Created
July 3, 2024 08:25
-
-
Save happyme531/65f3703d9bc9b764d165edd7115db51f to your computer and use it in GitHub Desktop.
考虑中文数字的自然排序
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import re | |
| import pypinyin | |
| from functools import cmp_to_key | |
| dic = {"零":0,"一":1,"二":2,"三":3,"四":4,"五":5,"六":6,"七":7,"八":8,"九":9,"十":10,"百":100,"千":1000,"万":10000,"亿":100000000,"兆":1000000000000} | |
| def chi_num(text): | |
| if len(text) == 0: | |
| result = 0 | |
| # 如果字符串和"十百千万亿兆"没有并集,直接简单转换 | |
| elif len(set(text).intersection("十百千万亿兆")) == 0: | |
| result = simply_convert(text) | |
| # 以“兆”字为分隔点,将字符串切割成两个字符串组成的列表,注意列表中的字符串可能为空 | |
| elif "兆" in text: | |
| # 兆、亿、万在字符串一般最多出现一次。特殊情况,比如"四万万",会转换错误 | |
| text = text.split(sep="兆",maxsplit=1) | |
| # 如果text=['','']需要返回1*dic['兆'] | |
| result = max(decwise_add(text[0]),1)*dic["兆"] + chi_num(text[1]) | |
| elif "亿" in text: | |
| text = text.split(sep="亿",maxsplit=1) | |
| result = max(decwise_add(text[0]),1)*dic["亿"] + chi_num(text[1]) | |
| elif "万" in text: | |
| text = text.split(sep="万",maxsplit=1) | |
| result = max(decwise_add(text[0]),1)*dic["万"] + chi_num(text[1]) | |
| else: | |
| result = decwise_add(text) | |
| return result | |
| def simply_convert(text): | |
| result="" | |
| for i in text: | |
| result += str(dic[i]) | |
| return int(result) | |
| # 一万以内的数字(比如“一千二百三十四“)通过"1*1000+2*100+3*10+4"这样的方式进行转换 | |
| def decwise_add(text): | |
| #将字符串中的每个汉字转换成对应的阿拉伯数字 | |
| temp=[dic[i] for i in text] | |
| result=0 | |
| for i in range(len(temp)): | |
| if (temp[i]>9) and (i>0): | |
| temp[i]*=temp[i-1] | |
| temp[i-1] =0 | |
| for i in temp: | |
| result+=i | |
| return result | |
| def replace_cn_num(input_str): | |
| pattern = re.compile(r'[零一二三四五六七八九十百千万亿兆]+') | |
| result = pattern.sub(lambda x: str(chi_num(x.group())), input_str) | |
| return result | |
| def iscjk(char): | |
| return 0x4E00 <= ord(char) <= 0x9FFF | |
| def isEn(char): | |
| return 0x0041 <= ord(char) <= 0x005A or 0x0061 <= ord(char) <= 0x007A | |
| def isDigit(char): | |
| return 0x0030 <= ord(char) <= 0x0039 | |
| def cn_natcompare(s1, s2): | |
| # 1.将中文数字转换为阿拉伯数字 | |
| s1 = replace_cn_num(s1) | |
| s2 = replace_cn_num(s2) | |
| # 2.打散字符串 | |
| s1 = list(s1) | |
| s2 = list(s2) | |
| # 2.使用pypinyin将中文转换为拼音 | |
| ps1 = pypinyin.lazy_pinyin(s1) | |
| ps2 = pypinyin.lazy_pinyin(s2) | |
| # 3.比较字符 | |
| i1 = 0 | |
| i2 = 0 | |
| while True: | |
| # 0. 检查是否已经遍历完字符串, 小的排在前面 | |
| if i1 == len(s1) and i2 == len(s2): | |
| return 0 | |
| elif i1 == len(s1): | |
| return -1 | |
| elif i2 == len(s2): | |
| return 1 | |
| # 1. 中文排在英文之后 | |
| if iscjk(s1[i1]) and not iscjk(s2[i2]): | |
| return 1 | |
| elif not iscjk(s1[i1]) and iscjk(s2[i2]): | |
| return -1 | |
| # 2. 都是中文字符, 比较拼音 | |
| elif iscjk(s1[i1]) and iscjk(s2[i2]): | |
| for j in range(min(len(ps1[i1]), len(ps2[i2]))): | |
| if ps1[i1][j] < ps2[i2][j]: | |
| return -1 | |
| elif ps1[i1][j] > ps2[i2][j]: | |
| return 1 | |
| if len(ps1[i1]) < len(ps2[i2]): | |
| return -1 | |
| elif len(ps1[i1]) > len(ps2[i2]): | |
| return 1 | |
| # 3. 都是英文字符, 比较字符 | |
| elif (isEn(s1[i1]) and isEn(s2[i2])): | |
| if s1[i1] < s2[i2]: | |
| return -1 | |
| elif s1[i1] > s2[i2]: | |
| return 1 | |
| # 4. 都是数字字符, 比较数值! | |
| elif (isDigit(s1[i1]) and isDigit(s2[i2])): | |
| num1 = '' | |
| num2 = '' | |
| while i1 < len(s1) and isDigit(s1[i1]): | |
| num1 += s1[i1] | |
| i1 += 1 | |
| while i2 < len(s2) and isDigit(s2[i2]): | |
| num2 += s2[i2] | |
| i2 += 1 | |
| num1 = int(num1) | |
| num2 = int(num2) | |
| if num1 < num2: | |
| return -1 | |
| elif num1 > num2: | |
| return 1 | |
| continue | |
| # 5. 其他字符, 正常比较 | |
| else: | |
| if s1[i1] < s2[i2]: | |
| return -1 | |
| elif s1[i1] > s2[i2]: | |
| return 1 | |
| i1 += 1 | |
| i2 += 1 | |
| # 获取当前文件夹下的所有文件 | |
| files = os.listdir('.') | |
| # 对文件列表进行排序 | |
| files.sort(key=cmp_to_key(cn_natcompare)) | |
| for file in files: | |
| print(file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment