Skip to content

Instantly share code, notes, and snippets.

@happyme531
Created July 3, 2024 08:25
Show Gist options
  • Select an option

  • Save happyme531/65f3703d9bc9b764d165edd7115db51f to your computer and use it in GitHub Desktop.

Select an option

Save happyme531/65f3703d9bc9b764d165edd7115db51f to your computer and use it in GitHub Desktop.
考虑中文数字的自然排序
import os
import re
import pypinyin
from functools import cmp_to_key
dic = {"零":0,"一":1,"二":2,"三":3,"四":4,"五":5,"六":6,"七":7,"八":8,"九":9,"十":10,"百":100,"千":1000,"万":10000,"亿":100000000,"兆":1000000000000}
def chi_num(text):
if len(text) == 0:
result = 0
# 如果字符串和"十百千万亿兆"没有并集,直接简单转换
elif len(set(text).intersection("十百千万亿兆")) == 0:
result = simply_convert(text)
# 以“兆”字为分隔点,将字符串切割成两个字符串组成的列表,注意列表中的字符串可能为空
elif "兆" in text:
# 兆、亿、万在字符串一般最多出现一次。特殊情况,比如"四万万",会转换错误
text = text.split(sep="兆",maxsplit=1)
# 如果text=['','']需要返回1*dic['兆']
result = max(decwise_add(text[0]),1)*dic["兆"] + chi_num(text[1])
elif "亿" in text:
text = text.split(sep="亿",maxsplit=1)
result = max(decwise_add(text[0]),1)*dic["亿"] + chi_num(text[1])
elif "万" in text:
text = text.split(sep="万",maxsplit=1)
result = max(decwise_add(text[0]),1)*dic["万"] + chi_num(text[1])
else:
result = decwise_add(text)
return result
def simply_convert(text):
result=""
for i in text:
result += str(dic[i])
return int(result)
# 一万以内的数字(比如“一千二百三十四“)通过"1*1000+2*100+3*10+4"这样的方式进行转换
def decwise_add(text):
#将字符串中的每个汉字转换成对应的阿拉伯数字
temp=[dic[i] for i in text]
result=0
for i in range(len(temp)):
if (temp[i]>9) and (i>0):
temp[i]*=temp[i-1]
temp[i-1] =0
for i in temp:
result+=i
return result
def replace_cn_num(input_str):
pattern = re.compile(r'[零一二三四五六七八九十百千万亿兆]+')
result = pattern.sub(lambda x: str(chi_num(x.group())), input_str)
return result
def iscjk(char):
return 0x4E00 <= ord(char) <= 0x9FFF
def isEn(char):
return 0x0041 <= ord(char) <= 0x005A or 0x0061 <= ord(char) <= 0x007A
def isDigit(char):
return 0x0030 <= ord(char) <= 0x0039
def cn_natcompare(s1, s2):
# 1.将中文数字转换为阿拉伯数字
s1 = replace_cn_num(s1)
s2 = replace_cn_num(s2)
# 2.打散字符串
s1 = list(s1)
s2 = list(s2)
# 2.使用pypinyin将中文转换为拼音
ps1 = pypinyin.lazy_pinyin(s1)
ps2 = pypinyin.lazy_pinyin(s2)
# 3.比较字符
i1 = 0
i2 = 0
while True:
# 0. 检查是否已经遍历完字符串, 小的排在前面
if i1 == len(s1) and i2 == len(s2):
return 0
elif i1 == len(s1):
return -1
elif i2 == len(s2):
return 1
# 1. 中文排在英文之后
if iscjk(s1[i1]) and not iscjk(s2[i2]):
return 1
elif not iscjk(s1[i1]) and iscjk(s2[i2]):
return -1
# 2. 都是中文字符, 比较拼音
elif iscjk(s1[i1]) and iscjk(s2[i2]):
for j in range(min(len(ps1[i1]), len(ps2[i2]))):
if ps1[i1][j] < ps2[i2][j]:
return -1
elif ps1[i1][j] > ps2[i2][j]:
return 1
if len(ps1[i1]) < len(ps2[i2]):
return -1
elif len(ps1[i1]) > len(ps2[i2]):
return 1
# 3. 都是英文字符, 比较字符
elif (isEn(s1[i1]) and isEn(s2[i2])):
if s1[i1] < s2[i2]:
return -1
elif s1[i1] > s2[i2]:
return 1
# 4. 都是数字字符, 比较数值!
elif (isDigit(s1[i1]) and isDigit(s2[i2])):
num1 = ''
num2 = ''
while i1 < len(s1) and isDigit(s1[i1]):
num1 += s1[i1]
i1 += 1
while i2 < len(s2) and isDigit(s2[i2]):
num2 += s2[i2]
i2 += 1
num1 = int(num1)
num2 = int(num2)
if num1 < num2:
return -1
elif num1 > num2:
return 1
continue
# 5. 其他字符, 正常比较
else:
if s1[i1] < s2[i2]:
return -1
elif s1[i1] > s2[i2]:
return 1
i1 += 1
i2 += 1
# 获取当前文件夹下的所有文件
files = os.listdir('.')
# 对文件列表进行排序
files.sort(key=cmp_to_key(cn_natcompare))
for file in files:
print(file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment