Skip to content

Instantly share code, notes, and snippets.

@rains31
Last active August 29, 2015 14:05
Show Gist options
  • Select an option

  • Save rains31/e3cba90963d9590551c0 to your computer and use it in GitHub Desktop.

Select an option

Save rains31/e3cba90963d9590551c0 to your computer and use it in GitHub Desktop.
mdcode.py
#!/usr/bin/python2
#coding: utf-8
#这个函数是从网上摘的
def is_zh (c):
try:
x = ord (c)
except:
return False
# Punct & Radicals
if x >= 0x2e80 and x <= 0x33ff:
return True
# Fullwidth Latin Characters
elif x >= 0xff00 and x <= 0xffef:
return True
# CJK Unified Ideographs &
# CJK Unified Ideographs Extension A
elif x >= 0x4e00 and x <= 0x9fbb:
return True
# CJK Compatibility Ideographs
elif x >= 0xf900 and x <= 0xfad9:
return True
# CJK Unified Ideographs Extension B
elif x >= 0x20000 and x <= 0x2a6d6:
return True
# CJK Compatibility Supplement
elif x >= 0x2f800 and x <= 0x2fa1d:
return True
else:
return False
def mdcode( str, encoding='utf-8' ):
if isinstance(str, unicode):
return str.encode(encoding)
for c in ('utf-8', 'gb18030', 'gbk', 'gb2312','utf-16'):
try:
if encoding == 'unicode':
return str.decode(c)
else:
return str.decode(c).encode( encoding )
except:
pass
raise 'Unknown charset'
def mdcode_char( str, encoding='utf-8' ):
str_list = list( str )
#print str_list
str_result = []
pos = 0
while pos < len(str_list):
#print pos
#小于127的属于字母、数字等
if ord( str_list[pos] ) <= 127 :
str_result.append(str_list[pos])
pos += 1
continue
#test utf8 3个字符
#utf8 占3个字节,先取三个字节看是不是utf8
c = ''.join( str_list[pos:pos+3] )
try:
zh = c.decode('utf-8')
flag = True
except:
flag = False
zh = ''
#如果可以用decode对3个字节用utf8解码,则确定可能是utf8的汉字
#再用 is_zh 函数 判断是否符合汉字的编码范围
#print flag,zh,is_zh(zh)
if flag and is_zh( zh ):
pos += 3
str_result.append(zh)
continue
#test gb18030 2个字符
#gb18030 占2个字节,先取两个字节看是不是gb18030
c = ''.join( str_list[pos:pos+2] )
try:
zh = c.decode('gb18030')
flag = True
except:
flag = False
#如果可以用decode对2个字节用gb18030解码,则确定可能是gb18030的汉字
#再用 is_zh 函数 判断是否符合汉字的编码范围
#print flag,zh,is_zh(zh)
if flag and is_zh( zh ):
pos += 2
str_result.append(zh)
continue
else:
#raise 'Unknown charset_char'
pos += 1
return ''.join( [ i.encode( encoding ) for i in str_result ] )
if __name__ == '__main__':
#测试字符串
#转换成utf8
sutf8 = '''123测试中文abc测试'''
sgbk = mdcode( sutf8, 'gbk')
#这个字符串包含了gbk和utf8的汉字
sall = "%s%s%s" % (sgbk, sutf8, sgbk)
print sall
#转换编码
sresult = mdcode_char( sall, 'utf-8' )
#sresult = mdcode( sall, 'gbk' )
print mdcode(sresult,'utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment