Skip to content

Instantly share code, notes, and snippets.

@cnDelbert
Forked from evi1m0/PicConverText.py
Last active August 29, 2015 14:17
Show Gist options
  • Select an option

  • Save cnDelbert/104e8cdf47b9dcff1aa8 to your computer and use it in GitHub Desktop.

Select an option

Save cnDelbert/104e8cdf47b9dcff1aa8 to your computer and use it in GitHub Desktop.

Revisions

  1. @evi1m0 evi1m0 created this gist Mar 17, 2015.
    145 changes: 145 additions & 0 deletions PicConverText.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,145 @@
    #!/usr/bin/env python
    # coding=utf8
    # author=evi1m0
    # website=linux.im

    '''
    12306 Captcha Picture:
    author: Evi1m0@20150316
    1. Download Captcha
    2. Pic Conver Text
    3. Return result
    '''

    import re
    import time
    import json
    import urllib
    import urllib2
    import requests

    from PIL import Image


    def downloadImg():
    pic_file = int(time.time())
    pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand"
    print '[+] Download Picture: {}'.format(pic_url)
    try:
    resp = requests.get(pic_url, verify=False, timeout=5)
    except:
    resp = requests.get(pic_url, verify=False, timeout=3)
    with open("./12306_pic/%s.jpg"%pic_file, 'wb') as fp:
    fp.write(resp.content)
    return pic_file

    def imgCut():
    pic_file = downloadImg()
    pic_path = "./12306_pic/%s.jpg" % pic_file
    pic_text_path = './12306_pic/%s_text.jpg' % pic_file
    pic_obj = Image.open(pic_path)
    box = (120,0,290,25)
    region = pic_obj.crop(box)
    region.save(pic_text_path)
    print '[*] Picture Text Picture: {}'.format(pic_text_path)
    return pic_path, pic_text_path

    def ocrApi(filename):
    # Text picture conver text.
    upload_pic_url = "http://cn.docs88.com/pdftowordupload2.php"
    headers_fake = {
    'ccept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
    'Connection': 'keep-alive',
    'Host': 'cn.docs88.com',
    'Origin': 'http://cn.docs88.com',
    'User-Agent': 'Mozilla/5.0 (KHTML, like Gecko) Chrome/41.0.2272.89',
    'X-Requested-With': 'ShockwaveFlash/17.0.0.134',
    }
    filename_tmp = filename.split('/')[-1]
    pic_text_content = open(filename).read()
    para = {'Filename': filename_tmp,
    'sourcename': filename_tmp,
    'sourcelanguage': 'cn',
    'desttype': 'txt',
    'Upload': 'Submit Query',}
    upload_pic = requests.post(upload_pic_url, data=para, files={"Filedata" : open(filename, 'rb')}, headers=headers_fake)
    time.sleep(2)
    text_result_url = 'http://cn.docs88.com/' + upload_pic.content[3:]
    text_result = requests.get(text_result_url)
    if text_result.status_code == 200:
    print '[*] Text: {}'.format(text_result.content)
    else:
    print '[-] False'
    return text_result.content


    '''
    baidu stu
    author: andelf
    '''
    def baidu_stu_html_extract(html):
    pattern = re.compile(r"keywords:'(.*?)'")
    matches = pattern.findall(html)
    if not matches:
    return '[UNKOWN]'
    json_str = matches[0]
    json_str = json_str.replace('\\x22', '"').replace('\\\\', '\\')
    result = [item['keyword'] for item in json.loads(json_str)]
    return '|'.join(result) if result else '[UNKOWN]'

    def baidu_stu_lookup(im):
    url = ("http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id="
    "WU_FILE_0&name=233.png&type=image%2Fpng&lastModifiedDate=Mon+Mar"
    "+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size=")
    im.save("./query_temp_img.png")
    raw = open("./query_temp_img.png", 'rb').read()
    url = url + str(len(raw))
    req = urllib2.Request(url, raw, {'Content-Type':'image/png', 'User-Agent':UA})
    resp = urllib2.urlopen(req)
    resp_url = resp.read() # return a pure url
    url = "http://stu.baidu.com/n/searchpc?queryImageUrl=" + urllib.quote(resp_url)
    req = urllib2.Request(url, headers={'User-Agent':UA})
    resp = urllib2.urlopen(req)
    html = resp.read()
    return baidu_stu_html_extract(html)

    def get_sub_img(pic_text_path, x, y):
    im = Image.open(pic_text_path)
    assert 0 <= x <= 3
    assert 0 <= y <= 2
    WITH = HEIGHT = 68
    left = 5 + (67 + 5) * x
    top = 41 + (67 + 5) * y
    right = left + 67
    bottom = top + 67
    return im.crop((left, top, right, bottom))


    if __name__ == '__main__':
    UA = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"
    pic_path, pic_text_path = imgCut()
    captcha_text = ocrApi(pic_text_path)
    dict_list = {}
    count = 0
    for y in range(2):
    for x in range(4):
    count += 1
    im2 = get_sub_img(pic_path, x, y)
    result = baidu_stu_lookup(im2)
    dict_list[count] = result
    print (y,x), result
    if captcha_text.strip() > 2:
    print '\n[*] Maybe the result of the:'
    maybe_result = []
    for v in dict_list:
    for c in range(len(unicode(captcha_text.strip(), 'utf8'))):
    text = unicode(captcha_text, 'utf8')[c]
    if text in dict_list[v]:
    _str_res = '%s --- %s' % (v, dict_list[v])
    maybe_result.append(_str_res)
    for r in list(set(maybe_result)):
    print r
    else:
    print '[-] False'