Created
February 6, 2019 07:57
-
-
Save DCMMC/32b1a713c15636de850b177e6922bbe8 to your computer and use it in GitHub Desktop.
ctrip flight spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Mar 17 23:22:36 2018 | |
| @author: DCMMC | |
| """ | |
| import urllib | |
| import urllib.request | |
| from lxml import etree | |
| import json | |
| import random | |
| def get_json2(date,rk,CK,r, threshold=1000): | |
| '''根据构造出的url获取到航班数据''' | |
| url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=BJS&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s" % (date, rk, CK, r) | |
| # debug | |
| # print(url) | |
| headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64)" | |
| " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", | |
| 'referer':"http://flights.ctrip.com/booking/SHA-BJS-day-1.html?ddate1=2018-04-29"} | |
| headers['referer']="http://flights.ctrip.com/booking/SHA-BJS-day-1.html?SortByPrice=true&ddate=%s" % date | |
| req = urllib.request.Request(url, headers=headers) | |
| res = urllib.request.urlopen(req) | |
| content=res.read() | |
| # print (content) | |
| dict_content = json.loads(content.decode("gb2312"),encoding="gb2312") | |
| length = len(dict_content['fis']) | |
| # print (dict_content) | |
| # print('length =', length) | |
| i = 0 | |
| for i in range(length): | |
| # lp 就是价格 | |
| # alc 表示航班承运方的缩写, 具体的中文名称在 dict_content['als']['承运方缩写'] | |
| # fn 表示班次 | |
| # dt 和 at 分别表示起飞和降落时间 | |
| # threshold 为价格阀值 | |
| if ((dict_content['fis'][i][u'lp']) < threshold): | |
| print (dict_content['fis'][i][u'lp'], '\t', | |
| dict_content['als'][dict_content['fis'][i][u'alc']], ' ', | |
| dict_content['fis'][i][u'fn'], '\t', | |
| dict_content['fis'][i][u'dt'], '\t', | |
| dict_content['fis'][i][u'at'], sep='') | |
| #print (dict_content['fis'][i][u'dpbn']) | |
| def get_parameter(date): | |
| '''获取重要的参数 | |
| date:日期,格式示例:2018-05-13 | |
| ''' | |
| url='http://flights.ctrip.com/booking/SHA-BJS-day-1.html?SortByPrice=true&ddate1=%s'%date | |
| res=urllib.request.urlopen(url).read() | |
| # print(res.decode("gbk")) | |
| tree = etree.HTML(res) | |
| pp = tree.xpath('''//body/script[1]/text()''')[0].split() | |
| # debug | |
| # print(pp) | |
| CK_original=pp[3][-34:-2] | |
| CK = CK_original[0:5] + CK_original[13] + CK_original[5:13] + CK_original[14:] | |
| rk = pp[-1][18:24] | |
| num = random.random() * 10 | |
| num_str = "%.15f" % num | |
| rk = num_str + rk | |
| r = pp[-1][27:len(pp[-1])-3] | |
| # debug | |
| # print(rk, CK, r) | |
| return rk,CK,r | |
| if __name__=='__main__': | |
| dates=['2018-05-18'] | |
| for date in dates: | |
| print('时间:', date) | |
| print('价格\t班次\t起飞时间\t到达时间') | |
| rk,CK,r = get_parameter(date) | |
| get_json2(date, rk, CK, r) | |
| print ("----------------------------------------------------" | |
| "--------------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment