Skip to content

Instantly share code, notes, and snippets.

@DCMMC
Created February 6, 2019 07:57
Show Gist options
  • Select an option

  • Save DCMMC/32b1a713c15636de850b177e6922bbe8 to your computer and use it in GitHub Desktop.

Select an option

Save DCMMC/32b1a713c15636de850b177e6922bbe8 to your computer and use it in GitHub Desktop.
ctrip flight spider
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 17 23:22:36 2018
@author: DCMMC
"""
import urllib
import urllib.request
from lxml import etree
import json
import random
def get_json2(date,rk,CK,r, threshold=1000):
'''根据构造出的url获取到航班数据'''
url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=BJS&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s" % (date, rk, CK, r)
# debug
# print(url)
headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'referer':"http://flights.ctrip.com/booking/SHA-BJS-day-1.html?ddate1=2018-04-29"}
headers['referer']="http://flights.ctrip.com/booking/SHA-BJS-day-1.html?SortByPrice=true&ddate=%s" % date
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
content=res.read()
# print (content)
dict_content = json.loads(content.decode("gb2312"),encoding="gb2312")
length = len(dict_content['fis'])
# print (dict_content)
# print('length =', length)
i = 0
for i in range(length):
# lp 就是价格
# alc 表示航班承运方的缩写, 具体的中文名称在 dict_content['als']['承运方缩写']
# fn 表示班次
# dt 和 at 分别表示起飞和降落时间
# threshold 为价格阀值
if ((dict_content['fis'][i][u'lp']) < threshold):
print (dict_content['fis'][i][u'lp'], '\t',
dict_content['als'][dict_content['fis'][i][u'alc']], ' ',
dict_content['fis'][i][u'fn'], '\t',
dict_content['fis'][i][u'dt'], '\t',
dict_content['fis'][i][u'at'], sep='')
#print (dict_content['fis'][i][u'dpbn'])
def get_parameter(date):
'''获取重要的参数
date:日期,格式示例:2018-05-13
'''
url='http://flights.ctrip.com/booking/SHA-BJS-day-1.html?SortByPrice=true&ddate1=%s'%date
res=urllib.request.urlopen(url).read()
# print(res.decode("gbk"))
tree = etree.HTML(res)
pp = tree.xpath('''//body/script[1]/text()''')[0].split()
# debug
# print(pp)
CK_original=pp[3][-34:-2]
CK = CK_original[0:5] + CK_original[13] + CK_original[5:13] + CK_original[14:]
rk = pp[-1][18:24]
num = random.random() * 10
num_str = "%.15f" % num
rk = num_str + rk
r = pp[-1][27:len(pp[-1])-3]
# debug
# print(rk, CK, r)
return rk,CK,r
if __name__=='__main__':
dates=['2018-05-18']
for date in dates:
print('时间:', date)
print('价格\t班次\t起飞时间\t到达时间')
rk,CK,r = get_parameter(date)
get_json2(date, rk, CK, r)
print ("----------------------------------------------------"
"--------------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment