Skip to content

Instantly share code, notes, and snippets.

@luliangce
Created May 9, 2019 13:51
Show Gist options
  • Select an option

  • Save luliangce/edaba2f498968012eac1c0c4a53219a4 to your computer and use it in GitHub Desktop.

Select an option

Save luliangce/edaba2f498968012eac1c0c4a53219a4 to your computer and use it in GitHub Desktop.
一个学信网院校库采集脚本
from typing import Generator, List
import requests
import xlwt
from lxml import etree
base_url = "https://gaokao.chsi.com.cn/sch/search.do"
max_page = 137
session = requests.Session()
session.headers["User-Agent"] = ("Mozilla/5.0"
" (Macintosh; Intel Mac OS X 10_14_4) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/74.0.3729.131 Safari/537.36")
def handle_line(node: etree.Element) -> dict:
def gettext(path: str) -> str:
return ",".join(node.xpath(path)).strip()
return {"name": gettext("td[1]/a/text()") or gettext("td[1]/text()"), # 有几个学校没有a标签
"location": gettext("td[2]/text()"),
"father": gettext("td[3]/text()"),
"type": gettext("td[4]/text()"),
"degree": gettext("td[5]/text()"),
"tags": gettext("td[6]/span/text()"),
"has_graduate": gettext("td[7]/i/text()") == "\ue664",
"score": gettext("td[8]/a/text()")}
def handle_page(text: str) -> List[dict]:
root = etree.HTML(text)
lines = root.xpath("//table/tr")[1:] # 第一行是表头
return [handle_line(i) for i in lines]
def crawl_page(page: int) -> str:
start = page * 20
return session.get(base_url, params={"start": start}).text
def crawl() -> Generator[dict, None, None]:
for i in range(max_page):
text = crawl_page(i)
yield from handle_page(text)
def main() -> None:
cols = [
("name", "院校名称"),
("location", "院校所在地"),
("father", "院校隶属"),
("type", "院校类型"),
("degree", "学历层次"),
("tags", "院校特性"),
("has_graduate", "研究生院"),
("score", "满意度")]
book = xlwt.Workbook()
sheet = book.add_sheet("院校名称")
for idx, col in enumerate(cols):
sheet.write(0, idx, col[1])
try:
row = 1
for line in crawl():
for idx, col in enumerate(cols):
sheet.write(row, idx, line[col[0]])
print("当前行数:{row},校名:{name}".format(row=row, **line))
row += 1
finally:
book.save("院校.xls")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment