Created
May 9, 2019 13:51
-
-
Save luliangce/edaba2f498968012eac1c0c4a53219a4 to your computer and use it in GitHub Desktop.
一个学信网院校库采集脚本
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import Generator, List | |
| import requests | |
| import xlwt | |
| from lxml import etree | |
| base_url = "https://gaokao.chsi.com.cn/sch/search.do" | |
| max_page = 137 | |
| session = requests.Session() | |
| session.headers["User-Agent"] = ("Mozilla/5.0" | |
| " (Macintosh; Intel Mac OS X 10_14_4) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/74.0.3729.131 Safari/537.36") | |
| def handle_line(node: etree.Element) -> dict: | |
| def gettext(path: str) -> str: | |
| return ",".join(node.xpath(path)).strip() | |
| return {"name": gettext("td[1]/a/text()") or gettext("td[1]/text()"), # 有几个学校没有a标签 | |
| "location": gettext("td[2]/text()"), | |
| "father": gettext("td[3]/text()"), | |
| "type": gettext("td[4]/text()"), | |
| "degree": gettext("td[5]/text()"), | |
| "tags": gettext("td[6]/span/text()"), | |
| "has_graduate": gettext("td[7]/i/text()") == "\ue664", | |
| "score": gettext("td[8]/a/text()")} | |
| def handle_page(text: str) -> List[dict]: | |
| root = etree.HTML(text) | |
| lines = root.xpath("//table/tr")[1:] # 第一行是表头 | |
| return [handle_line(i) for i in lines] | |
| def crawl_page(page: int) -> str: | |
| start = page * 20 | |
| return session.get(base_url, params={"start": start}).text | |
| def crawl() -> Generator[dict, None, None]: | |
| for i in range(max_page): | |
| text = crawl_page(i) | |
| yield from handle_page(text) | |
| def main() -> None: | |
| cols = [ | |
| ("name", "院校名称"), | |
| ("location", "院校所在地"), | |
| ("father", "院校隶属"), | |
| ("type", "院校类型"), | |
| ("degree", "学历层次"), | |
| ("tags", "院校特性"), | |
| ("has_graduate", "研究生院"), | |
| ("score", "满意度")] | |
| book = xlwt.Workbook() | |
| sheet = book.add_sheet("院校名称") | |
| for idx, col in enumerate(cols): | |
| sheet.write(0, idx, col[1]) | |
| try: | |
| row = 1 | |
| for line in crawl(): | |
| for idx, col in enumerate(cols): | |
| sheet.write(row, idx, line[col[0]]) | |
| print("当前行数:{row},校名:{name}".format(row=row, **line)) | |
| row += 1 | |
| finally: | |
| book.save("院校.xls") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment