Last active
February 4, 2020 05:55
-
-
Save chinoll/e79005589056d2ea5309c7c03b3b40de to your computer and use it in GitHub Desktop.
机械工业出版社电子书爬虫
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from httpx import Client as Session | |
| import os | |
| from bs4 import BeautifulSoup as bs4 | |
| import time | |
| import logging | |
| import re | |
| from selenium import webdriver | |
| from selenium.webdriver.common.proxy import Proxy,ProxyType | |
| import sys | |
| import copy | |
| import json | |
| import tqdm | |
| HEADERS = { | |
| "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
| "Accept-Encoding":"gzip, deflate", | |
| "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6", | |
| "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36", | |
| } | |
| proxies = {"http":"http://39.137.69.8:8080"} | |
| proxies = {"socks5h":"socks5://127.0.0.1:1080"} | |
| proxies = {} | |
| class Downloader: | |
| baseurl="http://ebooks.cmanuf.com/detail?id=" | |
| bookurl="http://hzcourse.com/resource/readBook?path=" | |
| imgurl="openresources/teach_ebook/uncompressed" | |
| basere = "pdfReader\?id=[a-zA-Z0-9]*" | |
| session = Session() | |
| bookre = "/openresources/teach_ebook/uncompressed/[0-9]*/OEBPS/Text/chapter[0-9]*.xhtml" | |
| def __init__(self): | |
| #self.option = webdriver.FirefoxOptions() | |
| #self.option.add_argument("--proxy-server=223.242.225.254:9999") | |
| proxy = Proxy() | |
| proxy.proxy_type = ProxyType.MANUAL | |
| proxy.socks_proxy = "127.0.0.1:1080" | |
| proxy.http_proxy = "127.0.0.1:1080" | |
| proxy.ssl_proxy = "127.0.0.1:1080" | |
| capabilities = webdriver.DesiredCapabilities.FIREFOX | |
| proxy.add_to_capabilities(capabilities) | |
| self.driver = webdriver.Firefox(desired_capabilities=capabilities) | |
| self.driver.get("https://www.baidu.com/s?wd=ip") | |
| time.sleep(10) | |
| #self.driver = webdriver.Chrome() | |
| self.driver.maximize_window() | |
| #self.session.proxies = proxies | |
| def __del__(self): | |
| try: | |
| self.driver.close() | |
| self.driver.quit() | |
| except: | |
| pass | |
| def download_img(self,source,url): | |
| img_links = [] | |
| s1 = copy.deepcopy(source) | |
| img = bs4(s1) | |
| for img_link in img.find_all('img'): | |
| name = re.search("[a-zA-Z0-9\-]*.(png|jpg|jpeg)",img_link.get("src")).group() | |
| bid = re.search("/[0-9]*/",url).group() | |
| with open(name,"wb") as f: | |
| f.write(self.session.get(self.bookurl + self.imgurl + bid + "OEBPS/Text/" + img_link.get("src"),proxies=proxies).content) | |
| return s1.replace("../Images/","") | |
| def download_html(self,bookid): | |
| btext = self.session.get(self.baseurl+str(bookid),headers = HEADERS).text | |
| if(re.search("\"success\"\:false",btext)): | |
| print("服务器已炸") | |
| exit() | |
| bid = re.search(self.basere,btext).group() | |
| self.session.get("http://ebooks.cmanuf.com/"+bid,headers=HEADERS) | |
| self.driver.get("http://ebooks.cmanuf.com/"+bid) | |
| time.sleep(3) | |
| contentText = self.driver.page_source | |
| url = self.driver.current_url | |
| chapterlist = re.compile(self.bookre).findall(contentText) | |
| namelist = [] | |
| for i in bs4(contentText).find_all("li"): | |
| namelist.append(i.get_text()) | |
| namelist = namelist[3:-2] | |
| try: | |
| with open("breakpoint." + str(bookid),"r") as file: | |
| namelist = json.loads(file.read()) | |
| os.system("rm breakpoint." + str(bookid)) | |
| except: | |
| pass | |
| for i in tqdm.tqdm(range(len(chapterlist))): | |
| if i % 7 == 0: | |
| time.sleep(0.5) | |
| with open(namelist[i].replace("/","\\") + ".xhtml","w") as file: | |
| try: | |
| file.write(self.download_img(self.session.get(self.bookurl+chapterlist[i],headers=HEADERS).text,chapterlist[i])) | |
| except Exception as e: | |
| with open("breakpoint." + str(bookid),"w") as file: | |
| file.write(json.dumps(namelist[i:],ensure_ascii=False)) | |
| raise e | |
| namelist[i] = namelist[i] + ".xhtml" | |
| return namelist | |
| def download_cover(self,bookid): | |
| coverurl = bs4(self.session.get(self.baseurl+bookid,headers=HEADERS).text).find_all("img")[-1].get("src") | |
| with open("cover.jpg","wb") as f: | |
| try: | |
| f.write(self.session.get(coverurl,headers=HEADERS).content) | |
| except: | |
| pass | |
| def download(self,bookid,name): | |
| namelist = self.download_html(bookid) | |
| with open("info.json","w") as f: | |
| f.write(json.dumps(namelist,ensure_ascii=False)) | |
| strl = "" | |
| for i in namelist: | |
| strl += i + " " | |
| self.download_cover(bookid) | |
| os.system("zip " + name + ".zip" + " *.xhtml *.jpg info.json") | |
| os.system("rm *.xhtml *.jpg info.json") | |
| download = Downloader() | |
| #书籍的id,名字 | |
| download.download(sys.argv[1],sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment