Skip to content

Instantly share code, notes, and snippets.

@chinoll
Last active February 4, 2020 05:55
Show Gist options
  • Select an option

  • Save chinoll/e79005589056d2ea5309c7c03b3b40de to your computer and use it in GitHub Desktop.

Select an option

Save chinoll/e79005589056d2ea5309c7c03b3b40de to your computer and use it in GitHub Desktop.
机械工业出版社电子书爬虫
from httpx import Client as Session
import os
from bs4 import BeautifulSoup as bs4
import time
import logging
import re
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy,ProxyType
import sys
import copy
import json
import tqdm
HEADERS = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
}
proxies = {"http":"http://39.137.69.8:8080"}
proxies = {"socks5h":"socks5://127.0.0.1:1080"}
proxies = {}
class Downloader:
baseurl="http://ebooks.cmanuf.com/detail?id="
bookurl="http://hzcourse.com/resource/readBook?path="
imgurl="openresources/teach_ebook/uncompressed"
basere = "pdfReader\?id=[a-zA-Z0-9]*"
session = Session()
bookre = "/openresources/teach_ebook/uncompressed/[0-9]*/OEBPS/Text/chapter[0-9]*.xhtml"
def __init__(self):
#self.option = webdriver.FirefoxOptions()
#self.option.add_argument("--proxy-server=223.242.225.254:9999")
proxy = Proxy()
proxy.proxy_type = ProxyType.MANUAL
proxy.socks_proxy = "127.0.0.1:1080"
proxy.http_proxy = "127.0.0.1:1080"
proxy.ssl_proxy = "127.0.0.1:1080"
capabilities = webdriver.DesiredCapabilities.FIREFOX
proxy.add_to_capabilities(capabilities)
self.driver = webdriver.Firefox(desired_capabilities=capabilities)
self.driver.get("https://www.baidu.com/s?wd=ip")
time.sleep(10)
#self.driver = webdriver.Chrome()
self.driver.maximize_window()
#self.session.proxies = proxies
def __del__(self):
try:
self.driver.close()
self.driver.quit()
except:
pass
def download_img(self,source,url):
img_links = []
s1 = copy.deepcopy(source)
img = bs4(s1)
for img_link in img.find_all('img'):
name = re.search("[a-zA-Z0-9\-]*.(png|jpg|jpeg)",img_link.get("src")).group()
bid = re.search("/[0-9]*/",url).group()
with open(name,"wb") as f:
f.write(self.session.get(self.bookurl + self.imgurl + bid + "OEBPS/Text/" + img_link.get("src"),proxies=proxies).content)
return s1.replace("../Images/","")
def download_html(self,bookid):
btext = self.session.get(self.baseurl+str(bookid),headers = HEADERS).text
if(re.search("\"success\"\:false",btext)):
print("服务器已炸")
exit()
bid = re.search(self.basere,btext).group()
self.session.get("http://ebooks.cmanuf.com/"+bid,headers=HEADERS)
self.driver.get("http://ebooks.cmanuf.com/"+bid)
time.sleep(3)
contentText = self.driver.page_source
url = self.driver.current_url
chapterlist = re.compile(self.bookre).findall(contentText)
namelist = []
for i in bs4(contentText).find_all("li"):
namelist.append(i.get_text())
namelist = namelist[3:-2]
try:
with open("breakpoint." + str(bookid),"r") as file:
namelist = json.loads(file.read())
os.system("rm breakpoint." + str(bookid))
except:
pass
for i in tqdm.tqdm(range(len(chapterlist))):
if i % 7 == 0:
time.sleep(0.5)
with open(namelist[i].replace("/","\\") + ".xhtml","w") as file:
try:
file.write(self.download_img(self.session.get(self.bookurl+chapterlist[i],headers=HEADERS).text,chapterlist[i]))
except Exception as e:
with open("breakpoint." + str(bookid),"w") as file:
file.write(json.dumps(namelist[i:],ensure_ascii=False))
raise e
namelist[i] = namelist[i] + ".xhtml"
return namelist
def download_cover(self,bookid):
coverurl = bs4(self.session.get(self.baseurl+bookid,headers=HEADERS).text).find_all("img")[-1].get("src")
with open("cover.jpg","wb") as f:
try:
f.write(self.session.get(coverurl,headers=HEADERS).content)
except:
pass
def download(self,bookid,name):
namelist = self.download_html(bookid)
with open("info.json","w") as f:
f.write(json.dumps(namelist,ensure_ascii=False))
strl = ""
for i in namelist:
strl += i + " "
self.download_cover(bookid)
os.system("zip " + name + ".zip" + " *.xhtml *.jpg info.json")
os.system("rm *.xhtml *.jpg info.json")
download = Downloader()
#书籍的id,名字
download.download(sys.argv[1],sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment