chinoll · February 4, 2020 05:55
diff --git a/downloader.py b/downloader.py
 from httpx import Client as Session
 import os
 from bs4 import BeautifulSoup as bs4
 import time
 import logging
 import re
 from selenium import webdriver
 from selenium.webdriver.common.proxy import Proxy,ProxyType
 import sys
 import copy
 import json
 import tqdm

 HEADERS = {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":"gzip, deflate",
        "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
        "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
 }
 proxies = {"http":"http://39.137.69.8:8080"}
 proxies = {"socks5h":"socks5://127.0.0.1:1080"}
 proxies = {}
 class Downloader:
    baseurl="http://ebooks.cmanuf.com/detail?id="
    bookurl="http://hzcourse.com/resource/readBook?path="
    imgurl="openresources/teach_ebook/uncompressed"
    basere = "pdfReader\?id=[a-zA-Z0-9]*"
    session = Session()
    bookre = "/openresources/teach_ebook/uncompressed/[0-9]*/OEBPS/Text/chapter[0-9]*.xhtml"
    def __init__(self):
        #self.option = webdriver.FirefoxOptions()
        #self.option.add_argument("--proxy-server=223.242.225.254:9999")
        proxy = Proxy()
        proxy.proxy_type = ProxyType.MANUAL
        proxy.socks_proxy = "127.0.0.1:1080"
        proxy.http_proxy = "127.0.0.1:1080"
        proxy.ssl_proxy = "127.0.0.1:1080"
        capabilities = webdriver.DesiredCapabilities.FIREFOX
        proxy.add_to_capabilities(capabilities)

        self.driver = webdriver.Firefox(desired_capabilities=capabilities)
        self.driver.get("https://www.baidu.com/s?wd=ip")
        time.sleep(10)
        #self.driver = webdriver.Chrome()
        self.driver.maximize_window()
        #self.session.proxies = proxies
    def __del__(self):
        try:
            self.driver.close()
            self.driver.quit()
        except:
            pass
    def download_img(self,source,url):
        img_links = []
        s1 = copy.deepcopy(source)
        img = bs4(s1)
        for img_link in img.find_all('img'):
            name = re.search("[a-zA-Z0-9\-]*.(png|jpg|jpeg)",img_link.get("src")).group()
            bid = re.search("/[0-9]*/",url).group()
            with open(name,"wb") as f:
                f.write(self.session.get(self.bookurl + self.imgurl + bid + "OEBPS/Text/" + img_link.get("src"),proxies=proxies).content)
        return s1.replace("../Images/","")
    def download_html(self,bookid):
        btext = self.session.get(self.baseurl+str(bookid),headers = HEADERS).text
        if(re.search("\"success\"\:false",btext)):
            print("服务器已炸")
            exit()
        bid = re.search(self.basere,btext).group()
        self.session.get("http://ebooks.cmanuf.com/"+bid,headers=HEADERS)
        self.driver.get("http://ebooks.cmanuf.com/"+bid)
        time.sleep(3)
        contentText = self.driver.page_source
        url = self.driver.current_url

        chapterlist = re.compile(self.bookre).findall(contentText)
        namelist = []
        for i in bs4(contentText).find_all("li"):
            namelist.append(i.get_text())
        namelist = namelist[3:-2]
        try:
            with open("breakpoint." + str(bookid),"r") as file:
                namelist = json.loads(file.read())
                os.system("rm breakpoint." + str(bookid))
        except:
            pass
        for i in tqdm.tqdm(range(len(chapterlist))):
            if i % 7 == 0:
                time.sleep(0.5)
            with open(namelist[i].replace("/","\\") + ".xhtml","w") as file:
                try:
                    file.write(self.download_img(self.session.get(self.bookurl+chapterlist[i],headers=HEADERS).text,chapterlist[i]))
                except Exception as e:
                    with open("breakpoint." + str(bookid),"w") as file:
                        file.write(json.dumps(namelist[i:],ensure_ascii=False))
                    raise e
            namelist[i] = namelist[i] + ".xhtml"
        return namelist
    def download_cover(self,bookid):
        coverurl = bs4(self.session.get(self.baseurl+bookid,headers=HEADERS).text).find_all("img")[-1].get("src")
        with open("cover.jpg","wb") as f:
            try:
                f.write(self.session.get(coverurl,headers=HEADERS).content)
            except:
                pass

    def download(self,bookid,name):
        namelist = self.download_html(bookid)
        with open("info.json","w") as f:
            f.write(json.dumps(namelist,ensure_ascii=False))
        strl = ""
        for i in namelist:
            strl += i + " "
        self.download_cover(bookid)
        os.system("zip " + name + ".zip" + " *.xhtml *.jpg info.json")
        os.system("rm *.xhtml *.jpg info.json")
 download = Downloader()
 #书籍的id，名字
 download.download(sys.argv[1],sys.argv[2])
	from httpx import Client as Session
	import os
	from bs4 import BeautifulSoup as bs4
	import time
	import logging
	import re
	from selenium import webdriver
	from selenium.webdriver.common.proxy import Proxy,ProxyType
	import sys
	import copy
	import json
	import tqdm

	HEADERS = {
	"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Encoding":"gzip, deflate",
	"Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
	"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
	}
	proxies = {"http":"http://39.137.69.8:8080"}
	proxies = {"socks5h":"socks5://127.0.0.1:1080"}
	proxies = {}
	class Downloader:
	baseurl="http://ebooks.cmanuf.com/detail?id="
	bookurl="http://hzcourse.com/resource/readBook?path="
	imgurl="openresources/teach_ebook/uncompressed"
	basere = "pdfReader\?id=[a-zA-Z0-9]*"
	session = Session()
	bookre = "/openresources/teach_ebook/uncompressed/[0-9]/OEBPS/Text/chapter[0-9].xhtml"
	def __init__(self):
	#self.option = webdriver.FirefoxOptions()
	#self.option.add_argument("--proxy-server=223.242.225.254:9999")
	proxy = Proxy()
	proxy.proxy_type = ProxyType.MANUAL
	proxy.socks_proxy = "127.0.0.1:1080"
	proxy.http_proxy = "127.0.0.1:1080"
	proxy.ssl_proxy = "127.0.0.1:1080"
	capabilities = webdriver.DesiredCapabilities.FIREFOX
	proxy.add_to_capabilities(capabilities)

	self.driver = webdriver.Firefox(desired_capabilities=capabilities)
	self.driver.get("https://www.baidu.com/s?wd=ip")
	time.sleep(10)
	#self.driver = webdriver.Chrome()
	self.driver.maximize_window()
	#self.session.proxies = proxies
	def __del__(self):
	try:
	self.driver.close()
	self.driver.quit()
	except:
	pass
	def download_img(self,source,url):
	img_links = []
	s1 = copy.deepcopy(source)
	img = bs4(s1)
	for img_link in img.find_all('img'):
	name = re.search("[a-zA-Z0-9\-]*.(png\|jpg\|jpeg)",img_link.get("src")).group()
	bid = re.search("/[0-9]*/",url).group()
	with open(name,"wb") as f:
	f.write(self.session.get(self.bookurl + self.imgurl + bid + "OEBPS/Text/" + img_link.get("src"),proxies=proxies).content)
	return s1.replace("../Images/","")
	def download_html(self,bookid):
	btext = self.session.get(self.baseurl+str(bookid),headers = HEADERS).text
	if(re.search("\"success\"\:false",btext)):
	print("服务器已炸")
	exit()
	bid = re.search(self.basere,btext).group()
	self.session.get("http://ebooks.cmanuf.com/"+bid,headers=HEADERS)
	self.driver.get("http://ebooks.cmanuf.com/"+bid)
	time.sleep(3)
	contentText = self.driver.page_source
	url = self.driver.current_url

	chapterlist = re.compile(self.bookre).findall(contentText)
	namelist = []
	for i in bs4(contentText).find_all("li"):
	namelist.append(i.get_text())
	namelist = namelist[3:-2]
	try:
	with open("breakpoint." + str(bookid),"r") as file:
	namelist = json.loads(file.read())
	os.system("rm breakpoint." + str(bookid))
	except:
	pass
	for i in tqdm.tqdm(range(len(chapterlist))):
	if i % 7 == 0:
	time.sleep(0.5)
	with open(namelist[i].replace("/","\\") + ".xhtml","w") as file:
	try:
	file.write(self.download_img(self.session.get(self.bookurl+chapterlist[i],headers=HEADERS).text,chapterlist[i]))
	except Exception as e:
	with open("breakpoint." + str(bookid),"w") as file:
	file.write(json.dumps(namelist[i:],ensure_ascii=False))
	raise e
	namelist[i] = namelist[i] + ".xhtml"
	return namelist
	def download_cover(self,bookid):
	coverurl = bs4(self.session.get(self.baseurl+bookid,headers=HEADERS).text).find_all("img")[-1].get("src")
	with open("cover.jpg","wb") as f:
	try:
	f.write(self.session.get(coverurl,headers=HEADERS).content)
	except:
	pass

	def download(self,bookid,name):
	namelist = self.download_html(bookid)
	with open("info.json","w") as f:
	f.write(json.dumps(namelist,ensure_ascii=False))
	strl = ""
	for i in namelist:
	strl += i + " "
	self.download_cover(bookid)
	os.system("zip " + name + ".zip" + " .xhtml .jpg info.json")
	os.system("rm .xhtml .jpg info.json")
	download = Downloader()
	#书籍的id，名字
	download.download(sys.argv[1],sys.argv[2])
No results found