Last active
December 27, 2016 08:34
-
-
Save 3rogue/f86157e01b5fc0aee4e0 to your computer and use it in GitHub Desktop.
Revisions
-
3rogue revised this gist
Dec 27, 2016 . 2 changed files with 64 additions and 64 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,64 @@ from pyquery import PyQuery as pq import requests import os import os.path import time from multiprocessing.dummy import Pool as ThreadPool def get_urls(): pagenum = 1 urls = [] while True: url = 'https://avmo.pw/cn/star/{}/page/{}'.format(keyword,str(pagenum)) page = requests.get(url) if(page.status_code == requests.codes.ok): pagenum += 1 urls.append(url) else: break return urls def get_imginfo(url): page = requests.get(url).text d = pq(page) img_url = [] title_list = [] for url in d('.movie-box .photo-frame img'): img_url.append(d(url).attr('src').replace('s.jpg', 'l.jpg')) for title in d('.movie-box .photo-info date:eq(0)'): title_list.append(d(title).text()) return [[img_url[pos], title_list[pos]] for pos in range(len(img_url))] def write_img(writelist): for url,title in writelist: with open(r'./' + keyword + '/' + title + '.jpg','wb') as f: print('download {} {}...'.format(title, url)) f.write(requests.get(url).content) def main(): starttime = time.time() # keyword=9oz e.g "https://avmo.pw/cn/star/9oz" global keyword keyword = '9qp' if not os.path.exists(keyword): os.makedirs(keyword) urls = get_urls() poolnum = 8 pool = ThreadPool(poolnum) writelist = pool.map(get_imginfo, urls) # writelist = map(get_imginfo, urls) pool2 = ThreadPool(poolnum) pool2.map(write_img, writelist) pool2.close() pool2.join() endtime = time.time() print('Ok, all down.................') print('耗时{:.2f}秒'.format(endtime-starttime)) if __name__ == '__main__': main() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,64 +0,0 @@ -
3rogue created this gist
Aug 10, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,64 @@ #coding=utf-8 import requests, re, os, time from multiprocessing.dummy import Pool as ThreadPool def get_pagenum(url): pagenum = 1 page = requests.get(url) while page.status_code == requests.codes.ok: pagenum += 1 fullurl = url + '/currentPage/%d' % pagenum page = requests.get(fullurl) return pagenum - 1 img_title_queue = [] def get_img_url(url): global img_title_queue path_queue = [] img_queue = [] page = requests.get(url) if page.status_code == requests.codes.ok: img_url = re.findall('http://.*?video.*?jpg', page.content) for i in img_url: img_queue.append(re.sub('s.j','l.j', i)) title_list = re.findall('<span>(.*?)(<i.*?</i>)?<br><date>(.*?)</date>.*?<date>(.*?)</date>', page.content) for pos, title in enumerate(title_list): path_queue.append('./'+ dirname + '/' + title[0].decode('utf-8').replace(' ','') + '_' + title[2] + '_' + title[3] + '.png') img_title = [(img_queue[i], path_queue[i] )for i in range(len(img_queue))] img_title_queue.extend(img_title) def write_img(imginfo): try: print imginfo[1] with open(imginfo[1], 'wb') as f: f.write(requests.get(imginfo[0]).content) except: pass with open('error_list.txt','a') as f: # path -----utf-8 to write f.write(imginfo[1].encode('utf-8')+'\n') f.write(imginfo[0]+'\n') if __name__ == '__main__': dirname = raw_input('input the dirname:') urlkey = raw_input('input the key afer the "star". e.g "http://www.javfee.com/cn/star/9oz" input 9oz:') if not os.path.isdir(dirname): os.makedirs(dirname) start = time.time() url = "http://www.javfee.com/cn/star/%s" %urlkey pagenum = get_pagenum(url) urls = [url + '/currentPage/%i' % i for i in range(1,pagenum+1)] pool = ThreadPool(15) pool.map(get_img_url, urls) # pool.close() # pool.join() # print len(img_title_queue) pool2 = ThreadPool(15) pool2.map(write_img, img_title_queue) pool2.close() pool2.join() end = time.time() print "耗时.......%d秒" %(end-start)