Skip to content

Instantly share code, notes, and snippets.

@3rogue
Last active December 27, 2016 08:34
Show Gist options
  • Select an option

  • Save 3rogue/f86157e01b5fc0aee4e0 to your computer and use it in GitHub Desktop.

Select an option

Save 3rogue/f86157e01b5fc0aee4e0 to your computer and use it in GitHub Desktop.

Revisions

  1. 3rogue revised this gist Dec 27, 2016. 2 changed files with 64 additions and 64 deletions.
    64 changes: 64 additions & 0 deletions avmo2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    from pyquery import PyQuery as pq
    import requests
    import os
    import os.path
    import time
    from multiprocessing.dummy import Pool as ThreadPool

    def get_urls():
    pagenum = 1
    urls = []
    while True:
    url = 'https://avmo.pw/cn/star/{}/page/{}'.format(keyword,str(pagenum))
    page = requests.get(url)
    if(page.status_code == requests.codes.ok):
    pagenum += 1
    urls.append(url)
    else:
    break
    return urls

    def get_imginfo(url):
    page = requests.get(url).text
    d = pq(page)
    img_url = []
    title_list = []
    for url in d('.movie-box .photo-frame img'):
    img_url.append(d(url).attr('src').replace('s.jpg', 'l.jpg'))
    for title in d('.movie-box .photo-info date:eq(0)'):
    title_list.append(d(title).text())
    return [[img_url[pos], title_list[pos]] for pos in range(len(img_url))]

    def write_img(writelist):
    for url,title in writelist:
    with open(r'./' + keyword + '/' + title + '.jpg','wb') as f:
    print('download {} {}...'.format(title, url))
    f.write(requests.get(url).content)

    def main():
    starttime = time.time()
    # keyword=9oz e.g "https://avmo.pw/cn/star/9oz"
    global keyword
    keyword = '9qp'
    if not os.path.exists(keyword):
    os.makedirs(keyword)

    urls = get_urls()
    poolnum = 8

    pool = ThreadPool(poolnum)
    writelist = pool.map(get_imginfo, urls)
    # writelist = map(get_imginfo, urls)

    pool2 = ThreadPool(poolnum)
    pool2.map(write_img, writelist)

    pool2.close()
    pool2.join()

    endtime = time.time()
    print('Ok, all down.................')
    print('耗时{:.2f}秒'.format(endtime-starttime))

    if __name__ == '__main__':
    main()
    64 changes: 0 additions & 64 deletions javfee2.py
    Original file line number Diff line number Diff line change
    @@ -1,64 +0,0 @@
    #coding=utf-8
    import requests, re, os, time
    from multiprocessing.dummy import Pool as ThreadPool

    def get_pagenum(url):
    pagenum = 1
    page = requests.get(url)
    while page.status_code == requests.codes.ok:
    pagenum += 1
    fullurl = url + '/currentPage/%d' % pagenum
    page = requests.get(fullurl)
    return pagenum - 1

    img_title_queue = []
    def get_img_url(url):
    global img_title_queue
    path_queue = []
    img_queue = []
    page = requests.get(url)
    if page.status_code == requests.codes.ok:
    img_url = re.findall('http://.*?video.*?jpg', page.content)
    for i in img_url:
    img_queue.append(re.sub('s.j','l.j', i))
    title_list = re.findall('<span>(.*?)(<i.*?</i>)?<br><date>(.*?)</date>.*?<date>(.*?)</date>', page.content)
    for pos, title in enumerate(title_list):
    path_queue.append('./'+ dirname + '/' + title[0].decode('utf-8').replace(' ','') + '_' + title[2] + '_' + title[3] + '.png')
    img_title = [(img_queue[i], path_queue[i] )for i in range(len(img_queue))]
    img_title_queue.extend(img_title)

    def write_img(imginfo):
    try:
    print imginfo[1]
    with open(imginfo[1], 'wb') as f:
    f.write(requests.get(imginfo[0]).content)
    except:
    pass
    with open('error_list.txt','a') as f:
    # path -----utf-8 to write
    f.write(imginfo[1].encode('utf-8')+'\n')
    f.write(imginfo[0]+'\n')

    if __name__ == '__main__':
    dirname = raw_input('input the dirname:')
    urlkey = raw_input('input the key afer the "star". e.g "http://www.javfee.com/cn/star/9oz" input 9oz:')
    if not os.path.isdir(dirname):
    os.makedirs(dirname)

    start = time.time()
    url = "http://www.javfee.com/cn/star/%s" %urlkey
    pagenum = get_pagenum(url)

    urls = [url + '/currentPage/%i' % i for i in range(1,pagenum+1)]
    pool = ThreadPool(15)
    pool.map(get_img_url, urls)
    # pool.close()
    # pool.join()
    # print len(img_title_queue)

    pool2 = ThreadPool(15)
    pool2.map(write_img, img_title_queue)
    pool2.close()
    pool2.join()
    end = time.time()
    print "耗时.......%d秒" %(end-start)
  2. 3rogue created this gist Aug 10, 2015.
    64 changes: 64 additions & 0 deletions javfee2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    #coding=utf-8
    import requests, re, os, time
    from multiprocessing.dummy import Pool as ThreadPool

    def get_pagenum(url):
    pagenum = 1
    page = requests.get(url)
    while page.status_code == requests.codes.ok:
    pagenum += 1
    fullurl = url + '/currentPage/%d' % pagenum
    page = requests.get(fullurl)
    return pagenum - 1

    img_title_queue = []
    def get_img_url(url):
    global img_title_queue
    path_queue = []
    img_queue = []
    page = requests.get(url)
    if page.status_code == requests.codes.ok:
    img_url = re.findall('http://.*?video.*?jpg', page.content)
    for i in img_url:
    img_queue.append(re.sub('s.j','l.j', i))
    title_list = re.findall('<span>(.*?)(<i.*?</i>)?<br><date>(.*?)</date>.*?<date>(.*?)</date>', page.content)
    for pos, title in enumerate(title_list):
    path_queue.append('./'+ dirname + '/' + title[0].decode('utf-8').replace(' ','') + '_' + title[2] + '_' + title[3] + '.png')
    img_title = [(img_queue[i], path_queue[i] )for i in range(len(img_queue))]
    img_title_queue.extend(img_title)

    def write_img(imginfo):
    try:
    print imginfo[1]
    with open(imginfo[1], 'wb') as f:
    f.write(requests.get(imginfo[0]).content)
    except:
    pass
    with open('error_list.txt','a') as f:
    # path -----utf-8 to write
    f.write(imginfo[1].encode('utf-8')+'\n')
    f.write(imginfo[0]+'\n')

    if __name__ == '__main__':
    dirname = raw_input('input the dirname:')
    urlkey = raw_input('input the key afer the "star". e.g "http://www.javfee.com/cn/star/9oz" input 9oz:')
    if not os.path.isdir(dirname):
    os.makedirs(dirname)

    start = time.time()
    url = "http://www.javfee.com/cn/star/%s" %urlkey
    pagenum = get_pagenum(url)

    urls = [url + '/currentPage/%i' % i for i in range(1,pagenum+1)]
    pool = ThreadPool(15)
    pool.map(get_img_url, urls)
    # pool.close()
    # pool.join()
    # print len(img_title_queue)

    pool2 = ThreadPool(15)
    pool2.map(write_img, img_title_queue)
    pool2.close()
    pool2.join()
    end = time.time()
    print "耗时.......%d秒" %(end-start)