Skip to content

Instantly share code, notes, and snippets.

@cnDelbert
Last active August 29, 2015 14:13
Show Gist options
  • Select an option

  • Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.

Select an option

Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.

Revisions

  1. cnDelbert revised this gist Jan 19, 2015. 1 changed file with 27 additions and 13 deletions.
    40 changes: 27 additions & 13 deletions DownTraverse.py
    Original file line number Diff line number Diff line change
    @@ -1,15 +1,20 @@
    # -*- coding: utf-8 -*-
    __author__ = 'Delbert'
    # Download files from an http server which allows traversing.

    # Python 3 Only.
    # requests and BeautifulSoup4 are required.

    from bs4 import BeautifulSoup
    import urllib
    import requests
    import os


    def init():
    basepath = "http://graphics.csie.ntu.edu.tw/~apfelpuff/"
    downpath = "./apfelpuff/"
    global ignoredDir
    basepath = "HTTP SERVER ADDRESS" # Begin with http:// or ftp:// or https://
    downpath = "DIRECTORY TO STORE" # Relative is preferred
    ignoredDir = {'IGNORED FILE OR PATH'} # Shown text only, not relative directory
    parse(basepath, downpath)


    @@ -32,6 +37,9 @@ def parse(baseurl, localpath):

    if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory
    continue

    if d_link.text in ignoredDir:
    continue

    if d_link.text.endswith('/'): # A link to a child directory
    if not os.path.exists(currentLocalPath + d_link.text):
    @@ -44,18 +52,24 @@ def parse(baseurl, localpath):


    def download(downloadUrl, saveFile):
    r = requests.get(downloadUrl)
    print(urllib.parse.unquote(downloadUrl))
    if os.path.isfile(urllib.parse.unquote(saveFile)) and os.path.getsize(urllib.parse.unquote(saveFile)) > 0:
    return

    furl = open("./furl.txt", "at", encoding='utf-8')
    furl.write(downloadUrl + '\n')
    furl.write(urllib.parse.unquote(downloadUrl) + '\n')
    furl.close()
    print(downloadUrl)
    # print(saveFile)
    # print(r.headers['content-type'])
    if os.path.isfile(saveFile):
    return
    if r.headers['content-type'].startswith("text"): # If it's a text file
    dfile = open(saveFile, "wt")
    dfile.write(r.text.decode().encode('utf-8'))

    r = requests.get(downloadUrl)
    # print(r.headers.get('content-type', 'unknown').lower())
    content_type = r.headers.get('content-type', 'unknown').lower()
    if content_type.startswith("text"): # If it's a text file
    dfile = open(urllib.parse.unquote(saveFile), "wt")
    try:
    temp = r.text.decode().encode('utf-8', 'ignore')
    except:
    temp = r.text.encode('utf-8').decode('gb18030')
    dfile.write(temp)
    else:
    dfile = open(saveFile, "wb")
    dfile.write(r.content)
  2. cnDelbert revised this gist Jan 18, 2015. 1 changed file with 11 additions and 6 deletions.
    17 changes: 11 additions & 6 deletions DownTraverse.py
    Original file line number Diff line number Diff line change
    @@ -8,13 +8,13 @@
    import os

    def init():
    basepath = "http server address"
    downpath = "Folder to save files"
    basepath = "http://graphics.csie.ntu.edu.tw/~apfelpuff/"
    downpath = "./apfelpuff/"
    parse(basepath, downpath)


    def parse(baseurl, localpath):
    print localpath
    print(localpath)
    currentUrl = baseurl
    currentLocalPath = localpath
    req = requests.get(currentUrl)
    @@ -38,22 +38,27 @@ def parse(baseurl, localpath):
    os.mkdir(currentLocalPath + d_link.text)
    parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
    else:
    if not os.path.exists(currentLocalPath):
    os.mkdir(currentLocalPath)
    download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])


    def download(downloadUrl, saveFile):
    r = requests.get(downloadUrl)
    furl = open("./down/furl", "at+")
    furl.write(downloadUrl.encode('utf-8') + '\n')
    furl = open("./furl.txt", "at", encoding='utf-8')
    furl.write(downloadUrl + '\n')
    furl.close()
    print(downloadUrl)
    # print(saveFile)
    # print(r.headers['content-type'])
    if os.path.isfile(saveFile):
    return
    if r.headers['content-type'].startswith("text"): # If it's a text file
    dfile = open(saveFile, "wt")
    dfile.write(r.text.decode().encode('utf-8'))
    else:
    dfile = open(saveFile, "wb")
    dfile.write(r.content)
    dfile.write(r.content)
    dfile.close()


  3. cnDelbert created this gist Jan 17, 2015.
    65 changes: 65 additions & 0 deletions DownTraverse.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,65 @@
    # -*- coding: utf-8 -*-
    __author__ = 'Delbert'
    # Download files from an http server which allows traversing.


    from bs4 import BeautifulSoup
    import requests
    import os

    def init():
    basepath = "http server address"
    downpath = "Folder to save files"
    parse(basepath, downpath)


    def parse(baseurl, localpath):
    print localpath
    currentUrl = baseurl
    currentLocalPath = localpath
    req = requests.get(currentUrl)
    raw_data = BeautifulSoup(req.text)
    all_link = raw_data.find_all("a")

    if all_link == []: # If the directory contains an empty index.html or others
    return

    if not all_link[0]["href"].startswith("?C="): # If the directory contains a default page.
    download(currentUrl, currentLocalPath + "index.html")
    return

    for d_link in all_link:

    if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory
    continue

    if d_link.text.endswith('/'): # A link to a child directory
    if not os.path.exists(currentLocalPath + d_link.text):
    os.mkdir(currentLocalPath + d_link.text)
    parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
    else:
    download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])


    def download(downloadUrl, saveFile):
    r = requests.get(downloadUrl)
    furl = open("./down/furl", "at+")
    furl.write(downloadUrl.encode('utf-8') + '\n')
    furl.close()
    print(downloadUrl)
    # print(saveFile)
    # print(r.headers['content-type'])
    if r.headers['content-type'].startswith("text"): # If it's a text file
    dfile = open(saveFile, "wt")
    else:
    dfile = open(saveFile, "wb")
    dfile.write(r.content)
    dfile.close()


    def main():
    init()


    if __name__ == '__main__':
    main()