Last active
August 29, 2015 14:13
-
-
Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.
Revisions
-
cnDelbert revised this gist
Jan 19, 2015 . 1 changed file with 27 additions and 13 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,15 +1,20 @@ # -*- coding: utf-8 -*- __author__ = 'Delbert' # Download files from an http server which allows traversing. # Python 3 Only. # requests and BeautifulSoup4 are required. from bs4 import BeautifulSoup import urllib import requests import os def init(): global ignoredDir basepath = "HTTP SERVER ADDRESS" # Begin with http:// or ftp:// or https:// downpath = "DIRECTORY TO STORE" # Relative is preferred ignoredDir = {'IGNORED FILE OR PATH'} # Shown text only, not relative directory parse(basepath, downpath) @@ -32,6 +37,9 @@ def parse(baseurl, localpath): if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory continue if d_link.text in ignoredDir: continue if d_link.text.endswith('/'): # A link to a child directory if not os.path.exists(currentLocalPath + d_link.text): @@ -44,18 +52,24 @@ def parse(baseurl, localpath): def download(downloadUrl, saveFile): print(urllib.parse.unquote(downloadUrl)) if os.path.isfile(urllib.parse.unquote(saveFile)) and os.path.getsize(urllib.parse.unquote(saveFile)) > 0: return furl = open("./furl.txt", "at", encoding='utf-8') furl.write(urllib.parse.unquote(downloadUrl) + '\n') furl.close() r = requests.get(downloadUrl) # print(r.headers.get('content-type', 'unknown').lower()) content_type = r.headers.get('content-type', 'unknown').lower() if content_type.startswith("text"): # If it's a text file dfile = open(urllib.parse.unquote(saveFile), "wt") try: temp = r.text.decode().encode('utf-8', 'ignore') except: temp = r.text.encode('utf-8').decode('gb18030') dfile.write(temp) else: dfile = open(saveFile, "wb") dfile.write(r.content) -
cnDelbert revised this gist
Jan 18, 2015 . 1 changed file with 11 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,13 +8,13 @@ import os def init(): basepath = "http://graphics.csie.ntu.edu.tw/~apfelpuff/" downpath = "./apfelpuff/" parse(basepath, downpath) def parse(baseurl, localpath): print(localpath) currentUrl = baseurl currentLocalPath = localpath req = requests.get(currentUrl) @@ -38,22 +38,27 @@ def parse(baseurl, localpath): os.mkdir(currentLocalPath + d_link.text) parse(currentUrl + d_link.text, currentLocalPath + d_link.text) else: if not os.path.exists(currentLocalPath): os.mkdir(currentLocalPath) download(currentUrl + d_link["href"], currentLocalPath + d_link["href"]) def download(downloadUrl, saveFile): r = requests.get(downloadUrl) furl = open("./furl.txt", "at", encoding='utf-8') furl.write(downloadUrl + '\n') furl.close() print(downloadUrl) # print(saveFile) # print(r.headers['content-type']) if os.path.isfile(saveFile): return if r.headers['content-type'].startswith("text"): # If it's a text file dfile = open(saveFile, "wt") dfile.write(r.text.decode().encode('utf-8')) else: dfile = open(saveFile, "wb") dfile.write(r.content) dfile.close() -
cnDelbert created this gist
Jan 17, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,65 @@ # -*- coding: utf-8 -*- __author__ = 'Delbert' # Download files from an http server which allows traversing. from bs4 import BeautifulSoup import requests import os def init(): basepath = "http server address" downpath = "Folder to save files" parse(basepath, downpath) def parse(baseurl, localpath): print localpath currentUrl = baseurl currentLocalPath = localpath req = requests.get(currentUrl) raw_data = BeautifulSoup(req.text) all_link = raw_data.find_all("a") if all_link == []: # If the directory contains an empty index.html or others return if not all_link[0]["href"].startswith("?C="): # If the directory contains a default page. download(currentUrl, currentLocalPath + "index.html") return for d_link in all_link: if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory continue if d_link.text.endswith('/'): # A link to a child directory if not os.path.exists(currentLocalPath + d_link.text): os.mkdir(currentLocalPath + d_link.text) parse(currentUrl + d_link.text, currentLocalPath + d_link.text) else: download(currentUrl + d_link["href"], currentLocalPath + d_link["href"]) def download(downloadUrl, saveFile): r = requests.get(downloadUrl) furl = open("./down/furl", "at+") furl.write(downloadUrl.encode('utf-8') + '\n') furl.close() print(downloadUrl) # print(saveFile) # print(r.headers['content-type']) if r.headers['content-type'].startswith("text"): # If it's a text file dfile = open(saveFile, "wt") else: dfile = open(saveFile, "wb") dfile.write(r.content) dfile.close() def main(): init() if __name__ == '__main__': main()