cnDelbert · August 29, 2015 14:13 · Jan 19, 2015 · Jan 18, 2015 · Jan 17, 2015
diff --git a/DownTraverse.py b/DownTraverse.py
@@ -1,15 +1,20 @@
 # -*- coding: utf-8 -*-
 __author__ = 'Delbert'
 # Download files from an http server which allows traversing.
-
+# Python 3 Only.
+# requests and BeautifulSoup4 are required.
 
 from bs4 import BeautifulSoup
+import urllib
 import requests
 import os
 
+
 def init():
-    basepath = "http://graphics.csie.ntu.edu.tw/~apfelpuff/"
-    downpath = "./apfelpuff/"
+    global ignoredDir
+    basepath = "HTTP SERVER ADDRESS"    # Begin with http:// or ftp:// or https://
+    downpath = "DIRECTORY TO STORE"     # Relative is preferred
+    ignoredDir = {'IGNORED FILE OR PATH'}   # Shown text only, not relative directory
     parse(basepath, downpath)
 
 
@@ -32,6 +37,9 @@ def parse(baseurl, localpath):
 
         if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory":   # If it's an empty directory
             continue
+
+        if d_link.text in ignoredDir:
+            continue
 
         if d_link.text.endswith('/'):   # A link to a child directory
             if not os.path.exists(currentLocalPath + d_link.text):
@@ -44,18 +52,24 @@ def parse(baseurl, localpath):
 
 
 def download(downloadUrl, saveFile):
-    r = requests.get(downloadUrl)
+    print(urllib.parse.unquote(downloadUrl))
+    if os.path.isfile(urllib.parse.unquote(saveFile)) and os.path.getsize(urllib.parse.unquote(saveFile)) > 0:
+        return
+
     furl = open("./furl.txt", "at", encoding='utf-8')
-    furl.write(downloadUrl + '\n')
+    furl.write(urllib.parse.unquote(downloadUrl) + '\n')
     furl.close()
-    print(downloadUrl)
-    # print(saveFile)
-    # print(r.headers['content-type'])
-    if os.path.isfile(saveFile):
-        return
-    if r.headers['content-type'].startswith("text"):    # If it's a text file
-        dfile = open(saveFile, "wt")
-        dfile.write(r.text.decode().encode('utf-8'))
+
+    r = requests.get(downloadUrl)
+    # print(r.headers.get('content-type', 'unknown').lower())
+    content_type = r.headers.get('content-type', 'unknown').lower()
+    if content_type.startswith("text"):    # If it's a text file
+        dfile = open(urllib.parse.unquote(saveFile), "wt")
+        try:
+            temp = r.text.decode().encode('utf-8', 'ignore')
+        except:
+            temp = r.text.encode('utf-8').decode('gb18030')
+        dfile.write(temp)
     else:
         dfile = open(saveFile, "wb")
         dfile.write(r.content)

diff --git a/DownTraverse.py b/DownTraverse.py
@@ -8,13 +8,13 @@
 import os
 
 def init():
-    basepath = "http server address"
-    downpath = "Folder to save files"
+    basepath = "http://graphics.csie.ntu.edu.tw/~apfelpuff/"
+    downpath = "./apfelpuff/"
     parse(basepath, downpath)
 
 
 def parse(baseurl, localpath):
-    print localpath
+    print(localpath)
     currentUrl = baseurl
     currentLocalPath = localpath
     req = requests.get(currentUrl)
@@ -38,22 +38,27 @@ def parse(baseurl, localpath):
                 os.mkdir(currentLocalPath + d_link.text)
             parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
         else:
+            if not os.path.exists(currentLocalPath):
+                os.mkdir(currentLocalPath)
             download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])
 
 
 def download(downloadUrl, saveFile):
     r = requests.get(downloadUrl)
-    furl = open("./down/furl", "at+")
-    furl.write(downloadUrl.encode('utf-8') + '\n')
+    furl = open("./furl.txt", "at", encoding='utf-8')
+    furl.write(downloadUrl + '\n')
     furl.close()
     print(downloadUrl)
     # print(saveFile)
     # print(r.headers['content-type'])
+    if os.path.isfile(saveFile):
+        return
     if r.headers['content-type'].startswith("text"):    # If it's a text file
         dfile = open(saveFile, "wt")
+        dfile.write(r.text.decode().encode('utf-8'))
     else:
         dfile = open(saveFile, "wb")
-    dfile.write(r.content)
+        dfile.write(r.content)
     dfile.close()
 
 

diff --git a/DownTraverse.py b/DownTraverse.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+__author__ = 'Delbert'
+# Download files from an http server which allows traversing.
+
+
+from bs4 import BeautifulSoup
+import requests
+import os
+
+def init():
+    basepath = "http server address"
+    downpath = "Folder to save files"
+    parse(basepath, downpath)
+
+
+def parse(baseurl, localpath):
+    print localpath
+    currentUrl = baseurl
+    currentLocalPath = localpath
+    req = requests.get(currentUrl)
+    raw_data = BeautifulSoup(req.text)
+    all_link = raw_data.find_all("a")
+
+    if all_link == []:  # If the directory contains an empty index.html or others
+        return
+
+    if not all_link[0]["href"].startswith("?C="):   # If the directory contains a default page.
+        download(currentUrl, currentLocalPath + "index.html")
+        return
+
+    for d_link in all_link:
+
+        if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory":   # If it's an empty directory
+            continue
+
+        if d_link.text.endswith('/'):   # A link to a child directory
+            if not os.path.exists(currentLocalPath + d_link.text):
+                os.mkdir(currentLocalPath + d_link.text)
+            parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
+        else:
+            download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])
+
+
+def download(downloadUrl, saveFile):
+    r = requests.get(downloadUrl)
+    furl = open("./down/furl", "at+")
+    furl.write(downloadUrl.encode('utf-8') + '\n')
+    furl.close()
+    print(downloadUrl)
+    # print(saveFile)
+    # print(r.headers['content-type'])
+    if r.headers['content-type'].startswith("text"):    # If it's a text file
+        dfile = open(saveFile, "wt")
+    else:
+        dfile = open(saveFile, "wb")
+    dfile.write(r.content)
+    dfile.close()
+
+
+def main():
+    init()
+
+
+if __name__ == '__main__':
+    main()
No results found