Skip to content

Instantly share code, notes, and snippets.

@cnDelbert
Last active August 29, 2015 14:13
Show Gist options
  • Select an option

  • Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.

Select an option

Save cnDelbert/a4b62515ec89dcb1596b to your computer and use it in GitHub Desktop.
Download files from an http server which allows traversing.
# -*- coding: utf-8 -*-
__author__ = 'Delbert'
# Download files from an http server which allows traversing.
from bs4 import BeautifulSoup
import requests
import os
def init():
basepath = "http://graphics.csie.ntu.edu.tw/~apfelpuff/"
downpath = "./apfelpuff/"
parse(basepath, downpath)
def parse(baseurl, localpath):
print(localpath)
currentUrl = baseurl
currentLocalPath = localpath
req = requests.get(currentUrl)
raw_data = BeautifulSoup(req.text)
all_link = raw_data.find_all("a")
if all_link == []: # If the directory contains an empty index.html or others
return
if not all_link[0]["href"].startswith("?C="): # If the directory contains a default page.
download(currentUrl, currentLocalPath + "index.html")
return
for d_link in all_link:
if d_link["href"].startswith("?C=") or d_link.text == "Parent Directory": # If it's an empty directory
continue
if d_link.text.endswith('/'): # A link to a child directory
if not os.path.exists(currentLocalPath + d_link.text):
os.mkdir(currentLocalPath + d_link.text)
parse(currentUrl + d_link.text, currentLocalPath + d_link.text)
else:
if not os.path.exists(currentLocalPath):
os.mkdir(currentLocalPath)
download(currentUrl + d_link["href"], currentLocalPath + d_link["href"])
def download(downloadUrl, saveFile):
r = requests.get(downloadUrl)
furl = open("./furl.txt", "at", encoding='utf-8')
furl.write(downloadUrl + '\n')
furl.close()
print(downloadUrl)
# print(saveFile)
# print(r.headers['content-type'])
if os.path.isfile(saveFile):
return
if r.headers['content-type'].startswith("text"): # If it's a text file
dfile = open(saveFile, "wt")
dfile.write(r.text.decode().encode('utf-8'))
else:
dfile = open(saveFile, "wb")
dfile.write(r.content)
dfile.close()
def main():
init()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment