Last active
October 27, 2018 02:20
-
-
Save kylemcdonald/0d769bdab6b0251f4644 to your computer and use it in GitHub Desktop.
Download all the content from Seene.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from os.path import basename | |
| from urlparse import urlparse | |
| from os.path import splitext, basename | |
| from multiprocessing.dummy import Pool | |
| from urllib3 import HTTPConnectionPool | |
| from tqdm import tqdm | |
| import fnmatch | |
| import json | |
| import os | |
| import errno | |
| def mkdir_p(path): | |
| try: | |
| os.makedirs(path) | |
| except OSError as exc: | |
| if exc.errno == errno.EEXIST and os.path.isdir(path): | |
| pass | |
| else: | |
| raise | |
| def list_all_files(directory, extensions=None, limit=None): | |
| total = 0 | |
| with tqdm() as pbar: | |
| for root, dirnames, filenames in os.walk(directory): | |
| try: | |
| for filename in filenames: | |
| base, ext = os.path.splitext(filename) | |
| joined = os.path.join(root, filename) | |
| if extensions is None or ext.lower() in extensions: | |
| yield joined | |
| pbar.update(1) | |
| total += 1 | |
| if total == limit: | |
| return | |
| except KeyboardInterrupt: | |
| break | |
| def build_identifiers(): | |
| print 'Loading filenames...' | |
| all_files = list(list_all_files('users', extensions=['.json'], limit=None)) | |
| print 'Processing files...' | |
| with open('identifiers.txt', 'w') as out: | |
| for fn in tqdm(all_files): | |
| try: | |
| with open(fn, 'r') as f: | |
| user = json.load(f) | |
| for scene in user['scenes']: | |
| identifier = scene['identifier'] | |
| out.write(identifier + '\n') | |
| except KeyboardInterrupt: | |
| break | |
| except: | |
| pass | |
| if not os.path.exists('identifiers.txt'): | |
| print 'Building identifiers.' | |
| build_identifiers() | |
| with open('identifiers.txt') as f: | |
| identifiers = f.read().splitlines() | |
| # identifiers = identifiers[:128] | |
| n_connections = 64 | |
| domain = 'd2qkfprjkxv2r7.cloudfront.net' | |
| http_pool = HTTPConnectionPool(domain) | |
| def download(url, fn): | |
| if not os.path.isfile(fn): | |
| r = http_pool.urlopen('GET', url) | |
| with open(fn, 'wb') as f: | |
| f.write(r.data) | |
| pbar = tqdm(total=len(identifiers), leave=True) | |
| def job(identifier): | |
| prefix = identifier[:2] | |
| dir_name = os.path.join('scenes', prefix) | |
| model_fn = os.path.join(dir_name, '%s.oemodel' % identifier) | |
| poster_fn = os.path.join(dir_name, '%s.jpg' % identifier) | |
| mkdir_p(dir_name) | |
| model_url = '/uploads/scene/model/%s/scene.oemodel' % identifier | |
| poster_url = '/uploads/scene/poster/%s/poster.jpg' % identifier | |
| download(model_url, model_fn) | |
| download(poster_url, poster_fn) | |
| pbar.update(1) | |
| pool = Pool(n_connections) | |
| pool.map(job, identifiers) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| FIRST=1 | |
| LAST=641400 # 641415 total as of march 19 | |
| N=100 | |
| echo "Starting $N instances of curl..." | |
| STEP=$((LAST/N)) | |
| for START in `seq $FIRST $STEP $LAST`; do | |
| END=$((START+STEP-1)) | |
| # echo "$START to $END" | |
| mkdir -p "users/$START" | |
| curl -s "https://seene.co/api/seene/-/users/[$START-$END]/scenes?count=1000" -o "users/$START/#1.json" & | |
| done | |
| while true; do | |
| CUR=`find . -name "*.json" | wc -l` | |
| TOTAL=$((LAST-FIRST)) | |
| PCT=$(((100*CUR)/TOTAL)) | |
| echo -en "\r$PCT% ($CUR/$TOTAL)" | |
| sleep 3 | |
| if [ "$CUR" -eq "$TOTAL" ]; then | |
| break | |
| fi | |
| done | |
| echo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment