Skip to content

Instantly share code, notes, and snippets.

@kylemcdonald
Last active October 27, 2018 02:20
Show Gist options
  • Select an option

  • Save kylemcdonald/0d769bdab6b0251f4644 to your computer and use it in GitHub Desktop.

Select an option

Save kylemcdonald/0d769bdab6b0251f4644 to your computer and use it in GitHub Desktop.
Download all the content from Seene.
from os.path import basename
from urlparse import urlparse
from os.path import splitext, basename
from multiprocessing.dummy import Pool
from urllib3 import HTTPConnectionPool
from tqdm import tqdm
import fnmatch
import json
import os
import errno
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def list_all_files(directory, extensions=None, limit=None):
total = 0
with tqdm() as pbar:
for root, dirnames, filenames in os.walk(directory):
try:
for filename in filenames:
base, ext = os.path.splitext(filename)
joined = os.path.join(root, filename)
if extensions is None or ext.lower() in extensions:
yield joined
pbar.update(1)
total += 1
if total == limit:
return
except KeyboardInterrupt:
break
def build_identifiers():
print 'Loading filenames...'
all_files = list(list_all_files('users', extensions=['.json'], limit=None))
print 'Processing files...'
with open('identifiers.txt', 'w') as out:
for fn in tqdm(all_files):
try:
with open(fn, 'r') as f:
user = json.load(f)
for scene in user['scenes']:
identifier = scene['identifier']
out.write(identifier + '\n')
except KeyboardInterrupt:
break
except:
pass
if not os.path.exists('identifiers.txt'):
print 'Building identifiers.'
build_identifiers()
with open('identifiers.txt') as f:
identifiers = f.read().splitlines()
# identifiers = identifiers[:128]
n_connections = 64
domain = 'd2qkfprjkxv2r7.cloudfront.net'
http_pool = HTTPConnectionPool(domain)
def download(url, fn):
if not os.path.isfile(fn):
r = http_pool.urlopen('GET', url)
with open(fn, 'wb') as f:
f.write(r.data)
pbar = tqdm(total=len(identifiers), leave=True)
def job(identifier):
prefix = identifier[:2]
dir_name = os.path.join('scenes', prefix)
model_fn = os.path.join(dir_name, '%s.oemodel' % identifier)
poster_fn = os.path.join(dir_name, '%s.jpg' % identifier)
mkdir_p(dir_name)
model_url = '/uploads/scene/model/%s/scene.oemodel' % identifier
poster_url = '/uploads/scene/poster/%s/poster.jpg' % identifier
download(model_url, model_fn)
download(poster_url, poster_fn)
pbar.update(1)
pool = Pool(n_connections)
pool.map(job, identifiers)
#!/usr/bin/env bash
FIRST=1
LAST=641400 # 641415 total as of march 19
N=100
echo "Starting $N instances of curl..."
STEP=$((LAST/N))
for START in `seq $FIRST $STEP $LAST`; do
END=$((START+STEP-1))
# echo "$START to $END"
mkdir -p "users/$START"
curl -s "https://seene.co/api/seene/-/users/[$START-$END]/scenes?count=1000" -o "users/$START/#1.json" &
done
while true; do
CUR=`find . -name "*.json" | wc -l`
TOTAL=$((LAST-FIRST))
PCT=$(((100*CUR)/TOTAL))
echo -en "\r$PCT% ($CUR/$TOTAL)"
sleep 3
if [ "$CUR" -eq "$TOTAL" ]; then
break
fi
done
echo
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment