Skip to content

Instantly share code, notes, and snippets.

@jesteria
Created March 1, 2020 18:41
Show Gist options
  • Select an option

  • Save jesteria/968f96a884817ac495c423fca52314c0 to your computer and use it in GitHub Desktop.

Select an option

Save jesteria/968f96a884817ac495c423fca52314c0 to your computer and use it in GitHub Desktop.
pandas chunk-read resource
import ohio
def open_url_lazy(url):
"""construct file-like object for resource at given url"""
chunks = request_chunks(url)
return ohio.IteratorTextIO(chunks)
# pandas doesn't seem to know how to stream/chunk from an *online* resource
# TODO: confirm that chunksize doesn't fix this
import pandas as pd
DIVVY_DATA_URL = 'https://data.cityofchicago.org/api/views/fg6s-gzvg/rows.csv'
df = pd.read_csv(
open_url_lazy(DIVVY_DATA_URL),
usecols=[
'TRIP ID',
'START TIME',
'STOP TIME',
'TRIP DURATION',
'USER TYPE',
'GENDER',
'BIRTH YEAR',
'FROM LOCATION',
'TO LOCATION',
],
index_col='TRIP ID',
parse_dates=['START TIME', 'STOP TIME'],
nrows=500_000,
)
import requests
def request_chunks(url, chunk_size=None):
"""generate chunks of resource at given url
url: resource locator (str)
chunk_size: limit size of chunks (int)
"""
response = requests.get(url, stream=True)
for chunk in response.iter_content(chunk_size=chunk_size):
# filter out keep-alive responses
if chunk:
yield chunk.decode('utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment