jesteria · March 1, 2020 18:41
diff --git a/open_url_lazy.py b/open_url_lazy.py
 import ohio


 def open_url_lazy(url):
    """construct file-like object for resource at given url"""
    chunks = request_chunks(url)
    return ohio.IteratorTextIO(chunks)
diff --git a/pandas_read_csv_chunks.py b/pandas_read_csv_chunks.py
 # pandas doesn't seem to know how to stream/chunk from an *online* resource
 # TODO: confirm that chunksize doesn't fix this    


 import pandas as pd


 DIVVY_DATA_URL = 'https://data.cityofchicago.org/api/views/fg6s-gzvg/rows.csv'


 df = pd.read_csv(
    open_url_lazy(DIVVY_DATA_URL),
    usecols=[
        'TRIP ID',
        'START TIME',
        'STOP TIME',
        'TRIP DURATION',
        'USER TYPE',
        'GENDER',
        'BIRTH YEAR',
        'FROM LOCATION',
        'TO LOCATION',
    ],
    index_col='TRIP ID',
    parse_dates=['START TIME', 'STOP TIME'],
    nrows=500_000,
 )
diff --git a/request_chunks.py b/request_chunks.py
 import requests


 def request_chunks(url, chunk_size=None):
    """generate chunks of resource at given url
    
    url: resource locator (str)
    chunk_size: limit size of chunks (int)
    
    """
    response = requests.get(url, stream=True)
    
    for chunk in response.iter_content(chunk_size=chunk_size):
        # filter out keep-alive responses
        if chunk:
            yield chunk.decode('utf-8')
	import ohio


	def open_url_lazy(url):
	"""construct file-like object for resource at given url"""
	chunks = request_chunks(url)
	return ohio.IteratorTextIO(chunks)
	# pandas doesn't seem to know how to stream/chunk from an online resource
	# TODO: confirm that chunksize doesn't fix this


	import pandas as pd


	DIVVY_DATA_URL = 'https://data.cityofchicago.org/api/views/fg6s-gzvg/rows.csv'


	df = pd.read_csv(
	open_url_lazy(DIVVY_DATA_URL),
	usecols=[
	'TRIP ID',
	'START TIME',
	'STOP TIME',
	'TRIP DURATION',
	'USER TYPE',
	'GENDER',
	'BIRTH YEAR',
	'FROM LOCATION',
	'TO LOCATION',
	],
	index_col='TRIP ID',
	parse_dates=['START TIME', 'STOP TIME'],
	nrows=500_000,
	)
	import requests


	def request_chunks(url, chunk_size=None):
	"""generate chunks of resource at given url

	url: resource locator (str)
	chunk_size: limit size of chunks (int)

	"""
	response = requests.get(url, stream=True)

	for chunk in response.iter_content(chunk_size=chunk_size):
	# filter out keep-alive responses
	if chunk:
	yield chunk.decode('utf-8')