{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Some timings with GeoPandas new Parquet and Feather file format support" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import geopandas" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# ignore the warnings of it being experimental\n", "import warnings\n", "warnings.filterwarnings(\"ignore\", \"this is an initial implementation of Parquet file support\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test case 1: Natural Earth 1:10m Admin 1 – States, Provinces\n", "\n", "https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-1-states-provinces/" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = geopandas.read_file(\"zip+https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Writing" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.85 s, sys: 67.3 ms, total: 1.91 s\n", "Wall time: 1.91 s\n" ] } ], "source": [ "%time df.to_file(\"test_ne_10m.shp\", driver='ESRI Shapefile')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.53 s, sys: 146 ms, total: 1.68 s\n", "Wall time: 2.45 s\n" ] } ], "source": [ "%time df.to_file(\"test_ne_10m.gpkg\", driver=\"GPKG\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 209 ms, sys: 28.2 ms, total: 237 ms\n", "Wall time: 236 ms\n" ] } ], "source": [ "%time df.to_parquet(\"test_ne_10m.parquet\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 212 ms, sys: 19.2 ms, total: 231 ms\n", "Wall time: 215 ms\n" ] } ], "source": [ "%time df.to_feather(\"test_ne_10m.feather\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Reading" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "825 ms ± 8.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit geopandas.read_file(\"test_ne_10m.shp\", driver='ESRI Shapefile')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "729 ms ± 4.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit geopandas.read_file(\"test_ne_10m.gpkg\", driver='GPKG')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "161 ms ± 2.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit geopandas.read_parquet(\"test_ne_10m.parquet\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "134 ms ± 839 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit geopandas.read_feather(\"test_ne_10m.feather\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### File sizes" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 joris joris 10 Mai 20 20:35 test_ne_10m.cpg\n", "-rw-r--r-- 1 joris joris 23M Mai 20 20:35 test_ne_10m.dbf\n", "-rw-r--r-- 1 joris joris 19M Mai 20 20:35 test_ne_10m.feather\n", "-rw-r--r-- 1 joris joris 27M Mai 20 20:35 test_ne_10m.gpkg\n", "-rw-r--r-- 1 joris joris 20M Mai 20 20:35 test_ne_10m.parquet\n", "-rw-r--r-- 1 joris joris 145 Mai 20 20:35 test_ne_10m.prj\n", "-rw-r--r-- 1 joris joris 21M Mai 20 20:35 test_ne_10m.shp\n", "-rw-r--r-- 1 joris joris 36K Mai 20 20:35 test_ne_10m.shx\n" ] } ], "source": [ "!ls test_ne_10m.* -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test case 2: OpenStreetMap buildings" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pyrosm" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Download pbf data\n", "fp = pyrosm.get_data(\"London\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Initialize the OSM object\n", "osm = pyrosm.OSM(fp)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "buildings = osm.get_buildings()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df = buildings[[\"id\", \"osm_type\", \"building\", \"amenity\", \"addr:street\", \"timestamp\", \"geometry\"]].rename(columns={\"id\": \"osm_id\"})" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | osm_id | \n", "osm_type | \n", "building | \n", "amenity | \n", "addr:street | \n", "timestamp | \n", "geometry | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "2956186 | \n", "way | \n", "block | \n", "None | \n", "None | \n", "0 | \n", "POLYGON ((-0.02162 51.44472, -0.02033 51.44469... | \n", "
| 1 | \n", "2956187 | \n", "way | \n", "yes | \n", "townhall | \n", "Catford Broadway | \n", "0 | \n", "POLYGON ((-0.02110 51.44523, -0.02132 51.44508... | \n", "
| 2 | \n", "2956188 | \n", "way | \n", "yes | \n", "theatre | \n", "None | \n", "0 | \n", "POLYGON ((-0.02004 51.44536, -0.02006 51.44528... | \n", "
| 3 | \n", "2956192 | \n", "way | \n", "store | \n", "None | \n", "None | \n", "0 | \n", "POLYGON ((-0.01900 51.44462, -0.01864 51.44458... | \n", "
| 4 | \n", "2956193 | \n", "way | \n", "store | \n", "None | \n", "None | \n", "0 | \n", "POLYGON ((-0.01752 51.44542, -0.01815 51.44551... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 584187 | \n", "266218115929 | \n", "relation | \n", "residential | \n", "None | \n", "Bedford Gardens | \n", "0 | \n", "POLYGON ((-0.19751 51.50561, -0.19750 51.50562... | \n", "
| 584188 | \n", "266229200037 | \n", "relation | \n", "residential | \n", "None | \n", "Bedford Gardens | \n", "0 | \n", "MULTIPOLYGON (((-0.19738 51.50565, -0.19730 51... | \n", "
| 584189 | \n", "266395470798 | \n", "relation | \n", "yes | \n", "None | \n", "None | \n", "0 | \n", "POLYGON ((-0.11464 51.45445, -0.11467 51.45450... | \n", "
| 584190 | \n", "266406556085 | \n", "relation | \n", "yes | \n", "None | \n", "None | \n", "0 | \n", "POLYGON ((-0.11409 51.45358, -0.11412 51.45362... | \n", "
| 584191 | \n", "266417641373 | \n", "relation | \n", "yes | \n", "None | \n", "None | \n", "0 | \n", "POLYGON ((-0.11420 51.45375, -0.11422 51.45378... | \n", "
584192 rows × 7 columns
\n", "| \n", " | osm_id | \n", "osm_type | \n", "amenity | \n", "addr:street | \n", "timestamp | \n", "geometry | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "108042 | \n", "node | \n", "pub | \n", "University Street | \n", "NaN | \n", "POINT (-0.13551 51.52356) | \n", "
| 1 | \n", "108539 | \n", "node | \n", "bicycle_rental | \n", "None | \n", "NaN | \n", "POINT (-0.09339 51.52913) | \n", "
| 2 | \n", "109575 | \n", "node | \n", "advice | \n", "None | \n", "NaN | \n", "POINT (-0.14312 51.52826) | \n", "
| 3 | \n", "110075 | \n", "node | \n", "bicycle_parking | \n", "None | \n", "NaN | \n", "POINT (-0.14028 51.53426) | \n", "
| 4 | \n", "451152 | \n", "node | \n", "pub | \n", "Regents Park Road | \n", "NaN | \n", "POINT (-0.19461 51.60084) | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 134244 | \n", "260535140001 | \n", "relation | \n", "school | \n", "Brick Lane | \n", "0.0 | \n", "MULTIPOLYGON (((-0.07169 51.51895, -0.07171 51... | \n", "
| 134245 | \n", "261175302034 | \n", "relation | \n", "college | \n", "None | \n", "0.0 | \n", "MULTIPOLYGON (((0.00889 51.54038, 0.00842 51.5... | \n", "
| 134246 | \n", "261594886517 | \n", "relation | \n", "school | \n", "None | \n", "0.0 | \n", "MULTIPOLYGON (((0.03907 51.51750, 0.04075 51.5... | \n", "
| 134247 | \n", "262136116245 | \n", "relation | \n", "school | \n", "None | \n", "0.0 | \n", "MULTIPOLYGON (((-0.03010 51.51048, -0.02994 51... | \n", "
| 134248 | \n", "262334979711 | \n", "relation | \n", "school | \n", "None | \n", "0.0 | \n", "MULTIPOLYGON (((-0.02641 51.51923, -0.02532 51... | \n", "
134249 rows × 6 columns
\n", "| \n", " | ZCTA5CE10 | \n", "GEOID10 | \n", "CLASSFP10 | \n", "MTFCC10 | \n", "FUNCSTAT10 | \n", "ALAND10 | \n", "AWATER10 | \n", "INTPTLAT10 | \n", "INTPTLON10 | \n", "geometry | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "43451 | \n", "43451 | \n", "B5 | \n", "G6350 | \n", "S | \n", "63484186 | \n", "157689 | \n", "+41.3183010 | \n", "-083.6174935 | \n", "POLYGON ((-83.70873 41.32733, -83.70815 41.327... | \n", "
| 1 | \n", "43452 | \n", "43452 | \n", "B5 | \n", "G6350 | \n", "S | \n", "121522304 | \n", "13721730 | \n", "+41.5157923 | \n", "-082.9809454 | \n", "POLYGON ((-83.08698 41.53780, -83.08256 41.537... | \n", "
| 2 | \n", "43456 | \n", "43456 | \n", "B5 | \n", "G6350 | \n", "S | \n", "9320975 | \n", "1003775 | \n", "+41.6318300 | \n", "-082.8393923 | \n", "MULTIPOLYGON (((-82.83558 41.71082, -82.83515 ... | \n", "
| 3 | \n", "43457 | \n", "43457 | \n", "B5 | \n", "G6350 | \n", "S | \n", "48004681 | \n", "0 | \n", "+41.2673301 | \n", "-083.4274872 | \n", "POLYGON ((-83.49650 41.25371, -83.48382 41.253... | \n", "
| 4 | \n", "43458 | \n", "43458 | \n", "B5 | \n", "G6350 | \n", "S | \n", "2573816 | \n", "39915 | \n", "+41.5304461 | \n", "-083.2133648 | \n", "POLYGON ((-83.22229 41.53102, -83.22228 41.532... | \n", "