Last active
September 29, 2025 13:46
-
-
Save jpswinski/13074fc773f92a529f98b274e5ad5283 to your computer and use it in GitHub Desktop.
Revisions
-
jpswinski revised this gist
Jan 17, 2023 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -18,10 +18,10 @@ * * Notes: * 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory * that the executable is run from. * 2. The data written to the GeoParquet file consists of * - a single data column of 1 byte integers * - a timestamp column consisting of GPS times in seconds * (i.e. number of seconds since GPS epoch of Jan 6, 1980) * - a geometry column of longitude,latitude points conforming * to the GeoParquet specification -
jpswinski revised this gist
Jan 17, 2023 . 1 changed file with 5 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -31,7 +31,7 @@ * >>> gdf = geopandas.read_parquet("myfile.parquet") * * Todo: * 1. Use a arrow::date64() type for the timestamp column */ /* @@ -131,14 +131,14 @@ int main(int argc, char* argv[]) /* Data */ const int NUM_ROWS = 10; int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9}; uint64_t timestamps[NUM_ROWS] = {1358002370, 1358002371, 1358002372, 1358002373, 1358002374, 1358002375, 1358002376, 1358002377, 1358002378, 1358002379}; double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4}; double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9}; /* Build Schema */ vector<shared_ptr<arrow::Field>> schema_vector; schema_vector.push_back(arrow::field("data", arrow::int8())); schema_vector.push_back(arrow::field("timestamp", arrow::int64())); schema_vector.push_back(arrow::field("geometry", arrow::binary())); shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector); @@ -193,11 +193,11 @@ int main(int argc, char* argv[]) /* Write Timestamps */ { shared_ptr<arrow::Array> column; arrow::Int64Builder builder; (void)builder.Reserve(NUM_ROWS); for(int row = 0; row < NUM_ROWS; row++) { builder.UnsafeAppend(timestamps[row]); } (void)builder.Finish(&column); columns.push_back(column); -
jpswinski revised this gist
Jan 17, 2023 . 1 changed file with 4 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -29,6 +29,9 @@ * 4. The file can be read into a GeoDataFrame in Python with the following Python code * >>> import geopandas * >>> gdf = geopandas.read_parquet("myfile.parquet") * * Todo: * 1. Account for leap seconds in GPS to Unix time conversion */ /* @@ -194,7 +197,7 @@ int main(int argc, char* argv[]) (void)builder.Reserve(NUM_ROWS); for(int row = 0; row < NUM_ROWS; row++) { builder.UnsafeAppend(timestamps[row] + 315964800); } (void)builder.Finish(&column); columns.push_back(column); -
jpswinski revised this gist
Jan 17, 2023 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -117,7 +117,6 @@ const char* buildGeoMetaData (void) } new_str[j] = '\0'; return new_str; } -
jpswinski created this gist
Jan 17, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,242 @@ /* * File: geoparquet.cpp * * Purpose: A minimal example to build a GeoParquet file using Apache Arrow. * * Prerequisites: The Apache Arrow library is needed and can be installed as follows * $ git clone https://github.com/apache/arrow.git * $ cd arrow/cpp * $ mkdir build * $ cd build * $ cmake .. -DARROW_PARQUET=ON -DARROW_WITH_ZLIB=ON * $ make -j8 * $ sudo make install * * Building: gcc geoparquet.cpp -Wl,-lstdc++ -Wl,/usr/local/lib/libparquet.so -Wl,/usr/local/lib/libarrow.so * * Running: ./a.out * * Notes: * 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory * that the executable is run from.. * 2. The data written to the GeoParquet file consists of * - a single data column of 1 byte integers * - a timestamp column consisting GPS times in seconds * (i.e. number of seconds since GPS epoch of Jan 6, 1980) * - a geometry column of longitude,latitude points conforming * to the GeoParquet specification * 3. To get a quick look into "myfile.parquet", use the parquet-tools (installed via pip) * 4. The file can be read into a GeoDataFrame in Python with the following Python code * >>> import geopandas * >>> gdf = geopandas.read_parquet("myfile.parquet") */ /* * Includes */ #include <iostream> #include <arrow/builder.h> #include <arrow/table.h> #include <arrow/io/file.h> #include <arrow/util/key_value_metadata.h> #include <parquet/arrow/writer.h> #include <parquet/arrow/schema.h> #include <parquet/properties.h> #include <parquet/file_writer.h> /* * Namespaces */ using std::shared_ptr; using std::unique_ptr; using std::make_shared; using std::vector; /* * Function: Build GeoParquet Metadata String */ const char* buildGeoMetaData (void) { const char* str = R"json({ "version": "1.0.0-beta.1", "primary_column": "geometry", "columns": { "geometry": { "encoding": "WKB", "geometry_types": ["Point"], "crs": { "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", "type": "GeographicCRS", "name": "WGS 84 longitude-latitude", "datum": { "type": "GeodeticReferenceFrame", "name": "World Geodetic System 1984", "ellipsoid": { "name": "WGS 84", "semi_major_axis": 6378137, "inverse_flattening": 298.257223563 } }, "coordinate_system": { "subtype": "ellipsoidal", "axis": [ { "name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree" }, { "name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree" } ] }, "id": { "authority": "OGC", "code": "CRS84" } }, "edges": "planar", "bbox": [-180.0, -90.0, 180.0, 90.0], "epoch": 2018.0 } } })json"; int len = strlen(str) + 1; char* new_str = new char [len]; int i = 0, j = 0; while(i < len) { if((i < len-4) && (str[i] == ' ' && str[i+1] == ' ' && str[i+2] == ' ' && str[i+3] == ' ')) i += 4; else if(str[i] == '\n') i += 1; else new_str[j++] = str[i++]; } new_str[j] = '\0'; printf("|%s|\n", new_str); return new_str; } /* * Function: Main */ int main(int argc, char* argv[]) { /* Data */ const int NUM_ROWS = 10; int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9}; uint64_t timestamps[NUM_ROWS] = {1358002370347, 1358002370348, 1358002370349, 1358002370350, 1358002370351, 1358002370352, 1358002370353, 1358002370354, 1358002370355, 1358002370356}; double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4}; double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9}; /* Build Schema */ vector<shared_ptr<arrow::Field>> schema_vector; schema_vector.push_back(arrow::field("data", arrow::int8())); schema_vector.push_back(arrow::field("time", arrow::date64())); schema_vector.push_back(arrow::field("geometry", arrow::binary())); shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector); /* Create Arrow Output Stream */ shared_ptr<arrow::io::FileOutputStream> file_output_stream; PARQUET_ASSIGN_OR_THROW(file_output_stream, arrow::io::FileOutputStream::Open("myfile.parquet")); /* Create Writer Properties */ parquet::WriterProperties::Builder writer_props_builder; writer_props_builder.compression(parquet::Compression::GZIP); shared_ptr<parquet::WriterProperties> writer_props = writer_props_builder.build(); /* Create Arrow Writer Properties */ auto arrow_writer_props = parquet::ArrowWriterProperties::Builder().store_schema()->build(); /* Build GeoParquet MetaData */ auto metadata = schema->metadata() ? schema->metadata()->Copy() : std::make_shared<arrow::KeyValueMetadata>(); const char* metadata_str = buildGeoMetaData(); metadata->Append("geo", metadata_str); schema = schema->WithMetadata(metadata); delete [] metadata_str; /* Create Parquet Writer */ unique_ptr<parquet::arrow::FileWriter> parquetWriter; arrow::Result<unique_ptr<parquet::arrow::FileWriter>> result = parquet::arrow::FileWriter::Open(*schema, ::arrow::default_memory_pool(), file_output_stream, writer_props, arrow_writer_props); if(result.ok()) { parquetWriter = std::move(result).ValueOrDie(); } else { printf("Failed to open parquet writer: %s", result.status().ToString().c_str()); return 1; } /* Initialize Columns */ vector<shared_ptr<arrow::Array>> columns; /* Write Data */ { shared_ptr<arrow::Array> column; arrow::Int8Builder builder; (void)builder.Reserve(NUM_ROWS); for(int row = 0; row < NUM_ROWS; row++) { builder.UnsafeAppend(data[row]); } (void)builder.Finish(&column); columns.push_back(column); } /* Write Timestamps */ { shared_ptr<arrow::Array> column; arrow::Date64Builder builder; (void)builder.Reserve(NUM_ROWS); for(int row = 0; row < NUM_ROWS; row++) { builder.UnsafeAppend(timestamps[row]); } (void)builder.Finish(&column); columns.push_back(column); } /* Write Geometry Column */ { typedef struct WKBPoint { uint8_t byteOrder; uint32_t wkbType; double x; double y; } __attribute__((packed)) wkbpoint_t; shared_ptr<arrow::Array> column; arrow::BinaryBuilder builder; for(int row = 0; row < NUM_ROWS; row++) { wkbpoint_t point = { #ifdef __be__ .byteOrder = 0, #else .byteOrder = 1, #endif .wkbType = 1, .x = longitude[row], .y = latitude[row] }; (void)builder.Append((uint8_t*)&point, sizeof(wkbpoint_t)); } (void)builder.Finish(&column); columns.push_back(column); } /* Build and Write Table */ shared_ptr<arrow::Table> table = arrow::Table::Make(schema, columns); (void)parquetWriter->WriteTable(*table, NUM_ROWS); /* Close Parquet Writer */ (void)parquetWriter->Close(); /* Return Success */ return 0; }