Skip to content

Instantly share code, notes, and snippets.

@vojtatom
Forked from jpswinski/geoparquet.cpp
Created September 29, 2025 13:46
Show Gist options
  • Select an option

  • Save vojtatom/3f3fb485466ecb7471fe7f8b6812ff94 to your computer and use it in GitHub Desktop.

Select an option

Save vojtatom/3f3fb485466ecb7471fe7f8b6812ff94 to your computer and use it in GitHub Desktop.

Revisions

  1. @jpswinski jpswinski revised this gist Jan 17, 2023. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions geoparquet.cpp
    Original file line number Diff line number Diff line change
    @@ -18,10 +18,10 @@
    *
    * Notes:
    * 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory
    * that the executable is run from..
    * that the executable is run from.
    * 2. The data written to the GeoParquet file consists of
    * - a single data column of 1 byte integers
    * - a timestamp column consisting GPS times in seconds
    * - a timestamp column consisting of GPS times in seconds
    * (i.e. number of seconds since GPS epoch of Jan 6, 1980)
    * - a geometry column of longitude,latitude points conforming
    * to the GeoParquet specification
  2. @jpswinski jpswinski revised this gist Jan 17, 2023. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions geoparquet.cpp
    Original file line number Diff line number Diff line change
    @@ -31,7 +31,7 @@
    * >>> gdf = geopandas.read_parquet("myfile.parquet")
    *
    * Todo:
    * 1. Account for leap seconds in GPS to Unix time conversion
    * 1. Use a arrow::date64() type for the timestamp column
    */

    /*
    @@ -131,14 +131,14 @@ int main(int argc, char* argv[])
    /* Data */
    const int NUM_ROWS = 10;
    int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9};
    uint64_t timestamps[NUM_ROWS] = {1358002370347, 1358002370348, 1358002370349, 1358002370350, 1358002370351, 1358002370352, 1358002370353, 1358002370354, 1358002370355, 1358002370356};
    uint64_t timestamps[NUM_ROWS] = {1358002370, 1358002371, 1358002372, 1358002373, 1358002374, 1358002375, 1358002376, 1358002377, 1358002378, 1358002379};
    double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4};
    double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9};

    /* Build Schema */
    vector<shared_ptr<arrow::Field>> schema_vector;
    schema_vector.push_back(arrow::field("data", arrow::int8()));
    schema_vector.push_back(arrow::field("time", arrow::date64()));
    schema_vector.push_back(arrow::field("timestamp", arrow::int64()));
    schema_vector.push_back(arrow::field("geometry", arrow::binary()));
    shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector);

    @@ -193,11 +193,11 @@ int main(int argc, char* argv[])
    /* Write Timestamps */
    {
    shared_ptr<arrow::Array> column;
    arrow::Date64Builder builder;
    arrow::Int64Builder builder;
    (void)builder.Reserve(NUM_ROWS);
    for(int row = 0; row < NUM_ROWS; row++)
    {
    builder.UnsafeAppend(timestamps[row] + 315964800);
    builder.UnsafeAppend(timestamps[row]);
    }
    (void)builder.Finish(&column);
    columns.push_back(column);
  3. @jpswinski jpswinski revised this gist Jan 17, 2023. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion geoparquet.cpp
    Original file line number Diff line number Diff line change
    @@ -29,6 +29,9 @@
    * 4. The file can be read into a GeoDataFrame in Python with the following Python code
    * >>> import geopandas
    * >>> gdf = geopandas.read_parquet("myfile.parquet")
    *
    * Todo:
    * 1. Account for leap seconds in GPS to Unix time conversion
    */

    /*
    @@ -194,7 +197,7 @@ int main(int argc, char* argv[])
    (void)builder.Reserve(NUM_ROWS);
    for(int row = 0; row < NUM_ROWS; row++)
    {
    builder.UnsafeAppend(timestamps[row]);
    builder.UnsafeAppend(timestamps[row] + 315964800);
    }
    (void)builder.Finish(&column);
    columns.push_back(column);
  4. @jpswinski jpswinski revised this gist Jan 17, 2023. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion geoparquet.cpp
    Original file line number Diff line number Diff line change
    @@ -117,7 +117,6 @@ const char* buildGeoMetaData (void)
    }
    new_str[j] = '\0';

    printf("|%s|\n", new_str);
    return new_str;
    }

  5. @jpswinski jpswinski created this gist Jan 17, 2023.
    242 changes: 242 additions & 0 deletions geoparquet.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,242 @@
    /*
    * File: geoparquet.cpp
    *
    * Purpose: A minimal example to build a GeoParquet file using Apache Arrow.
    *
    * Prerequisites: The Apache Arrow library is needed and can be installed as follows
    * $ git clone https://github.com/apache/arrow.git
    * $ cd arrow/cpp
    * $ mkdir build
    * $ cd build
    * $ cmake .. -DARROW_PARQUET=ON -DARROW_WITH_ZLIB=ON
    * $ make -j8
    * $ sudo make install
    *
    * Building: gcc geoparquet.cpp -Wl,-lstdc++ -Wl,/usr/local/lib/libparquet.so -Wl,/usr/local/lib/libarrow.so
    *
    * Running: ./a.out
    *
    * Notes:
    * 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory
    * that the executable is run from..
    * 2. The data written to the GeoParquet file consists of
    * - a single data column of 1 byte integers
    * - a timestamp column consisting GPS times in seconds
    * (i.e. number of seconds since GPS epoch of Jan 6, 1980)
    * - a geometry column of longitude,latitude points conforming
    * to the GeoParquet specification
    * 3. To get a quick look into "myfile.parquet", use the parquet-tools (installed via pip)
    * 4. The file can be read into a GeoDataFrame in Python with the following Python code
    * >>> import geopandas
    * >>> gdf = geopandas.read_parquet("myfile.parquet")
    */

    /*
    * Includes
    */
    #include <iostream>
    #include <arrow/builder.h>
    #include <arrow/table.h>
    #include <arrow/io/file.h>
    #include <arrow/util/key_value_metadata.h>
    #include <parquet/arrow/writer.h>
    #include <parquet/arrow/schema.h>
    #include <parquet/properties.h>
    #include <parquet/file_writer.h>

    /*
    * Namespaces
    */
    using std::shared_ptr;
    using std::unique_ptr;
    using std::make_shared;
    using std::vector;

    /*
    * Function: Build GeoParquet Metadata String
    */
    const char* buildGeoMetaData (void)
    {
    const char* str = R"json({
    "version": "1.0.0-beta.1",
    "primary_column": "geometry",
    "columns": {
    "geometry": {
    "encoding": "WKB",
    "geometry_types": ["Point"],
    "crs": {
    "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
    "type": "GeographicCRS",
    "name": "WGS 84 longitude-latitude",
    "datum": {
    "type": "GeodeticReferenceFrame",
    "name": "World Geodetic System 1984",
    "ellipsoid": {
    "name": "WGS 84",
    "semi_major_axis": 6378137,
    "inverse_flattening": 298.257223563
    }
    },
    "coordinate_system": {
    "subtype": "ellipsoidal",
    "axis": [
    {
    "name": "Geodetic longitude",
    "abbreviation": "Lon",
    "direction": "east",
    "unit": "degree"
    },
    {
    "name": "Geodetic latitude",
    "abbreviation": "Lat",
    "direction": "north",
    "unit": "degree"
    }
    ]
    },
    "id": {
    "authority": "OGC",
    "code": "CRS84"
    }
    },
    "edges": "planar",
    "bbox": [-180.0, -90.0, 180.0, 90.0],
    "epoch": 2018.0
    }
    }
    })json";

    int len = strlen(str) + 1;
    char* new_str = new char [len];
    int i = 0, j = 0;
    while(i < len)
    {
    if((i < len-4) && (str[i] == ' ' && str[i+1] == ' ' && str[i+2] == ' ' && str[i+3] == ' ')) i += 4;
    else if(str[i] == '\n') i += 1;
    else new_str[j++] = str[i++];
    }
    new_str[j] = '\0';

    printf("|%s|\n", new_str);
    return new_str;
    }

    /*
    * Function: Main
    */
    int main(int argc, char* argv[])
    {
    /* Data */
    const int NUM_ROWS = 10;
    int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9};
    uint64_t timestamps[NUM_ROWS] = {1358002370347, 1358002370348, 1358002370349, 1358002370350, 1358002370351, 1358002370352, 1358002370353, 1358002370354, 1358002370355, 1358002370356};
    double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4};
    double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9};

    /* Build Schema */
    vector<shared_ptr<arrow::Field>> schema_vector;
    schema_vector.push_back(arrow::field("data", arrow::int8()));
    schema_vector.push_back(arrow::field("time", arrow::date64()));
    schema_vector.push_back(arrow::field("geometry", arrow::binary()));
    shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector);

    /* Create Arrow Output Stream */
    shared_ptr<arrow::io::FileOutputStream> file_output_stream;
    PARQUET_ASSIGN_OR_THROW(file_output_stream, arrow::io::FileOutputStream::Open("myfile.parquet"));

    /* Create Writer Properties */
    parquet::WriterProperties::Builder writer_props_builder;
    writer_props_builder.compression(parquet::Compression::GZIP);
    shared_ptr<parquet::WriterProperties> writer_props = writer_props_builder.build();

    /* Create Arrow Writer Properties */
    auto arrow_writer_props = parquet::ArrowWriterProperties::Builder().store_schema()->build();

    /* Build GeoParquet MetaData */
    auto metadata = schema->metadata() ? schema->metadata()->Copy() : std::make_shared<arrow::KeyValueMetadata>();
    const char* metadata_str = buildGeoMetaData();
    metadata->Append("geo", metadata_str);
    schema = schema->WithMetadata(metadata);
    delete [] metadata_str;

    /* Create Parquet Writer */
    unique_ptr<parquet::arrow::FileWriter> parquetWriter;
    arrow::Result<unique_ptr<parquet::arrow::FileWriter>> result = parquet::arrow::FileWriter::Open(*schema, ::arrow::default_memory_pool(), file_output_stream, writer_props, arrow_writer_props);
    if(result.ok())
    {
    parquetWriter = std::move(result).ValueOrDie();
    }
    else
    {
    printf("Failed to open parquet writer: %s", result.status().ToString().c_str());
    return 1;
    }

    /* Initialize Columns */
    vector<shared_ptr<arrow::Array>> columns;

    /* Write Data */
    {
    shared_ptr<arrow::Array> column;
    arrow::Int8Builder builder;
    (void)builder.Reserve(NUM_ROWS);
    for(int row = 0; row < NUM_ROWS; row++)
    {
    builder.UnsafeAppend(data[row]);
    }
    (void)builder.Finish(&column);
    columns.push_back(column);
    }

    /* Write Timestamps */
    {
    shared_ptr<arrow::Array> column;
    arrow::Date64Builder builder;
    (void)builder.Reserve(NUM_ROWS);
    for(int row = 0; row < NUM_ROWS; row++)
    {
    builder.UnsafeAppend(timestamps[row]);
    }
    (void)builder.Finish(&column);
    columns.push_back(column);
    }

    /* Write Geometry Column */
    {
    typedef struct WKBPoint {
    uint8_t byteOrder;
    uint32_t wkbType;
    double x;
    double y;
    } __attribute__((packed)) wkbpoint_t;

    shared_ptr<arrow::Array> column;
    arrow::BinaryBuilder builder;
    for(int row = 0; row < NUM_ROWS; row++)
    {
    wkbpoint_t point = {
    #ifdef __be__
    .byteOrder = 0,
    #else
    .byteOrder = 1,
    #endif
    .wkbType = 1,
    .x = longitude[row],
    .y = latitude[row]
    };
    (void)builder.Append((uint8_t*)&point, sizeof(wkbpoint_t));
    }
    (void)builder.Finish(&column);
    columns.push_back(column);
    }

    /* Build and Write Table */
    shared_ptr<arrow::Table> table = arrow::Table::Make(schema, columns);
    (void)parquetWriter->WriteTable(*table, NUM_ROWS);

    /* Close Parquet Writer */
    (void)parquetWriter->Close();

    /* Return Success */
    return 0;
    }