|
|
@@ -0,0 +1,242 @@ |
|
|
/* |
|
|
* File: geoparquet.cpp |
|
|
* |
|
|
* Purpose: A minimal example to build a GeoParquet file using Apache Arrow. |
|
|
* |
|
|
* Prerequisites: The Apache Arrow library is needed and can be installed as follows |
|
|
* $ git clone https://github.com/apache/arrow.git |
|
|
* $ cd arrow/cpp |
|
|
* $ mkdir build |
|
|
* $ cd build |
|
|
* $ cmake .. -DARROW_PARQUET=ON -DARROW_WITH_ZLIB=ON |
|
|
* $ make -j8 |
|
|
* $ sudo make install |
|
|
* |
|
|
* Building: gcc geoparquet.cpp -Wl,-lstdc++ -Wl,/usr/local/lib/libparquet.so -Wl,/usr/local/lib/libarrow.so |
|
|
* |
|
|
* Running: ./a.out |
|
|
* |
|
|
* Notes: |
|
|
* 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory |
|
|
* that the executable is run from.. |
|
|
* 2. The data written to the GeoParquet file consists of |
|
|
* - a single data column of 1 byte integers |
|
|
* - a timestamp column consisting GPS times in seconds |
|
|
* (i.e. number of seconds since GPS epoch of Jan 6, 1980) |
|
|
* - a geometry column of longitude,latitude points conforming |
|
|
* to the GeoParquet specification |
|
|
* 3. To get a quick look into "myfile.parquet", use the parquet-tools (installed via pip) |
|
|
* 4. The file can be read into a GeoDataFrame in Python with the following Python code |
|
|
* >>> import geopandas |
|
|
* >>> gdf = geopandas.read_parquet("myfile.parquet") |
|
|
*/ |
|
|
|
|
|
/* |
|
|
* Includes |
|
|
*/ |
|
|
#include <iostream> |
|
|
#include <arrow/builder.h> |
|
|
#include <arrow/table.h> |
|
|
#include <arrow/io/file.h> |
|
|
#include <arrow/util/key_value_metadata.h> |
|
|
#include <parquet/arrow/writer.h> |
|
|
#include <parquet/arrow/schema.h> |
|
|
#include <parquet/properties.h> |
|
|
#include <parquet/file_writer.h> |
|
|
|
|
|
/* |
|
|
* Namespaces |
|
|
*/ |
|
|
using std::shared_ptr; |
|
|
using std::unique_ptr; |
|
|
using std::make_shared; |
|
|
using std::vector; |
|
|
|
|
|
/* |
|
|
* Function: Build GeoParquet Metadata String |
|
|
*/ |
|
|
const char* buildGeoMetaData (void) |
|
|
{ |
|
|
const char* str = R"json({ |
|
|
"version": "1.0.0-beta.1", |
|
|
"primary_column": "geometry", |
|
|
"columns": { |
|
|
"geometry": { |
|
|
"encoding": "WKB", |
|
|
"geometry_types": ["Point"], |
|
|
"crs": { |
|
|
"$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", |
|
|
"type": "GeographicCRS", |
|
|
"name": "WGS 84 longitude-latitude", |
|
|
"datum": { |
|
|
"type": "GeodeticReferenceFrame", |
|
|
"name": "World Geodetic System 1984", |
|
|
"ellipsoid": { |
|
|
"name": "WGS 84", |
|
|
"semi_major_axis": 6378137, |
|
|
"inverse_flattening": 298.257223563 |
|
|
} |
|
|
}, |
|
|
"coordinate_system": { |
|
|
"subtype": "ellipsoidal", |
|
|
"axis": [ |
|
|
{ |
|
|
"name": "Geodetic longitude", |
|
|
"abbreviation": "Lon", |
|
|
"direction": "east", |
|
|
"unit": "degree" |
|
|
}, |
|
|
{ |
|
|
"name": "Geodetic latitude", |
|
|
"abbreviation": "Lat", |
|
|
"direction": "north", |
|
|
"unit": "degree" |
|
|
} |
|
|
] |
|
|
}, |
|
|
"id": { |
|
|
"authority": "OGC", |
|
|
"code": "CRS84" |
|
|
} |
|
|
}, |
|
|
"edges": "planar", |
|
|
"bbox": [-180.0, -90.0, 180.0, 90.0], |
|
|
"epoch": 2018.0 |
|
|
} |
|
|
} |
|
|
})json"; |
|
|
|
|
|
int len = strlen(str) + 1; |
|
|
char* new_str = new char [len]; |
|
|
int i = 0, j = 0; |
|
|
while(i < len) |
|
|
{ |
|
|
if((i < len-4) && (str[i] == ' ' && str[i+1] == ' ' && str[i+2] == ' ' && str[i+3] == ' ')) i += 4; |
|
|
else if(str[i] == '\n') i += 1; |
|
|
else new_str[j++] = str[i++]; |
|
|
} |
|
|
new_str[j] = '\0'; |
|
|
|
|
|
printf("|%s|\n", new_str); |
|
|
return new_str; |
|
|
} |
|
|
|
|
|
/* |
|
|
* Function: Main |
|
|
*/ |
|
|
int main(int argc, char* argv[]) |
|
|
{ |
|
|
/* Data */ |
|
|
const int NUM_ROWS = 10; |
|
|
int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9}; |
|
|
uint64_t timestamps[NUM_ROWS] = {1358002370347, 1358002370348, 1358002370349, 1358002370350, 1358002370351, 1358002370352, 1358002370353, 1358002370354, 1358002370355, 1358002370356}; |
|
|
double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4}; |
|
|
double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9}; |
|
|
|
|
|
/* Build Schema */ |
|
|
vector<shared_ptr<arrow::Field>> schema_vector; |
|
|
schema_vector.push_back(arrow::field("data", arrow::int8())); |
|
|
schema_vector.push_back(arrow::field("time", arrow::date64())); |
|
|
schema_vector.push_back(arrow::field("geometry", arrow::binary())); |
|
|
shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector); |
|
|
|
|
|
/* Create Arrow Output Stream */ |
|
|
shared_ptr<arrow::io::FileOutputStream> file_output_stream; |
|
|
PARQUET_ASSIGN_OR_THROW(file_output_stream, arrow::io::FileOutputStream::Open("myfile.parquet")); |
|
|
|
|
|
/* Create Writer Properties */ |
|
|
parquet::WriterProperties::Builder writer_props_builder; |
|
|
writer_props_builder.compression(parquet::Compression::GZIP); |
|
|
shared_ptr<parquet::WriterProperties> writer_props = writer_props_builder.build(); |
|
|
|
|
|
/* Create Arrow Writer Properties */ |
|
|
auto arrow_writer_props = parquet::ArrowWriterProperties::Builder().store_schema()->build(); |
|
|
|
|
|
/* Build GeoParquet MetaData */ |
|
|
auto metadata = schema->metadata() ? schema->metadata()->Copy() : std::make_shared<arrow::KeyValueMetadata>(); |
|
|
const char* metadata_str = buildGeoMetaData(); |
|
|
metadata->Append("geo", metadata_str); |
|
|
schema = schema->WithMetadata(metadata); |
|
|
delete [] metadata_str; |
|
|
|
|
|
/* Create Parquet Writer */ |
|
|
unique_ptr<parquet::arrow::FileWriter> parquetWriter; |
|
|
arrow::Result<unique_ptr<parquet::arrow::FileWriter>> result = parquet::arrow::FileWriter::Open(*schema, ::arrow::default_memory_pool(), file_output_stream, writer_props, arrow_writer_props); |
|
|
if(result.ok()) |
|
|
{ |
|
|
parquetWriter = std::move(result).ValueOrDie(); |
|
|
} |
|
|
else |
|
|
{ |
|
|
printf("Failed to open parquet writer: %s", result.status().ToString().c_str()); |
|
|
return 1; |
|
|
} |
|
|
|
|
|
/* Initialize Columns */ |
|
|
vector<shared_ptr<arrow::Array>> columns; |
|
|
|
|
|
/* Write Data */ |
|
|
{ |
|
|
shared_ptr<arrow::Array> column; |
|
|
arrow::Int8Builder builder; |
|
|
(void)builder.Reserve(NUM_ROWS); |
|
|
for(int row = 0; row < NUM_ROWS; row++) |
|
|
{ |
|
|
builder.UnsafeAppend(data[row]); |
|
|
} |
|
|
(void)builder.Finish(&column); |
|
|
columns.push_back(column); |
|
|
} |
|
|
|
|
|
/* Write Timestamps */ |
|
|
{ |
|
|
shared_ptr<arrow::Array> column; |
|
|
arrow::Date64Builder builder; |
|
|
(void)builder.Reserve(NUM_ROWS); |
|
|
for(int row = 0; row < NUM_ROWS; row++) |
|
|
{ |
|
|
builder.UnsafeAppend(timestamps[row]); |
|
|
} |
|
|
(void)builder.Finish(&column); |
|
|
columns.push_back(column); |
|
|
} |
|
|
|
|
|
/* Write Geometry Column */ |
|
|
{ |
|
|
typedef struct WKBPoint { |
|
|
uint8_t byteOrder; |
|
|
uint32_t wkbType; |
|
|
double x; |
|
|
double y; |
|
|
} __attribute__((packed)) wkbpoint_t; |
|
|
|
|
|
shared_ptr<arrow::Array> column; |
|
|
arrow::BinaryBuilder builder; |
|
|
for(int row = 0; row < NUM_ROWS; row++) |
|
|
{ |
|
|
wkbpoint_t point = { |
|
|
#ifdef __be__ |
|
|
.byteOrder = 0, |
|
|
#else |
|
|
.byteOrder = 1, |
|
|
#endif |
|
|
.wkbType = 1, |
|
|
.x = longitude[row], |
|
|
.y = latitude[row] |
|
|
}; |
|
|
(void)builder.Append((uint8_t*)&point, sizeof(wkbpoint_t)); |
|
|
} |
|
|
(void)builder.Finish(&column); |
|
|
columns.push_back(column); |
|
|
} |
|
|
|
|
|
/* Build and Write Table */ |
|
|
shared_ptr<arrow::Table> table = arrow::Table::Make(schema, columns); |
|
|
(void)parquetWriter->WriteTable(*table, NUM_ROWS); |
|
|
|
|
|
/* Close Parquet Writer */ |
|
|
(void)parquetWriter->Close(); |
|
|
|
|
|
/* Return Success */ |
|
|
return 0; |
|
|
} |