-
-
Save Jap8nted/4ae7cbd52b2459da6a7efe698617a0d6 to your computer and use it in GitHub Desktop.
Minimal example C++ code to write a GeoParquet file using Apache Arrow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* File: geoparquet.cpp | |
* | |
* Purpose: A minimal example to build a GeoParquet file using Apache Arrow. | |
* | |
* Prerequisites: The Apache Arrow library is needed and can be installed as follows | |
* $ git clone https://github.com/apache/arrow.git | |
* $ cd arrow/cpp | |
* $ mkdir build | |
* $ cd build | |
* $ cmake .. -DARROW_PARQUET=ON -DARROW_WITH_ZLIB=ON | |
* $ make -j8 | |
* $ sudo make install | |
* | |
* Building: gcc geoparquet.cpp -Wl,-lstdc++ -Wl,/usr/local/lib/libparquet.so -Wl,/usr/local/lib/libarrow.so | |
* | |
* Running: ./a.out | |
* | |
* Notes: | |
* 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory | |
* that the executable is run from. | |
* 2. The data written to the GeoParquet file consists of | |
* - a single data column of 1 byte integers | |
* - a timestamp column consisting of GPS times in seconds | |
* (i.e. number of seconds since GPS epoch of Jan 6, 1980) | |
* - a geometry column of longitude,latitude points conforming | |
* to the GeoParquet specification | |
* 3. To get a quick look into "myfile.parquet", use the parquet-tools (installed via pip) | |
* 4. The file can be read into a GeoDataFrame in Python with the following Python code | |
* >>> import geopandas | |
* >>> gdf = geopandas.read_parquet("myfile.parquet") | |
* | |
* Todo: | |
* 1. Use a arrow::date64() type for the timestamp column | |
*/ | |
/* | |
* Includes | |
*/ | |
#include <iostream> | |
#include <arrow/builder.h> | |
#include <arrow/table.h> | |
#include <arrow/io/file.h> | |
#include <arrow/util/key_value_metadata.h> | |
#include <parquet/arrow/writer.h> | |
#include <parquet/arrow/schema.h> | |
#include <parquet/properties.h> | |
#include <parquet/file_writer.h> | |
/* | |
* Namespaces | |
*/ | |
using std::shared_ptr; | |
using std::unique_ptr; | |
using std::make_shared; | |
using std::vector; | |
/* | |
* Function: Build GeoParquet Metadata String | |
*/ | |
const char* buildGeoMetaData (void) | |
{ | |
const char* str = R"json({ | |
"version": "1.0.0-beta.1", | |
"primary_column": "geometry", | |
"columns": { | |
"geometry": { | |
"encoding": "WKB", | |
"geometry_types": ["Point"], | |
"crs": { | |
"$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", | |
"type": "GeographicCRS", | |
"name": "WGS 84 longitude-latitude", | |
"datum": { | |
"type": "GeodeticReferenceFrame", | |
"name": "World Geodetic System 1984", | |
"ellipsoid": { | |
"name": "WGS 84", | |
"semi_major_axis": 6378137, | |
"inverse_flattening": 298.257223563 | |
} | |
}, | |
"coordinate_system": { | |
"subtype": "ellipsoidal", | |
"axis": [ | |
{ | |
"name": "Geodetic longitude", | |
"abbreviation": "Lon", | |
"direction": "east", | |
"unit": "degree" | |
}, | |
{ | |
"name": "Geodetic latitude", | |
"abbreviation": "Lat", | |
"direction": "north", | |
"unit": "degree" | |
} | |
] | |
}, | |
"id": { | |
"authority": "OGC", | |
"code": "CRS84" | |
} | |
}, | |
"edges": "planar", | |
"bbox": [-180.0, -90.0, 180.0, 90.0], | |
"epoch": 2018.0 | |
} | |
} | |
})json"; | |
int len = strlen(str) + 1; | |
char* new_str = new char [len]; | |
int i = 0, j = 0; | |
while(i < len) | |
{ | |
if((i < len-4) && (str[i] == ' ' && str[i+1] == ' ' && str[i+2] == ' ' && str[i+3] == ' ')) i += 4; | |
else if(str[i] == '\n') i += 1; | |
else new_str[j++] = str[i++]; | |
} | |
new_str[j] = '\0'; | |
return new_str; | |
} | |
/* | |
* Function: Main | |
*/ | |
int main(int argc, char* argv[]) | |
{ | |
/* Data */ | |
const int NUM_ROWS = 10; | |
int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9}; | |
uint64_t timestamps[NUM_ROWS] = {1358002370, 1358002371, 1358002372, 1358002373, 1358002374, 1358002375, 1358002376, 1358002377, 1358002378, 1358002379}; | |
double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4}; | |
double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9}; | |
/* Build Schema */ | |
vector<shared_ptr<arrow::Field>> schema_vector; | |
schema_vector.push_back(arrow::field("data", arrow::int8())); | |
schema_vector.push_back(arrow::field("timestamp", arrow::int64())); | |
schema_vector.push_back(arrow::field("geometry", arrow::binary())); | |
shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector); | |
/* Create Arrow Output Stream */ | |
shared_ptr<arrow::io::FileOutputStream> file_output_stream; | |
PARQUET_ASSIGN_OR_THROW(file_output_stream, arrow::io::FileOutputStream::Open("myfile.parquet")); | |
/* Create Writer Properties */ | |
parquet::WriterProperties::Builder writer_props_builder; | |
writer_props_builder.compression(parquet::Compression::GZIP); | |
shared_ptr<parquet::WriterProperties> writer_props = writer_props_builder.build(); | |
/* Create Arrow Writer Properties */ | |
auto arrow_writer_props = parquet::ArrowWriterProperties::Builder().store_schema()->build(); | |
/* Build GeoParquet MetaData */ | |
auto metadata = schema->metadata() ? schema->metadata()->Copy() : std::make_shared<arrow::KeyValueMetadata>(); | |
const char* metadata_str = buildGeoMetaData(); | |
metadata->Append("geo", metadata_str); | |
schema = schema->WithMetadata(metadata); | |
delete [] metadata_str; | |
/* Create Parquet Writer */ | |
unique_ptr<parquet::arrow::FileWriter> parquetWriter; | |
arrow::Result<unique_ptr<parquet::arrow::FileWriter>> result = parquet::arrow::FileWriter::Open(*schema, ::arrow::default_memory_pool(), file_output_stream, writer_props, arrow_writer_props); | |
if(result.ok()) | |
{ | |
parquetWriter = std::move(result).ValueOrDie(); | |
} | |
else | |
{ | |
printf("Failed to open parquet writer: %s", result.status().ToString().c_str()); | |
return 1; | |
} | |
/* Initialize Columns */ | |
vector<shared_ptr<arrow::Array>> columns; | |
/* Write Data */ | |
{ | |
shared_ptr<arrow::Array> column; | |
arrow::Int8Builder builder; | |
(void)builder.Reserve(NUM_ROWS); | |
for(int row = 0; row < NUM_ROWS; row++) | |
{ | |
builder.UnsafeAppend(data[row]); | |
} | |
(void)builder.Finish(&column); | |
columns.push_back(column); | |
} | |
/* Write Timestamps */ | |
{ | |
shared_ptr<arrow::Array> column; | |
arrow::Int64Builder builder; | |
(void)builder.Reserve(NUM_ROWS); | |
for(int row = 0; row < NUM_ROWS; row++) | |
{ | |
builder.UnsafeAppend(timestamps[row]); | |
} | |
(void)builder.Finish(&column); | |
columns.push_back(column); | |
} | |
/* Write Geometry Column */ | |
{ | |
typedef struct WKBPoint { | |
uint8_t byteOrder; | |
uint32_t wkbType; | |
double x; | |
double y; | |
} __attribute__((packed)) wkbpoint_t; | |
shared_ptr<arrow::Array> column; | |
arrow::BinaryBuilder builder; | |
for(int row = 0; row < NUM_ROWS; row++) | |
{ | |
wkbpoint_t point = { | |
#ifdef __be__ | |
.byteOrder = 0, | |
#else | |
.byteOrder = 1, | |
#endif | |
.wkbType = 1, | |
.x = longitude[row], | |
.y = latitude[row] | |
}; | |
(void)builder.Append((uint8_t*)&point, sizeof(wkbpoint_t)); | |
} | |
(void)builder.Finish(&column); | |
columns.push_back(column); | |
} | |
/* Build and Write Table */ | |
shared_ptr<arrow::Table> table = arrow::Table::Make(schema, columns); | |
(void)parquetWriter->WriteTable(*table, NUM_ROWS); | |
/* Close Parquet Writer */ | |
(void)parquetWriter->Close(); | |
/* Return Success */ | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment