Skip to content

Instantly share code, notes, and snippets.

@jpswinski
Last active August 8, 2024 04:23
Show Gist options
  • Save jpswinski/13074fc773f92a529f98b274e5ad5283 to your computer and use it in GitHub Desktop.
Save jpswinski/13074fc773f92a529f98b274e5ad5283 to your computer and use it in GitHub Desktop.
Minimal example C++ code to write a GeoParquet file using Apache Arrow
/*
* File: geoparquet.cpp
*
* Purpose: A minimal example to build a GeoParquet file using Apache Arrow.
*
* Prerequisites: The Apache Arrow library is needed and can be installed as follows
* $ git clone https://github.com/apache/arrow.git
* $ cd arrow/cpp
* $ mkdir build
* $ cd build
* $ cmake .. -DARROW_PARQUET=ON -DARROW_WITH_ZLIB=ON
* $ make -j8
* $ sudo make install
*
* Building: gcc geoparquet.cpp -Wl,-lstdc++ -Wl,/usr/local/lib/libparquet.so -Wl,/usr/local/lib/libarrow.so
*
* Running: ./a.out
*
* Notes:
* 1. The output of the program is a GeoParquet file called "myfile.parquet" written to the directory
* that the executable is run from.
* 2. The data written to the GeoParquet file consists of
* - a single data column of 1 byte integers
* - a timestamp column consisting of GPS times in seconds
* (i.e. number of seconds since GPS epoch of Jan 6, 1980)
* - a geometry column of longitude,latitude points conforming
* to the GeoParquet specification
* 3. To get a quick look into "myfile.parquet", use the parquet-tools (installed via pip)
* 4. The file can be read into a GeoDataFrame in Python with the following Python code
* >>> import geopandas
* >>> gdf = geopandas.read_parquet("myfile.parquet")
*
* Todo:
* 1. Use a arrow::date64() type for the timestamp column
*/
/*
* Includes
*/
#include <iostream>
#include <arrow/builder.h>
#include <arrow/table.h>
#include <arrow/io/file.h>
#include <arrow/util/key_value_metadata.h>
#include <parquet/arrow/writer.h>
#include <parquet/arrow/schema.h>
#include <parquet/properties.h>
#include <parquet/file_writer.h>
/*
* Namespaces
*/
using std::shared_ptr;
using std::unique_ptr;
using std::make_shared;
using std::vector;
/*
* Function: Build GeoParquet Metadata String
*/
const char* buildGeoMetaData (void)
{
const char* str = R"json({
"version": "1.0.0-beta.1",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": ["Point"],
"crs": {
"$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
"type": "GeographicCRS",
"name": "WGS 84 longitude-latitude",
"datum": {
"type": "GeodeticReferenceFrame",
"name": "World Geodetic System 1984",
"ellipsoid": {
"name": "WGS 84",
"semi_major_axis": 6378137,
"inverse_flattening": 298.257223563
}
},
"coordinate_system": {
"subtype": "ellipsoidal",
"axis": [
{
"name": "Geodetic longitude",
"abbreviation": "Lon",
"direction": "east",
"unit": "degree"
},
{
"name": "Geodetic latitude",
"abbreviation": "Lat",
"direction": "north",
"unit": "degree"
}
]
},
"id": {
"authority": "OGC",
"code": "CRS84"
}
},
"edges": "planar",
"bbox": [-180.0, -90.0, 180.0, 90.0],
"epoch": 2018.0
}
}
})json";
int len = strlen(str) + 1;
char* new_str = new char [len];
int i = 0, j = 0;
while(i < len)
{
if((i < len-4) && (str[i] == ' ' && str[i+1] == ' ' && str[i+2] == ' ' && str[i+3] == ' ')) i += 4;
else if(str[i] == '\n') i += 1;
else new_str[j++] = str[i++];
}
new_str[j] = '\0';
return new_str;
}
/*
* Function: Main
*/
int main(int argc, char* argv[])
{
/* Data */
const int NUM_ROWS = 10;
int8_t data[NUM_ROWS] = {0,1,2,3,4,5,6,7,8,9};
uint64_t timestamps[NUM_ROWS] = {1358002370, 1358002371, 1358002372, 1358002373, 1358002374, 1358002375, 1358002376, 1358002377, 1358002378, 1358002379};
double latitude[NUM_ROWS] = {60.2, 61.1, 63.4, 63.9, 64.7, 65.0, 66.8, 67.1, 67.2, 69.4};
double longitude[NUM_ROWS] = {142.0, 142.1, 142.2, 142.3, 142.4, 142.5, 142.6, 142.7, 142.8, 142.9};
/* Build Schema */
vector<shared_ptr<arrow::Field>> schema_vector;
schema_vector.push_back(arrow::field("data", arrow::int8()));
schema_vector.push_back(arrow::field("timestamp", arrow::int64()));
schema_vector.push_back(arrow::field("geometry", arrow::binary()));
shared_ptr<arrow::Schema> schema = make_shared<arrow::Schema>(schema_vector);
/* Create Arrow Output Stream */
shared_ptr<arrow::io::FileOutputStream> file_output_stream;
PARQUET_ASSIGN_OR_THROW(file_output_stream, arrow::io::FileOutputStream::Open("myfile.parquet"));
/* Create Writer Properties */
parquet::WriterProperties::Builder writer_props_builder;
writer_props_builder.compression(parquet::Compression::GZIP);
shared_ptr<parquet::WriterProperties> writer_props = writer_props_builder.build();
/* Create Arrow Writer Properties */
auto arrow_writer_props = parquet::ArrowWriterProperties::Builder().store_schema()->build();
/* Build GeoParquet MetaData */
auto metadata = schema->metadata() ? schema->metadata()->Copy() : std::make_shared<arrow::KeyValueMetadata>();
const char* metadata_str = buildGeoMetaData();
metadata->Append("geo", metadata_str);
schema = schema->WithMetadata(metadata);
delete [] metadata_str;
/* Create Parquet Writer */
unique_ptr<parquet::arrow::FileWriter> parquetWriter;
arrow::Result<unique_ptr<parquet::arrow::FileWriter>> result = parquet::arrow::FileWriter::Open(*schema, ::arrow::default_memory_pool(), file_output_stream, writer_props, arrow_writer_props);
if(result.ok())
{
parquetWriter = std::move(result).ValueOrDie();
}
else
{
printf("Failed to open parquet writer: %s", result.status().ToString().c_str());
return 1;
}
/* Initialize Columns */
vector<shared_ptr<arrow::Array>> columns;
/* Write Data */
{
shared_ptr<arrow::Array> column;
arrow::Int8Builder builder;
(void)builder.Reserve(NUM_ROWS);
for(int row = 0; row < NUM_ROWS; row++)
{
builder.UnsafeAppend(data[row]);
}
(void)builder.Finish(&column);
columns.push_back(column);
}
/* Write Timestamps */
{
shared_ptr<arrow::Array> column;
arrow::Int64Builder builder;
(void)builder.Reserve(NUM_ROWS);
for(int row = 0; row < NUM_ROWS; row++)
{
builder.UnsafeAppend(timestamps[row]);
}
(void)builder.Finish(&column);
columns.push_back(column);
}
/* Write Geometry Column */
{
typedef struct WKBPoint {
uint8_t byteOrder;
uint32_t wkbType;
double x;
double y;
} __attribute__((packed)) wkbpoint_t;
shared_ptr<arrow::Array> column;
arrow::BinaryBuilder builder;
for(int row = 0; row < NUM_ROWS; row++)
{
wkbpoint_t point = {
#ifdef __be__
.byteOrder = 0,
#else
.byteOrder = 1,
#endif
.wkbType = 1,
.x = longitude[row],
.y = latitude[row]
};
(void)builder.Append((uint8_t*)&point, sizeof(wkbpoint_t));
}
(void)builder.Finish(&column);
columns.push_back(column);
}
/* Build and Write Table */
shared_ptr<arrow::Table> table = arrow::Table::Make(schema, columns);
(void)parquetWriter->WriteTable(*table, NUM_ROWS);
/* Close Parquet Writer */
(void)parquetWriter->Close();
/* Return Success */
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment