Skip to content

Instantly share code, notes, and snippets.

@tai
Created May 30, 2019 04:59
Show Gist options
  • Select an option

  • Save tai/d1d08f2806f1ceb7d4435346ce02817a to your computer and use it in GitHub Desktop.

Select an option

Save tai/d1d08f2806f1ceb7d4435346ce02817a to your computer and use it in GitHub Desktop.
/*BINFMTCXX: -Wall -Wno-switch -lparquet -larrow
*/
#include <stdio.h>
#include "arrow/api.h"
#include "arrow/io/api.h"
#include "arrow/array.h"
#include "parquet/arrow/reader.h"
#include "parquet/arrow/schema.h"
#include "parquet/exception.h"
#include "parquet/file_reader.h"
#include "parquet/statistics.h"
#include <iostream>
void
read_data(std::shared_ptr<arrow::Array> arr, int n=3) {
switch (arr->type_id()) {
case arrow::Type::STRING: {
auto newdata = arr->data()->Copy();
newdata->type = arrow::utf8();
arrow::StringArray sa(newdata);
for (int i = 0; i < n; i++) {
std::cout << sa.GetString(i) << std::endl;
}
break;
}
case arrow::Type::INT64: {
auto newdata = arr->data()->Copy();
newdata->type = arrow::int64();
arrow::NumericArray<arrow::Int64Type> numarr(newdata);
for (int i = 0; i < n; i++) {
std::cout << numarr.Value(i) << std::endl;
}
break;
}
default:
std::cerr << "unknown type" << std::endl;
break;
}
}
int
main(int argc, char **argv) {
std::unique_ptr<parquet::arrow::FileReader> reader;
auto pool = arrow::default_memory_pool();
reader.reset(new parquet::arrow::FileReader(
pool,
parquet::ParquetFileReader::OpenFile(argv[1], false)
));
// metadata
auto meta = reader->parquet_reader()->metadata();
auto schema = meta->schema();
int ngrp = reader->num_row_groups();
int ncol = schema->num_columns();
printf("ngrp=%d, ncol=%d\n", ngrp, ncol);
// rowgroup-level metadata
auto rgm = meta->RowGroup(0);
int rg_ncols = rgm->num_columns();
int rg_nrows = rgm->num_rows();
printf("rg_ncol=%d, rg_nrows=%d\n", rg_ncols, rg_nrows);
// read table
auto rg = reader->RowGroup(0);
std::shared_ptr<arrow::Table> t;
rg->ReadTable(&t);
printf("t_ncol=%d, t_nrows=%ld\n", t->num_columns(), t->num_rows());
auto col0 = t->GetColumnByName("PARAMSU");
auto col1 = t->GetColumnByName("PAGELOADTIME");
auto vec0 = col0->data()->chunks();
auto vec1 = col1->data()->chunks();
auto arr0 = vec0[0];
auto arr1 = vec1[0];
printf("arr0.length=%ld\n", arr0->length());
printf("arr1.length=%ld\n", arr1->length());
std::cout << "arr0.type=" << arr0->type()->name() << std::endl;
std::cout << "arr1.type=" << arr1->type()->name() << std::endl;
read_data(arr0);
read_data(arr1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment