Skip to content

Instantly share code, notes, and snippets.

@ianmcook
Created October 11, 2023 21:02
Show Gist options
  • Select an option

  • Save ianmcook/104dfd1a2693c7a715f9da6cd68155e8 to your computer and use it in GitHub Desktop.

Select an option

Save ianmcook/104dfd1a2693c7a715f9da6cd68155e8 to your computer and use it in GitHub Desktop.
Write a very wide Parquet file
#include <iostream>
#include <random>
#include <vector>
#include <string>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/writer.h>
std::vector<std::string> GenerateUniqueStrings() {
// generates 26^4 = 456,976 unique 4-letter combinations
std::vector<std::string> result;
const int alphabetSize = 26;
for (int h = 0; h < alphabetSize; ++h) {
for (int i = 0; i < alphabetSize; ++i) {
for (int j = 0; j < alphabetSize; ++j) {
for (int k = 0; k < alphabetSize; ++k) {
std::string unique_string;
unique_string.push_back('a' + h);
unique_string.push_back('a' + i);
unique_string.push_back('a' + j);
unique_string.push_back('a' + k);
result.push_back(unique_string);
}
}
}
}
return result;
}
arrow::Status WriteTableToParquetFile() {
int num_columns = 450000;
int num_rows = 100;
auto colnames = GenerateUniqueStrings();
std::shared_ptr<arrow::Array> array;
arrow::Int32Builder builder;
for (int i = 0; i < num_rows; i++) {
ARROW_RETURN_NOT_OK(builder.Append(rand()));
//ARROW_RETURN_NOT_OK(builder.AppendNull());
}
ARROW_RETURN_NOT_OK(builder.Finish(&array));
std::vector<std::shared_ptr<arrow::Array>> arrays;
for (int j = 0; j < num_columns; j++) {
arrays.push_back(array);
}
std::vector<std::shared_ptr<arrow::Field>> schema_vector;
for (int j = 0; j < num_columns; j++) {
schema_vector.push_back(
arrow::field(colnames[j], arrow::int32())
);
}
auto schema = std::make_shared<arrow::Schema>(schema_vector);
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays);
// Choose compression
std::shared_ptr<parquet::WriterProperties> props =
parquet::WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
// Opt to store Arrow schema for easier reads back into Arrow
std::shared_ptr<parquet::ArrowWriterProperties> arrow_props =
parquet::ArrowWriterProperties::Builder().store_schema()->build();
std::shared_ptr<arrow::io::FileOutputStream> outfile;
ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test.parquet"));
ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
arrow::default_memory_pool(), outfile,
/*chunk_size=*/1000, props, arrow_props));
return arrow::Status::OK();
}
int main(int, char**) {
auto status = WriteTableToParquetFile();
if (!status.ok()) {
std::cerr << "Error occurred : " << status.message() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment