Created
October 11, 2023 21:02
-
-
Save ianmcook/104dfd1a2693c7a715f9da6cd68155e8 to your computer and use it in GitHub Desktop.
Write a very wide Parquet file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <random> | |
| #include <vector> | |
| #include <string> | |
| #include <arrow/api.h> | |
| #include <arrow/io/api.h> | |
| #include <parquet/arrow/writer.h> | |
| std::vector<std::string> GenerateUniqueStrings() { | |
| // generates 26^4 = 456,976 unique 4-letter combinations | |
| std::vector<std::string> result; | |
| const int alphabetSize = 26; | |
| for (int h = 0; h < alphabetSize; ++h) { | |
| for (int i = 0; i < alphabetSize; ++i) { | |
| for (int j = 0; j < alphabetSize; ++j) { | |
| for (int k = 0; k < alphabetSize; ++k) { | |
| std::string unique_string; | |
| unique_string.push_back('a' + h); | |
| unique_string.push_back('a' + i); | |
| unique_string.push_back('a' + j); | |
| unique_string.push_back('a' + k); | |
| result.push_back(unique_string); | |
| } | |
| } | |
| } | |
| } | |
| return result; | |
| } | |
| arrow::Status WriteTableToParquetFile() { | |
| int num_columns = 450000; | |
| int num_rows = 100; | |
| auto colnames = GenerateUniqueStrings(); | |
| std::shared_ptr<arrow::Array> array; | |
| arrow::Int32Builder builder; | |
| for (int i = 0; i < num_rows; i++) { | |
| ARROW_RETURN_NOT_OK(builder.Append(rand())); | |
| //ARROW_RETURN_NOT_OK(builder.AppendNull()); | |
| } | |
| ARROW_RETURN_NOT_OK(builder.Finish(&array)); | |
| std::vector<std::shared_ptr<arrow::Array>> arrays; | |
| for (int j = 0; j < num_columns; j++) { | |
| arrays.push_back(array); | |
| } | |
| std::vector<std::shared_ptr<arrow::Field>> schema_vector; | |
| for (int j = 0; j < num_columns; j++) { | |
| schema_vector.push_back( | |
| arrow::field(colnames[j], arrow::int32()) | |
| ); | |
| } | |
| auto schema = std::make_shared<arrow::Schema>(schema_vector); | |
| std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, arrays); | |
| // Choose compression | |
| std::shared_ptr<parquet::WriterProperties> props = | |
| parquet::WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); | |
| // Opt to store Arrow schema for easier reads back into Arrow | |
| std::shared_ptr<parquet::ArrowWriterProperties> arrow_props = | |
| parquet::ArrowWriterProperties::Builder().store_schema()->build(); | |
| std::shared_ptr<arrow::io::FileOutputStream> outfile; | |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test.parquet")); | |
| ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), | |
| arrow::default_memory_pool(), outfile, | |
| /*chunk_size=*/1000, props, arrow_props)); | |
| return arrow::Status::OK(); | |
| } | |
| int main(int, char**) { | |
| auto status = WriteTableToParquetFile(); | |
| if (!status.ok()) { | |
| std::cerr << "Error occurred : " << status.message() << std::endl; | |
| return EXIT_FAILURE; | |
| } | |
| return EXIT_SUCCESS; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment