Skip to content

Instantly share code, notes, and snippets.

@amoeba
Created September 4, 2024 19:32
Show Gist options
  • Save amoeba/d6acd28aed944480efe40b71f5545ee7 to your computer and use it in GitHub Desktop.
Save amoeba/d6acd28aed944480efe40b71f5545ee7 to your computer and use it in GitHub Desktop.
diff --git forkSrcPrefix/cpp/src/parquet/arrow/reader_writer_benchmark.cc forkDstPrefix/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index 95c4a659297d93f0927a8f36bb25607b15e5a0fa..d450431eae5133daf24066f8d01701cd34af63c9 100644
--- forkSrcPrefix/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ forkDstPrefix/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include <sys/_types/_int64_t.h>
+#include "arrow/io/file.h"
#include "benchmark/benchmark.h"
#include <array>
@@ -247,6 +249,51 @@ BENCHMARK(BM_WriteBinaryColumn)
->Args({50, kInfiniteUniqueValues})
->Args({99, kInfiniteUniqueValues});
+static void BM_WriteBinaryColumnMultithreaded(::benchmark::State& state) {
+ // hack: load parquet from disk
+ auto pq_file = ::arrow::io::ReadableFile::Open("/Users/bryce/Work/VESA-189/sample.parquet").ValueOrDie();
+ std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+ EXIT_NOT_OK(arrow::OpenFile(pq_file, ::arrow::default_memory_pool(), &arrow_reader));
+
+ std::shared_ptr<::arrow::Table> table;
+ EXIT_NOT_OK(arrow_reader->ReadTable(&table));
+ //
+
+ std::shared_ptr<WriterProperties> props = default_writer_properties();
+ std::shared_ptr<ArrowWriterProperties> arrow_props =
+ ArrowWriterProperties::Builder().set_use_threads(state.range(0))->build();
+
+ auto batches = ::arrow::TableBatchReader(table).ToRecordBatches().ValueOrDie();
+
+ // Track total bytes written by measuring the output stream
+ int64_t total_bytes = 0;
+
+ while (state.KeepRunning()) {
+ auto output = CreateOutputStream();
+ auto writer = parquet::arrow::FileWriter::Open(*table->schema().get(),
+ ::arrow::default_memory_pool(), output,
+ props, arrow_props)
+ .ValueOrDie();
+
+ for (auto batch : batches) {
+ EXIT_NOT_OK(writer->WriteRecordBatch(*batch));
+ }
+
+ auto final_buf = output->Finish().ValueOrDie();
+ total_bytes += final_buf->size();
+ }
+
+ state.SetItemsProcessed(table->num_rows() * state.iterations());
+ state.SetBytesProcessed(total_bytes);
+}
+
+BENCHMARK(BM_WriteBinaryColumnMultithreaded)
+ ->ArgNames({"use_threads", "unique_values"})
+ // We vary unique values to trigger the dictionary-encoded (for low-cardinality)
+ // and plain (for high-cardinality) code paths.
+ ->Args({false})
+ ->Args({true});
+
template <typename T>
struct Examples {
static constexpr std::array<T, 2> values() { return {127, 128}; }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment