Skip to content

Instantly share code, notes, and snippets.

View ianmcook's full-sized avatar

Ian Cook ianmcook

View GitHub Profile
@ianmcook
ianmcook / ibis_bigquery_github_nested.py
Created April 14, 2023 17:04
Ibis BigQuery github_nested example query
import google.auth
import ibis
from ibis import _
credentials, billing_project = google.auth.default()
conn = ibis.bigquery.connect(billing_project, 'bigquery-public-data.samples')
t = conn.table('github_nested')
expr = (
@ianmcook
ianmcook / acero_sort.cpp
Created August 17, 2023 21:19
Sort an Arrow Table with Acero
#include <iostream>
#include <arrow/api.h>
#include <arrow/result.h>
#include <arrow/compute/api.h>
#include <arrow/compute/exec/exec_plan.h>
arrow::Status ExecutePlanAndCollectAsTable(
std::shared_ptr<arrow::compute::ExecPlan> plan,
std::shared_ptr<arrow::Schema> schema,
arrow::AsyncGenerator<std::optional<arrow::compute::ExecBatch>> sink_gen) {
@ianmcook
ianmcook / substrait_pyarrow_dataset_expressions.py
Created August 29, 2023 21:39
Use Substrait expressions to filter and project PyArrow datasets
import tempfile
import pathlib
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
import pyarrow.dataset as ds
# create a small dataset for example purposes
@ianmcook
ianmcook / arrow_is_in.cpp
Created September 19, 2023 15:07
Standalone test of the Arrow C++ `is_in` kernel
#include <iostream>
#include <arrow/api.h>
#include <arrow/compute/api.h>
int main(int, char**) {
// lookup set
std::shared_ptr<arrow::Array> array;
arrow::Int32Builder builder;
if (!builder.Append(5).ok()) return 1;
@ianmcook
ianmcook / write_wide_parquet.cpp
Created October 11, 2023 21:02
Write a very wide Parquet file
#include <iostream>
#include <random>
#include <vector>
#include <string>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/writer.h>
std::vector<std::string> GenerateUniqueStrings() {
// generates 26^4 = 456,976 unique 4-letter combinations
@ianmcook
ianmcook / write_parquet_float.cpp
Last active October 13, 2023 18:10
Write Parquet file with float32 column
#include <iostream>
#include <random>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/writer.h>
float GetRandomFloat()
{
static std::default_random_engine e;
@ianmcook
ianmcook / 1-write_parquet_float16.cpp
Last active October 13, 2023 18:04
Test writing and reading a Parquet file with a float16 column
#include <iostream>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <arrow/util/float16.h>
#include <parquet/arrow/writer.h>
arrow::Status WriteTableToParquetFile() {
std::shared_ptr<arrow::Array> array;
arrow::HalfFloatBuilder builder;
@ianmcook
ianmcook / pyarrow_read_write_order_test.py
Created November 7, 2023 20:04
Write and read Parquet files, combine columns together into an Arrow table, and check if order was preserved
import pyarrow as pa
import pyarrow.parquet as pq
import random
import string
# write parquet files
original = []
for i in range(3):
data = [[random.uniform(0, 1) for _ in range(1000000)]]
original.extend(data)
@ianmcook
ianmcook / acero_tpch_06.cpp
Last active January 22, 2024 23:31
Acero ExecPlan for TPC-H Query 06
#include <iostream>
#include <arrow/api.h>
#include <arrow/type.h>
#include <arrow/result.h>
#include <arrow/io/api.h>
#include <arrow/compute/api.h>
#include <arrow/acero/exec_plan.h>
#include <arrow/acero/options.h>
#include <parquet/arrow/reader.h>
@ianmcook
ianmcook / acero_tpch_06_decl.cpp
Created January 22, 2024 23:22
Acero Declarations for TPC-H Query 06
#include <iostream>
#include <arrow/api.h>
#include <arrow/type.h>
#include <arrow/result.h>
#include <arrow/io/api.h>
#include <arrow/compute/api.h>
#include <arrow/acero/exec_plan.h>
#include <arrow/acero/options.h>
#include <parquet/arrow/reader.h>