Skip to content

Instantly share code, notes, and snippets.

@raulcd
Created June 25, 2025 12:16
Show Gist options
  • Select an option

  • Save raulcd/aa4bda51de0b08b8ac08852cdfb81ece to your computer and use it in GitHub Desktop.

Select an option

Save raulcd/aa4bda51de0b08b8ac08852cdfb81ece to your computer and use it in GitHub Desktop.
Create truncated min/max binary parquet file so `is_{max/min}_value_exact=false`
use parquet::arrow::ArrowWriter;
use parquet::basic::{Compression, Encoding};
use parquet::file::properties::WriterProperties;
use arrow::array::{BinaryArray, StringArray};
use arrow::record_batch::RecordBatch;
use arrow::datatypes::{DataType, Field, Schema};
use std::fs::File;
use std::sync::Arc;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create schema with string and binary columns (similar to the test)
let string_field = Field::new("a", DataType::Utf8, false);
let binary_field = Field::new("b", DataType::Binary, false);
let schema = Schema::new(vec![string_field, binary_field]);
// Use multiple distinct values to test truncation and distinct_count
let raw_string_values = vec![
"Blart Versenwald III",
"Alice Johnson",
"Bob Smith",
"Charlie Brown",
"Diana Prince",
"Edward Norton",
"Fiona Apple",
"George Lucas",
"Helen Keller",
"Ivan Drago",
"Julia Roberts",
"Kevin Bacon"
];
let raw_binary_values: Vec<Vec<u8>> = raw_string_values
.iter()
.map(|s| s.as_bytes().to_vec())
.collect();
let raw_binary_value_refs = raw_binary_values
.iter()
.map(|x| x.as_slice())
.collect::<Vec<_>>();
println!("Creating data with string: {:?}", raw_string_values);
println!("Creating data with binary: {:?}", raw_binary_values);
// Create Arrow arrays
let string_values = StringArray::from(raw_string_values.clone());
let binary_values = BinaryArray::from(raw_binary_value_refs);
// Create record batch
let batch = RecordBatch::try_new(
Arc::new(schema),
vec![Arc::new(string_values), Arc::new(binary_values)],
)?;
println!("Created record batch with {} rows", batch.num_rows());
println!("Values range from: '{}' to '{}'",
raw_string_values.iter().min().unwrap(),
raw_string_values.iter().max().unwrap());
// Create writer properties with statistics truncation
// This should cause truncation similar to the test
let props = WriterProperties::builder()
.set_statistics_truncate_length(Some(2)) // Truncate to 2 characters
.set_dictionary_enabled(false)
.set_encoding(Encoding::PLAIN)
.set_compression(Compression::UNCOMPRESSED)
.build();
// Write the file
let file = File::create("binary_truncated_min_max.parquet")?;
let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props))?;
writer.write(&batch)?;
writer.close()?;
println!("Successfully created binary_truncated_min_max.parquet");
// Verify the file was created
if std::path::Path::new("binary_truncated_min_max.parquet").exists() {
let metadata = std::fs::metadata("binary_truncated_min_max.parquet")?;
println!("File size: {} bytes", metadata.len());
println!("Created {} distinct values", raw_string_values.len());
println!("Expected truncated values:");
println!(" min should be truncated to: 'Al' (from 'Alice Johnson')");
println!(" max should be truncated to: 'Kf' (from 'Kevin Bacon' -> 'Ke' -> increment to 'Kf')");
println!(" is_min_value_exact should be: false");
println!(" is_max_value_exact should be: false");
println!(" distinct_count should be: 0 (not calculated by Arrow-rs)");
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment