Created
June 25, 2025 12:16
-
-
Save raulcd/aa4bda51de0b08b8ac08852cdfb81ece to your computer and use it in GitHub Desktop.
Create truncated min/max binary parquet file so `is_{max/min}_value_exact=false`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use parquet::arrow::ArrowWriter; | |
| use parquet::basic::{Compression, Encoding}; | |
| use parquet::file::properties::WriterProperties; | |
| use arrow::array::{BinaryArray, StringArray}; | |
| use arrow::record_batch::RecordBatch; | |
| use arrow::datatypes::{DataType, Field, Schema}; | |
| use std::fs::File; | |
| use std::sync::Arc; | |
| fn main() -> Result<(), Box<dyn std::error::Error>> { | |
| // Create schema with string and binary columns (similar to the test) | |
| let string_field = Field::new("a", DataType::Utf8, false); | |
| let binary_field = Field::new("b", DataType::Binary, false); | |
| let schema = Schema::new(vec![string_field, binary_field]); | |
| // Use multiple distinct values to test truncation and distinct_count | |
| let raw_string_values = vec![ | |
| "Blart Versenwald III", | |
| "Alice Johnson", | |
| "Bob Smith", | |
| "Charlie Brown", | |
| "Diana Prince", | |
| "Edward Norton", | |
| "Fiona Apple", | |
| "George Lucas", | |
| "Helen Keller", | |
| "Ivan Drago", | |
| "Julia Roberts", | |
| "Kevin Bacon" | |
| ]; | |
| let raw_binary_values: Vec<Vec<u8>> = raw_string_values | |
| .iter() | |
| .map(|s| s.as_bytes().to_vec()) | |
| .collect(); | |
| let raw_binary_value_refs = raw_binary_values | |
| .iter() | |
| .map(|x| x.as_slice()) | |
| .collect::<Vec<_>>(); | |
| println!("Creating data with string: {:?}", raw_string_values); | |
| println!("Creating data with binary: {:?}", raw_binary_values); | |
| // Create Arrow arrays | |
| let string_values = StringArray::from(raw_string_values.clone()); | |
| let binary_values = BinaryArray::from(raw_binary_value_refs); | |
| // Create record batch | |
| let batch = RecordBatch::try_new( | |
| Arc::new(schema), | |
| vec![Arc::new(string_values), Arc::new(binary_values)], | |
| )?; | |
| println!("Created record batch with {} rows", batch.num_rows()); | |
| println!("Values range from: '{}' to '{}'", | |
| raw_string_values.iter().min().unwrap(), | |
| raw_string_values.iter().max().unwrap()); | |
| // Create writer properties with statistics truncation | |
| // This should cause truncation similar to the test | |
| let props = WriterProperties::builder() | |
| .set_statistics_truncate_length(Some(2)) // Truncate to 2 characters | |
| .set_dictionary_enabled(false) | |
| .set_encoding(Encoding::PLAIN) | |
| .set_compression(Compression::UNCOMPRESSED) | |
| .build(); | |
| // Write the file | |
| let file = File::create("binary_truncated_min_max.parquet")?; | |
| let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props))?; | |
| writer.write(&batch)?; | |
| writer.close()?; | |
| println!("Successfully created binary_truncated_min_max.parquet"); | |
| // Verify the file was created | |
| if std::path::Path::new("binary_truncated_min_max.parquet").exists() { | |
| let metadata = std::fs::metadata("binary_truncated_min_max.parquet")?; | |
| println!("File size: {} bytes", metadata.len()); | |
| println!("Created {} distinct values", raw_string_values.len()); | |
| println!("Expected truncated values:"); | |
| println!(" min should be truncated to: 'Al' (from 'Alice Johnson')"); | |
| println!(" max should be truncated to: 'Kf' (from 'Kevin Bacon' -> 'Ke' -> increment to 'Kf')"); | |
| println!(" is_min_value_exact should be: false"); | |
| println!(" is_max_value_exact should be: false"); | |
| println!(" distinct_count should be: 0 (not calculated by Arrow-rs)"); | |
| } | |
| Ok(()) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment