Created
January 23, 2026 21:46
-
-
Save bdunnette/9431513faaa7942697b125a2f16e6a82 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Script to read all Excel files in the RackScans directory, | |
| combine them into a single Polars DataFrame, and save as a Parquet file. | |
| """ | |
| import polars as pl | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| def main(): | |
| year_dir = Path(__file__).parent | |
| study_year = year_dir.name.lower() | |
| print(f"Compiling rack scans for study year: {study_year}") | |
| # Define the directory containing Excel files | |
| rack_scans_dir = year_dir / "RackScans" | |
| # Get all Excel files in the directory | |
| excel_files = list(rack_scans_dir.glob("*.xlsx")) | |
| if not excel_files: | |
| print("No Excel files found in RackScans directory.") | |
| return | |
| print(f"Found {len(excel_files)} Excel files to process...") | |
| # Read all Excel files and combine them | |
| dataframes = [] | |
| for excel_file in tqdm(excel_files, desc="Reading Excel files"): | |
| try: | |
| df = pl.read_excel(excel_file, infer_schema_length=0) | |
| dataframes.append(df) | |
| except Exception as e: | |
| print(f"Error reading {excel_file.name}: {e}") | |
| if not dataframes: | |
| print("No dataframes to combine.") | |
| return | |
| # Combine all dataframes | |
| print("Combining dataframes...") | |
| combined_df = pl.concat(dataframes, how="vertical_relaxed") | |
| # Drop any rows where Barcode is null or "EMPTY" | |
| print("Filtering invalid barcodes...") | |
| combined_df = combined_df.filter( | |
| (pl.col("Barcode").is_not_null()) & (pl.col("Barcode") != "EMPTY") | |
| ) | |
| print(f"Total valid barcodes after filtering: {combined_df.height:,}") | |
| # Attempt to cast ScanTime to datetime as ScanDateTime | |
| combined_df = combined_df.with_columns( | |
| pl.col("ScanTime").str.strptime(pl.Datetime, strict=False).alias("ScanDateTime") | |
| ) | |
| # Flag the latest scan for each Barcode | |
| print("Flagging latest scans...") | |
| latest_scans = combined_df.group_by("Barcode").agg( | |
| pl.col("ScanDateTime").max().alias("LatestScanDateTime") | |
| ) | |
| combined_df = combined_df.join(latest_scans, on="Barcode", how="left") | |
| combined_df = combined_df.with_columns( | |
| (pl.col("ScanDateTime") == pl.col("LatestScanDateTime")).alias("IsLatestScan") | |
| ).drop("LatestScanDateTime") | |
| print( | |
| f"Total latest scans flagged: {combined_df.filter(pl.col('IsLatestScan')).height:,}" | |
| ) | |
| # Flag any Barcodes that have been scanned into multiple BoxBarcodes and/or TubeLocations | |
| print("Flagging multiple box scans...") | |
| barcode_box_counts = combined_df.group_by("Barcode").agg( | |
| [ | |
| pl.n_unique("BoxBarcode").alias("UniqueBoxCount"), | |
| pl.n_unique("TubeLocation").alias("UniqueLocationCount"), | |
| ] | |
| ) | |
| combined_df = combined_df.join(barcode_box_counts, on="Barcode", how="left") | |
| combined_df = combined_df.with_columns( | |
| pl.when((pl.col("UniqueBoxCount") > 1) | (pl.col("UniqueLocationCount") > 1)) | |
| .then(True) | |
| .otherwise(False) | |
| .alias("IsMultipleBoxScan") | |
| ).drop(["UniqueBoxCount", "UniqueLocationCount"]) | |
| print( | |
| f"Total barcodes with multiple box scans: {combined_df.filter(pl.col('IsMultipleBoxScan')).height:,}" | |
| ) | |
| # Save as Parquet file | |
| output_file = year_dir / f"{study_year}-rack-scans-combined.parquet" | |
| print(f"Saving to {output_file.name}...") | |
| combined_df.write_parquet(output_file) | |
| print(f"Done! Combined {len(dataframes)} files into {output_file.name}") | |
| print(f"Total rows: {combined_df.height:,}") | |
| print(f"Total columns: {combined_df.width}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment