Skip to content

Instantly share code, notes, and snippets.

@bdunnette
Created January 23, 2026 21:46
Show Gist options
  • Select an option

  • Save bdunnette/9431513faaa7942697b125a2f16e6a82 to your computer and use it in GitHub Desktop.

Select an option

Save bdunnette/9431513faaa7942697b125a2f16e6a82 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Script to read all Excel files in the RackScans directory,
combine them into a single Polars DataFrame, and save as a Parquet file.
"""
import polars as pl
from pathlib import Path
from tqdm import tqdm
def main():
year_dir = Path(__file__).parent
study_year = year_dir.name.lower()
print(f"Compiling rack scans for study year: {study_year}")
# Define the directory containing Excel files
rack_scans_dir = year_dir / "RackScans"
# Get all Excel files in the directory
excel_files = list(rack_scans_dir.glob("*.xlsx"))
if not excel_files:
print("No Excel files found in RackScans directory.")
return
print(f"Found {len(excel_files)} Excel files to process...")
# Read all Excel files and combine them
dataframes = []
for excel_file in tqdm(excel_files, desc="Reading Excel files"):
try:
df = pl.read_excel(excel_file, infer_schema_length=0)
dataframes.append(df)
except Exception as e:
print(f"Error reading {excel_file.name}: {e}")
if not dataframes:
print("No dataframes to combine.")
return
# Combine all dataframes
print("Combining dataframes...")
combined_df = pl.concat(dataframes, how="vertical_relaxed")
# Drop any rows where Barcode is null or "EMPTY"
print("Filtering invalid barcodes...")
combined_df = combined_df.filter(
(pl.col("Barcode").is_not_null()) & (pl.col("Barcode") != "EMPTY")
)
print(f"Total valid barcodes after filtering: {combined_df.height:,}")
# Attempt to cast ScanTime to datetime as ScanDateTime
combined_df = combined_df.with_columns(
pl.col("ScanTime").str.strptime(pl.Datetime, strict=False).alias("ScanDateTime")
)
# Flag the latest scan for each Barcode
print("Flagging latest scans...")
latest_scans = combined_df.group_by("Barcode").agg(
pl.col("ScanDateTime").max().alias("LatestScanDateTime")
)
combined_df = combined_df.join(latest_scans, on="Barcode", how="left")
combined_df = combined_df.with_columns(
(pl.col("ScanDateTime") == pl.col("LatestScanDateTime")).alias("IsLatestScan")
).drop("LatestScanDateTime")
print(
f"Total latest scans flagged: {combined_df.filter(pl.col('IsLatestScan')).height:,}"
)
# Flag any Barcodes that have been scanned into multiple BoxBarcodes and/or TubeLocations
print("Flagging multiple box scans...")
barcode_box_counts = combined_df.group_by("Barcode").agg(
[
pl.n_unique("BoxBarcode").alias("UniqueBoxCount"),
pl.n_unique("TubeLocation").alias("UniqueLocationCount"),
]
)
combined_df = combined_df.join(barcode_box_counts, on="Barcode", how="left")
combined_df = combined_df.with_columns(
pl.when((pl.col("UniqueBoxCount") > 1) | (pl.col("UniqueLocationCount") > 1))
.then(True)
.otherwise(False)
.alias("IsMultipleBoxScan")
).drop(["UniqueBoxCount", "UniqueLocationCount"])
print(
f"Total barcodes with multiple box scans: {combined_df.filter(pl.col('IsMultipleBoxScan')).height:,}"
)
# Save as Parquet file
output_file = year_dir / f"{study_year}-rack-scans-combined.parquet"
print(f"Saving to {output_file.name}...")
combined_df.write_parquet(output_file)
print(f"Done! Combined {len(dataframes)} files into {output_file.name}")
print(f"Total rows: {combined_df.height:,}")
print(f"Total columns: {combined_df.width}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment