Skip to content

Instantly share code, notes, and snippets.

@mattleblanc
Created July 25, 2025 20:28
Show Gist options
  • Select an option

  • Save mattleblanc/5f410a15d853e51f6be9782bbd34ca93 to your computer and use it in GitHub Desktop.

Select an option

Save mattleblanc/5f410a15d853e51f6be9782bbd34ca93 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Script to generate a local QCD.json file with absolute paths to downloaded files.
This script scans the download directory and creates a new JSON file that points
to local files while preserving all original metadata.
"""
import json
import os
import sys
import argparse
from pathlib import Path
import hashlib
def find_downloaded_files(base_dir, organized_by_dataset=False):
"""
Find all .root files in the download directory
Returns dict mapping filename -> full_local_path
"""
downloaded_files = {}
base_path = Path(base_dir)
if not base_path.exists():
print(f"Error: Download directory {base_dir} does not exist")
return downloaded_files
# Find all .root files recursively
for root_file in base_path.rglob("*.root"):
filename = root_file.name
full_path = str(root_file.absolute())
# Handle potential filename conflicts
if filename in downloaded_files:
print(f"Warning: Duplicate filename found: {filename}")
print(f" Existing: {downloaded_files[filename]}")
print(f" New: {full_path}")
# Use the one with more recent modification time
existing_mtime = os.path.getmtime(downloaded_files[filename])
new_mtime = os.path.getmtime(full_path)
if new_mtime > existing_mtime:
downloaded_files[filename] = full_path
print(f" Using newer file: {full_path}")
else:
print(f" Keeping existing file: {downloaded_files[filename]}")
else:
downloaded_files[filename] = full_path
return downloaded_files
def extract_filename_from_url(url):
"""Extract filename from ROOT URL"""
return os.path.basename(url)
def verify_file_integrity(local_path, expected_size=None):
"""
Verify file exists and optionally check size
Returns (exists, actual_size, is_size_match)
"""
if not os.path.exists(local_path):
return False, 0, False
actual_size = os.path.getsize(local_path)
size_match = True
if expected_size is not None:
size_match = (actual_size == expected_size)
return True, actual_size, size_match
def main():
parser = argparse.ArgumentParser(description="Generate local QCD.json with absolute paths to downloaded files")
parser.add_argument("--download-dir", "-d", default=".",
help="Directory containing downloaded files (default: current directory)")
parser.add_argument("--source-json", "-s", default=None,
help="Source JSON file (default: auto-detect QCD_working_files.json or QCD.json)")
parser.add_argument("--output", "-o", default="QCD_local.json",
help="Output JSON file name (default: QCD_local.json)")
parser.add_argument("--organized-by-dataset", action="store_true",
help="Files are organized in dataset subdirectories")
parser.add_argument("--verify-sizes", action="store_true",
help="Verify file sizes match original metadata (requires size info in source JSON)")
parser.add_argument("--missing-action", choices=['skip', 'error', 'warn'], default='warn',
help="Action for missing files: skip (exclude), error (abort), warn (include with note)")
args = parser.parse_args()
# Determine source JSON file
if args.source_json:
source_json = args.source_json
elif os.path.exists("QCD_working_files.json"):
source_json = "QCD_working_files.json"
print("Using QCD_working_files.json as source")
elif os.path.exists("QCD.json"):
source_json = "QCD.json"
print("Using QCD.json as source")
else:
print("Error: No source JSON file found. Please specify with --source-json")
sys.exit(1)
if not os.path.exists(source_json):
print(f"Error: Source JSON file {source_json} not found")
sys.exit(1)
# Load source JSON
try:
with open(source_json, 'r') as f:
source_data = json.load(f)
except Exception as e:
print(f"Error reading {source_json}: {e}")
sys.exit(1)
print(f"Source JSON: {source_json}")
print(f"Download directory: {os.path.abspath(args.download_dir)}")
print(f"Output file: {args.output}")
print(f"Organized by dataset: {args.organized_by_dataset}")
# Find downloaded files
downloaded_files = find_downloaded_files(args.download_dir, args.organized_by_dataset)
print(f"Found {len(downloaded_files)} downloaded .root files")
if len(downloaded_files) == 0:
print("Warning: No .root files found in download directory")
response = input("Continue anyway? (y/N): ")
if response.lower() not in ['y', 'yes']:
sys.exit(1)
# Generate local JSON
local_data = {}
total_files_processed = 0
total_files_found = 0
total_files_missing = 0
missing_files_report = []
size_mismatches = []
for dataset_name, dataset_info in source_data.items():
if 'files' not in dataset_info:
# Preserve datasets without files
local_data[dataset_name] = dataset_info.copy()
continue
print(f"\nProcessing dataset: {dataset_name}")
original_files = dataset_info['files']
local_files = []
dataset_missing = []
for remote_url in original_files:
total_files_processed += 1
filename = extract_filename_from_url(remote_url)
# Look for the file in downloads
if args.organized_by_dataset:
# Look in dataset-specific subdirectory first
dataset_specific_path = os.path.join(args.download_dir, dataset_name, filename)
if os.path.exists(dataset_specific_path):
local_path = os.path.abspath(dataset_specific_path)
elif filename in downloaded_files:
local_path = downloaded_files[filename]
else:
local_path = None
else:
local_path = downloaded_files.get(filename)
if local_path and os.path.exists(local_path):
# File found locally
total_files_found += 1
# Verify file integrity if requested
if args.verify_sizes and 'metadata' in dataset_info and 'size' in dataset_info['metadata']:
try:
expected_total_size = int(dataset_info['metadata']['size'])
# This is total dataset size, not individual file size
# We'll just check if file is not empty
file_size = os.path.getsize(local_path)
if file_size == 0:
size_mismatches.append({
'dataset': dataset_name,
'file': filename,
'local_path': local_path,
'issue': 'File is empty'
})
except (ValueError, OSError) as e:
print(f" Warning: Could not verify size for {filename}: {e}")
local_files.append(f"file://{local_path}")
else:
# File not found locally
total_files_missing += 1
missing_info = {
'dataset': dataset_name,
'filename': filename,
'original_url': remote_url
}
dataset_missing.append(missing_info)
missing_files_report.append(missing_info)
if args.missing_action == 'error':
print(f"Error: Required file not found: {filename}")
sys.exit(1)
elif args.missing_action == 'warn':
print(f" Warning: File not found locally: {filename}")
# Include a comment in the JSON
local_files.append(f"# MISSING: {remote_url}")
# For 'skip', we just don't add it to local_files
# Create dataset entry for local JSON
if local_files or args.missing_action != 'skip':
local_dataset = dataset_info.copy() # Preserve all original metadata
local_dataset['files'] = local_files
# Add local processing metadata
local_metadata = local_dataset.get('metadata', {}).copy()
local_metadata['local_processing'] = {
'source_json': source_json,
'download_directory': os.path.abspath(args.download_dir),
'generated_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
'original_file_count': str(len(original_files)),
'local_file_count': str(len([f for f in local_files if not f.startswith('#')])),
'missing_file_count': str(len(dataset_missing))
}
if args.organized_by_dataset:
local_metadata['local_processing']['organized_by_dataset'] = True
local_dataset['metadata'] = local_metadata
local_data[dataset_name] = local_dataset
found_count = len([f for f in local_files if not f.startswith('#')])
print(f" Files found: {found_count}/{len(original_files)}")
# Write local JSON file
try:
with open(args.output, 'w') as f:
json.dump(local_data, f, indent=2)
print(f"\nLocal JSON file written: {args.output}")
except Exception as e:
print(f"Error writing {args.output}: {e}")
sys.exit(1)
# Print summary
print("\n" + "="*60)
print("LOCAL JSON GENERATION SUMMARY:")
print("="*60)
print(f"Source JSON: {source_json}")
print(f"Output JSON: {args.output}")
print(f"Total files processed: {total_files_processed}")
print(f"Files found locally: {total_files_found}")
print(f"Files missing: {total_files_missing}")
print(f"Success rate: {(total_files_found/total_files_processed)*100:.1f}%")
print(f"Datasets in output: {len(local_data)}")
# Report missing files
if missing_files_report:
print(f"\nMISSING FILES ({len(missing_files_report)}):")
print("-" * 50)
# Group by dataset
missing_by_dataset = {}
for missing in missing_files_report:
dataset = missing['dataset']
if dataset not in missing_by_dataset:
missing_by_dataset[dataset] = []
missing_by_dataset[dataset].append(missing)
for dataset, missing_list in missing_by_dataset.items():
print(f" {dataset}: {len(missing_list)} missing files")
# Write detailed missing files report
missing_report_file = f"missing_files_{args.output.replace('.json', '.txt')}"
with open(missing_report_file, 'w') as f:
f.write("MISSING FILES REPORT\n")
f.write("=" * 50 + "\n")
f.write(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"Source JSON: {source_json}\n")
f.write(f"Download directory: {os.path.abspath(args.download_dir)}\n")
f.write(f"Total missing files: {len(missing_files_report)}\n\n")
for dataset, missing_list in missing_by_dataset.items():
f.write(f"\nDATASET: {dataset} ({len(missing_list)} missing)\n")
f.write("-" * 40 + "\n")
for missing in missing_list:
f.write(f"File: {missing['filename']}\n")
f.write(f"URL: {missing['original_url']}\n")
f.write("-" * 20 + "\n")
print(f"Detailed missing files report: {missing_report_file}")
# Report size mismatches
if size_mismatches:
print(f"\nSIZE ISSUES ({len(size_mismatches)}):")
print("-" * 50)
for mismatch in size_mismatches:
print(f" {mismatch['dataset']}/{mismatch['file']}: {mismatch['issue']}")
print(f"\nLocal QCD.json file is ready for use!")
print(f"All file paths are absolute and point to your local downloads.")
if __name__ == "__main__":
import time
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment