Created
July 25, 2025 20:28
-
-
Save mattleblanc/5f410a15d853e51f6be9782bbd34ca93 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Script to generate a local QCD.json file with absolute paths to downloaded files. | |
| This script scans the download directory and creates a new JSON file that points | |
| to local files while preserving all original metadata. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import argparse | |
| from pathlib import Path | |
| import hashlib | |
| def find_downloaded_files(base_dir, organized_by_dataset=False): | |
| """ | |
| Find all .root files in the download directory | |
| Returns dict mapping filename -> full_local_path | |
| """ | |
| downloaded_files = {} | |
| base_path = Path(base_dir) | |
| if not base_path.exists(): | |
| print(f"Error: Download directory {base_dir} does not exist") | |
| return downloaded_files | |
| # Find all .root files recursively | |
| for root_file in base_path.rglob("*.root"): | |
| filename = root_file.name | |
| full_path = str(root_file.absolute()) | |
| # Handle potential filename conflicts | |
| if filename in downloaded_files: | |
| print(f"Warning: Duplicate filename found: {filename}") | |
| print(f" Existing: {downloaded_files[filename]}") | |
| print(f" New: {full_path}") | |
| # Use the one with more recent modification time | |
| existing_mtime = os.path.getmtime(downloaded_files[filename]) | |
| new_mtime = os.path.getmtime(full_path) | |
| if new_mtime > existing_mtime: | |
| downloaded_files[filename] = full_path | |
| print(f" Using newer file: {full_path}") | |
| else: | |
| print(f" Keeping existing file: {downloaded_files[filename]}") | |
| else: | |
| downloaded_files[filename] = full_path | |
| return downloaded_files | |
| def extract_filename_from_url(url): | |
| """Extract filename from ROOT URL""" | |
| return os.path.basename(url) | |
| def verify_file_integrity(local_path, expected_size=None): | |
| """ | |
| Verify file exists and optionally check size | |
| Returns (exists, actual_size, is_size_match) | |
| """ | |
| if not os.path.exists(local_path): | |
| return False, 0, False | |
| actual_size = os.path.getsize(local_path) | |
| size_match = True | |
| if expected_size is not None: | |
| size_match = (actual_size == expected_size) | |
| return True, actual_size, size_match | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Generate local QCD.json with absolute paths to downloaded files") | |
| parser.add_argument("--download-dir", "-d", default=".", | |
| help="Directory containing downloaded files (default: current directory)") | |
| parser.add_argument("--source-json", "-s", default=None, | |
| help="Source JSON file (default: auto-detect QCD_working_files.json or QCD.json)") | |
| parser.add_argument("--output", "-o", default="QCD_local.json", | |
| help="Output JSON file name (default: QCD_local.json)") | |
| parser.add_argument("--organized-by-dataset", action="store_true", | |
| help="Files are organized in dataset subdirectories") | |
| parser.add_argument("--verify-sizes", action="store_true", | |
| help="Verify file sizes match original metadata (requires size info in source JSON)") | |
| parser.add_argument("--missing-action", choices=['skip', 'error', 'warn'], default='warn', | |
| help="Action for missing files: skip (exclude), error (abort), warn (include with note)") | |
| args = parser.parse_args() | |
| # Determine source JSON file | |
| if args.source_json: | |
| source_json = args.source_json | |
| elif os.path.exists("QCD_working_files.json"): | |
| source_json = "QCD_working_files.json" | |
| print("Using QCD_working_files.json as source") | |
| elif os.path.exists("QCD.json"): | |
| source_json = "QCD.json" | |
| print("Using QCD.json as source") | |
| else: | |
| print("Error: No source JSON file found. Please specify with --source-json") | |
| sys.exit(1) | |
| if not os.path.exists(source_json): | |
| print(f"Error: Source JSON file {source_json} not found") | |
| sys.exit(1) | |
| # Load source JSON | |
| try: | |
| with open(source_json, 'r') as f: | |
| source_data = json.load(f) | |
| except Exception as e: | |
| print(f"Error reading {source_json}: {e}") | |
| sys.exit(1) | |
| print(f"Source JSON: {source_json}") | |
| print(f"Download directory: {os.path.abspath(args.download_dir)}") | |
| print(f"Output file: {args.output}") | |
| print(f"Organized by dataset: {args.organized_by_dataset}") | |
| # Find downloaded files | |
| downloaded_files = find_downloaded_files(args.download_dir, args.organized_by_dataset) | |
| print(f"Found {len(downloaded_files)} downloaded .root files") | |
| if len(downloaded_files) == 0: | |
| print("Warning: No .root files found in download directory") | |
| response = input("Continue anyway? (y/N): ") | |
| if response.lower() not in ['y', 'yes']: | |
| sys.exit(1) | |
| # Generate local JSON | |
| local_data = {} | |
| total_files_processed = 0 | |
| total_files_found = 0 | |
| total_files_missing = 0 | |
| missing_files_report = [] | |
| size_mismatches = [] | |
| for dataset_name, dataset_info in source_data.items(): | |
| if 'files' not in dataset_info: | |
| # Preserve datasets without files | |
| local_data[dataset_name] = dataset_info.copy() | |
| continue | |
| print(f"\nProcessing dataset: {dataset_name}") | |
| original_files = dataset_info['files'] | |
| local_files = [] | |
| dataset_missing = [] | |
| for remote_url in original_files: | |
| total_files_processed += 1 | |
| filename = extract_filename_from_url(remote_url) | |
| # Look for the file in downloads | |
| if args.organized_by_dataset: | |
| # Look in dataset-specific subdirectory first | |
| dataset_specific_path = os.path.join(args.download_dir, dataset_name, filename) | |
| if os.path.exists(dataset_specific_path): | |
| local_path = os.path.abspath(dataset_specific_path) | |
| elif filename in downloaded_files: | |
| local_path = downloaded_files[filename] | |
| else: | |
| local_path = None | |
| else: | |
| local_path = downloaded_files.get(filename) | |
| if local_path and os.path.exists(local_path): | |
| # File found locally | |
| total_files_found += 1 | |
| # Verify file integrity if requested | |
| if args.verify_sizes and 'metadata' in dataset_info and 'size' in dataset_info['metadata']: | |
| try: | |
| expected_total_size = int(dataset_info['metadata']['size']) | |
| # This is total dataset size, not individual file size | |
| # We'll just check if file is not empty | |
| file_size = os.path.getsize(local_path) | |
| if file_size == 0: | |
| size_mismatches.append({ | |
| 'dataset': dataset_name, | |
| 'file': filename, | |
| 'local_path': local_path, | |
| 'issue': 'File is empty' | |
| }) | |
| except (ValueError, OSError) as e: | |
| print(f" Warning: Could not verify size for {filename}: {e}") | |
| local_files.append(f"file://{local_path}") | |
| else: | |
| # File not found locally | |
| total_files_missing += 1 | |
| missing_info = { | |
| 'dataset': dataset_name, | |
| 'filename': filename, | |
| 'original_url': remote_url | |
| } | |
| dataset_missing.append(missing_info) | |
| missing_files_report.append(missing_info) | |
| if args.missing_action == 'error': | |
| print(f"Error: Required file not found: {filename}") | |
| sys.exit(1) | |
| elif args.missing_action == 'warn': | |
| print(f" Warning: File not found locally: {filename}") | |
| # Include a comment in the JSON | |
| local_files.append(f"# MISSING: {remote_url}") | |
| # For 'skip', we just don't add it to local_files | |
| # Create dataset entry for local JSON | |
| if local_files or args.missing_action != 'skip': | |
| local_dataset = dataset_info.copy() # Preserve all original metadata | |
| local_dataset['files'] = local_files | |
| # Add local processing metadata | |
| local_metadata = local_dataset.get('metadata', {}).copy() | |
| local_metadata['local_processing'] = { | |
| 'source_json': source_json, | |
| 'download_directory': os.path.abspath(args.download_dir), | |
| 'generated_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), | |
| 'original_file_count': str(len(original_files)), | |
| 'local_file_count': str(len([f for f in local_files if not f.startswith('#')])), | |
| 'missing_file_count': str(len(dataset_missing)) | |
| } | |
| if args.organized_by_dataset: | |
| local_metadata['local_processing']['organized_by_dataset'] = True | |
| local_dataset['metadata'] = local_metadata | |
| local_data[dataset_name] = local_dataset | |
| found_count = len([f for f in local_files if not f.startswith('#')]) | |
| print(f" Files found: {found_count}/{len(original_files)}") | |
| # Write local JSON file | |
| try: | |
| with open(args.output, 'w') as f: | |
| json.dump(local_data, f, indent=2) | |
| print(f"\nLocal JSON file written: {args.output}") | |
| except Exception as e: | |
| print(f"Error writing {args.output}: {e}") | |
| sys.exit(1) | |
| # Print summary | |
| print("\n" + "="*60) | |
| print("LOCAL JSON GENERATION SUMMARY:") | |
| print("="*60) | |
| print(f"Source JSON: {source_json}") | |
| print(f"Output JSON: {args.output}") | |
| print(f"Total files processed: {total_files_processed}") | |
| print(f"Files found locally: {total_files_found}") | |
| print(f"Files missing: {total_files_missing}") | |
| print(f"Success rate: {(total_files_found/total_files_processed)*100:.1f}%") | |
| print(f"Datasets in output: {len(local_data)}") | |
| # Report missing files | |
| if missing_files_report: | |
| print(f"\nMISSING FILES ({len(missing_files_report)}):") | |
| print("-" * 50) | |
| # Group by dataset | |
| missing_by_dataset = {} | |
| for missing in missing_files_report: | |
| dataset = missing['dataset'] | |
| if dataset not in missing_by_dataset: | |
| missing_by_dataset[dataset] = [] | |
| missing_by_dataset[dataset].append(missing) | |
| for dataset, missing_list in missing_by_dataset.items(): | |
| print(f" {dataset}: {len(missing_list)} missing files") | |
| # Write detailed missing files report | |
| missing_report_file = f"missing_files_{args.output.replace('.json', '.txt')}" | |
| with open(missing_report_file, 'w') as f: | |
| f.write("MISSING FILES REPORT\n") | |
| f.write("=" * 50 + "\n") | |
| f.write(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") | |
| f.write(f"Source JSON: {source_json}\n") | |
| f.write(f"Download directory: {os.path.abspath(args.download_dir)}\n") | |
| f.write(f"Total missing files: {len(missing_files_report)}\n\n") | |
| for dataset, missing_list in missing_by_dataset.items(): | |
| f.write(f"\nDATASET: {dataset} ({len(missing_list)} missing)\n") | |
| f.write("-" * 40 + "\n") | |
| for missing in missing_list: | |
| f.write(f"File: {missing['filename']}\n") | |
| f.write(f"URL: {missing['original_url']}\n") | |
| f.write("-" * 20 + "\n") | |
| print(f"Detailed missing files report: {missing_report_file}") | |
| # Report size mismatches | |
| if size_mismatches: | |
| print(f"\nSIZE ISSUES ({len(size_mismatches)}):") | |
| print("-" * 50) | |
| for mismatch in size_mismatches: | |
| print(f" {mismatch['dataset']}/{mismatch['file']}: {mismatch['issue']}") | |
| print(f"\nLocal QCD.json file is ready for use!") | |
| print(f"All file paths are absolute and point to your local downloads.") | |
| if __name__ == "__main__": | |
| import time | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment