mattleblanc · July 25, 2025 20:28
diff --git a/generate_local_json.py b/generate_local_json.py
 #!/usr/bin/env python3
 """
 Script to generate a local QCD.json file with absolute paths to downloaded files.
 This script scans the download directory and creates a new JSON file that points
 to local files while preserving all original metadata.
 """

 import json
 import os
 import sys
 import argparse
 from pathlib import Path
 import hashlib

 def find_downloaded_files(base_dir, organized_by_dataset=False):
    """
    Find all .root files in the download directory
    Returns dict mapping filename -> full_local_path
    """
    downloaded_files = {}
    base_path = Path(base_dir)
    
    if not base_path.exists():
        print(f"Error: Download directory {base_dir} does not exist")
        return downloaded_files
    
    # Find all .root files recursively
    for root_file in base_path.rglob("*.root"):
        filename = root_file.name
        full_path = str(root_file.absolute())
        
        # Handle potential filename conflicts
        if filename in downloaded_files:
            print(f"Warning: Duplicate filename found: {filename}")
            print(f"  Existing: {downloaded_files[filename]}")
            print(f"  New: {full_path}")
            # Use the one with more recent modification time
            existing_mtime = os.path.getmtime(downloaded_files[filename])
            new_mtime = os.path.getmtime(full_path)
            if new_mtime > existing_mtime:
                downloaded_files[filename] = full_path
                print(f"  Using newer file: {full_path}")
            else:
                print(f"  Keeping existing file: {downloaded_files[filename]}")
        else:
            downloaded_files[filename] = full_path
    
    return downloaded_files

 def extract_filename_from_url(url):
    """Extract filename from ROOT URL"""
    return os.path.basename(url)

 def verify_file_integrity(local_path, expected_size=None):
    """
    Verify file exists and optionally check size
    Returns (exists, actual_size, is_size_match)
    """
    if not os.path.exists(local_path):
        return False, 0, False
    
    actual_size = os.path.getsize(local_path)
    size_match = True
    
    if expected_size is not None:
        size_match = (actual_size == expected_size)
    
    return True, actual_size, size_match

 def main():
    parser = argparse.ArgumentParser(description="Generate local QCD.json with absolute paths to downloaded files")
    parser.add_argument("--download-dir", "-d", default=".", 
                        help="Directory containing downloaded files (default: current directory)")
    parser.add_argument("--source-json", "-s", default=None,
                        help="Source JSON file (default: auto-detect QCD_working_files.json or QCD.json)")
    parser.add_argument("--output", "-o", default="QCD_local.json",
                        help="Output JSON file name (default: QCD_local.json)")
    parser.add_argument("--organized-by-dataset", action="store_true",
                        help="Files are organized in dataset subdirectories")
    parser.add_argument("--verify-sizes", action="store_true",
                        help="Verify file sizes match original metadata (requires size info in source JSON)")
    parser.add_argument("--missing-action", choices=['skip', 'error', 'warn'], default='warn',
                        help="Action for missing files: skip (exclude), error (abort), warn (include with note)")
    
    args = parser.parse_args()
    
    # Determine source JSON file
    if args.source_json:
        source_json = args.source_json
    elif os.path.exists("QCD_working_files.json"):
        source_json = "QCD_working_files.json"
        print("Using QCD_working_files.json as source")
    elif os.path.exists("QCD.json"):
        source_json = "QCD.json"
        print("Using QCD.json as source")
    else:
        print("Error: No source JSON file found. Please specify with --source-json")
        sys.exit(1)
    
    if not os.path.exists(source_json):
        print(f"Error: Source JSON file {source_json} not found")
        sys.exit(1)
    
    # Load source JSON
    try:
        with open(source_json, 'r') as f:
            source_data = json.load(f)
    except Exception as e:
        print(f"Error reading {source_json}: {e}")
        sys.exit(1)
    
    print(f"Source JSON: {source_json}")
    print(f"Download directory: {os.path.abspath(args.download_dir)}")
    print(f"Output file: {args.output}")
    print(f"Organized by dataset: {args.organized_by_dataset}")
    
    # Find downloaded files
    downloaded_files = find_downloaded_files(args.download_dir, args.organized_by_dataset)
    print(f"Found {len(downloaded_files)} downloaded .root files")
    
    if len(downloaded_files) == 0:
        print("Warning: No .root files found in download directory")
        response = input("Continue anyway? (y/N): ")
        if response.lower() not in ['y', 'yes']:
            sys.exit(1)
    
    # Generate local JSON
    local_data = {}
    total_files_processed = 0
    total_files_found = 0
    total_files_missing = 0
    missing_files_report = []
    size_mismatches = []
    
    for dataset_name, dataset_info in source_data.items():
        if 'files' not in dataset_info:
            # Preserve datasets without files
            local_data[dataset_name] = dataset_info.copy()
            continue
        
        print(f"\nProcessing dataset: {dataset_name}")
        original_files = dataset_info['files']
        local_files = []
        dataset_missing = []
        
        for remote_url in original_files:
            total_files_processed += 1
            filename = extract_filename_from_url(remote_url)
            
            # Look for the file in downloads
            if args.organized_by_dataset:
                # Look in dataset-specific subdirectory first
                dataset_specific_path = os.path.join(args.download_dir, dataset_name, filename)
                if os.path.exists(dataset_specific_path):
                    local_path = os.path.abspath(dataset_specific_path)
                elif filename in downloaded_files:
                    local_path = downloaded_files[filename]
                else:
                    local_path = None
            else:
                local_path = downloaded_files.get(filename)
            
            if local_path and os.path.exists(local_path):
                # File found locally
                total_files_found += 1
                
                # Verify file integrity if requested
                if args.verify_sizes and 'metadata' in dataset_info and 'size' in dataset_info['metadata']:
                    try:
                        expected_total_size = int(dataset_info['metadata']['size'])
                        # This is total dataset size, not individual file size
                        # We'll just check if file is not empty
                        file_size = os.path.getsize(local_path)
                        if file_size == 0:
                            size_mismatches.append({
                                'dataset': dataset_name,
                                'file': filename,
                                'local_path': local_path,
                                'issue': 'File is empty'
                            })
                    except (ValueError, OSError) as e:
                        print(f"  Warning: Could not verify size for {filename}: {e}")
                
                local_files.append(f"file://{local_path}")
                
            else:
                # File not found locally
                total_files_missing += 1
                missing_info = {
                    'dataset': dataset_name,
                    'filename': filename,
                    'original_url': remote_url
                }
                dataset_missing.append(missing_info)
                missing_files_report.append(missing_info)
                
                if args.missing_action == 'error':
                    print(f"Error: Required file not found: {filename}")
                    sys.exit(1)
                elif args.missing_action == 'warn':
                    print(f"  Warning: File not found locally: {filename}")
                    # Include a comment in the JSON
                    local_files.append(f"# MISSING: {remote_url}")
                # For 'skip', we just don't add it to local_files
        
        # Create dataset entry for local JSON
        if local_files or args.missing_action != 'skip':
            local_dataset = dataset_info.copy()  # Preserve all original metadata
            local_dataset['files'] = local_files
            
            # Add local processing metadata
            local_metadata = local_dataset.get('metadata', {}).copy()
            local_metadata['local_processing'] = {
                'source_json': source_json,
                'download_directory': os.path.abspath(args.download_dir),
                'generated_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                'original_file_count': str(len(original_files)),
                'local_file_count': str(len([f for f in local_files if not f.startswith('#')])),
                'missing_file_count': str(len(dataset_missing))
            }
            
            if args.organized_by_dataset:
                local_metadata['local_processing']['organized_by_dataset'] = True
            
            local_dataset['metadata'] = local_metadata
            local_data[dataset_name] = local_dataset
            
            found_count = len([f for f in local_files if not f.startswith('#')])
            print(f"  Files found: {found_count}/{len(original_files)}")
    
    # Write local JSON file
    try:
        with open(args.output, 'w') as f:
            json.dump(local_data, f, indent=2)
        print(f"\nLocal JSON file written: {args.output}")
    except Exception as e:
        print(f"Error writing {args.output}: {e}")
        sys.exit(1)
    
    # Print summary
    print("\n" + "="*60)
    print("LOCAL JSON GENERATION SUMMARY:")
    print("="*60)
    print(f"Source JSON: {source_json}")
    print(f"Output JSON: {args.output}")
    print(f"Total files processed: {total_files_processed}")
    print(f"Files found locally: {total_files_found}")
    print(f"Files missing: {total_files_missing}")
    print(f"Success rate: {(total_files_found/total_files_processed)*100:.1f}%")
    print(f"Datasets in output: {len(local_data)}")
    
    # Report missing files
    if missing_files_report:
        print(f"\nMISSING FILES ({len(missing_files_report)}):")
        print("-" * 50)
        
        # Group by dataset
        missing_by_dataset = {}
        for missing in missing_files_report:
            dataset = missing['dataset']
            if dataset not in missing_by_dataset:
                missing_by_dataset[dataset] = []
            missing_by_dataset[dataset].append(missing)
        
        for dataset, missing_list in missing_by_dataset.items():
            print(f"  {dataset}: {len(missing_list)} missing files")
        
        # Write detailed missing files report
        missing_report_file = f"missing_files_{args.output.replace('.json', '.txt')}"
        with open(missing_report_file, 'w') as f:
            f.write("MISSING FILES REPORT\n")
            f.write("=" * 50 + "\n")
            f.write(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Source JSON: {source_json}\n")
            f.write(f"Download directory: {os.path.abspath(args.download_dir)}\n")
            f.write(f"Total missing files: {len(missing_files_report)}\n\n")
            
            for dataset, missing_list in missing_by_dataset.items():
                f.write(f"\nDATASET: {dataset} ({len(missing_list)} missing)\n")
                f.write("-" * 40 + "\n")
                for missing in missing_list:
                    f.write(f"File: {missing['filename']}\n")
                    f.write(f"URL: {missing['original_url']}\n")
                    f.write("-" * 20 + "\n")
        
        print(f"Detailed missing files report: {missing_report_file}")
    
    # Report size mismatches
    if size_mismatches:
        print(f"\nSIZE ISSUES ({len(size_mismatches)}):")
        print("-" * 50)
        for mismatch in size_mismatches:
            print(f"  {mismatch['dataset']}/{mismatch['file']}: {mismatch['issue']}")
    
    print(f"\nLocal QCD.json file is ready for use!")
    print(f"All file paths are absolute and point to your local downloads.")

 if __name__ == "__main__":
    import time
    main()
	#!/usr/bin/env python3
	"""
	Script to generate a local QCD.json file with absolute paths to downloaded files.
	This script scans the download directory and creates a new JSON file that points
	to local files while preserving all original metadata.
	"""

	import json
	import os
	import sys
	import argparse
	from pathlib import Path
	import hashlib

	def find_downloaded_files(base_dir, organized_by_dataset=False):
	"""
	Find all .root files in the download directory
	Returns dict mapping filename -> full_local_path
	"""
	downloaded_files = {}
	base_path = Path(base_dir)

	if not base_path.exists():
	print(f"Error: Download directory {base_dir} does not exist")
	return downloaded_files

	# Find all .root files recursively
	for root_file in base_path.rglob("*.root"):
	filename = root_file.name
	full_path = str(root_file.absolute())

	# Handle potential filename conflicts
	if filename in downloaded_files:
	print(f"Warning: Duplicate filename found: {filename}")
	print(f" Existing: {downloaded_files[filename]}")
	print(f" New: {full_path}")
	# Use the one with more recent modification time
	existing_mtime = os.path.getmtime(downloaded_files[filename])
	new_mtime = os.path.getmtime(full_path)
	if new_mtime > existing_mtime:
	downloaded_files[filename] = full_path
	print(f" Using newer file: {full_path}")
	else:
	print(f" Keeping existing file: {downloaded_files[filename]}")
	else:
	downloaded_files[filename] = full_path

	return downloaded_files

	def extract_filename_from_url(url):
	"""Extract filename from ROOT URL"""
	return os.path.basename(url)

	def verify_file_integrity(local_path, expected_size=None):
	"""
	Verify file exists and optionally check size
	Returns (exists, actual_size, is_size_match)
	"""
	if not os.path.exists(local_path):
	return False, 0, False

	actual_size = os.path.getsize(local_path)
	size_match = True

	if expected_size is not None:
	size_match = (actual_size == expected_size)

	return True, actual_size, size_match

	def main():
	parser = argparse.ArgumentParser(description="Generate local QCD.json with absolute paths to downloaded files")
	parser.add_argument("--download-dir", "-d", default=".",
	help="Directory containing downloaded files (default: current directory)")
	parser.add_argument("--source-json", "-s", default=None,
	help="Source JSON file (default: auto-detect QCD_working_files.json or QCD.json)")
	parser.add_argument("--output", "-o", default="QCD_local.json",
	help="Output JSON file name (default: QCD_local.json)")
	parser.add_argument("--organized-by-dataset", action="store_true",
	help="Files are organized in dataset subdirectories")
	parser.add_argument("--verify-sizes", action="store_true",
	help="Verify file sizes match original metadata (requires size info in source JSON)")
	parser.add_argument("--missing-action", choices=['skip', 'error', 'warn'], default='warn',
	help="Action for missing files: skip (exclude), error (abort), warn (include with note)")

	args = parser.parse_args()

	# Determine source JSON file
	if args.source_json:
	source_json = args.source_json
	elif os.path.exists("QCD_working_files.json"):
	source_json = "QCD_working_files.json"
	print("Using QCD_working_files.json as source")
	elif os.path.exists("QCD.json"):
	source_json = "QCD.json"
	print("Using QCD.json as source")
	else:
	print("Error: No source JSON file found. Please specify with --source-json")
	sys.exit(1)

	if not os.path.exists(source_json):
	print(f"Error: Source JSON file {source_json} not found")
	sys.exit(1)

	# Load source JSON
	try:
	with open(source_json, 'r') as f:
	source_data = json.load(f)
	except Exception as e:
	print(f"Error reading {source_json}: {e}")
	sys.exit(1)

	print(f"Source JSON: {source_json}")
	print(f"Download directory: {os.path.abspath(args.download_dir)}")
	print(f"Output file: {args.output}")
	print(f"Organized by dataset: {args.organized_by_dataset}")

	# Find downloaded files
	downloaded_files = find_downloaded_files(args.download_dir, args.organized_by_dataset)
	print(f"Found {len(downloaded_files)} downloaded .root files")

	if len(downloaded_files) == 0:
	print("Warning: No .root files found in download directory")
	response = input("Continue anyway? (y/N): ")
	if response.lower() not in ['y', 'yes']:
	sys.exit(1)

	# Generate local JSON
	local_data = {}
	total_files_processed = 0
	total_files_found = 0
	total_files_missing = 0
	missing_files_report = []
	size_mismatches = []

	for dataset_name, dataset_info in source_data.items():
	if 'files' not in dataset_info:
	# Preserve datasets without files
	local_data[dataset_name] = dataset_info.copy()
	continue

	print(f"\nProcessing dataset: {dataset_name}")
	original_files = dataset_info['files']
	local_files = []
	dataset_missing = []

	for remote_url in original_files:
	total_files_processed += 1
	filename = extract_filename_from_url(remote_url)

	# Look for the file in downloads
	if args.organized_by_dataset:
	# Look in dataset-specific subdirectory first
	dataset_specific_path = os.path.join(args.download_dir, dataset_name, filename)
	if os.path.exists(dataset_specific_path):
	local_path = os.path.abspath(dataset_specific_path)
	elif filename in downloaded_files:
	local_path = downloaded_files[filename]
	else:
	local_path = None
	else:
	local_path = downloaded_files.get(filename)

	if local_path and os.path.exists(local_path):
	# File found locally
	total_files_found += 1

	# Verify file integrity if requested
	if args.verify_sizes and 'metadata' in dataset_info and 'size' in dataset_info['metadata']:
	try:
	expected_total_size = int(dataset_info['metadata']['size'])
	# This is total dataset size, not individual file size
	# We'll just check if file is not empty
	file_size = os.path.getsize(local_path)
	if file_size == 0:
	size_mismatches.append({
	'dataset': dataset_name,
	'file': filename,
	'local_path': local_path,
	'issue': 'File is empty'
	})
	except (ValueError, OSError) as e:
	print(f" Warning: Could not verify size for {filename}: {e}")

	local_files.append(f"file://{local_path}")

	else:
	# File not found locally
	total_files_missing += 1
	missing_info = {
	'dataset': dataset_name,
	'filename': filename,
	'original_url': remote_url
	}
	dataset_missing.append(missing_info)
	missing_files_report.append(missing_info)

	if args.missing_action == 'error':
	print(f"Error: Required file not found: {filename}")
	sys.exit(1)
	elif args.missing_action == 'warn':
	print(f" Warning: File not found locally: {filename}")
	# Include a comment in the JSON
	local_files.append(f"# MISSING: {remote_url}")
	# For 'skip', we just don't add it to local_files

	# Create dataset entry for local JSON
	if local_files or args.missing_action != 'skip':
	local_dataset = dataset_info.copy() # Preserve all original metadata
	local_dataset['files'] = local_files

	# Add local processing metadata
	local_metadata = local_dataset.get('metadata', {}).copy()
	local_metadata['local_processing'] = {
	'source_json': source_json,
	'download_directory': os.path.abspath(args.download_dir),
	'generated_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
	'original_file_count': str(len(original_files)),
	'local_file_count': str(len([f for f in local_files if not f.startswith('#')])),
	'missing_file_count': str(len(dataset_missing))
	}

	if args.organized_by_dataset:
	local_metadata['local_processing']['organized_by_dataset'] = True

	local_dataset['metadata'] = local_metadata
	local_data[dataset_name] = local_dataset

	found_count = len([f for f in local_files if not f.startswith('#')])
	print(f" Files found: {found_count}/{len(original_files)}")

	# Write local JSON file
	try:
	with open(args.output, 'w') as f:
	json.dump(local_data, f, indent=2)
	print(f"\nLocal JSON file written: {args.output}")
	except Exception as e:
	print(f"Error writing {args.output}: {e}")
	sys.exit(1)

	# Print summary
	print("\n" + "="*60)
	print("LOCAL JSON GENERATION SUMMARY:")
	print("="*60)
	print(f"Source JSON: {source_json}")
	print(f"Output JSON: {args.output}")
	print(f"Total files processed: {total_files_processed}")
	print(f"Files found locally: {total_files_found}")
	print(f"Files missing: {total_files_missing}")
	print(f"Success rate: {(total_files_found/total_files_processed)*100:.1f}%")
	print(f"Datasets in output: {len(local_data)}")

	# Report missing files
	if missing_files_report:
	print(f"\nMISSING FILES ({len(missing_files_report)}):")
	print("-" * 50)

	# Group by dataset
	missing_by_dataset = {}
	for missing in missing_files_report:
	dataset = missing['dataset']
	if dataset not in missing_by_dataset:
	missing_by_dataset[dataset] = []
	missing_by_dataset[dataset].append(missing)

	for dataset, missing_list in missing_by_dataset.items():
	print(f" {dataset}: {len(missing_list)} missing files")

	# Write detailed missing files report
	missing_report_file = f"missing_files_{args.output.replace('.json', '.txt')}"
	with open(missing_report_file, 'w') as f:
	f.write("MISSING FILES REPORT\n")
	f.write("=" * 50 + "\n")
	f.write(f"Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
	f.write(f"Source JSON: {source_json}\n")
	f.write(f"Download directory: {os.path.abspath(args.download_dir)}\n")
	f.write(f"Total missing files: {len(missing_files_report)}\n\n")

	for dataset, missing_list in missing_by_dataset.items():
	f.write(f"\nDATASET: {dataset} ({len(missing_list)} missing)\n")
	f.write("-" * 40 + "\n")
	for missing in missing_list:
	f.write(f"File: {missing['filename']}\n")
	f.write(f"URL: {missing['original_url']}\n")
	f.write("-" * 20 + "\n")

	print(f"Detailed missing files report: {missing_report_file}")

	# Report size mismatches
	if size_mismatches:
	print(f"\nSIZE ISSUES ({len(size_mismatches)}):")
	print("-" * 50)
	for mismatch in size_mismatches:
	print(f" {mismatch['dataset']}/{mismatch['file']}: {mismatch['issue']}")

	print(f"\nLocal QCD.json file is ready for use!")
	print(f"All file paths are absolute and point to your local downloads.")

	if __name__ == "__main__":
	import time
	main()
No results found