DonRichards · July 2, 2024 14:19
diff --git a/hashes_merge_aws_logs.py b/hashes_merge_aws_logs.py
 import os
 import json
 import pandas as pd

 """
 This script processes multiple JSON files to extract and display unique `RelativePath` and `SrcChecksum` pairs.
 1. Loads JSON files from a specified directory.
 2. Extracts `RelativePath` and `SrcChecksum` from the `Verified` list in each JSON file.
 3. Tracks unique `RelativePath` and their corresponding `SrcChecksum` to avoid duplicates.
 4. Identifies and logs instances where the same `RelativePath` has different `SrcChecksum` values.
 5. Displays the unique records in a tabular format and shows the total count of unique files.
 6. Alerts if there are duplicate paths with different checksums.

 Todo: Utilize this to check that migrated files match the hashes.
 """

 def load_json_files(file_paths):
    records = []
    seen_paths = {}
    duplicates_with_diff_checksum = []

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            data = json.load(file)
            # Extract Verified records
            for item in data.get('Verified', []):
                relative_path = item.get('RelativePath')
                src_checksum = item.get('SrcChecksum')

                if relative_path in seen_paths:
                    if seen_paths[relative_path] != src_checksum:
                        duplicates_with_diff_checksum.append({
                            'RelativePath': relative_path,
                            'ExistingChecksum': seen_paths[relative_path],
                            'NewChecksum': src_checksum
                        })
                else:
                    seen_paths[relative_path] = src_checksum
                    records.append({
                        'RelativePath': relative_path,
                        'SrcChecksum': src_checksum
                    })

    return records, duplicates_with_diff_checksum

 def display_data(records, duplicates_with_diff_checksum):
    df = pd.DataFrame(records)
    pd.set_option('display.max_colwidth', None)
    print(df)
    print("\nTotal number of unique files:", len(records))

    if duplicates_with_diff_checksum:
        print("\nDuplicates with different checksums:")
        df_duplicates = pd.DataFrame(duplicates_with_diff_checksum)
        print(df_duplicates)

 def main():
    # Specify the directory containing JSON files
    json_directory = '/path/prod-bucket-manifest/'
    # Files within folder rough naming convention
    #  exec-08765432asdf.files-transferred-v1-00001-112456789013457.json
    #  exec-08765432asdf.files-transferred-v1-00001-11245678ed55yt6.json
    #  exec-08765432asdf.files-transferred-v1-00001-112453ev67ytre3.json

    # List all JSON files in the directory
    json_files = [os.path.join(json_directory, f) for f in os.listdir(json_directory) if f.endswith('.json')]

    # Load JSON files and extract records
    records, duplicates_with_diff_checksum = load_json_files(json_files)

    # Display the records and total count
    display_data(records, duplicates_with_diff_checksum)

 if __name__ == '__main__':
    main()
	import os
	import json
	import pandas as pd

	"""
	This script processes multiple JSON files to extract and display unique `RelativePath` and `SrcChecksum` pairs.
	1. Loads JSON files from a specified directory.
	2. Extracts `RelativePath` and `SrcChecksum` from the `Verified` list in each JSON file.
	3. Tracks unique `RelativePath` and their corresponding `SrcChecksum` to avoid duplicates.
	4. Identifies and logs instances where the same `RelativePath` has different `SrcChecksum` values.
	5. Displays the unique records in a tabular format and shows the total count of unique files.
	6. Alerts if there are duplicate paths with different checksums.

	Todo: Utilize this to check that migrated files match the hashes.
	"""

	def load_json_files(file_paths):
	records = []
	seen_paths = {}
	duplicates_with_diff_checksum = []

	for file_path in file_paths:
	with open(file_path, 'r') as file:
	data = json.load(file)
	# Extract Verified records
	for item in data.get('Verified', []):
	relative_path = item.get('RelativePath')
	src_checksum = item.get('SrcChecksum')

	if relative_path in seen_paths:
	if seen_paths[relative_path] != src_checksum:
	duplicates_with_diff_checksum.append({
	'RelativePath': relative_path,
	'ExistingChecksum': seen_paths[relative_path],
	'NewChecksum': src_checksum
	})
	else:
	seen_paths[relative_path] = src_checksum
	records.append({
	'RelativePath': relative_path,
	'SrcChecksum': src_checksum
	})

	return records, duplicates_with_diff_checksum

	def display_data(records, duplicates_with_diff_checksum):
	df = pd.DataFrame(records)
	pd.set_option('display.max_colwidth', None)
	print(df)
	print("\nTotal number of unique files:", len(records))

	if duplicates_with_diff_checksum:
	print("\nDuplicates with different checksums:")
	df_duplicates = pd.DataFrame(duplicates_with_diff_checksum)
	print(df_duplicates)

	def main():
	# Specify the directory containing JSON files
	json_directory = '/path/prod-bucket-manifest/'
	# Files within folder rough naming convention
	# exec-08765432asdf.files-transferred-v1-00001-112456789013457.json
	# exec-08765432asdf.files-transferred-v1-00001-11245678ed55yt6.json
	# exec-08765432asdf.files-transferred-v1-00001-112453ev67ytre3.json

	# List all JSON files in the directory
	json_files = [os.path.join(json_directory, f) for f in os.listdir(json_directory) if f.endswith('.json')]

	# Load JSON files and extract records
	records, duplicates_with_diff_checksum = load_json_files(json_files)

	# Display the records and total count
	display_data(records, duplicates_with_diff_checksum)

	if __name__ == '__main__':
	main()