Skip to content

Instantly share code, notes, and snippets.

@DonRichards
Created July 2, 2024 14:19
Show Gist options
  • Save DonRichards/43bafacb20b6b48ce90da871bc6c3cbd to your computer and use it in GitHub Desktop.
Save DonRichards/43bafacb20b6b48ce90da871bc6c3cbd to your computer and use it in GitHub Desktop.
Processes multiple JSON files provided by S3 when copying a bucket and extract and display unique `RelativePath` and `SrcChecksum` pairs.
import os
import json
import pandas as pd
"""
This script processes multiple JSON files to extract and display unique `RelativePath` and `SrcChecksum` pairs.
1. Loads JSON files from a specified directory.
2. Extracts `RelativePath` and `SrcChecksum` from the `Verified` list in each JSON file.
3. Tracks unique `RelativePath` and their corresponding `SrcChecksum` to avoid duplicates.
4. Identifies and logs instances where the same `RelativePath` has different `SrcChecksum` values.
5. Displays the unique records in a tabular format and shows the total count of unique files.
6. Alerts if there are duplicate paths with different checksums.
Todo: Utilize this to check that migrated files match the hashes.
"""
def load_json_files(file_paths):
records = []
seen_paths = {}
duplicates_with_diff_checksum = []
for file_path in file_paths:
with open(file_path, 'r') as file:
data = json.load(file)
# Extract Verified records
for item in data.get('Verified', []):
relative_path = item.get('RelativePath')
src_checksum = item.get('SrcChecksum')
if relative_path in seen_paths:
if seen_paths[relative_path] != src_checksum:
duplicates_with_diff_checksum.append({
'RelativePath': relative_path,
'ExistingChecksum': seen_paths[relative_path],
'NewChecksum': src_checksum
})
else:
seen_paths[relative_path] = src_checksum
records.append({
'RelativePath': relative_path,
'SrcChecksum': src_checksum
})
return records, duplicates_with_diff_checksum
def display_data(records, duplicates_with_diff_checksum):
df = pd.DataFrame(records)
pd.set_option('display.max_colwidth', None)
print(df)
print("\nTotal number of unique files:", len(records))
if duplicates_with_diff_checksum:
print("\nDuplicates with different checksums:")
df_duplicates = pd.DataFrame(duplicates_with_diff_checksum)
print(df_duplicates)
def main():
# Specify the directory containing JSON files
json_directory = '/path/prod-bucket-manifest/'
# Files within folder rough naming convention
# exec-08765432asdf.files-transferred-v1-00001-112456789013457.json
# exec-08765432asdf.files-transferred-v1-00001-11245678ed55yt6.json
# exec-08765432asdf.files-transferred-v1-00001-112453ev67ytre3.json
# List all JSON files in the directory
json_files = [os.path.join(json_directory, f) for f in os.listdir(json_directory) if f.endswith('.json')]
# Load JSON files and extract records
records, duplicates_with_diff_checksum = load_json_files(json_files)
# Display the records and total count
display_data(records, duplicates_with_diff_checksum)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment