Created
July 2, 2024 14:19
-
-
Save DonRichards/43bafacb20b6b48ce90da871bc6c3cbd to your computer and use it in GitHub Desktop.
Processes multiple JSON files provided by S3 when copying a bucket and extract and display unique `RelativePath` and `SrcChecksum` pairs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import pandas as pd | |
""" | |
This script processes multiple JSON files to extract and display unique `RelativePath` and `SrcChecksum` pairs. | |
1. Loads JSON files from a specified directory. | |
2. Extracts `RelativePath` and `SrcChecksum` from the `Verified` list in each JSON file. | |
3. Tracks unique `RelativePath` and their corresponding `SrcChecksum` to avoid duplicates. | |
4. Identifies and logs instances where the same `RelativePath` has different `SrcChecksum` values. | |
5. Displays the unique records in a tabular format and shows the total count of unique files. | |
6. Alerts if there are duplicate paths with different checksums. | |
Todo: Utilize this to check that migrated files match the hashes. | |
""" | |
def load_json_files(file_paths): | |
records = [] | |
seen_paths = {} | |
duplicates_with_diff_checksum = [] | |
for file_path in file_paths: | |
with open(file_path, 'r') as file: | |
data = json.load(file) | |
# Extract Verified records | |
for item in data.get('Verified', []): | |
relative_path = item.get('RelativePath') | |
src_checksum = item.get('SrcChecksum') | |
if relative_path in seen_paths: | |
if seen_paths[relative_path] != src_checksum: | |
duplicates_with_diff_checksum.append({ | |
'RelativePath': relative_path, | |
'ExistingChecksum': seen_paths[relative_path], | |
'NewChecksum': src_checksum | |
}) | |
else: | |
seen_paths[relative_path] = src_checksum | |
records.append({ | |
'RelativePath': relative_path, | |
'SrcChecksum': src_checksum | |
}) | |
return records, duplicates_with_diff_checksum | |
def display_data(records, duplicates_with_diff_checksum): | |
df = pd.DataFrame(records) | |
pd.set_option('display.max_colwidth', None) | |
print(df) | |
print("\nTotal number of unique files:", len(records)) | |
if duplicates_with_diff_checksum: | |
print("\nDuplicates with different checksums:") | |
df_duplicates = pd.DataFrame(duplicates_with_diff_checksum) | |
print(df_duplicates) | |
def main(): | |
# Specify the directory containing JSON files | |
json_directory = '/path/prod-bucket-manifest/' | |
# Files within folder rough naming convention | |
# exec-08765432asdf.files-transferred-v1-00001-112456789013457.json | |
# exec-08765432asdf.files-transferred-v1-00001-11245678ed55yt6.json | |
# exec-08765432asdf.files-transferred-v1-00001-112453ev67ytre3.json | |
# List all JSON files in the directory | |
json_files = [os.path.join(json_directory, f) for f in os.listdir(json_directory) if f.endswith('.json')] | |
# Load JSON files and extract records | |
records, duplicates_with_diff_checksum = load_json_files(json_files) | |
# Display the records and total count | |
display_data(records, duplicates_with_diff_checksum) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment