Created
February 21, 2021 09:34
-
-
Save DNGros/1a1bfaaac0544b1b53bf20a301d61b50 to your computer and use it in GitHub Desktop.
CuBERT Manifest Dup Inspect
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from tqdm import tqdm | |
import pandas as pd | |
# Get the manifests from gs://cubert/20200621_Python/github_python_minus_ethpy150open_deduplicated_manifest | |
cubert_py_manifests_root = Path("REPLACE_WITH_PATH_WHERE_DOWNLOAD_MANIFESTS") | |
def get_manifest_df(num_files_limit: int = None) -> pd.DataFrame: | |
all_manifests = list(cubert_py_manifests_root.iterdir()) | |
taken_manifests = all_manifests[:min(num_files_limit or 999999, len(all_manifests))] | |
df = pd.concat( | |
pd.read_json( | |
fn, | |
orient='records', | |
lines=True, | |
) | |
for fn in tqdm(taken_manifests, desc="Reading cubert manifests") | |
) | |
return df | |
if __name__ == "__main__": | |
df = get_manifest_df(num_files_limit=None) | |
print("Total Num In Manifest", len(df)) | |
print("Top 20 SHAs:") | |
g = df['id'].value_counts().sort_values(ascending=False) | |
print(g[:20]) | |
print("Num unique ids", len(g)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment