Skip to content

Instantly share code, notes, and snippets.

@DNGros
Created February 21, 2021 09:34
Show Gist options
  • Save DNGros/1a1bfaaac0544b1b53bf20a301d61b50 to your computer and use it in GitHub Desktop.
Save DNGros/1a1bfaaac0544b1b53bf20a301d61b50 to your computer and use it in GitHub Desktop.
CuBERT Manifest Dup Inspect
from pathlib import Path
from tqdm import tqdm
import pandas as pd
# Get the manifests from gs://cubert/20200621_Python/github_python_minus_ethpy150open_deduplicated_manifest
cubert_py_manifests_root = Path("REPLACE_WITH_PATH_WHERE_DOWNLOAD_MANIFESTS")
def get_manifest_df(num_files_limit: int = None) -> pd.DataFrame:
all_manifests = list(cubert_py_manifests_root.iterdir())
taken_manifests = all_manifests[:min(num_files_limit or 999999, len(all_manifests))]
df = pd.concat(
pd.read_json(
fn,
orient='records',
lines=True,
)
for fn in tqdm(taken_manifests, desc="Reading cubert manifests")
)
return df
if __name__ == "__main__":
df = get_manifest_df(num_files_limit=None)
print("Total Num In Manifest", len(df))
print("Top 20 SHAs:")
g = df['id'].value_counts().sort_values(ascending=False)
print(g[:20])
print("Num unique ids", len(g))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment