Last active
April 10, 2023 16:24
-
-
Save vmarkovtsev/7b0216a6e23fcf879f87a10f0f421915 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
from collections import defaultdict | |
import numpy as np | |
import pandas as pd | |
from tqdm import tqdm | |
# calculate the number of lines with zcat commits.csv.gz | wc -l | |
size = 485226041 | |
hashes = np.zeros(size, dtype="S20") | |
authors = np.zeros(size, dtype="u4") | |
committers = np.zeros(size, dtype="u4") | |
# read line by line and don't use pd.read_csv for less memory | |
with gzip.open("commits.csv.gz") as fin: | |
for i, line in enumerate(tqdm(fin, total=size)): | |
parts = line.decode().split(",") | |
hashes[i] = np.frombuffer(bytes.fromhex(parts[1][1:-1]), dtype="S20") | |
authors[i] = int(parts[2]) | |
committers[i] = int(parts[3]) | |
# sort by commit hash | |
order = np.argsort(hashes) | |
hashes = hashes[order] | |
authors = authors[order] | |
committers = committers[order] | |
# load the CSV from BigQuery | |
# actually, I had to merge 6 separate CSV chunks here due to | |
# BigQuery output limitations on Google Drive (less than 1GB) | |
bq_df = pd.read_csv("bigquery-results.csv.gz") | |
# a dict with the commit hashes will consume too much memory | |
# so we get away with np.searchsorted | |
people = defaultdict(set) | |
acc_membership = defaultdict(set) | |
for author, committer, commit, acc in tqdm(zip( | |
bq_df["name"].values, bq_df["name_1"].values, | |
bq_df["commit"].values, bq_df["f0_"].values), | |
total=len(bq_df)): | |
h = bytes.fromhex(commit) | |
index = np.searchsorted(commits_to_find, h) | |
if hashes[index] != h: | |
continue | |
# check isinstance(author, str) to avoid inserting NANs | |
if isinstance(author, str): | |
people[(author_id := authors[index])].add(author) | |
if isinstance(committer, str): | |
people[(committer_id := committers[index])].add(committer) | |
acc_membership[acc].add(author_id) | |
acc_membership[acc].add(committer_id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @jmkehayias
commits_to_find
isnp.array(dtype="S20")
- 20-byte hashes. I think that I missed a rename fromhashes
. And the lineindex = found_indexes[index]
should be removed. Sorry for the raw code.I fixed it.