Last active
April 10, 2023 16:24
-
-
Save vmarkovtsev/7b0216a6e23fcf879f87a10f0f421915 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
from collections import defaultdict | |
import numpy as np | |
import pandas as pd | |
from tqdm import tqdm | |
# calculate the number of lines with zcat commits.csv.gz | wc -l | |
size = 485226041 | |
hashes = np.zeros(size, dtype="S20") | |
authors = np.zeros(size, dtype="u4") | |
committers = np.zeros(size, dtype="u4") | |
# read line by line and don't use pd.read_csv for less memory | |
with gzip.open("commits.csv.gz") as fin: | |
for i, line in enumerate(tqdm(fin, total=size)): | |
parts = line.decode().split(",") | |
hashes[i] = np.frombuffer(bytes.fromhex(parts[1][1:-1]), dtype="S20") | |
authors[i] = int(parts[2]) | |
committers[i] = int(parts[3]) | |
# sort by commit hash | |
order = np.argsort(hashes) | |
hashes = hashes[order] | |
authors = authors[order] | |
committers = committers[order] | |
# load the CSV from BigQuery | |
# actually, I had to merge 6 separate CSV chunks here due to | |
# BigQuery output limitations on Google Drive (less than 1GB) | |
bq_df = pd.read_csv("bigquery-results.csv.gz") | |
# a dict with the commit hashes will consume too much memory | |
# so we get away with np.searchsorted | |
people = defaultdict(set) | |
acc_membership = defaultdict(set) | |
for author, committer, commit, acc in tqdm(zip( | |
bq_df["name"].values, bq_df["name_1"].values, | |
bq_df["commit"].values, bq_df["f0_"].values), | |
total=len(bq_df)): | |
h = bytes.fromhex(commit) | |
index = np.searchsorted(commits_to_find, h) | |
if hashes[index] != h: | |
continue | |
# check isinstance(author, str) to avoid inserting NANs | |
if isinstance(author, str): | |
people[(author_id := authors[index])].add(author) | |
if isinstance(committer, str): | |
people[(committer_id := committers[index])].add(committer) | |
acc_membership[acc].add(author_id) | |
acc_membership[acc].add(committer_id) |
Hi @jmkehayias
commits_to_find
is np.array(dtype="S20")
- 20-byte hashes. I think that I missed a rename from hashes
. And the line index = found_indexes[index]
should be removed. Sorry for the raw code.
I fixed it.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Can you tell me what the commits_to_find and found_indexes objects are supposed to be? I am trying to learn more through examples like this and I really appreciate you sharing this openly online, but I can't seem to figure out those two things and I can't get to the point of playing around with string matching algorithms.