vmarkovtsev · April 10, 2023 16:24 · jmkehayias · Mar 30, 2023 · vmarkovtsev · Apr 10, 2023
diff --git a/python_shines.py b/python_shines.py
 import gzip
 from collections import defaultdict
 import numpy as np
 import pandas as pd
 from tqdm import tqdm

 # calculate the number of lines with zcat commits.csv.gz | wc -l
 size = 485226041
 hashes = np.zeros(size, dtype="S20")
 authors = np.zeros(size, dtype="u4")
 committers = np.zeros(size, dtype="u4")

 # read line by line and don't use pd.read_csv for less memory
 with gzip.open("commits.csv.gz") as fin:
    for i, line in enumerate(tqdm(fin, total=size)):
        parts = line.decode().split(",")
        hashes[i] = np.frombuffer(bytes.fromhex(parts[1][1:-1]), dtype="S20")
        authors[i] = int(parts[2])
        committers[i] = int(parts[3])
 # sort by commit hash
 order = np.argsort(hashes)
 hashes = hashes[order]
 authors = authors[order]
 committers = committers[order]
 # load the CSV from BigQuery
 # actually, I had to merge 6 separate CSV chunks here due to
 # BigQuery output limitations on Google Drive (less than 1GB)
 bq_df = pd.read_csv("bigquery-results.csv.gz")
 # a dict with the commit hashes will consume too much memory
 # so we get away with np.searchsorted
 people = defaultdict(set)
 acc_membership = defaultdict(set)
 for author, committer, commit, acc in tqdm(zip(
        bq_df["name"].values, bq_df["name_1"].values,
        bq_df["commit"].values, bq_df["f0_"].values),
        total=len(bq_df)):
    h = bytes.fromhex(commit)
    index = np.searchsorted(commits_to_find, h)
    if hashes[index] != h:
        continue
    # check isinstance(author, str) to avoid inserting NANs
    if isinstance(author, str):
        people[(author_id := authors[index])].add(author)
    if isinstance(committer, str):
        people[(committer_id := committers[index])].add(committer)
    acc_membership[acc].add(author_id)
    acc_membership[acc].add(committer_id)
	import gzip
	from collections import defaultdict
	import numpy as np
	import pandas as pd
	from tqdm import tqdm

	# calculate the number of lines with zcat commits.csv.gz \| wc -l
	size = 485226041
	hashes = np.zeros(size, dtype="S20")
	authors = np.zeros(size, dtype="u4")
	committers = np.zeros(size, dtype="u4")

	# read line by line and don't use pd.read_csv for less memory
	with gzip.open("commits.csv.gz") as fin:
	for i, line in enumerate(tqdm(fin, total=size)):
	parts = line.decode().split(",")
	hashes[i] = np.frombuffer(bytes.fromhex(parts[1][1:-1]), dtype="S20")
	authors[i] = int(parts[2])
	committers[i] = int(parts[3])
	# sort by commit hash
	order = np.argsort(hashes)
	hashes = hashes[order]
	authors = authors[order]
	committers = committers[order]
	# load the CSV from BigQuery
	# actually, I had to merge 6 separate CSV chunks here due to
	# BigQuery output limitations on Google Drive (less than 1GB)
	bq_df = pd.read_csv("bigquery-results.csv.gz")
	# a dict with the commit hashes will consume too much memory
	# so we get away with np.searchsorted
	people = defaultdict(set)
	acc_membership = defaultdict(set)
	for author, committer, commit, acc in tqdm(zip(
	bq_df["name"].values, bq_df["name_1"].values,
	bq_df["commit"].values, bq_df["f0_"].values),
	total=len(bq_df)):
	h = bytes.fromhex(commit)
	index = np.searchsorted(commits_to_find, h)
	if hashes[index] != h:
	continue
	# check isinstance(author, str) to avoid inserting NANs
	if isinstance(author, str):
	people[(author_id := authors[index])].add(author)
	if isinstance(committer, str):
	people[(committer_id := committers[index])].add(committer)
	acc_membership[acc].add(author_id)
	acc_membership[acc].add(committer_id)