Created
February 18, 2022 02:53
-
-
Save moyix/c0e77fe96b0570d8a5a72a29399296f2 to your computer and use it in GitHub Desktop.
Geez this is awful
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
DB_TIMEOUT=10000 | |
if [[ $# -ne 3 ]]; then | |
echo "getrepo.sh name url jobslot" | |
exit 1 | |
fi | |
jobslot="$3" | |
if [[ "$1" == *"/"* ]] ; then | |
user=$(dirname "$1") | |
repo=$(basename "$1") | |
url=https://github.com/"$1" | |
else | |
# Some weird ones are single words | |
url="$2" | |
user=$(echo "$url" | awk -F/ '{ print $(NF-1) }') | |
repo=$(echo "$url" | awk -F/ '{ print $(NF) }') | |
stem=$(echo "$url" | cut -d/ -f1-3) | |
if [ "$stem" != "https://github.com" ]; then | |
echo "Something wonky with this one, aborting: $1 $2" | |
exit | |
fi | |
fi | |
if [ -z "$user" ] || [ -z "$repo" ]; then | |
exit 1 | |
fi | |
fl=${user:0:1} | |
sl=${user:1:1} | |
# Make lowercase | |
fl=${fl,,} | |
sl=${sl,,} | |
dir="${fl}/${sl}/${user}/${repo}" | |
logdir="/fastdata/github_repos/logs/${dir}" | |
mkdir -p "${logdir}" | |
exec &> "$logdir"/clone.log | |
echo "Will clone ${url} into ${dir}" | |
cachedir="/fastdata/github_repos/revdb" | |
hashdb="${cachedir}/refs.db" | |
stampdir="/fastdata/github_repos/stamps" | |
stamp="${stampdir}"/stamps.${jobslot}.txt | |
FAST="/fastdata/github_repos/repos" | |
SLOW="/data/research/github" | |
cd "${FAST}" || exit | |
mkdir -p "${dir}" | |
git -c url.https://github.com/.insteadOf=ssh://[email protected]/ clone --filter=blob:limit=1k "${url}" "${dir}" | |
# Have we seen this repo already? | |
pushd "${dir}" || exit 1 | |
tophash=$(git rev-parse HEAD) | |
if [ -z "$tophash" ]; then | |
# Something has gone wrong here | |
echo "Couldn't retrieve hash for ${url} in ${dir}" | |
popd || exit | |
rm -rf "${dir}" | |
exit | |
fi | |
match=$(sqlite3 -init <(echo .timeout "${DB_TIMEOUT}") -batch "${hashdb}" "SELECT EXISTS(SELECT 1 FROM refs WHERE hash = '"${tophash}"')") | |
sqlrv=$? | |
if [[ $sqlrv -ne 0 ]]; then | |
echo "Error from SQLite: ${sqlrv}" | |
exit 1 | |
fi | |
if [[ $match -eq 1 ]]; then | |
echo "Already seen hash: ${tophash}" | |
popd || exit | |
rm -rf "${dir}" | |
else # It's new! | |
git log --pretty=format:%H,"${user}","${repo}" | sqlite3 -init <(echo .timeout "${DB_TIMEOUT}") -batch -csv "${hashdb}" ".import '|cat -' refs" | |
sqlrv=$? | |
if [[ $sqlrv -ne 0 ]]; then | |
echo "Error from SQLite: ${sqlrv}" | |
exit 1 | |
fi | |
# Move it to slow storage | |
mkdir -p "${SLOW}"/"${dir}" | |
popd || exit | |
# Remove the .git directory first | |
rm -rf "${dir}"/.git | |
# Tar it up | |
tar -cf - "${dir}" | pigz > "${SLOW}"/"${dir}"/repo.tar.gz | |
rm -rf "${dir}" | |
fi | |
echo "${url}" >> "${stamp}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment