Skip to content

Instantly share code, notes, and snippets.

@moyix
Created February 18, 2022 02:53
Show Gist options
  • Save moyix/c0e77fe96b0570d8a5a72a29399296f2 to your computer and use it in GitHub Desktop.
Save moyix/c0e77fe96b0570d8a5a72a29399296f2 to your computer and use it in GitHub Desktop.
Geez this is awful
#!/bin/bash
DB_TIMEOUT=10000
if [[ $# -ne 3 ]]; then
echo "getrepo.sh name url jobslot"
exit 1
fi
jobslot="$3"
if [[ "$1" == *"/"* ]] ; then
user=$(dirname "$1")
repo=$(basename "$1")
url=https://github.com/"$1"
else
# Some weird ones are single words
url="$2"
user=$(echo "$url" | awk -F/ '{ print $(NF-1) }')
repo=$(echo "$url" | awk -F/ '{ print $(NF) }')
stem=$(echo "$url" | cut -d/ -f1-3)
if [ "$stem" != "https://github.com" ]; then
echo "Something wonky with this one, aborting: $1 $2"
exit
fi
fi
if [ -z "$user" ] || [ -z "$repo" ]; then
exit 1
fi
fl=${user:0:1}
sl=${user:1:1}
# Make lowercase
fl=${fl,,}
sl=${sl,,}
dir="${fl}/${sl}/${user}/${repo}"
logdir="/fastdata/github_repos/logs/${dir}"
mkdir -p "${logdir}"
exec &> "$logdir"/clone.log
echo "Will clone ${url} into ${dir}"
cachedir="/fastdata/github_repos/revdb"
hashdb="${cachedir}/refs.db"
stampdir="/fastdata/github_repos/stamps"
stamp="${stampdir}"/stamps.${jobslot}.txt
FAST="/fastdata/github_repos/repos"
SLOW="/data/research/github"
cd "${FAST}" || exit
mkdir -p "${dir}"
git -c url.https://github.com/.insteadOf=ssh://[email protected]/ clone --filter=blob:limit=1k "${url}" "${dir}"
# Have we seen this repo already?
pushd "${dir}" || exit 1
tophash=$(git rev-parse HEAD)
if [ -z "$tophash" ]; then
# Something has gone wrong here
echo "Couldn't retrieve hash for ${url} in ${dir}"
popd || exit
rm -rf "${dir}"
exit
fi
match=$(sqlite3 -init <(echo .timeout "${DB_TIMEOUT}") -batch "${hashdb}" "SELECT EXISTS(SELECT 1 FROM refs WHERE hash = '"${tophash}"')")
sqlrv=$?
if [[ $sqlrv -ne 0 ]]; then
echo "Error from SQLite: ${sqlrv}"
exit 1
fi
if [[ $match -eq 1 ]]; then
echo "Already seen hash: ${tophash}"
popd || exit
rm -rf "${dir}"
else # It's new!
git log --pretty=format:%H,"${user}","${repo}" | sqlite3 -init <(echo .timeout "${DB_TIMEOUT}") -batch -csv "${hashdb}" ".import '|cat -' refs"
sqlrv=$?
if [[ $sqlrv -ne 0 ]]; then
echo "Error from SQLite: ${sqlrv}"
exit 1
fi
# Move it to slow storage
mkdir -p "${SLOW}"/"${dir}"
popd || exit
# Remove the .git directory first
rm -rf "${dir}"/.git
# Tar it up
tar -cf - "${dir}" | pigz > "${SLOW}"/"${dir}"/repo.tar.gz
rm -rf "${dir}"
fi
echo "${url}" >> "${stamp}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment