Skip to content

Instantly share code, notes, and snippets.

@db48x
Last active December 18, 2016 19:42
Show Gist options
  • Save db48x/66f41a91120266fe195aacec4c5531c2 to your computer and use it in GitHub Desktop.
Save db48x/66f41a91120266fe195aacec4c5531c2 to your computer and use it in GitHub Desktop.
work-in-progress script for updating an iabak shard
#!/bin/bash
# Usage:
# ./add-shardmeta-branch.sh [OPTION] <shard> <collection> [<collection> ...]
# <shard> is the name of a new shard to create
# <collection> is the name of any collection of items on IA
#
# This script updates an existing shard so that it has a
# shardmeta branch with a get_all_items.sh script.
set -e
shard=${1}
shift
criteria=
for c in "$@"; do
if [[ ! -z "${criteria}" ]]; then
criteria+=" OR "
fi
criteria+="collection:${c}"
done
if [[ ! -d "${shard}/.git" ]]; then
echo "${shard} doesn't exist"
exit 1
fi
if [[ -z "${criteria}" ]]; then
echo "you must specify at least one collection to put in the shard"
exit 1
fi
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")"
pushd "${shard}"
if git branch | grep -q shardmeta; then
echo "${shard} already has a shardmeta branch; skipping it"
exit 0
fi
git checkout --orphan shardmeta
git rm -rf . &>- || true
echo "*~" > .gitignore
echo "\#*" >> .gitignore
cat <<-EOF > get_all_items.sh
#!/bin/sh
ia search --itemlist "${criteria}"
EOF
chmod u+x get_all_items.sh
git add .gitignore get_all_items.sh
git commit -m "creating shard metadata branch"
git checkout master
popd
#!/bin/bash
set -e
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")"
sharddir="$(mktemp --directory --suff .testshard)"
pass=0
fail=0
total=0
function exists
{
total=$((total+1))
filename="${1}"
if [[ -h "${filename}" ]] && git-annex whereis "${filename}" &>-; then
echo -n .
pass=$((pass+1))
else
echo -n x
fail=$((fail+1))
fi
}
function doesntexist
{
total=$((total+1))
filename="${1}"
if [[ ! -h "${filename}" ]]; then
echo -n .
pass=$((pass+1))
else
echo -n x
fail=$((fail+1))
fi
}
function exists_nourl
{
total=$((total+1))
filename="${1}"
if [[ -h "${filename}" ]] && ! git-annex whereis "${filename}" &>-; then
echo -n .
pass=$((pass+1))
else
echo -n x
fail=$((fail+1))
fi
}
function keyis {
total=$((total+1))
filename="${1}"
key="${2}"
if [[ -h "${filename}" ]] && [[ "x$(git-annex lookupkey "${filename}")" = "x${key}" ]]; then
echo -n .
pass=$((pass+1))
else
echo -n x
fail=$((fail+1))
fi
}
pushd "$(dirname "${sharddir}")" &>-
echo "create shard"
"${scriptdir}/../make-shard.sh" -t "basic-testcase-first" "${sharddir}" softwarelibrary_win3_showcase &>-
pushd "${sharddir}" &>-
exists "softwarelibrary_win3_showcase/win311_masque_chessnet/CHESSNET.ZIP"
keyis "softwarelibrary_win3_showcase/win311_masque_chessnet/win311_masque_chessnet_meta.sqlite" "MD5-s12288--511557d1a393c0079dd782f2abdc8553"
exists "softwarelibrary_win3_showcase/win32c/win32c.zip"
doesntexist "softwarelibrary_win3_showcase/win3_MidMM211/MidMM211.zip"
echo
popd &>-
echo "update shard"
"${scriptdir}/../update-shard.sh" -t "basic-testcase-second" "${sharddir}" &>-
pushd "${sharddir}" &>-
exists_nourl "softwarelibrary_win3_showcase/win311_masque_chessnet/CHESSNET.ZIP"
keyis "softwarelibrary_win3_showcase/win311_masque_chessnet/win311_masque_chessnet_meta.sqlite" "MD5-s12288--000000000123456789abcdef01234567"
exists_nourl "softwarelibrary_win3_showcase/win32c/win32c.zip"
exists "softwarelibrary_win3_showcase/win3_MidMM211/MidMM211.zip"
echo
popd &>-
echo "${pass} of ${total} tests passed"
if [[ "${fail}" -gt 0 ]]; then
echo "You may wish to examine ${sharddir} to see what went wrong"
exit 1;
fi
rm -rf "${sharddir}"
# Assign identifier and collection to variables for use in final output.
.metadata.identifier as $i |
(.metadata.collection[0]? // .metadata.collection) as $c |
if .is_dark != true then
.files |
map(select(.source != "derivative") |
(if .name | endswith("_files.xml") then
"URL--"
else
"MD5-s\(.size)--\(.md5)"
end) as $key |
["file", $c, $i, $key, "\($i)/\(.name)"])
else
[["dark", $c, $i]]
end |
map(@tsv) | .[]
#!/bin/bash
# Usage:
# ./make-shard.sh [OPTION] <shard> <collection> [<collection> ...]
# -r NAME
# record all output from all calls to the 'ia' tool, for later
# use in test mode.
# -t NAME
# operate in test mode, reading the recorded output from the
# 'ia' tool rather than calling it.
#
# <shard> is the name of a new shard to create
# <collection> is the name of any collection of items on IA
#
# This script creates a new shard containing all items which appear in
# any of the named collections. To avoid overloading the resulting git
# repository, or the hard drives of users who may wish do download the
# complete shard, please avoid creating shards with more than ~100,000
# files or which use more than ~4TB of disk space.
testmode=
recordmode=
mockupname=
while getopts ":t:r:" opt; do
case "${opt}" in
t)
testmode=1
mockupname="${OPTARG}"
;;
r)
recordmode=1
mockupname="${OPTARG}"
;;
\?)
echo "unknown option -${OPTARG}"
exit 128
;;
:)
echo "option -${OPTARG} requires an argument"
exit 128
;;
esac
done
if [[ ! -z "${testmode}" ]] && [[ ! -z "${recordmode}" ]]; then
echo "cannot enable both test mode and record mode at the same time"
exit 129
fi
shift $((OPTIND-1))
set -e
shard=${1}
shift
criteria=
for c in "$@"; do
if [[ ! -z "${criteria}" ]]; then
criteria+=" OR "
fi
criteria+="collection:${c}"
done
if [[ -d "${shard}/.git" ]]; then
echo "${shard} already exists"
exit 1
fi
if [[ -z "${criteria}" ]]; then
echo "you must specify at least one collection to put in the shard"
exit 1
fi
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")"
mkdir -p "${shard}"
pushd "${shard}"
git init
git annex init
git checkout --orphan shardmeta
echo "*~" > .gitignore
echo "\#*" >> .gitignore
cat <<-EOF > get_all_items.sh
#!/bin/sh
ia search --itemlist "${criteria}"
EOF
chmod u+x get_all_items.sh
git add .gitignore get_all_items.sh
git commit -m "creating shard metadata branch"
git checkout --orphan master
git rm -rf .
popd
args=
if [[ ! -z "${testmode}" ]]; then
args="-t ${mockupname}"
elif [[ ! -z "${recordmode}" ]]; then
args="-r ${mockupname}"
fi
"${scriptdir}/update-shard.sh" ${args} "${shard}"
#!/bin/bash
# This script updates a shard by:
# Looking for items that have been uploaded since the shard was created
# Looking for new files added to existing items
# Looking for files which have been modified (the hash has changed)
# Looking for files which have been removed from their item
# Looking for items which have gone dark since the shard was created
# It does not yet:
# Move items to a different shard if they are moved to a completely different collection
# Usage:
# ./update-shard.sh [OPTION] SHARD
# -r NAME
# record all output from all calls to the 'ia' tool, for later
# use in test mode.
# -t NAME
# operate in test mode, reading the recorded output from the
# 'ia' tool rather than calling it.
#
# SHARD is the name of a shard, and the script assumes that you
# have a clone of the shard in the current directory. Note that
# this script also assumes that there is a shardmeta branch
# containing a get_all_items.sh script that can return a list of
# items that ought to be in the shard; not all shards have this
# yet so you might need to create one.
set -e
testmode=
recordmode=
mockupname=
while getopts ":t:r:" opt; do
case "${opt}" in
t)
testmode=1
mockupname="${OPTARG}"
;;
r)
recordmode=1
mockupname="${OPTARG}"
;;
\?)
echo "unknown option -${OPTARG}"
exit 128
;;
:)
echo "option -${OPTARG} requires an argument"
exit 128
;;
esac
done
if [[ ! -z "${testmode}" ]] && [[ ! -z "${recordmode}" ]]; then
echo "cannot enable both test mode and record mode at the same time"
exit 129
fi
shift $((OPTIND-1))
shard=$1
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")"
function gitannex {
git-annex --quiet "$@"
}
IA=/usr/local/bin/ia
function ia {
mock "${IA}" ia "${@}"
}
function mock {
local prog="${1}"
shift
local path="${scriptdir}/test/mocks/${mockupname}/$(join "/" "${@}")"
shift
if [[ -f "${path}" ]] || [[ ! -z "${testmode}" ]]; then
cat "${path}"
elif [[ ! -z "${recordmode}" ]]; then
mkdir -p "$(dirname "${path}")"
"${prog}" "$@" | tee "${path}"
else
"${prog}" "$@"
fi
}
function join {
local IFS="${1}"
shift
echo "${*}"
}
# applypairs is just a pseudo-xargs that doesn't split up pairs of arguments
function applypairs {
declare -n arr="${1}"
declare -n length="${1}_len"
local cmd="${2}"
arr+=("${3}")
arr+=("${4}")
length=$((${length} + ${#3} + ${#4}))
if [[ ${length} -gt $((100*1024)) ]]; then
gitannex "${cmd}" --force "${arr[@]}"
arr=()
length=0
fi
}
function pairfinish {
declare -n arr="${1}"
declare -n length="${1}_len"
local cmd="${2}"
shift 2
local args="${@}"
if [[ ${length} -gt $((0)) ]]; then
gitannex "${cmd}" --force ${args} "${arr[@]}"
arr=()
length=0
fi
}
# applypipe saves the args to a file and then pipes it to the command to finish
function applypipe {
declare -n file="${1}"
local cmd="${2}"
echo "${3} ${4}" >> "${file}"
}
function pipefinish {
declare -n file="${1}"
local cmd="${2}"
shift 2
local args="${@}"
cat "${file}" | pvn "git annex ${cmd}" | gitannex "${cmd}" --force ${args}
rm "${file}"
}
FROMKEY="$(mktemp --suff=.fromkey)"
function fromkey {
applypipe FROMKEY fromkey "$1" "$2"
}
function fromkey-finish {
pipefinish FROMKEY fromkey
}
REKEY="$(mktemp --suff=.rekey)"
function rekey {
applypipe REKEY rekey "$1" "$2"
}
function rekey-finish {
pipefinish REKEY rekey --batch
}
REGISTERURL="$(mktemp --suff=.registerurl)"
function registerurl {
applypipe REGISTERURL registerurl "$1" "$2"
}
function registerurl-finish {
pipefinish REGISTERURL registerurl
}
RMURL="$(mktemp --suff=.rmurl)"
function rmurl {
applypipe RMURL rmurl "$1" "$2"
}
function rmurl-finish {
pipefinish RMURL rmurl --batch
}
function collectionFromItem {
ls -d */"${1}" 2>&- | xargs -L1 dirname 2>&- | head -n 1
}
function mineia {
while read i; do
ia metadata "${i}"
done
}
function parsejson {
jq --raw-output -f "${scriptdir}/get_item_updates.jq"
}
function getitems {
mock "${1}/get_all_items.sh" get_all_items.sh
find -mindepth 2 -type d -not -path "*/.git/*" | xargs -L1 basename 2>&-
}
function getfiles {
cat "${tmpitems}" | mineia | parsejson
}
function pad {
printf "%${1}s" "${2}"
}
function pvn {
local msg="$(pad 25 "${1}")"
pv -N "${msg}" -cltab
}
function findfiles {
cd "${1}"; find "${2}" -mindepth 1 | sort
}
function listfiles {
declare -n filelist="${1}"
local IFS=$'\n'
echo "${filelist[*]}" | sort
}
function archiveurl {
echo "https://archive.org/download/$(urlencode -m "${1}")"
}
function addrmfiles {
local savedcollection=
local saveditem=
local saveditem_files=()
while read update collection item key filename; do
# The $collection here is simply the first collection
# in the list of collections that this item belongs
# to. We want to put the item in a stable location
# even if that list changes, so we first look to see
# what directory we put it in last time, if any.
local currentcollection="$(collectionFromItem "${item}")"
if [[ -z "${currentcollection}" ]]; then
currentcollection="${collection}"
fi
local url="$(archiveurl "${filename}")"
local path="${currentcollection}/${filename}"
if [[ "dark" = "${update:-x}" ]]; then
# The metadata for dark items doesn't include
# a file list, so we have to enumerate the
# files on disk directly.
(cd "${currentcollection}";
for file in ${item}/*; do
rmurl "${file}" "$(archiveurl "${file}")"
done)
else
# Calculates the key from the url exactly as
# git-annex will do it.
if [[ "${key}" = "URL--" ]]; then
if [[ "${#url}" -gt 64 ]]; then
key="URL--${url:0:31}-$(echo "${url}" | md5sum - | cut -d ' ' -f 1)"
else
key="URL--${url}"
fi
fi
if [[ ! -h "${path}" ]]; then
fromkey "${key}" "${path}"
registerurl "${key}" "${url}"
elif [[ "x$(git annex lookupkey "${path}")" != "x${key}" ]]; then
rekey "${path}" "${key}"
fi
fi
# Once we've been through all the current files in the
# item, we need to enumerate the local files for the
# item and see if we have any that aren't in the
# metadata; this will indicate that they were at some
# point removed from the item. Of course this would
# be simpler if we still had everything grouped by
# item.
if [[ -z "${saveditem}" ]]; then
savedcollection="${currentcollection}"
saveditem="${item}"
fi
if [[ "x${item}" = "x${saveditem}" ]]; then
saveditem_files+=("${filename}")
else
if [[ -d "${savedcollection}/${saveditem}" ]]; then
comm -23 <(findfiles "${savedcollection}" "${saveditem}") <(listfiles saveditem_files) | while read file; do
rmurl "${savedcollection}/${file}" "$(archiveurl "${file}")"
done
fi
savedcollection="${currentcollection}"
saveditem="${item}"
saveditem_files+=("${filename}")
fi
done
if [[ ! -z "${saveditem}" ]] && [[ -d "${savedcollection}/${saveditem}" ]]; then
comm -23 <(findfiles "${savedcollection}" "${saveditem}") <(listfiles saveditem_files) | while read file; do
rmurl "${savedcollection}/${file}" "$(archiveurl "${file}")"
done
fi
}
if [[ ! -d "${shard}" ]]; then
echo "No shard named '${shard}' exists"
exit 1
fi
pushd "${shard}"
commit_msg="updating ${shard}"
if ! git branch -v | grep -q master; then
commit_msg="creating ${shard}"
git checkout --orphan master
git rm -rf . 2>&- || true
elif [[ "x$(git rev-parse --abbrev-ref HEAD)" != "xmaster" ]]; then
git checkout master
fi
tmpdir="$(mktemp -d)"
git clone --shared --branch shardmeta -- . "${tmpdir}"
tmpitems="$(mktemp)"
getitems "${tmpdir}" | pvn "search for items" | sort -u > "${tmpitems}"
IFS=$'\t'
getfiles | pvn "enumerate files" | addrmfiles
fromkey-finish
rekey-finish
registerurl-finish
rmurl-finish
rm -rf -- "${tmpdir}" "${tmpitems}"
git annex merge
git commit --quiet -a -m "${commit_msg}"
popd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment