Last active
December 18, 2016 19:42
-
-
Save db48x/66f41a91120266fe195aacec4c5531c2 to your computer and use it in GitHub Desktop.
work-in-progress script for updating an iabak shard
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Usage: | |
# ./add-shardmeta-branch.sh [OPTION] <shard> <collection> [<collection> ...] | |
# <shard> is the name of a new shard to create | |
# <collection> is the name of any collection of items on IA | |
# | |
# This script updates an existing shard so that it has a | |
# shardmeta branch with a get_all_items.sh script. | |
set -e | |
shard=${1} | |
shift | |
criteria= | |
for c in "$@"; do | |
if [[ ! -z "${criteria}" ]]; then | |
criteria+=" OR " | |
fi | |
criteria+="collection:${c}" | |
done | |
if [[ ! -d "${shard}/.git" ]]; then | |
echo "${shard} doesn't exist" | |
exit 1 | |
fi | |
if [[ -z "${criteria}" ]]; then | |
echo "you must specify at least one collection to put in the shard" | |
exit 1 | |
fi | |
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")" | |
pushd "${shard}" | |
if git branch | grep -q shardmeta; then | |
echo "${shard} already has a shardmeta branch; skipping it" | |
exit 0 | |
fi | |
git checkout --orphan shardmeta | |
git rm -rf . &>- || true | |
echo "*~" > .gitignore | |
echo "\#*" >> .gitignore | |
cat <<-EOF > get_all_items.sh | |
#!/bin/sh | |
ia search --itemlist "${criteria}" | |
EOF | |
chmod u+x get_all_items.sh | |
git add .gitignore get_all_items.sh | |
git commit -m "creating shard metadata branch" | |
git checkout master | |
popd |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")" | |
sharddir="$(mktemp --directory --suff .testshard)" | |
pass=0 | |
fail=0 | |
total=0 | |
function exists | |
{ | |
total=$((total+1)) | |
filename="${1}" | |
if [[ -h "${filename}" ]] && git-annex whereis "${filename}" &>-; then | |
echo -n . | |
pass=$((pass+1)) | |
else | |
echo -n x | |
fail=$((fail+1)) | |
fi | |
} | |
function doesntexist | |
{ | |
total=$((total+1)) | |
filename="${1}" | |
if [[ ! -h "${filename}" ]]; then | |
echo -n . | |
pass=$((pass+1)) | |
else | |
echo -n x | |
fail=$((fail+1)) | |
fi | |
} | |
function exists_nourl | |
{ | |
total=$((total+1)) | |
filename="${1}" | |
if [[ -h "${filename}" ]] && ! git-annex whereis "${filename}" &>-; then | |
echo -n . | |
pass=$((pass+1)) | |
else | |
echo -n x | |
fail=$((fail+1)) | |
fi | |
} | |
function keyis { | |
total=$((total+1)) | |
filename="${1}" | |
key="${2}" | |
if [[ -h "${filename}" ]] && [[ "x$(git-annex lookupkey "${filename}")" = "x${key}" ]]; then | |
echo -n . | |
pass=$((pass+1)) | |
else | |
echo -n x | |
fail=$((fail+1)) | |
fi | |
} | |
pushd "$(dirname "${sharddir}")" &>- | |
echo "create shard" | |
"${scriptdir}/../make-shard.sh" -t "basic-testcase-first" "${sharddir}" softwarelibrary_win3_showcase &>- | |
pushd "${sharddir}" &>- | |
exists "softwarelibrary_win3_showcase/win311_masque_chessnet/CHESSNET.ZIP" | |
keyis "softwarelibrary_win3_showcase/win311_masque_chessnet/win311_masque_chessnet_meta.sqlite" "MD5-s12288--511557d1a393c0079dd782f2abdc8553" | |
exists "softwarelibrary_win3_showcase/win32c/win32c.zip" | |
doesntexist "softwarelibrary_win3_showcase/win3_MidMM211/MidMM211.zip" | |
echo | |
popd &>- | |
echo "update shard" | |
"${scriptdir}/../update-shard.sh" -t "basic-testcase-second" "${sharddir}" &>- | |
pushd "${sharddir}" &>- | |
exists_nourl "softwarelibrary_win3_showcase/win311_masque_chessnet/CHESSNET.ZIP" | |
keyis "softwarelibrary_win3_showcase/win311_masque_chessnet/win311_masque_chessnet_meta.sqlite" "MD5-s12288--000000000123456789abcdef01234567" | |
exists_nourl "softwarelibrary_win3_showcase/win32c/win32c.zip" | |
exists "softwarelibrary_win3_showcase/win3_MidMM211/MidMM211.zip" | |
echo | |
popd &>- | |
echo "${pass} of ${total} tests passed" | |
if [[ "${fail}" -gt 0 ]]; then | |
echo "You may wish to examine ${sharddir} to see what went wrong" | |
exit 1; | |
fi | |
rm -rf "${sharddir}" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Assign identifier and collection to variables for use in final output. | |
.metadata.identifier as $i | | |
(.metadata.collection[0]? // .metadata.collection) as $c | | |
if .is_dark != true then | |
.files | | |
map(select(.source != "derivative") | | |
(if .name | endswith("_files.xml") then | |
"URL--" | |
else | |
"MD5-s\(.size)--\(.md5)" | |
end) as $key | | |
["file", $c, $i, $key, "\($i)/\(.name)"]) | |
else | |
[["dark", $c, $i]] | |
end | | |
map(@tsv) | .[] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Usage: | |
# ./make-shard.sh [OPTION] <shard> <collection> [<collection> ...] | |
# -r NAME | |
# record all output from all calls to the 'ia' tool, for later | |
# use in test mode. | |
# -t NAME | |
# operate in test mode, reading the recorded output from the | |
# 'ia' tool rather than calling it. | |
# | |
# <shard> is the name of a new shard to create | |
# <collection> is the name of any collection of items on IA | |
# | |
# This script creates a new shard containing all items which appear in | |
# any of the named collections. To avoid overloading the resulting git | |
# repository, or the hard drives of users who may wish do download the | |
# complete shard, please avoid creating shards with more than ~100,000 | |
# files or which use more than ~4TB of disk space. | |
testmode= | |
recordmode= | |
mockupname= | |
while getopts ":t:r:" opt; do | |
case "${opt}" in | |
t) | |
testmode=1 | |
mockupname="${OPTARG}" | |
;; | |
r) | |
recordmode=1 | |
mockupname="${OPTARG}" | |
;; | |
\?) | |
echo "unknown option -${OPTARG}" | |
exit 128 | |
;; | |
:) | |
echo "option -${OPTARG} requires an argument" | |
exit 128 | |
;; | |
esac | |
done | |
if [[ ! -z "${testmode}" ]] && [[ ! -z "${recordmode}" ]]; then | |
echo "cannot enable both test mode and record mode at the same time" | |
exit 129 | |
fi | |
shift $((OPTIND-1)) | |
set -e | |
shard=${1} | |
shift | |
criteria= | |
for c in "$@"; do | |
if [[ ! -z "${criteria}" ]]; then | |
criteria+=" OR " | |
fi | |
criteria+="collection:${c}" | |
done | |
if [[ -d "${shard}/.git" ]]; then | |
echo "${shard} already exists" | |
exit 1 | |
fi | |
if [[ -z "${criteria}" ]]; then | |
echo "you must specify at least one collection to put in the shard" | |
exit 1 | |
fi | |
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")" | |
mkdir -p "${shard}" | |
pushd "${shard}" | |
git init | |
git annex init | |
git checkout --orphan shardmeta | |
echo "*~" > .gitignore | |
echo "\#*" >> .gitignore | |
cat <<-EOF > get_all_items.sh | |
#!/bin/sh | |
ia search --itemlist "${criteria}" | |
EOF | |
chmod u+x get_all_items.sh | |
git add .gitignore get_all_items.sh | |
git commit -m "creating shard metadata branch" | |
git checkout --orphan master | |
git rm -rf . | |
popd | |
args= | |
if [[ ! -z "${testmode}" ]]; then | |
args="-t ${mockupname}" | |
elif [[ ! -z "${recordmode}" ]]; then | |
args="-r ${mockupname}" | |
fi | |
"${scriptdir}/update-shard.sh" ${args} "${shard}" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script updates a shard by: | |
# Looking for items that have been uploaded since the shard was created | |
# Looking for new files added to existing items | |
# Looking for files which have been modified (the hash has changed) | |
# Looking for files which have been removed from their item | |
# Looking for items which have gone dark since the shard was created | |
# It does not yet: | |
# Move items to a different shard if they are moved to a completely different collection | |
# Usage: | |
# ./update-shard.sh [OPTION] SHARD | |
# -r NAME | |
# record all output from all calls to the 'ia' tool, for later | |
# use in test mode. | |
# -t NAME | |
# operate in test mode, reading the recorded output from the | |
# 'ia' tool rather than calling it. | |
# | |
# SHARD is the name of a shard, and the script assumes that you | |
# have a clone of the shard in the current directory. Note that | |
# this script also assumes that there is a shardmeta branch | |
# containing a get_all_items.sh script that can return a list of | |
# items that ought to be in the shard; not all shards have this | |
# yet so you might need to create one. | |
set -e | |
testmode= | |
recordmode= | |
mockupname= | |
while getopts ":t:r:" opt; do | |
case "${opt}" in | |
t) | |
testmode=1 | |
mockupname="${OPTARG}" | |
;; | |
r) | |
recordmode=1 | |
mockupname="${OPTARG}" | |
;; | |
\?) | |
echo "unknown option -${OPTARG}" | |
exit 128 | |
;; | |
:) | |
echo "option -${OPTARG} requires an argument" | |
exit 128 | |
;; | |
esac | |
done | |
if [[ ! -z "${testmode}" ]] && [[ ! -z "${recordmode}" ]]; then | |
echo "cannot enable both test mode and record mode at the same time" | |
exit 129 | |
fi | |
shift $((OPTIND-1)) | |
shard=$1 | |
scriptdir="$(readlink --canonicalize-existing "$(dirname "${0}")")" | |
function gitannex { | |
git-annex --quiet "$@" | |
} | |
IA=/usr/local/bin/ia | |
function ia { | |
mock "${IA}" ia "${@}" | |
} | |
function mock { | |
local prog="${1}" | |
shift | |
local path="${scriptdir}/test/mocks/${mockupname}/$(join "/" "${@}")" | |
shift | |
if [[ -f "${path}" ]] || [[ ! -z "${testmode}" ]]; then | |
cat "${path}" | |
elif [[ ! -z "${recordmode}" ]]; then | |
mkdir -p "$(dirname "${path}")" | |
"${prog}" "$@" | tee "${path}" | |
else | |
"${prog}" "$@" | |
fi | |
} | |
function join { | |
local IFS="${1}" | |
shift | |
echo "${*}" | |
} | |
# applypairs is just a pseudo-xargs that doesn't split up pairs of arguments | |
function applypairs { | |
declare -n arr="${1}" | |
declare -n length="${1}_len" | |
local cmd="${2}" | |
arr+=("${3}") | |
arr+=("${4}") | |
length=$((${length} + ${#3} + ${#4})) | |
if [[ ${length} -gt $((100*1024)) ]]; then | |
gitannex "${cmd}" --force "${arr[@]}" | |
arr=() | |
length=0 | |
fi | |
} | |
function pairfinish { | |
declare -n arr="${1}" | |
declare -n length="${1}_len" | |
local cmd="${2}" | |
shift 2 | |
local args="${@}" | |
if [[ ${length} -gt $((0)) ]]; then | |
gitannex "${cmd}" --force ${args} "${arr[@]}" | |
arr=() | |
length=0 | |
fi | |
} | |
# applypipe saves the args to a file and then pipes it to the command to finish | |
function applypipe { | |
declare -n file="${1}" | |
local cmd="${2}" | |
echo "${3} ${4}" >> "${file}" | |
} | |
function pipefinish { | |
declare -n file="${1}" | |
local cmd="${2}" | |
shift 2 | |
local args="${@}" | |
cat "${file}" | pvn "git annex ${cmd}" | gitannex "${cmd}" --force ${args} | |
rm "${file}" | |
} | |
FROMKEY="$(mktemp --suff=.fromkey)" | |
function fromkey { | |
applypipe FROMKEY fromkey "$1" "$2" | |
} | |
function fromkey-finish { | |
pipefinish FROMKEY fromkey | |
} | |
REKEY="$(mktemp --suff=.rekey)" | |
function rekey { | |
applypipe REKEY rekey "$1" "$2" | |
} | |
function rekey-finish { | |
pipefinish REKEY rekey --batch | |
} | |
REGISTERURL="$(mktemp --suff=.registerurl)" | |
function registerurl { | |
applypipe REGISTERURL registerurl "$1" "$2" | |
} | |
function registerurl-finish { | |
pipefinish REGISTERURL registerurl | |
} | |
RMURL="$(mktemp --suff=.rmurl)" | |
function rmurl { | |
applypipe RMURL rmurl "$1" "$2" | |
} | |
function rmurl-finish { | |
pipefinish RMURL rmurl --batch | |
} | |
function collectionFromItem { | |
ls -d */"${1}" 2>&- | xargs -L1 dirname 2>&- | head -n 1 | |
} | |
function mineia { | |
while read i; do | |
ia metadata "${i}" | |
done | |
} | |
function parsejson { | |
jq --raw-output -f "${scriptdir}/get_item_updates.jq" | |
} | |
function getitems { | |
mock "${1}/get_all_items.sh" get_all_items.sh | |
find -mindepth 2 -type d -not -path "*/.git/*" | xargs -L1 basename 2>&- | |
} | |
function getfiles { | |
cat "${tmpitems}" | mineia | parsejson | |
} | |
function pad { | |
printf "%${1}s" "${2}" | |
} | |
function pvn { | |
local msg="$(pad 25 "${1}")" | |
pv -N "${msg}" -cltab | |
} | |
function findfiles { | |
cd "${1}"; find "${2}" -mindepth 1 | sort | |
} | |
function listfiles { | |
declare -n filelist="${1}" | |
local IFS=$'\n' | |
echo "${filelist[*]}" | sort | |
} | |
function archiveurl { | |
echo "https://archive.org/download/$(urlencode -m "${1}")" | |
} | |
function addrmfiles { | |
local savedcollection= | |
local saveditem= | |
local saveditem_files=() | |
while read update collection item key filename; do | |
# The $collection here is simply the first collection | |
# in the list of collections that this item belongs | |
# to. We want to put the item in a stable location | |
# even if that list changes, so we first look to see | |
# what directory we put it in last time, if any. | |
local currentcollection="$(collectionFromItem "${item}")" | |
if [[ -z "${currentcollection}" ]]; then | |
currentcollection="${collection}" | |
fi | |
local url="$(archiveurl "${filename}")" | |
local path="${currentcollection}/${filename}" | |
if [[ "dark" = "${update:-x}" ]]; then | |
# The metadata for dark items doesn't include | |
# a file list, so we have to enumerate the | |
# files on disk directly. | |
(cd "${currentcollection}"; | |
for file in ${item}/*; do | |
rmurl "${file}" "$(archiveurl "${file}")" | |
done) | |
else | |
# Calculates the key from the url exactly as | |
# git-annex will do it. | |
if [[ "${key}" = "URL--" ]]; then | |
if [[ "${#url}" -gt 64 ]]; then | |
key="URL--${url:0:31}-$(echo "${url}" | md5sum - | cut -d ' ' -f 1)" | |
else | |
key="URL--${url}" | |
fi | |
fi | |
if [[ ! -h "${path}" ]]; then | |
fromkey "${key}" "${path}" | |
registerurl "${key}" "${url}" | |
elif [[ "x$(git annex lookupkey "${path}")" != "x${key}" ]]; then | |
rekey "${path}" "${key}" | |
fi | |
fi | |
# Once we've been through all the current files in the | |
# item, we need to enumerate the local files for the | |
# item and see if we have any that aren't in the | |
# metadata; this will indicate that they were at some | |
# point removed from the item. Of course this would | |
# be simpler if we still had everything grouped by | |
# item. | |
if [[ -z "${saveditem}" ]]; then | |
savedcollection="${currentcollection}" | |
saveditem="${item}" | |
fi | |
if [[ "x${item}" = "x${saveditem}" ]]; then | |
saveditem_files+=("${filename}") | |
else | |
if [[ -d "${savedcollection}/${saveditem}" ]]; then | |
comm -23 <(findfiles "${savedcollection}" "${saveditem}") <(listfiles saveditem_files) | while read file; do | |
rmurl "${savedcollection}/${file}" "$(archiveurl "${file}")" | |
done | |
fi | |
savedcollection="${currentcollection}" | |
saveditem="${item}" | |
saveditem_files+=("${filename}") | |
fi | |
done | |
if [[ ! -z "${saveditem}" ]] && [[ -d "${savedcollection}/${saveditem}" ]]; then | |
comm -23 <(findfiles "${savedcollection}" "${saveditem}") <(listfiles saveditem_files) | while read file; do | |
rmurl "${savedcollection}/${file}" "$(archiveurl "${file}")" | |
done | |
fi | |
} | |
if [[ ! -d "${shard}" ]]; then | |
echo "No shard named '${shard}' exists" | |
exit 1 | |
fi | |
pushd "${shard}" | |
commit_msg="updating ${shard}" | |
if ! git branch -v | grep -q master; then | |
commit_msg="creating ${shard}" | |
git checkout --orphan master | |
git rm -rf . 2>&- || true | |
elif [[ "x$(git rev-parse --abbrev-ref HEAD)" != "xmaster" ]]; then | |
git checkout master | |
fi | |
tmpdir="$(mktemp -d)" | |
git clone --shared --branch shardmeta -- . "${tmpdir}" | |
tmpitems="$(mktemp)" | |
getitems "${tmpdir}" | pvn "search for items" | sort -u > "${tmpitems}" | |
IFS=$'\t' | |
getfiles | pvn "enumerate files" | addrmfiles | |
fromkey-finish | |
rekey-finish | |
registerurl-finish | |
rmurl-finish | |
rm -rf -- "${tmpdir}" "${tmpitems}" | |
git annex merge | |
git commit --quiet -a -m "${commit_msg}" | |
popd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment