Skip to content

Instantly share code, notes, and snippets.

@db48x
Last active November 10, 2016 13:48
Show Gist options
  • Select an option

  • Save db48x/a1a8847916ab149abbfce25517944bdc to your computer and use it in GitHub Desktop.

Select an option

Save db48x/a1a8847916ab149abbfce25517944bdc to your computer and use it in GitHub Desktop.
# Assign identifier and collection to variables for use in final output.
.metadata.identifier as $i |
.metadata.collection as $c |
# Filter out any items that do not have files metadata.
select(.files != null) |
# Get all non-derivative files that have a file size, and slim down the metadata.
.files |
map(
select(.source != "derivative") |
# if case for catching files with size=null (i.e. files.xml).
if .size != null then
{"url": "https://archive.org/download/\($i)/\(.name)", "size": (.size | tonumber), "collection": $c[0], "md5": .md5}
else
{"url": "https://archive.org/download/\($i)/\(.name)", "size": 0, "collection": $c[0], "md5": .md5}
end
) |
map([.md5, .size, .collection, .url]) | map(@tsv) | .[]
#!/bin/sh
set -e
set -x
list=$1
num=$2
if [ -z "$list" ] || [ -z "$num" ]; then
echo "usage: mkSHARD collection-list N" >&2
exit 1
fi
IFS="
"
cp "$1" "SHARD$num.list"
wc -l "SHARD$num.list"
git init "SHARD$num"
cd "SHARD$num"
git annex init
../importlist ../"SHARD$num.list"
git commit --quiet -a -m "creating SHARD$num"
git annex dead .
git gc --aggressive
git annex info .
cd ..
git clone "SHARD$num" "SHARD$num".git --bare
rm -rf "SHARD$num"
#!/bin/sh
set -e
set -x
collection=${1}
itemfile=${collection}-ids.txt
ia-mine --secure -c -s "collection:${1}" --itemlist >"${itemfile}"
lines=$(wc -l "${itemfile}" | cut -d ' ' -f 1)
chunks=$((lines/100000))
split -nl/${chunks} -d --additional-suffix=.txt "${itemfile}" "${collection}-meta-"
for f in ${collection}-meta-*.json; do
ia-mine --secure -c "${f}" > "$(basename "${f/meta/files/}" .json).txt"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment