Last active
August 29, 2015 14:16
-
-
Save jjjake/161b318d9d5114051cd6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Assign identifier and collection to variables for use in final output. | |
.metadata.identifier as $i | | |
.metadata.collection as $c | | |
# Filter out any items that do not have files metadata. | |
select(.files != null) | | |
# Get all non-derivative files that have a file size, and slim down the metadata. | |
.files | | |
map( | |
select(.source != "derivative") | | |
# if case for catching files with size=null (i.e. files.xml). | |
if .size != null then | |
{"name": .name, "size": (.size | tonumber), "format": .format, "md5": .md5} | |
else | |
{"name": .name, "size": 0, "format": .format, "md5": .md5} | |
end | |
) | | |
# Get total size of files (per item). | |
(map(.size) | reduce .[] as $item (0; . + $item)) as $ts | | |
# Final output (per item). | |
{"id": $i, "collection": $c, "total_size": $ts, "files": .} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# ia-mine binaries available here: https://archive.org/details/iamine-pex | |
# concurrently retrieve metadata from items. | |
./ia-mine-0.5-py3.3.pex metamgr-norm-ids-20150304205357.txt --workers 600 2>/dev/null | | |
# mine progress stats. | |
pv -lacbrN 'mine' | | |
# parse JSON in parallel. | |
./parallel-chunks.sh jq -c -r -f get_file_size_md.jq | | |
# JSON parsing progress stats. | |
pv -lacbrN 'parse' | | |
# gzip output. | |
gzip > indexed-item-size-md_20150304205357.json.gz |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""jq alternative for getting non-derived file size_md | |
""" | |
import sys | |
import ujson | |
import json | |
if __name__ == '__main__': | |
# TODO: this doesn't work for JSON with newline chars. | |
for line in sys.stdin: | |
try: | |
if not line: | |
continue | |
j = ujson.loads(line) | |
identifier = j.get('metadata', {}).get('identifier') | |
files = j.get('files', []) | |
if not files: | |
#sys.stderr.write('{} has no files.\n'.format(identifier)) | |
continue | |
else: | |
files = [] | |
size = 0 | |
for f in j.get('files', []): | |
if f.get('source') != 'derivative': | |
size += int(f.get('size', 0)) | |
files.append({ | |
'name': f.get('name'), | |
'md5': f.get('md5'), | |
'format': f.get('format'), | |
'size': int(f.get('size', 0)), | |
}) | |
md = { | |
'id': identifier, | |
'files': files, | |
'collection': j.get('metadata', {}).get('collection'), | |
'total_size': size, | |
} | |
print(ujson.dumps(md)) | |
except Exception as exc: | |
sys.stderr.write(str(exc) + '\n') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
command="$@" | |
cat | parallel --pipe --group --block 1M "$command" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment