Created
September 22, 2014 22:07
-
-
Save jjjake/6aa11ba871763ed703e0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # | |
| # Assert that a web item is fully derived by comparing the number of | |
| # WARCs to the number of CDXs. An item that is not fully derived will | |
| # have fewer CDXs than WARCs. | |
| # | |
| # Requires: https://github.com/jjjake/ia-wrapper | |
| # | |
| # Usage: | |
| # $ ./assert_web_item_is_fully_derived $IDENTIFIER | |
| # $ parallel './assert_web_item_is_fully_derived {}' < itemlist.txt | |
| # | |
| IDENTIFIER="${1}" | |
| WARC_COUNT="$(ia ls $IDENTIFIER --columns format | grep 'Web ARChive GZ' | wc -l)" | |
| CDX_COUNT="$(ia ls $IDENTIFIER --columns format | grep 'WARC CDX Index' | wc -l)" | |
| if [[ $WARC_COUNT == $CDX_COUNT ]]; then | |
| echo "${IDENTIFIER} is fully derived." | |
| exit 0 | |
| else | |
| echo "error: ${IDENTIFIER} has ${WARC_COUNT} WARCs and ${CDX_COUNT} CDXs" | |
| exit 1 | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment