Skip to content

Instantly share code, notes, and snippets.

@jjjake
Last active December 10, 2015 23:59
Show Gist options
  • Select an option

  • Save jjjake/4513524 to your computer and use it in GitHub Desktop.

Select an option

Save jjjake/4513524 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import datetime
import logging
import sys
import ujson
# parallel_md_get.py available here: https://gist.github.com/3784845
from parallel_md_get import metadata_record_iterator
date = datetime.datetime.utcnow().strftime("%Y-%m-%d")
log_filename = "%s-%s.log" % (__file__.strip('./py'), date)
logging_format = "%(asctime)s\t%(levelname)s\t%(message)s"
logging.basicConfig(filename=log_filename,level=logging.WARNING,
format=logging_format)
#_______________________________________________________________________________
def get_arcs(files):
arc_formats = [
'Internet Archive ARC GZ',
'Encrypted Internet Archive ARC GZ',
'Web ARChive GZ',
]
return [f for f in files if f['format'] in arc_formats]
def get_cdxs(files):
cdx_formats = [
'ARC CDX Index',
'WARC CDX Index',
]
return [f for f in files if f['format'] in cdx_formats]
def all_arcs_have_cdxs(files):
arcs = get_arcs(files)
cdxs = get_cdxs(files)
for arc in arcs:
if any(f.get('original') == arc['name'] for f in cdxs) is False:
return False
#_______________________________________________________________________________
ids = open(sys.argv[1])
results = metadata_record_iterator(ids, workers=20)
for i, id, md_json in results:
try:
metadata = ujson.loads(md_json)
files = metadata.get('files')
if not files:
logging.warning("item has no files!\t%s" % id)
continue
if all_arcs_have_cdxs(files) is False:
output_file = 'underived-thumper-items_%s.txt' % date
with open(output_file, 'a') as f:
f.write('%s\n' % id)
except Exception, error:
logging.error("%s:\t%s" % (id, error))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment