Created
August 10, 2015 15:12
-
-
Save codersquid/c6a82d151a7f4b5f3b33 to your computer and use it in GitHub Desktop.
example scripts so that you can see making an item on archive.org with metadata for that item
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
I had a script to upload some conference files to archive.org a while back, and I can't remember | |
if this is the file I used. It could be. It's very krufty. Also, the internetarchive package has | |
reved a few times since. | |
this is only for an example | |
""" | |
import argparse | |
import json | |
import logging | |
import os | |
import sys | |
from internetarchive import get_item | |
logging.basicConfig( | |
format='%(asctime)s|%(levelname)s|%(message)s', | |
datefmt='%Y-%m-%d %H:%M', | |
filename='archive.log', | |
level=logging.DEBUG, | |
) | |
""" | |
This is a convenience script that wrapps ia-wrapper calls | |
http://archive.org/help/abouts3.txt | |
MEH boto escapes things and the internetarchive library handles things ok | |
""" | |
ACCESS_KEY="example" | |
SECRET_KEY="example" | |
def prepare(): | |
path = os.path.abspath('videos') | |
files = os.listdir(path) | |
with open('schedule.json') as fh: | |
schedule = json.load(fh) | |
return { | |
'files': files, | |
'path': path, | |
'schedule': schedule, | |
} | |
def upload_all(path, schedule): | |
files = os.listdir(path) | |
for f in files: | |
event_id = f.replace('.mp4', '') | |
if event_id not in schedule: | |
logging.warning('SKIPPING: %s', event_id) | |
continue | |
md = schedule[event_id] | |
name = '%s_event_%s' % (md['conference'].lower().replace(' ', '_'), event_id) | |
try: | |
item = get_item(name) | |
logging.debug("UPLOADING: event %s %s", event_id, name) | |
uploaded = item.upload(os.path.join(path, f), metadata=md, access_key=ACCESS_KEY, secret_key=SECRET_KEY) | |
if uploaded: | |
logging.info("SUCCESS: event %s %s", event_id, name) | |
else: | |
logging.info("FAILURE: event %s %s", event_id, name) | |
except: | |
e_type = sys.exc_info()[0] | |
logging.error("FAILURE: event %s %s type %s", event_id, name, e_type) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--schedule', default='schedule.json') | |
parser.add_argument('--videos', default='videos') | |
args = parser.parse_args() | |
with open(args.schedule) as fh: | |
schedule = json.load(fh) | |
videopath = os.path.abspath(args.videos) | |
upload_all(videopath, schedule) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import argparse | |
import json | |
import logging | |
""" | |
Takes schedule xml from frab conference system for EuroPython2014 and PyData Berlin 2014 | |
and creates json schedule with metadata fields for internet archive | |
http://archive.org/help/abouts3.txt | |
http://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/ | |
via email from [email protected] | |
The mediatypes are: | |
texts | |
audio | |
movies | |
image | |
The Community collections are: | |
opensource (this is for texts) | |
opensource_audio | |
opensource_movies | |
opensource_media (for items that are not text, audio or movies in type) | |
""" | |
logging.basicConfig( | |
format='%(asctime)s|%(levelname)s|%(message)s', | |
datefmt='%Y-%m-%d %H:%M', | |
filename='frab2json.log', | |
level=logging.DEBUG, | |
) | |
from bs4 import BeautifulSoup | |
def get_common_metadata(): | |
return { | |
'mediatype': 'movies', | |
'collection': 'opensource_movies', | |
'type': 'conference', | |
} | |
def get_conference_metadata(event_id): | |
""" specific to europython2014 and pydataberlin2014""" | |
if int(event_id) < 20000: | |
return { | |
'is-part-of': 'https://ep2014.europython.eu/en/', | |
'conference': 'EuroPython 2014', | |
'subject': 'python; europython2014', | |
'year': '2014', | |
'location': 'Berlin, Germany', | |
} | |
else: | |
return { | |
'is-part-of': 'http://pydata.org/berlin2014/', | |
'conference': 'PyData Berlin 2014', | |
'subject': 'python; pydata; pydataberlin2014', | |
'year': '2014', | |
'location': 'Berlin, Germany', | |
} | |
def schedule_xml_to_dict(filename): | |
with open(filename, 'r') as fh: | |
soup = BeautifulSoup(fh) | |
events = soup.find_all('event') | |
schedule = {} | |
for e in events: | |
event_id = e.attrs['id'] # blow up if there is no key | |
if not e.title: | |
logging.warning('skipping event %s with no title', event_id) | |
continue | |
if e in schedule: | |
# ignore duplicates | |
logging.warning('skipping duplicate event %s', event_id) | |
continue | |
persons = e.find_all('person') | |
event = { | |
'extent': e.duration.text, | |
'title': e.title.text, | |
'date': e.date.text, | |
'speakers': [p.text for p in persons], | |
'schedule_event_type': e.type.text, | |
'schedule_event_id': event_id, | |
} | |
if e.language: | |
event['language'] = e.language.text | |
if e.abstract: | |
event['abstract'] = e.abstract.text | |
if e.description: | |
event['description'] = e.description.text | |
event.update(get_conference_metadata(event_id)) | |
event.update(get_common_metadata()) | |
schedule[event_id] = event | |
return schedule | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--frabxml', '-f', default='schedule.xml') | |
parser.add_argument('--output', '-o', default='schedule.json') | |
args = parser.parse_args() | |
d = schedule_xml_to_dict(args.frabxml) | |
with open(args.output, 'w') as fh: | |
json.dump(d, fh) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment