codersquid · August 10, 2015 15:12
diff --git a/archive.py b/archive.py
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-

 """
 I had a script to upload some conference files to archive.org a while back, and I can't remember
 if this is the file I used. It could be. It's very krufty. Also, the internetarchive package has
 reved a few times since.

 this is only for an example
 """



 import argparse
 import json
 import logging
 import os
 import sys

 from internetarchive import get_item

 logging.basicConfig(
    format='%(asctime)s|%(levelname)s|%(message)s',
    datefmt='%Y-%m-%d %H:%M',
    filename='archive.log',
    level=logging.DEBUG,
 )


 """
 This is a convenience script that wrapps ia-wrapper calls

 http://archive.org/help/abouts3.txt
 MEH boto escapes things and the internetarchive library handles things ok

 """

 ACCESS_KEY="example"
 SECRET_KEY="example"


 def prepare():
    path = os.path.abspath('videos')
    files = os.listdir(path)
    with open('schedule.json') as fh:
        schedule = json.load(fh)
    return {
        'files': files,
        'path': path,
        'schedule': schedule,
    }


 def upload_all(path, schedule):
    files = os.listdir(path)
    for f in files:
        event_id = f.replace('.mp4', '')

        if event_id not in schedule:
            logging.warning('SKIPPING: %s', event_id)
            continue

        md = schedule[event_id]
        name = '%s_event_%s' % (md['conference'].lower().replace(' ', '_'), event_id)

        try:
            item = get_item(name)
            logging.debug("UPLOADING: event %s %s", event_id, name)
            uploaded = item.upload(os.path.join(path, f), metadata=md, access_key=ACCESS_KEY, secret_key=SECRET_KEY)
            if uploaded:
                logging.info("SUCCESS: event %s %s", event_id, name)
            else:
                logging.info("FAILURE: event %s %s", event_id, name)
        except:
            e_type = sys.exc_info()[0]
            logging.error("FAILURE: event %s %s type %s", event_id, name, e_type)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--schedule', default='schedule.json')
    parser.add_argument('--videos', default='videos')
    args = parser.parse_args()

    with open(args.schedule) as fh:
        schedule = json.load(fh)
    videopath = os.path.abspath(args.videos)

    upload_all(videopath, schedule)
diff --git a/frab2json.py b/frab2json.py
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-

 import argparse
 import json
 import logging

 """

 Takes schedule xml from frab conference system for EuroPython2014 and PyData Berlin 2014
 and creates json schedule with metadata fields for internet archive

 http://archive.org/help/abouts3.txt
 http://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/

 via email from [email protected]

 The mediatypes are:
 texts
 audio
 movies
 image

 The Community collections are:
 opensource (this is for texts)
 opensource_audio
 opensource_movies
 opensource_media (for items that are not text, audio or movies in type)

 """

 logging.basicConfig(
    format='%(asctime)s|%(levelname)s|%(message)s',
    datefmt='%Y-%m-%d %H:%M',
    filename='frab2json.log',
    level=logging.DEBUG,
 )

 from bs4 import BeautifulSoup

 def get_common_metadata():
    return {
        'mediatype': 'movies',
        'collection': 'opensource_movies',
        'type': 'conference',
    }

 def get_conference_metadata(event_id):
    """ specific to europython2014 and pydataberlin2014"""
    if int(event_id) < 20000:
        return {
            'is-part-of': 'https://ep2014.europython.eu/en/',
            'conference': 'EuroPython 2014',
            'subject': 'python; europython2014',
            'year': '2014',
            'location': 'Berlin, Germany',
        }
    else:
        return {
            'is-part-of': 'http://pydata.org/berlin2014/',
            'conference': 'PyData Berlin 2014',
            'subject': 'python; pydata; pydataberlin2014',
            'year': '2014',
            'location': 'Berlin, Germany',
        }

 def schedule_xml_to_dict(filename):
    with open(filename, 'r') as fh:
        soup = BeautifulSoup(fh)

    events = soup.find_all('event')
    schedule = {}
    for e in events:
        event_id = e.attrs['id']  # blow up if there is no key

        if not e.title:
            logging.warning('skipping event %s with no title', event_id)
            continue
        if e in schedule:
            # ignore duplicates
            logging.warning('skipping duplicate event %s', event_id)
            continue

        persons = e.find_all('person')
        event = {
            'extent': e.duration.text,
            'title': e.title.text,
            'date': e.date.text,
            'speakers': [p.text for p in persons],
            'schedule_event_type': e.type.text,
            'schedule_event_id': event_id,
        }
        if e.language:
            event['language'] = e.language.text
        if e.abstract:
            event['abstract'] = e.abstract.text
        if e.description:
            event['description'] = e.description.text
        event.update(get_conference_metadata(event_id))
        event.update(get_common_metadata())
        schedule[event_id] = event
    return schedule

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--frabxml', '-f', default='schedule.xml')
    parser.add_argument('--output', '-o', default='schedule.json')
    args = parser.parse_args()
 
    d = schedule_xml_to_dict(args.frabxml)
    with open(args.output, 'w') as fh:
        json.dump(d, fh)
	#! /usr/bin/env python
	# -- coding: utf-8 --

	"""
	I had a script to upload some conference files to archive.org a while back, and I can't remember
	if this is the file I used. It could be. It's very krufty. Also, the internetarchive package has
	reved a few times since.

	this is only for an example
	"""



	import argparse
	import json
	import logging
	import os
	import sys

	from internetarchive import get_item

	logging.basicConfig(
	format='%(asctime)s\|%(levelname)s\|%(message)s',
	datefmt='%Y-%m-%d %H:%M',
	filename='archive.log',
	level=logging.DEBUG,
	)


	"""
	This is a convenience script that wrapps ia-wrapper calls

	http://archive.org/help/abouts3.txt
	MEH boto escapes things and the internetarchive library handles things ok

	"""

	ACCESS_KEY="example"
	SECRET_KEY="example"


	def prepare():
	path = os.path.abspath('videos')
	files = os.listdir(path)
	with open('schedule.json') as fh:
	schedule = json.load(fh)
	return {
	'files': files,
	'path': path,
	'schedule': schedule,
	}


	def upload_all(path, schedule):
	files = os.listdir(path)
	for f in files:
	event_id = f.replace('.mp4', '')

	if event_id not in schedule:
	logging.warning('SKIPPING: %s', event_id)
	continue

	md = schedule[event_id]
	name = '%s_event_%s' % (md['conference'].lower().replace(' ', '_'), event_id)

	try:
	item = get_item(name)
	logging.debug("UPLOADING: event %s %s", event_id, name)
	uploaded = item.upload(os.path.join(path, f), metadata=md, access_key=ACCESS_KEY, secret_key=SECRET_KEY)
	if uploaded:
	logging.info("SUCCESS: event %s %s", event_id, name)
	else:
	logging.info("FAILURE: event %s %s", event_id, name)
	except:
	e_type = sys.exc_info()[0]
	logging.error("FAILURE: event %s %s type %s", event_id, name, e_type)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--schedule', default='schedule.json')
	parser.add_argument('--videos', default='videos')
	args = parser.parse_args()

	with open(args.schedule) as fh:
	schedule = json.load(fh)
	videopath = os.path.abspath(args.videos)

	upload_all(videopath, schedule)