Created
August 23, 2015 02:49
-
-
Save codersquid/e7b5237c397b0fcd6662 to your computer and use it in GitHub Desktop.
pyconza 2014 scratchpad
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
from __future__ import unicode_literals | |
from collections import defaultdict | |
from steve.util import ( | |
#get_from_config, | |
get_project_config, | |
save_json_files, | |
#load_json_files, | |
) | |
import json | |
import internetarchive | |
def files_by_format(item): | |
d = defaultdict(list) | |
for f in item.iter_files(): | |
d[f.format].append(f) | |
return d | |
def get_format_url(files_lookup, fmt): | |
formats = files_lookup.get(fmt) | |
if formats: | |
return formats[0].url | |
return '' | |
def subject2tags(metadata): | |
subject = metadata.get('subject') | |
if not subject: | |
return [] | |
return [t.strip() for t in subject.split(';')] | |
def creator2speakers(metadata): | |
if 'creator' in metadata: | |
return [metadata['creator']] | |
return [] | |
def language2language(metadata): | |
# need to lookup 3 letter codes | |
return metadata['language'] | |
def item2source_url(item): | |
return '{}//archive.org/details/{}'.format(item.protocol, item.identifier) | |
def item2video(item, category, language): | |
video = {} | |
if not item.exists: | |
return {} | |
md = item.metadata | |
video['category'] = category | |
video['state'] = 2 | |
video['title'] = md['title'] | |
video['description'] = md.get('description', '') | |
video['summary'] = md.get('description', '') | |
video['tags'] = subject2tags(md) | |
video['speakers'] = creator2speakers(md) | |
video['language'] = language | |
video['copyright_text'] = md.get('licenseurl', '') | |
video['recorded'] = md.get('date', '') | |
video['whiteboard'] = 'ia scrape' | |
video['source_url'] = item2source_url(item) | |
file_lookup = files_by_format(item) | |
video['thumbnail_url'] = get_format_url(file_lookup, 'Thumbnail') | |
video['video_ogv_url'] = get_format_url(file_lookup, 'Ogg Video') | |
video['video_ogv_download_only'] = False | |
video['video_mp4_url'] = get_format_url(file_lookup, 'MPEG4') | |
video['video_mp4_download_only'] = False | |
video['video_webm_download_only'] = False | |
video['video_webm_url'] = '' | |
video['video_flv_download_only'] = False | |
video['video_flv_url'] = '' | |
return video | |
if __name__ == "__main__": | |
cfg = get_project_config() | |
videos = [] | |
search = internetarchive.search_items('subject:pyconza2014') | |
identifiers = [result['identifier'] for result in search] | |
for identifier in identifiers: | |
item = internetarchive.Item(identifier) | |
video = item2video(item, 'PyCon ZA 2014', 'English') | |
if item.exists: | |
videos.append(('json/{}.json'.format(identifier), video)) | |
save_json_files(cfg, videos) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment