Last active
December 12, 2015 01:48
-
-
Save rlskoeser/4693891 to your computer and use it in GitHub Desktop.
Script to generate GeoRSS from CSV files generated by NameDropper lookup-names script. See http://disc.library.emory.edu/networkingbelfast/places-in-around-the-world-in-80-days/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# to get dependencies: pip install unicodecsv eulxml namedropper | |
# as of 2012/02/01 this requires the development version of namedropper, | |
# but should work with the 0.3 version once it is released | |
from collections import OrderedDict | |
import glob | |
import unicodecsv | |
from datetime import datetime | |
from os.path import basename, splitext | |
import sys | |
import logging | |
from eulxml import xmlmap | |
from namedropper.spotlight import DBpediaResource | |
logging.basicConfig(level=logging.INFO) | |
def main(): | |
filelist = glob.iglob('./chapter??.csv') | |
# order based on when a name first occurs | |
ids = OrderedDict() | |
file_count = 0 | |
for csv_filename in filelist: | |
file_label, ext = splitext(basename(csv_filename)) | |
# get number from filename (chapter##) | |
chapter_number = int(file_label[len('chapter'):]) + 1 | |
print >> sys.stderr, "Chapter %d" % chapter_number | |
file_count += 1 | |
with open(csv_filename, 'rb') as csvfile: | |
# infer field names from header | |
csvreader = unicodecsv.DictReader(csvfile) | |
for row in csvreader: | |
uri = row['URI'] | |
if uri not in ids: | |
ids[uri] = { | |
'count': 1, | |
'label': row['Name'], # store surface form in case label lookup fails | |
'text': '<p><a href="%s">DBpedia record</a></p>' \ | |
% (uri) | |
} | |
# verbose mode | |
print >> sys.stderr, uri | |
else: | |
ids[uri]['count'] += 1 | |
# highlight surface form of the annotation in context | |
txt = row['Context'].replace(row['Name'], | |
'<b>%s</b>' % row['Name']) | |
ids[uri]['text'] += '<p><i>Chapter %d</i>: ..%s..</p>\n' % \ | |
(chapter_number, txt) | |
print >> sys.stderr, 'Found %s unique ids in %s files' % \ | |
(len(ids.keys()), file_count) | |
feed = GeoRSSFeed(version='2.0', | |
title='Places in "Around the World in 80 Days"', | |
description='Places mentioned in the text of Jules Verne\'s' + \ | |
'"Around the World in 80 Days" as identified by DBpedia Spotlight', | |
pub_date=datetime.now() | |
) | |
for uri, data in ids.iteritems(): | |
res = DBpediaResource(uri) | |
feed.items.append(GeoRSSEntry( | |
title=res.label or data['label'], # note: could include count of occurrences in label... | |
description=data['text'], | |
latitude=res.latitude, | |
longitude=res.longitude | |
)) | |
print feed.serialize(pretty=True) | |
# georss xml objects | |
class GeoRSSEntry(xmlmap.XmlObject): | |
ROOT_NAMESPACES = { | |
'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#' | |
} | |
ROOT_NAME = 'item' | |
title = xmlmap.StringField('title') | |
link = xmlmap.StringField('link') | |
latitude = xmlmap.StringField('geo:lat') | |
longitude = xmlmap.StringField('geo:long') | |
description = xmlmap.StringField('description') | |
class GeoRSSFeed(xmlmap.XmlObject): | |
ROOT_NAMESPACES = { | |
'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#' | |
} | |
ROOT_NAME = 'rss' | |
version = xmlmap.StringField('@version') | |
title = xmlmap.StringField('channel/title') | |
description = xmlmap.StringField('channel/description') | |
link = xmlmap.StringField('channel/link') | |
pub_date = xmlmap.DateTimeField('channel/pubDate') | |
items = xmlmap.NodeListField('channel/item', GeoRSSEntry) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment