Created
November 23, 2013 22:59
-
-
Save uogbuji/7621132 to your computer and use it in GitHub Desktop.
Take a spreadsheet from the Britburn project <http://www.britburn.co.uk/> of UK music chart history and creates a smaller spreadsheet suitable for import into Viewshare/Freemix. Grabs some good info for exercising various facets, and also uses DBPedia for adding more, e.g. artist link, image and birthplace info. python britburn4freemix.py Britbu…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*-#import itertools | |
#Useful re DBPedia / SPARQL: http://markwatson.com/blog/2013-08/python-sparql-client-example.html | |
#Useful re DBPedia alternate lookup names & image retrieval: http://www.snee.com/bobdc.blog/2011/05/using-sparql-to-find-the-right.html | |
import sys | |
import time | |
from amara.thirdparty import httplib2, json | |
from datachef.squaredata import dict_from_xls | |
from datachef.exhibit import emitter # https://github.com/uogbuji/datachef | |
import xlwt | |
from SPARQLWrapper import SPARQLWrapper, SPARQLExceptions, JSON # https://pypi.python.org/pypi/SPARQLWrapper | |
ezxf = xlwt.easyxf | |
SPARQL = SPARQLWrapper("http://dbpedia.org/sparql") | |
#"Lucky DBPedia" query | |
LUCKY_DBPEDIA_TPL = u'''SELECT DISTINCT ?uri ?txt WHERE {{ | |
?uri rdfs:label ?txt . | |
?txt bif:contains '"{qstr}"' . | |
}} LIMIT 1''' | |
PROPERTIES_TPL = u'''SELECT DISTINCT ?property WHERE {{ | |
<{uri}> ?property ?value . | |
}}''' | |
#Useful: http://stackoverflow.com/questions/5161568/how-to-properly-use-sparql-optional-to-retrieve-attributes-for-a-resource-that-m | |
ARTIST_INFO_TPL = u'''\ | |
PREFIX db: <http://dbpedia.org/ontology/> | |
PREFIX dbp: <http://dbpedia.org/property/> | |
PREFIX foaf: <http://xmlns.com/foaf/0.1/> | |
SELECT DISTINCT * WHERE {{ | |
?res rdfs:label '{label}'@en . | |
?res foaf:depiction ?pictureURI . | |
?res db:birthPlace ?birthplace . | |
OPTIONAL {{ | |
?birthplace rdfs:label ?birthplace_city . | |
}} | |
OPTIONAL {{ | |
?birthplace dbp:country ?birthplace_country . | |
}} | |
FILTER (LANG(?birthplace_city) = 'en') | |
}} LIMIT 2 | |
''' | |
def build_query(template, **kwargs): | |
trace = kwargs.get('_sparql_trace', False) | |
qstr = template.format(**kwargs) | |
SPARQL.setQuery(qstr) | |
SPARQL.setReturnFormat(JSON) | |
try: | |
results = SPARQL.query().convert() | |
except SPARQLExceptions.QueryBadFormed: | |
print >> sys.stderr, qstr | |
raise | |
if trace: | |
print >> sys.stderr, results | |
return results | |
def lookup_dbpedia_resource(name): | |
results = build_query(LUCKY_DBPEDIA_TPL, qstr=name) | |
if results[u'results'][u'bindings']: | |
return results[u'results'][u'bindings'][0][u'uri'][u'value'], results[u'results'][u'bindings'][0][u'txt'][u'value'] | |
else: | |
return None | |
def dig_dbpedia_obj(uri): | |
#DESCRIBE <uri> | |
results = build_query(PROPERTIES_TPL, uri=uri, _sparql_trace=True) | |
properties = [ r[u'property'][u'value'] for r in results[u'results'][u'bindings'] ] | |
return properties | |
def lookup_artist_info(label): | |
''' | |
>>> u, l = lookup_dbpedia_resource(u'Elton John') | |
>>> lookup_artist_info(u) | |
(u'http://dbpedia.org/resource/Elton_John', u'http://upload.wikimedia.org/wikipedia/commons/d/d1/Elton_John_2011_Shankbone_2.JPG') | |
''' | |
results = build_query(ARTIST_INFO_TPL, label=label) # , _sparql_trace=True | |
if results[u'results'][u'bindings']: | |
info = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][0].items() ]) | |
if len(results[u'results'][u'bindings']) > 1: | |
part2 = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][1].items() ]) | |
part2.update(info) | |
info = part2 | |
return info | |
else: | |
return {} | |
def run(source=None, book=None, limit=None): | |
heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center') | |
kinds = 'text text int int text text text text text text date date'.split() | |
kind_to_xf_map = { | |
'date': ezxf(num_format_str='yyyy-mm-dd'), | |
'int': ezxf(num_format_str='#,##0'), | |
'money': ezxf('font: italic on; pattern: pattern solid, fore-colour grey25', | |
num_format_str='$#,##0.00'), | |
'price': ezxf(num_format_str='#0.000000'), | |
'text': ezxf(), | |
} | |
data_xfs = [kind_to_xf_map[k] for k in kinds] | |
headings = [ | |
u'ID', | |
u'Weeks on chart', | |
u'High position', | |
u'Artist', | |
u'Artist Link', | |
u'Artist Image', | |
u'Artist Birth City', | |
u'Artist Birth Country', | |
u'Title', | |
u'Label', | |
u'Date Entered', | |
u'Date Left', | |
] | |
for colx, val in enumerate(headings): | |
sheet.write(0, colx, val, heading_xf) | |
count = 1 | |
for rowx, row in enumerate(source): | |
#if row[u'Target Type'] == u'Starting url': | |
artist_name = row[u'Artist'] | |
title = row[u'Title'] | |
print >> sys.stderr, artist_name, u'-', title | |
artist_query_name = artist_name.split(u' ft ')[0] | |
try: | |
item = lookup_dbpedia_resource(artist_query_name) | |
u, l = item or (None, None) | |
artist_info = lookup_artist_info(l) | |
except SPARQLExceptions.QueryBadFormed: | |
artist_info = {} | |
print >> sys.stderr, (artist_info.values()) if artist_info else None | |
time.sleep(1) | |
if not artist_info: | |
continue | |
# *sigh* sometimes DBPedia has a URL for country. More often a literal string | |
country = artist_info.get(u'birthplace_country', u'').split(u'/')[-1].replace(u'_', u' ') | |
row_data = [ | |
row[u'Prefix'], | |
row[u'CH'], | |
row[u'High'], | |
artist_name, | |
artist_info.get(u'res', u''), | |
artist_info.get(u'pictureURI', u''), | |
artist_info.get(u'birthplace_city', u''), | |
, | |
title, | |
row[u'Label'], | |
row[u'Date Entered'], | |
row[u'Date Left'], | |
] | |
for colx, val in enumerate(row_data): | |
sheet.write(count, colx, val, data_xfs[colx]) | |
print >> sys.stderr, "Added ({0}).".format(count) | |
count += 1 | |
if count > limit: break | |
return | |
# Handle command-line arguments | |
import sys | |
from akara.thirdparty import argparse # Sorry PEP 8 ;) | |
if __name__ == '__main__': | |
# python wayin2ejson.py < wayinbites-sample.json | |
book = xlwt.Workbook() | |
sheet = book.add_sheet('britburn') | |
bb_199x = dict_from_xls(file(sys.argv[1]), 3) | |
run(source=bb_199x , book=book, limit=int(sys.argv[3])) | |
book.save(sys.argv[2]) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment