Skip to content

Instantly share code, notes, and snippets.

@uogbuji
Created November 23, 2013 22:59
Show Gist options
  • Save uogbuji/7621132 to your computer and use it in GitHub Desktop.
Save uogbuji/7621132 to your computer and use it in GitHub Desktop.
Take a spreadsheet from the Britburn project <http://www.britburn.co.uk/> of UK music chart history and creates a smaller spreadsheet suitable for import into Viewshare/Freemix. Grabs some good info for exercising various facets, and also uses DBPedia for adding more, e.g. artist link, image and birthplace info. python britburn4freemix.py Britbu…
#!/usr/bin/env python
# -*- coding: utf-8 -*-#import itertools
#Useful re DBPedia / SPARQL: http://markwatson.com/blog/2013-08/python-sparql-client-example.html
#Useful re DBPedia alternate lookup names & image retrieval: http://www.snee.com/bobdc.blog/2011/05/using-sparql-to-find-the-right.html
import sys
import time
from amara.thirdparty import httplib2, json
from datachef.squaredata import dict_from_xls
from datachef.exhibit import emitter # https://github.com/uogbuji/datachef
import xlwt
from SPARQLWrapper import SPARQLWrapper, SPARQLExceptions, JSON # https://pypi.python.org/pypi/SPARQLWrapper
ezxf = xlwt.easyxf
SPARQL = SPARQLWrapper("http://dbpedia.org/sparql")
#"Lucky DBPedia" query
LUCKY_DBPEDIA_TPL = u'''SELECT DISTINCT ?uri ?txt WHERE {{
?uri rdfs:label ?txt .
?txt bif:contains '"{qstr}"' .
}} LIMIT 1'''
PROPERTIES_TPL = u'''SELECT DISTINCT ?property WHERE {{
<{uri}> ?property ?value .
}}'''
#Useful: http://stackoverflow.com/questions/5161568/how-to-properly-use-sparql-optional-to-retrieve-attributes-for-a-resource-that-m
ARTIST_INFO_TPL = u'''\
PREFIX db: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT DISTINCT * WHERE {{
?res rdfs:label '{label}'@en .
?res foaf:depiction ?pictureURI .
?res db:birthPlace ?birthplace .
OPTIONAL {{
?birthplace rdfs:label ?birthplace_city .
}}
OPTIONAL {{
?birthplace dbp:country ?birthplace_country .
}}
FILTER (LANG(?birthplace_city) = 'en')
}} LIMIT 2
'''
def build_query(template, **kwargs):
trace = kwargs.get('_sparql_trace', False)
qstr = template.format(**kwargs)
SPARQL.setQuery(qstr)
SPARQL.setReturnFormat(JSON)
try:
results = SPARQL.query().convert()
except SPARQLExceptions.QueryBadFormed:
print >> sys.stderr, qstr
raise
if trace:
print >> sys.stderr, results
return results
def lookup_dbpedia_resource(name):
results = build_query(LUCKY_DBPEDIA_TPL, qstr=name)
if results[u'results'][u'bindings']:
return results[u'results'][u'bindings'][0][u'uri'][u'value'], results[u'results'][u'bindings'][0][u'txt'][u'value']
else:
return None
def dig_dbpedia_obj(uri):
#DESCRIBE <uri>
results = build_query(PROPERTIES_TPL, uri=uri, _sparql_trace=True)
properties = [ r[u'property'][u'value'] for r in results[u'results'][u'bindings'] ]
return properties
def lookup_artist_info(label):
'''
>>> u, l = lookup_dbpedia_resource(u'Elton John')
>>> lookup_artist_info(u)
(u'http://dbpedia.org/resource/Elton_John', u'http://upload.wikimedia.org/wikipedia/commons/d/d1/Elton_John_2011_Shankbone_2.JPG')
'''
results = build_query(ARTIST_INFO_TPL, label=label) # , _sparql_trace=True
if results[u'results'][u'bindings']:
info = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][0].items() ])
if len(results[u'results'][u'bindings']) > 1:
part2 = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][1].items() ])
part2.update(info)
info = part2
return info
else:
return {}
def run(source=None, book=None, limit=None):
heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
kinds = 'text text int int text text text text text text date date'.split()
kind_to_xf_map = {
'date': ezxf(num_format_str='yyyy-mm-dd'),
'int': ezxf(num_format_str='#,##0'),
'money': ezxf('font: italic on; pattern: pattern solid, fore-colour grey25',
num_format_str='$#,##0.00'),
'price': ezxf(num_format_str='#0.000000'),
'text': ezxf(),
}
data_xfs = [kind_to_xf_map[k] for k in kinds]
headings = [
u'ID',
u'Weeks on chart',
u'High position',
u'Artist',
u'Artist Link',
u'Artist Image',
u'Artist Birth City',
u'Artist Birth Country',
u'Title',
u'Label',
u'Date Entered',
u'Date Left',
]
for colx, val in enumerate(headings):
sheet.write(0, colx, val, heading_xf)
count = 1
for rowx, row in enumerate(source):
#if row[u'Target Type'] == u'Starting url':
artist_name = row[u'Artist']
title = row[u'Title']
print >> sys.stderr, artist_name, u'-', title
artist_query_name = artist_name.split(u' ft ')[0]
try:
item = lookup_dbpedia_resource(artist_query_name)
u, l = item or (None, None)
artist_info = lookup_artist_info(l)
except SPARQLExceptions.QueryBadFormed:
artist_info = {}
print >> sys.stderr, (artist_info.values()) if artist_info else None
time.sleep(1)
if not artist_info:
continue
# *sigh* sometimes DBPedia has a URL for country. More often a literal string
country = artist_info.get(u'birthplace_country', u'').split(u'/')[-1].replace(u'_', u' ')
row_data = [
row[u'Prefix'],
row[u'CH'],
row[u'High'],
artist_name,
artist_info.get(u'res', u''),
artist_info.get(u'pictureURI', u''),
artist_info.get(u'birthplace_city', u''),
,
title,
row[u'Label'],
row[u'Date Entered'],
row[u'Date Left'],
]
for colx, val in enumerate(row_data):
sheet.write(count, colx, val, data_xfs[colx])
print >> sys.stderr, "Added ({0}).".format(count)
count += 1
if count > limit: break
return
# Handle command-line arguments
import sys
from akara.thirdparty import argparse # Sorry PEP 8 ;)
if __name__ == '__main__':
# python wayin2ejson.py < wayinbites-sample.json
book = xlwt.Workbook()
sheet = book.add_sheet('britburn')
bb_199x = dict_from_xls(file(sys.argv[1]), 3)
run(source=bb_199x , book=book, limit=int(sys.argv[3]))
book.save(sys.argv[2])
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment