uogbuji · November 23, 2013 22:59
diff --git a/britburn4freemix.py b/britburn4freemix.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-#import itertools

 #Useful re DBPedia / SPARQL: http://markwatson.com/blog/2013-08/python-sparql-client-example.html
 #Useful re DBPedia alternate lookup names & image retrieval: http://www.snee.com/bobdc.blog/2011/05/using-sparql-to-find-the-right.html

 import sys
 import time

 from amara.thirdparty import httplib2, json

 from datachef.squaredata import dict_from_xls
 from datachef.exhibit import emitter  # https://github.com/uogbuji/datachef
 import xlwt
 from SPARQLWrapper import SPARQLWrapper, SPARQLExceptions, JSON # https://pypi.python.org/pypi/SPARQLWrapper


 ezxf = xlwt.easyxf

 SPARQL = SPARQLWrapper("http://dbpedia.org/sparql")

 #"Lucky DBPedia" query
 LUCKY_DBPEDIA_TPL = u'''SELECT DISTINCT ?uri ?txt WHERE {{
                  ?uri rdfs:label ?txt .
                  ?txt bif:contains '"{qstr}"' .
                 }} LIMIT 1'''


 PROPERTIES_TPL = u'''SELECT DISTINCT ?property WHERE {{
                  <{uri}> ?property ?value .
                 }}'''

 #Useful: http://stackoverflow.com/questions/5161568/how-to-properly-use-sparql-optional-to-retrieve-attributes-for-a-resource-that-m
 ARTIST_INFO_TPL = u'''\
 PREFIX db: <http://dbpedia.org/ontology/>
 PREFIX dbp: <http://dbpedia.org/property/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>

 SELECT DISTINCT * WHERE {{
    ?res rdfs:label '{label}'@en .
    ?res foaf:depiction ?pictureURI .
    ?res db:birthPlace ?birthplace .
    OPTIONAL {{
        ?birthplace rdfs:label ?birthplace_city .
    }}
    OPTIONAL {{
        ?birthplace dbp:country ?birthplace_country .
    }}
    FILTER (LANG(?birthplace_city) = 'en')
 }} LIMIT 2
 '''


 def build_query(template, **kwargs):
    trace = kwargs.get('_sparql_trace', False)
    qstr = template.format(**kwargs)
    SPARQL.setQuery(qstr)
    SPARQL.setReturnFormat(JSON)

    try:
        results = SPARQL.query().convert()
    except SPARQLExceptions.QueryBadFormed:
        print >> sys.stderr, qstr
        raise

    if trace:
        print >> sys.stderr, results
    return results


 def lookup_dbpedia_resource(name):
    results = build_query(LUCKY_DBPEDIA_TPL, qstr=name)
    if results[u'results'][u'bindings']:
        return results[u'results'][u'bindings'][0][u'uri'][u'value'], results[u'results'][u'bindings'][0][u'txt'][u'value']
    else:
        return None


 def dig_dbpedia_obj(uri):
    #DESCRIBE <uri>
    results = build_query(PROPERTIES_TPL, uri=uri, _sparql_trace=True)
    properties = [ r[u'property'][u'value'] for r in results[u'results'][u'bindings'] ]
    return properties


 def lookup_artist_info(label):
    '''
    >>> u, l = lookup_dbpedia_resource(u'Elton John')
    >>> lookup_artist_info(u)
 (u'http://dbpedia.org/resource/Elton_John', u'http://upload.wikimedia.org/wikipedia/commons/d/d1/Elton_John_2011_Shankbone_2.JPG')
    '''
    results = build_query(ARTIST_INFO_TPL, label=label) # , _sparql_trace=True
    if results[u'results'][u'bindings']:
        info = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][0].items() ])
        if len(results[u'results'][u'bindings']) > 1:
            part2 = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][1].items() ])
            part2.update(info)
            info = part2
        return info
    else:
        return {}

 def run(source=None, book=None, limit=None):
    heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
    kinds = 'text text int int text text text text text text date date'.split()
    kind_to_xf_map = {
        'date': ezxf(num_format_str='yyyy-mm-dd'),
        'int': ezxf(num_format_str='#,##0'),
        'money': ezxf('font: italic on; pattern: pattern solid, fore-colour grey25',
            num_format_str='$#,##0.00'),
        'price': ezxf(num_format_str='#0.000000'),
        'text': ezxf(),
        }

    data_xfs = [kind_to_xf_map[k] for k in kinds]
    headings = [
        u'ID',
        u'Weeks on chart',
        u'High position',
        u'Artist',
        u'Artist Link',
        u'Artist Image',
        u'Artist Birth City',
        u'Artist Birth Country',
        u'Title',
        u'Label',
        u'Date Entered',
        u'Date Left',
    ]
    for colx, val in enumerate(headings):
        sheet.write(0, colx, val, heading_xf)

    count = 1
    for rowx, row in enumerate(source):
        #if row[u'Target Type'] == u'Starting url':
        artist_name = row[u'Artist']
        title = row[u'Title']

        print >> sys.stderr, artist_name, u'-', title

        artist_query_name = artist_name.split(u' ft ')[0]
        try:
            item = lookup_dbpedia_resource(artist_query_name)
            u, l = item or (None, None)
            artist_info = lookup_artist_info(l)
        except SPARQLExceptions.QueryBadFormed:
            artist_info = {}
        print >> sys.stderr, (artist_info.values()) if artist_info else None

        time.sleep(1)
        if not artist_info:
            continue

        # *sigh* sometimes DBPedia has a URL for country. More often a literal string
        country = artist_info.get(u'birthplace_country', u'').split(u'/')[-1].replace(u'_', u' ')
        row_data = [
            row[u'Prefix'],
            row[u'CH'],
            row[u'High'],
            artist_name,
            artist_info.get(u'res', u''),
            artist_info.get(u'pictureURI', u''),
            artist_info.get(u'birthplace_city', u''),
            ,
            title,
            row[u'Label'],
            row[u'Date Entered'],
            row[u'Date Left'],
        ]
        for colx, val in enumerate(row_data):
            sheet.write(count, colx, val, data_xfs[colx])
        print >> sys.stderr, "Added ({0}).".format(count)
        count += 1
        if count > limit: break

    return

 # Handle command-line arguments
 import sys
 from akara.thirdparty import argparse  # Sorry PEP 8 ;)

 if __name__ == '__main__':
    # python wayin2ejson.py < wayinbites-sample.json

    book = xlwt.Workbook()
    sheet = book.add_sheet('britburn')
    bb_199x = dict_from_xls(file(sys.argv[1]), 3)

    run(source=bb_199x , book=book, limit=int(sys.argv[3]))

    book.save(sys.argv[2])

    sys.exit(0)
	#!/usr/bin/env python
	# -- coding: utf-8 --#import itertools

	#Useful re DBPedia / SPARQL: http://markwatson.com/blog/2013-08/python-sparql-client-example.html
	#Useful re DBPedia alternate lookup names & image retrieval: http://www.snee.com/bobdc.blog/2011/05/using-sparql-to-find-the-right.html

	import sys
	import time

	from amara.thirdparty import httplib2, json

	from datachef.squaredata import dict_from_xls
	from datachef.exhibit import emitter # https://github.com/uogbuji/datachef
	import xlwt
	from SPARQLWrapper import SPARQLWrapper, SPARQLExceptions, JSON # https://pypi.python.org/pypi/SPARQLWrapper


	ezxf = xlwt.easyxf

	SPARQL = SPARQLWrapper("http://dbpedia.org/sparql")

	#"Lucky DBPedia" query
	LUCKY_DBPEDIA_TPL = u'''SELECT DISTINCT ?uri ?txt WHERE {{
	?uri rdfs:label ?txt .
	?txt bif:contains '"{qstr}"' .
	}} LIMIT 1'''


	PROPERTIES_TPL = u'''SELECT DISTINCT ?property WHERE {{
	<{uri}> ?property ?value .
	}}'''

	#Useful: http://stackoverflow.com/questions/5161568/how-to-properly-use-sparql-optional-to-retrieve-attributes-for-a-resource-that-m
	ARTIST_INFO_TPL = u'''\
	PREFIX db: <http://dbpedia.org/ontology/>
	PREFIX dbp: <http://dbpedia.org/property/>
	PREFIX foaf: <http://xmlns.com/foaf/0.1/>

	SELECT DISTINCT * WHERE {{
	?res rdfs:label '{label}'@en .
	?res foaf:depiction ?pictureURI .
	?res db:birthPlace ?birthplace .
	OPTIONAL {{
	?birthplace rdfs:label ?birthplace_city .
	}}
	OPTIONAL {{
	?birthplace dbp:country ?birthplace_country .
	}}
	FILTER (LANG(?birthplace_city) = 'en')
	}} LIMIT 2
	'''


	def build_query(template, **kwargs):
	trace = kwargs.get('_sparql_trace', False)
	qstr = template.format(**kwargs)
	SPARQL.setQuery(qstr)
	SPARQL.setReturnFormat(JSON)

	try:
	results = SPARQL.query().convert()
	except SPARQLExceptions.QueryBadFormed:
	print >> sys.stderr, qstr
	raise

	if trace:
	print >> sys.stderr, results
	return results


	def lookup_dbpedia_resource(name):
	results = build_query(LUCKY_DBPEDIA_TPL, qstr=name)
	if results[u'results'][u'bindings']:
	return results[u'results'][u'bindings'][0][u'uri'][u'value'], results[u'results'][u'bindings'][0][u'txt'][u'value']
	else:
	return None


	def dig_dbpedia_obj(uri):
	#DESCRIBE <uri>
	results = build_query(PROPERTIES_TPL, uri=uri, _sparql_trace=True)
	properties = [ r[u'property'][u'value'] for r in results[u'results'][u'bindings'] ]
	return properties


	def lookup_artist_info(label):
	'''
	>>> u, l = lookup_dbpedia_resource(u'Elton John')
	>>> lookup_artist_info(u)
	(u'http://dbpedia.org/resource/Elton_John', u'http://upload.wikimedia.org/wikipedia/commons/d/d1/Elton_John_2011_Shankbone_2.JPG')
	'''
	results = build_query(ARTIST_INFO_TPL, label=label) # , _sparql_trace=True
	if results[u'results'][u'bindings']:
	info = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][0].items() ])
	if len(results[u'results'][u'bindings']) > 1:
	part2 = dict([ (k, v[u'value']) for (k, v) in results[u'results'][u'bindings'][1].items() ])
	part2.update(info)
	info = part2
	return info
	else:
	return {}

	def run(source=None, book=None, limit=None):
	heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
	kinds = 'text text int int text text text text text text date date'.split()
	kind_to_xf_map = {
	'date': ezxf(num_format_str='yyyy-mm-dd'),
	'int': ezxf(num_format_str='#,##0'),
	'money': ezxf('font: italic on; pattern: pattern solid, fore-colour grey25',
	num_format_str='$#,##0.00'),
	'price': ezxf(num_format_str='#0.000000'),
	'text': ezxf(),
	}

	data_xfs = [kind_to_xf_map[k] for k in kinds]
	headings = [
	u'ID',
	u'Weeks on chart',
	u'High position',
	u'Artist',
	u'Artist Link',
	u'Artist Image',
	u'Artist Birth City',
	u'Artist Birth Country',
	u'Title',
	u'Label',
	u'Date Entered',
	u'Date Left',
	]
	for colx, val in enumerate(headings):
	sheet.write(0, colx, val, heading_xf)

	count = 1
	for rowx, row in enumerate(source):
	#if row[u'Target Type'] == u'Starting url':
	artist_name = row[u'Artist']
	title = row[u'Title']

	print >> sys.stderr, artist_name, u'-', title

	artist_query_name = artist_name.split(u' ft ')[0]
	try:
	item = lookup_dbpedia_resource(artist_query_name)
	u, l = item or (None, None)
	artist_info = lookup_artist_info(l)
	except SPARQLExceptions.QueryBadFormed:
	artist_info = {}
	print >> sys.stderr, (artist_info.values()) if artist_info else None

	time.sleep(1)
	if not artist_info:
	continue

	# sigh sometimes DBPedia has a URL for country. More often a literal string
	country = artist_info.get(u'birthplace_country', u'').split(u'/')[-1].replace(u'_', u' ')
	row_data = [
	row[u'Prefix'],
	row[u'CH'],
	row[u'High'],
	artist_name,
	artist_info.get(u'res', u''),
	artist_info.get(u'pictureURI', u''),
	artist_info.get(u'birthplace_city', u''),
	,
	title,
	row[u'Label'],
	row[u'Date Entered'],
	row[u'Date Left'],
	]
	for colx, val in enumerate(row_data):
	sheet.write(count, colx, val, data_xfs[colx])
	print >> sys.stderr, "Added ({0}).".format(count)
	count += 1
	if count > limit: break

	return

	# Handle command-line arguments
	import sys
	from akara.thirdparty import argparse # Sorry PEP 8 ;)

	if __name__ == '__main__':
	# python wayin2ejson.py < wayinbites-sample.json

	book = xlwt.Workbook()
	sheet = book.add_sheet('britburn')
	bb_199x = dict_from_xls(file(sys.argv[1]), 3)

	run(source=bb_199x , book=book, limit=int(sys.argv[3]))

	book.save(sys.argv[2])

	sys.exit(0)