Last active
June 18, 2018 07:30
-
-
Save ettorerizza/6f4ea1b8a2f2fe9f0d79ddd66e24f421 to your computer and use it in GitHub Desktop.
match with wikidata using openrefine api
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Fri Jan 5 12:35:54 2018 | |
| @author: ettor | |
| """ | |
| import pandas as pd | |
| import requests | |
| import requests_cache | |
| requests_cache.install_cache('wikidata_cache') | |
| def get_wikidata(value, type_id, prop_id='', prop_value='', lang="en"): | |
| """ Use the Antonin's API to return the best match on Wikidata based on the type and a property. | |
| The result is a tuple (main_type, match, name, qid, score) | |
| Example : get_wikidata('Binche', 'Q618123', 'P17', 'Q31') | |
| Result : ('municipality of Belgium', False, 'Binche', 'Q95121', 100.0) | |
| """ | |
| base_url = "https://tools.wmflabs.org/openrefine-wikidata/%s/api" % (lang) | |
| query = {"query": """{"query":"%s", | |
| "limit":0, | |
| "type" : "%s"}""" % (value, type_id)} | |
| if prop_id or prop_value: | |
| query = {"query": """{"query":"%s", | |
| "limit":0, | |
| "type" : "%s", | |
| "properties":[{"pid":"%s", | |
| "v":{"id":"%s"}}]}""" % (value, type_id, prop_id, prop_value)} | |
| r = requests.get(base_url, params=query) | |
| # print(r.url) | |
| json_result = r.json() | |
| # print(json_result) | |
| try: | |
| qid = [d['id'] for d in json_result['result']] | |
| name = [d['name'] for d in json_result['result']] | |
| score = [d['score'] for d in json_result['result']] | |
| match = [d['match'] for d in json_result['result']] | |
| main_type = [d['type'][0]['name'] for d in json_result['result']] | |
| df = pd.DataFrame({'qid': qid, | |
| 'name': name, | |
| 'score': score, | |
| 'match': match, | |
| 'main_type': main_type | |
| }) | |
| # order by score | |
| df.sort_values(['score'], ascending=[ | |
| False], inplace=True) | |
| # select the best match | |
| match = df[df['match'] == True].values | |
| if match.size > 0: | |
| best_match = tuple(map(tuple, match))[0] | |
| else: | |
| best_match = tuple(map(tuple, df.iloc[[0]].values))[0] | |
| return best_match | |
| except IndexError: | |
| return "No match" | |
| if __name__ == '__main__': | |
| print(get_wikidata('Binche', 'Q618123', 'P31', 'Q15273785', "fr")) | |
| print(get_wikidata('Binche', 'Q618123')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment