Skip to content

Instantly share code, notes, and snippets.

@mdamien
Created August 20, 2015 15:28
Show Gist options
  • Save mdamien/d1f1dfa6b36fc20701aa to your computer and use it in GitHub Desktop.
Save mdamien/d1f1dfa6b36fc20701aa to your computer and use it in GitHub Desktop.
find companies in wikidata
import json
from pprint import pprint as pp
from ijson import items
COMPANIES = [
783794, #company
279014, #european society
134161, #join stock
33685, #limited company
]
def parse_the_shit():
f = open('dump.json')
objects = items(f, 'item')
for obj in objects:
yield obj
def name(it):
return it.get('labels',{}).get('en',{}).get('value','<-nope->')
def is_comp(it):
claims = it.get('claims',{})
for prop in 'P910','P1454','P31':
instance_of = claims.get(prop)
if instance_of:
for inst in instance_of:
val = inst.get('mainsnak',{}).get('datavalue',{}).get('value',{}).get('numeric-id')
if val and val in COMPANIES:
print(name(it),'(',it['id'],')','is comp',val,'via',prop)
return val
def is_sub(it):
claims = it.get('claims',{})
statements = claims.get('P355')
if statements:
for statement in statements:
val = statement.get('mainsnak',{}).get('datavalue',{}).get('value',{}).get('numeric-id')
print(name(it),'(',it['id'],')','subsidiary of',val)
return val
data = []
for i, it in enumerate(parse_the_shit()):
if is_sub(it) or is_comp(it):
data.append(it)
if i > 30000:
break
print('found',len(data),'comps!')
with open('companies.json','w') as f:
json.dump(data,f,indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment