Skip to content

Instantly share code, notes, and snippets.

@ettorerizza
Last active September 9, 2017 15:56
Show Gist options
  • Save ettorerizza/00693ff2adaa9d061344ad6ac7765f2c to your computer and use it in GitHub Desktop.
Save ettorerizza/00693ff2adaa9d061344ad6ac7765f2c to your computer and use it in GitHub Desktop.
python parser for Wikidata Neckar dumps (http://event.ifi.uni-heidelberg.de/?page_id=429)
import pandas as pd
import simplejson as json
import gzip
def getTargetIds(jsonData):
data = json.loads(jsonData)
return (str(data.get('id', 'null')),
str(data.get('norm_name', 'null')),
str(data.get('description', 'null')),
str(data.get('date_birth', 'null')),
str(data.get('date_death', 'null')),
str(data.get('gender', 'null')),
str(data.get('en_sitelink', 'null')),
str("||".join(data.get('alias', 'null'))))
with gzip.open('WikidataNE_20170320_Persons_NECKAR_1_0.json_.gz',"r") as infile, open(r'result.txt', "w", encoding="utf8") as outfile:
for row in infile:
outfile.write("::".join(getTargetIds(row))+"\n")
df = pd.read_csv(r'result.txt', sep="::", index_col=False, header= None, encoding="utf8")
df.columns = ['id', 'nom', 'description', 'birth', 'death', 'gender', 'en_wiki', 'alias']
df.to_csv(r'wikidata_per_neckar.csv', encoding="utf8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment