Last active
September 9, 2017 15:56
-
-
Save ettorerizza/00693ff2adaa9d061344ad6ac7765f2c to your computer and use it in GitHub Desktop.
python parser for Wikidata Neckar dumps (http://event.ifi.uni-heidelberg.de/?page_id=429)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import simplejson as json | |
import gzip | |
def getTargetIds(jsonData): | |
data = json.loads(jsonData) | |
return (str(data.get('id', 'null')), | |
str(data.get('norm_name', 'null')), | |
str(data.get('description', 'null')), | |
str(data.get('date_birth', 'null')), | |
str(data.get('date_death', 'null')), | |
str(data.get('gender', 'null')), | |
str(data.get('en_sitelink', 'null')), | |
str("||".join(data.get('alias', 'null')))) | |
with gzip.open('WikidataNE_20170320_Persons_NECKAR_1_0.json_.gz',"r") as infile, open(r'result.txt', "w", encoding="utf8") as outfile: | |
for row in infile: | |
outfile.write("::".join(getTargetIds(row))+"\n") | |
df = pd.read_csv(r'result.txt', sep="::", index_col=False, header= None, encoding="utf8") | |
df.columns = ['id', 'nom', 'description', 'birth', 'death', 'gender', 'en_wiki', 'alias'] | |
df.to_csv(r'wikidata_per_neckar.csv', encoding="utf8") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment