Created
March 26, 2021 05:51
-
-
Save Ladsgroup/b870447728f251605dadf0e688e9e95e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import os | |
import bz2 | |
import gzip | |
def read_dump(path): | |
mode = 'r' | |
file_ = os.path.split(path)[-1] | |
if file_.endswith('.gz'): | |
f = gzip.open(path, mode) | |
elif file_.endswith('.bz2'): | |
f = bz2.BZ2File(path, mode) | |
elif file_.endswith('.json'): | |
f = open(path, mode) | |
else: | |
raise NotImplementedError(f'Reading file {file_} is not supported') | |
try: | |
for line in f: | |
if isinstance(line, bytes): | |
line = line.decode('utf-8') | |
try: | |
yield json.loads(line.strip().strip(',')) | |
except json.JSONDecodeError: | |
continue | |
finally: | |
f.close() | |
with open('P279.txt', 'w') as f: | |
f.write('') | |
for item in read_dump(sys.argv[1]): | |
id_ = item['id'] | |
p31s = [] | |
p279s = [] | |
if 'P279' not in item.get('claims', {}): | |
continue | |
if 'P31' not in item.get('claims', {}): | |
continue | |
for claim in item.get('claims', {}).get('P31', []): | |
try: | |
p31 = claim['mainsnak']['datavalue']['value']['id'] | |
except: | |
continue | |
p31s.append(p31) | |
for claim in item.get('claims', {}).get('P279', []): | |
try: | |
p279 = claim['mainsnak']['datavalue']['value']['id'] | |
except: | |
continue | |
p279s.append(p279) | |
with open('P279.txt', 'a') as f: | |
for p31 in p31s: | |
f.write(id_[1:] + ' ' + p31[1:] + '\n') | |
for p279 in p279s: | |
f.write(id_[1:] + ' ' + p279[1:] + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment