Last active
August 29, 2015 14:25
-
-
Save AngryLoki/86f5ad48fafe526d8269 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from pprint import pprint | |
import itertools | |
import re | |
import requests | |
url = "https://tools.wmflabs.org/autolist/index.php" | |
logging.basicConfig(level=logging.INFO) | |
s = requests.Session() | |
r = s.get(url, params={ | |
"project": "wikipedia", | |
#"language": "eo", | |
#"category": "Asteroidoj de la ĉefa zono", | |
"language": "min", | |
"category": "Rancangan batopik asteroid", | |
"depth": "0", | |
"wdq": "claim[716]", | |
"statementlist": "", | |
"run": "Run", | |
"mode_manual": "or", | |
"mode_cat": "or", | |
"mode_wdq": "not", | |
"mode_find": "or", | |
"chunk_size": "100000", | |
"download": "1", | |
}) | |
if not r.text.startswith("Q"): | |
pprint(r.text) | |
raise RuntimeError("Invalid text") | |
items = list(map(lambda s: s.strip(), r.text.split())) | |
print(len(items)) | |
url = "https://www.wikidata.org/w/api.php" | |
params = { | |
"action": "wbgetentities", | |
"ids": "", | |
"props": "sitelinks|labels", | |
"format": "json", | |
"formatversion": "2", | |
} | |
def grouper(n, iterable): | |
it = iter(iterable) | |
while True: | |
chunk = tuple(itertools.islice(it, n)) | |
if not chunk: | |
return | |
yield chunk | |
def label_to_code(label): | |
id_search = re.search('^\(?(\d+)\)? ', label, re.IGNORECASE) | |
if id_search is not None: | |
code = int(id_search.group(1)) | |
return 2000000 + code | |
def labels_to_code(labels, sitelinks): | |
for label in labels.values(): | |
id_search = re.search('^\((\d+)\) ', label['value'], re.IGNORECASE) | |
if id_search is not None: | |
code = int(id_search.group(1)) | |
return 2000000 + code | |
for sitelink in sitelinks.values(): | |
id_search = re.search('^\((\d+)\) ', sitelink['title'], re.IGNORECASE) | |
if id_search is not None: | |
code = int(id_search.group(1)) | |
return 2000000 + code | |
logging.warning("no code for " + labels['min']['value']) | |
# return | |
global s | |
try: | |
page = str(s.get('http://ssd.jpl.nasa.gov/sbdb.cgi', params={ | |
'sstr': labels['min']['value'] | |
}).content) | |
except: | |
logging.warning("skipping " + labels['min']['value']) | |
return | |
c_s = re.search('SPK-ID:</b> </font><font face="times,serif" size="-1">(\d+)', page, re.IGNORECASE) | |
if c_s is None: | |
print("!!! ", labels, sitelinks, labels['min']['value']) | |
return | |
code = int(c_s.group(1)) | |
# code = label_to_code(labels['min']['value']) | |
assert code > 2015 | |
return code | |
#if code: ['minwiki']['title'] | |
# return code | |
#for lang in labels.values(): | |
# code = label_to_code(lang['value']) | |
# if code: | |
# return code | |
#pprint(labels) | |
# raise RuntimeError("Invalid name") | |
jpl_out = open("jpl4.txt", "a") | |
for group in grouper(50, items[3600:]): | |
params['ids'] = "|".join(group) | |
wd = s.get(url, params=params).json() | |
#pprint(wd) | |
#raise | |
if 'entities' not in wd: | |
pprint(wd) | |
raise RuntimeError("'entities' not in wd") | |
codes = [] | |
for ent in wd['entities'].values(): | |
c = labels_to_code(ent['labels'], ent['sitelinks']) | |
if c: | |
codes.append((ent['id'], c)) | |
out_line = "%s\tP716\t\"%d\"\tS143\tQ4026990" % (ent['id'], c) | |
print(out_line) | |
print(out_line, file=jpl_out) | |
jpl_out.flush() | |
#out_lines = "\n".join("%s\tP716\t\"%d\"\tS143\tQ4026990" % code for code in codes) | |
#print(out_lines) | |
# print(out_lines, file=jpl_out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment