Skip to content

Instantly share code, notes, and snippets.

@AngryLoki
Last active August 29, 2015 14:25
Show Gist options
  • Save AngryLoki/86f5ad48fafe526d8269 to your computer and use it in GitHub Desktop.
Save AngryLoki/86f5ad48fafe526d8269 to your computer and use it in GitHub Desktop.
import logging
from pprint import pprint
import itertools
import re
import requests
url = "https://tools.wmflabs.org/autolist/index.php"
logging.basicConfig(level=logging.INFO)
s = requests.Session()
r = s.get(url, params={
"project": "wikipedia",
#"language": "eo",
#"category": "Asteroidoj de la ĉefa zono",
"language": "min",
"category": "Rancangan batopik asteroid",
"depth": "0",
"wdq": "claim[716]",
"statementlist": "",
"run": "Run",
"mode_manual": "or",
"mode_cat": "or",
"mode_wdq": "not",
"mode_find": "or",
"chunk_size": "100000",
"download": "1",
})
if not r.text.startswith("Q"):
pprint(r.text)
raise RuntimeError("Invalid text")
items = list(map(lambda s: s.strip(), r.text.split()))
print(len(items))
url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"ids": "",
"props": "sitelinks|labels",
"format": "json",
"formatversion": "2",
}
def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def label_to_code(label):
id_search = re.search('^\(?(\d+)\)? ', label, re.IGNORECASE)
if id_search is not None:
code = int(id_search.group(1))
return 2000000 + code
def labels_to_code(labels, sitelinks):
for label in labels.values():
id_search = re.search('^\((\d+)\) ', label['value'], re.IGNORECASE)
if id_search is not None:
code = int(id_search.group(1))
return 2000000 + code
for sitelink in sitelinks.values():
id_search = re.search('^\((\d+)\) ', sitelink['title'], re.IGNORECASE)
if id_search is not None:
code = int(id_search.group(1))
return 2000000 + code
logging.warning("no code for " + labels['min']['value'])
# return
global s
try:
page = str(s.get('http://ssd.jpl.nasa.gov/sbdb.cgi', params={
'sstr': labels['min']['value']
}).content)
except:
logging.warning("skipping " + labels['min']['value'])
return
c_s = re.search('SPK-ID:</b> </font><font face="times,serif" size="-1">(\d+)', page, re.IGNORECASE)
if c_s is None:
print("!!! ", labels, sitelinks, labels['min']['value'])
return
code = int(c_s.group(1))
# code = label_to_code(labels['min']['value'])
assert code > 2015
return code
#if code: ['minwiki']['title']
# return code
#for lang in labels.values():
# code = label_to_code(lang['value'])
# if code:
# return code
#pprint(labels)
# raise RuntimeError("Invalid name")
jpl_out = open("jpl4.txt", "a")
for group in grouper(50, items[3600:]):
params['ids'] = "|".join(group)
wd = s.get(url, params=params).json()
#pprint(wd)
#raise
if 'entities' not in wd:
pprint(wd)
raise RuntimeError("'entities' not in wd")
codes = []
for ent in wd['entities'].values():
c = labels_to_code(ent['labels'], ent['sitelinks'])
if c:
codes.append((ent['id'], c))
out_line = "%s\tP716\t\"%d\"\tS143\tQ4026990" % (ent['id'], c)
print(out_line)
print(out_line, file=jpl_out)
jpl_out.flush()
#out_lines = "\n".join("%s\tP716\t\"%d\"\tS143\tQ4026990" % code for code in codes)
#print(out_lines)
# print(out_lines, file=jpl_out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment