Skip to content

Instantly share code, notes, and snippets.

@nyimbi
Created January 21, 2021 18:25
Show Gist options
  • Select an option

  • Save nyimbi/304226c4cec9e169dae9078fd0fec4f2 to your computer and use it in GitHub Desktop.

Select an option

Save nyimbi/304226c4cec9e169dae9078fd0fec4f2 to your computer and use it in GitHub Desktop.
import re
import sys
import json
from wikipedia2vec import Wikipedia2Vec
ev = Wikipedia2Vec.load("./enwiki_20180420_100d.pkl")
regex = r'\[\[(.+?)\]\]'
regex_first = r"'''(.+?)'''"
regex_title = r'<title>(.+?)</title>'
regex_kakko = r'{{.+?}}'
regex_kakko2 = r'[((].+?[))]'
regex_space =r'[ ]+'
def _extract_links(text, title, m2e):
out = []
text = re.sub(regex_kakko, ' ', text)
text = re.sub(regex_kakko2, ' ', text)
for x in re.finditer(regex_first, text):
try:
x = x.group(1).strip()
if not x:
continue
x = (title, x)
if x[1] not in m2e:
m2e[x[1]] = {}
if x[0] not in m2e[x[1]]:
m2e[x[1]][x[0]] = 0
m2e[x[1]][x[0]] += 1
except:
continue
text = re.sub(regex_first, r'\1', text)
for x in re.finditer(regex, text):
try:
tmp = x.group(1).split("|")
if not tmp[0].strip():
continue
if len(tmp) == 1:
x = (tmp[0], tmp[0])
text = text.replace("[[{}]]".format(tmp[0]), tmp[0])
elif len(tmp) == 2:
x = tuple(tmp)
text = text.replace("[[{}|{}]]".format(*tmp), tmp[1])
else:
continue
if x[1] not in m2e:
m2e[x[1]] = {}
if x[0] not in m2e[x[1]]:
m2e[x[1]][x[0]] = 0
m2e[x[1]][x[0]] += 1
except:
continue
text = re.sub(regex_space, ' ', text)
for m, es in m2e.items():
if m in text:
e = max(es.items(), key=lambda x: x[1])[0].strip()
try:
ev.get_entity_vector(e)
except KeyError:
continue
out.append({"m": m.strip(), "e": e.strip()})
return {"text": text.strip(), "m2e": out}
def main():
page = False
title = False
text = False
m2e = {}
for line in sys.stdin:
if not page:
if "<page" not in line:
continue
else:
page = True
else:
if "#REDIRECT" in line:
page = False
title = False
text = False
m2e = {}
continue
if "<title" in line:
title = next(re.finditer(regex_title, line)).group(1)
if "<text" in line:
text = True
if "</text" in line:
text = False
if "</page" in line:
page = False
title = False
text = False
m2e = {}
if text and line.strip():
if not title:
page = False
continue
try:
ev.get_entity_vector(title.strip())
except KeyError:
continue
x = _extract_links(line, title, m2e)
if x["m2e"]:
print(json.dumps(x, ensure_ascii=False))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment