Created
January 21, 2021 18:25
-
-
Save nyimbi/304226c4cec9e169dae9078fd0fec4f2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import sys | |
| import json | |
| from wikipedia2vec import Wikipedia2Vec | |
| ev = Wikipedia2Vec.load("./enwiki_20180420_100d.pkl") | |
| regex = r'\[\[(.+?)\]\]' | |
| regex_first = r"'''(.+?)'''" | |
| regex_title = r'<title>(.+?)</title>' | |
| regex_kakko = r'{{.+?}}' | |
| regex_kakko2 = r'[((].+?[))]' | |
| regex_space =r'[ ]+' | |
| def _extract_links(text, title, m2e): | |
| out = [] | |
| text = re.sub(regex_kakko, ' ', text) | |
| text = re.sub(regex_kakko2, ' ', text) | |
| for x in re.finditer(regex_first, text): | |
| try: | |
| x = x.group(1).strip() | |
| if not x: | |
| continue | |
| x = (title, x) | |
| if x[1] not in m2e: | |
| m2e[x[1]] = {} | |
| if x[0] not in m2e[x[1]]: | |
| m2e[x[1]][x[0]] = 0 | |
| m2e[x[1]][x[0]] += 1 | |
| except: | |
| continue | |
| text = re.sub(regex_first, r'\1', text) | |
| for x in re.finditer(regex, text): | |
| try: | |
| tmp = x.group(1).split("|") | |
| if not tmp[0].strip(): | |
| continue | |
| if len(tmp) == 1: | |
| x = (tmp[0], tmp[0]) | |
| text = text.replace("[[{}]]".format(tmp[0]), tmp[0]) | |
| elif len(tmp) == 2: | |
| x = tuple(tmp) | |
| text = text.replace("[[{}|{}]]".format(*tmp), tmp[1]) | |
| else: | |
| continue | |
| if x[1] not in m2e: | |
| m2e[x[1]] = {} | |
| if x[0] not in m2e[x[1]]: | |
| m2e[x[1]][x[0]] = 0 | |
| m2e[x[1]][x[0]] += 1 | |
| except: | |
| continue | |
| text = re.sub(regex_space, ' ', text) | |
| for m, es in m2e.items(): | |
| if m in text: | |
| e = max(es.items(), key=lambda x: x[1])[0].strip() | |
| try: | |
| ev.get_entity_vector(e) | |
| except KeyError: | |
| continue | |
| out.append({"m": m.strip(), "e": e.strip()}) | |
| return {"text": text.strip(), "m2e": out} | |
| def main(): | |
| page = False | |
| title = False | |
| text = False | |
| m2e = {} | |
| for line in sys.stdin: | |
| if not page: | |
| if "<page" not in line: | |
| continue | |
| else: | |
| page = True | |
| else: | |
| if "#REDIRECT" in line: | |
| page = False | |
| title = False | |
| text = False | |
| m2e = {} | |
| continue | |
| if "<title" in line: | |
| title = next(re.finditer(regex_title, line)).group(1) | |
| if "<text" in line: | |
| text = True | |
| if "</text" in line: | |
| text = False | |
| if "</page" in line: | |
| page = False | |
| title = False | |
| text = False | |
| m2e = {} | |
| if text and line.strip(): | |
| if not title: | |
| page = False | |
| continue | |
| try: | |
| ev.get_entity_vector(title.strip()) | |
| except KeyError: | |
| continue | |
| x = _extract_links(line, title, m2e) | |
| if x["m2e"]: | |
| print(json.dumps(x, ensure_ascii=False)) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment