nyimbi · January 21, 2021 18:25
diff --git a/extractor_for_entitylinking.py b/extractor_for_entitylinking.py
 import re
 import sys
 import json
 from wikipedia2vec import Wikipedia2Vec

 ev = Wikipedia2Vec.load("./enwiki_20180420_100d.pkl")

 regex = r'\[\[(.+?)\]\]'
 regex_first = r"'''(.+?)'''"
 regex_title = r'<title>(.+?)</title>'
 regex_kakko = r'{{.+?}}'
 regex_kakko2 = r'[(（].+?[)）]'
 regex_space =r'[ ]+'

 def _extract_links(text, title, m2e):
    out = []
    text = re.sub(regex_kakko, ' ', text)
    text = re.sub(regex_kakko2, ' ', text) 
    for x in re.finditer(regex_first, text):
        try:
            x = x.group(1).strip()
            if not x:
                continue
            x = (title, x)
            if x[1] not in m2e:
                m2e[x[1]] = {}
            if x[0] not in m2e[x[1]]:
                m2e[x[1]][x[0]] = 0
            m2e[x[1]][x[0]] += 1
        except:
            continue
    text = re.sub(regex_first, r'\1', text)
    for x in re.finditer(regex, text):
        try:
            tmp = x.group(1).split("|")
            if not tmp[0].strip():
                continue
            if len(tmp) == 1:
                x = (tmp[0], tmp[0])
                text = text.replace("[[{}]]".format(tmp[0]), tmp[0])
            elif len(tmp) == 2:
                x = tuple(tmp)
                text = text.replace("[[{}|{}]]".format(*tmp), tmp[1])
            else:
                continue
            if x[1] not in m2e:
                m2e[x[1]] = {}
            if x[0] not in m2e[x[1]]:
                m2e[x[1]][x[0]] = 0
            m2e[x[1]][x[0]] += 1
        except:
            continue
    text = re.sub(regex_space, ' ', text)
    for m, es in m2e.items():
        if m in text:
            e = max(es.items(), key=lambda x: x[1])[0].strip()
            try:
                ev.get_entity_vector(e)
            except KeyError:
                continue
            out.append({"m": m.strip(), "e": e.strip()})
    return {"text": text.strip(), "m2e": out}

 def main():
    page = False
    title = False
    text = False
    m2e = {}
    for line in sys.stdin:
        if not page:
            if "<page" not in line:
                continue
            else:
                page = True
        else:
            if "#REDIRECT" in line:
                page = False
                title = False
                text = False
                m2e = {}
                continue
            if "<title" in line:
                title = next(re.finditer(regex_title, line)).group(1)
            if "<text" in line:
                text = True
            if "</text" in line:
                text = False
            if "</page" in line:
                page = False
                title = False
                text = False
                m2e = {}
            if text and line.strip():
                if not title:
                    page = False
                    continue
                try:
                    ev.get_entity_vector(title.strip())
                except KeyError:
                    continue
                x = _extract_links(line, title, m2e)
                if x["m2e"]:
                    print(json.dumps(x, ensure_ascii=False))


 if __name__ == "__main__":
    main()
	import re
	import sys
	import json
	from wikipedia2vec import Wikipedia2Vec

	ev = Wikipedia2Vec.load("./enwiki_20180420_100d.pkl")

	regex = r'\[\[(.+?)\]\]'
	regex_first = r"'''(.+?)'''"
	regex_title = r'<title>(.+?)</title>'
	regex_kakko = r'{{.+?}}'
	regex_kakko2 = r'[(（].+?[)）]'
	regex_space =r'[ ]+'

	def _extract_links(text, title, m2e):
	out = []
	text = re.sub(regex_kakko, ' ', text)
	text = re.sub(regex_kakko2, ' ', text)
	for x in re.finditer(regex_first, text):
	try:
	x = x.group(1).strip()
	if not x:
	continue
	x = (title, x)
	if x[1] not in m2e:
	m2e[x[1]] = {}
	if x[0] not in m2e[x[1]]:
	m2e[x[1]][x[0]] = 0
	m2e[x[1]][x[0]] += 1
	except:
	continue
	text = re.sub(regex_first, r'\1', text)
	for x in re.finditer(regex, text):
	try:
	tmp = x.group(1).split("\|")
	if not tmp[0].strip():
	continue
	if len(tmp) == 1:
	x = (tmp[0], tmp[0])
	text = text.replace("[[{}]]".format(tmp[0]), tmp[0])
	elif len(tmp) == 2:
	x = tuple(tmp)
	text = text.replace("[[{}\|{}]]".format(*tmp), tmp[1])
	else:
	continue
	if x[1] not in m2e:
	m2e[x[1]] = {}
	if x[0] not in m2e[x[1]]:
	m2e[x[1]][x[0]] = 0
	m2e[x[1]][x[0]] += 1
	except:
	continue
	text = re.sub(regex_space, ' ', text)
	for m, es in m2e.items():
	if m in text:
	e = max(es.items(), key=lambda x: x[1])[0].strip()
	try:
	ev.get_entity_vector(e)
	except KeyError:
	continue
	out.append({"m": m.strip(), "e": e.strip()})
	return {"text": text.strip(), "m2e": out}

	def main():
	page = False
	title = False
	text = False
	m2e = {}
	for line in sys.stdin:
	if not page:
	if "<page" not in line:
	continue
	else:
	page = True
	else:
	if "#REDIRECT" in line:
	page = False
	title = False
	text = False
	m2e = {}
	continue
	if "<title" in line:
	title = next(re.finditer(regex_title, line)).group(1)
	if "<text" in line:
	text = True
	if "</text" in line:
	text = False
	if "</page" in line:
	page = False
	title = False
	text = False
	m2e = {}
	if text and line.strip():
	if not title:
	page = False
	continue
	try:
	ev.get_entity_vector(title.strip())
	except KeyError:
	continue
	x = _extract_links(line, title, m2e)
	if x["m2e"]:
	print(json.dumps(x, ensure_ascii=False))


	if __name__ == "__main__":
	main()
No results found