Last active
June 6, 2017 12:38
-
-
Save alexeyev/b2835b03d03ef07a6176b27e87bea887 to your computer and use it in GitHub Desktop.
Извлечение именованных сущностей из текстов, уложенных в .docx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
"""" | |
Скрипт для извлечения имён людей из коллекции текстов о музеях | |
""" | |
import os | |
from docx import Document | |
from natasha import Combinator | |
from natasha.grammars import Person | |
from natasha.grammars.person import PersonObject | |
from natasha.grammars.person.grammars import ProbabilisticPerson | |
from yargy.interpretation import InterpretationEngine | |
import csv | |
class Txt(object): | |
def __init__(self, path, date, author, txt): | |
self.path = path | |
self.date = date | |
self.author = author | |
self.text = txt | |
# парсинг docx | |
TEXTS_DIR = "texts/" | |
texts = [] | |
for dir in os.listdir(TEXTS_DIR): | |
if os.path.isdir(TEXTS_DIR + dir): | |
for subdir in os.listdir(TEXTS_DIR + dir): | |
if os.path.isdir(TEXTS_DIR + dir + "/" + subdir): | |
for f in os.listdir(TEXTS_DIR + dir + "/" + subdir): | |
if f.endswith(".docx") and not f.startswith("~"): | |
filepath = TEXTS_DIR + dir + "/" + subdir + "/" + f | |
doc = Document(filepath) | |
fullText = [] | |
for para in doc.paragraphs: | |
fullText.append(para.text) | |
texts.append( | |
Txt(filepath, | |
fullText[0], # .split(": ")[1], | |
fullText[1], # .split(": ")[1], | |
"\n".join(fullText[2:]))) | |
# print('\n'.join(fullText)) | |
# извлечение именованных сущностей | |
# todo: возможно, тут надо кое-что настроить | |
combinator = Combinator([ | |
Person, | |
ProbabilisticPerson, | |
]) | |
persons_engine = InterpretationEngine(PersonObject) | |
def extract(text): | |
matches = combinator.resolve_matches(combinator.extract(text), strict=False) | |
matches = [ | |
# преобразуем результат парсера в более читаемый формат: | |
# (правило, [список, оригинальных, совпадений]) | |
(grammar, [t for t in tokens]) for (grammar, tokens) in matches | |
] | |
persons = list(persons_engine.extract(matches)) | |
return ([{"firstname": (p.normalized_firstname), | |
"middlename": (p.normalized_middlename), | |
"lastname": (p.normalized_lastname)} for p in persons]) | |
def empty_if_none(s): | |
if s: | |
return s | |
else: | |
return "" | |
with open('museums_ner2.csv', 'w') as csvfile: | |
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
writer.writerow(["path", "date", "author", "firstname", "middlename", "lastname"]) | |
for txt in texts: | |
extracted = extract(txt.text) | |
for e in extracted: | |
writer.writerow( | |
txt.path.split("/")[1:] + | |
[txt.date, | |
txt.author, | |
empty_if_none(e["firstname"]), | |
empty_if_none(e["middlename"]), | |
empty_if_none(e["lastname"])]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment