Skip to content

Instantly share code, notes, and snippets.

@alexeyev
Last active June 6, 2017 12:38
Show Gist options
  • Save alexeyev/b2835b03d03ef07a6176b27e87bea887 to your computer and use it in GitHub Desktop.
Save alexeyev/b2835b03d03ef07a6176b27e87bea887 to your computer and use it in GitHub Desktop.
Извлечение именованных сущностей из текстов, уложенных в .docx
# coding: utf-8
""""
Скрипт для извлечения имён людей из коллекции текстов о музеях
"""
import os
from docx import Document
from natasha import Combinator
from natasha.grammars import Person
from natasha.grammars.person import PersonObject
from natasha.grammars.person.grammars import ProbabilisticPerson
from yargy.interpretation import InterpretationEngine
import csv
class Txt(object):
def __init__(self, path, date, author, txt):
self.path = path
self.date = date
self.author = author
self.text = txt
# парсинг docx
TEXTS_DIR = "texts/"
texts = []
for dir in os.listdir(TEXTS_DIR):
if os.path.isdir(TEXTS_DIR + dir):
for subdir in os.listdir(TEXTS_DIR + dir):
if os.path.isdir(TEXTS_DIR + dir + "/" + subdir):
for f in os.listdir(TEXTS_DIR + dir + "/" + subdir):
if f.endswith(".docx") and not f.startswith("~"):
filepath = TEXTS_DIR + dir + "/" + subdir + "/" + f
doc = Document(filepath)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
texts.append(
Txt(filepath,
fullText[0], # .split(": ")[1],
fullText[1], # .split(": ")[1],
"\n".join(fullText[2:])))
# print('\n'.join(fullText))
# извлечение именованных сущностей
# todo: возможно, тут надо кое-что настроить
combinator = Combinator([
Person,
ProbabilisticPerson,
])
persons_engine = InterpretationEngine(PersonObject)
def extract(text):
matches = combinator.resolve_matches(combinator.extract(text), strict=False)
matches = [
# преобразуем результат парсера в более читаемый формат:
# (правило, [список, оригинальных, совпадений])
(grammar, [t for t in tokens]) for (grammar, tokens) in matches
]
persons = list(persons_engine.extract(matches))
return ([{"firstname": (p.normalized_firstname),
"middlename": (p.normalized_middlename),
"lastname": (p.normalized_lastname)} for p in persons])
def empty_if_none(s):
if s:
return s
else:
return ""
with open('museums_ner2.csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["path", "date", "author", "firstname", "middlename", "lastname"])
for txt in texts:
extracted = extract(txt.text)
for e in extracted:
writer.writerow(
txt.path.split("/")[1:] +
[txt.date,
txt.author,
empty_if_none(e["firstname"]),
empty_if_none(e["middlename"]),
empty_if_none(e["lastname"])])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment