alexeyev · June 6, 2017 12:38
diff --git a/museums_ner.py b/museums_ner.py
 # coding: utf-8
 """"
    Скрипт для извлечения имён людей из коллекции текстов о музеях
 """

 import os

 from docx import Document
 from natasha import Combinator
 from natasha.grammars import Person
 from natasha.grammars.person import PersonObject
 from natasha.grammars.person.grammars import ProbabilisticPerson
 from yargy.interpretation import InterpretationEngine

 import csv


 class Txt(object):
    def __init__(self, path, date, author, txt):
        self.path = path
        self.date = date
        self.author = author
        self.text = txt


 # парсинг docx

 TEXTS_DIR = "texts/"
 texts = []

 for dir in os.listdir(TEXTS_DIR):
    if os.path.isdir(TEXTS_DIR + dir):
        for subdir in os.listdir(TEXTS_DIR + dir):
            if os.path.isdir(TEXTS_DIR + dir + "/" + subdir):
                for f in os.listdir(TEXTS_DIR + dir + "/" + subdir):
                    if f.endswith(".docx") and not f.startswith("~"):

                        filepath = TEXTS_DIR + dir + "/" + subdir + "/" + f
                        doc = Document(filepath)
                        fullText = []

                        for para in doc.paragraphs:
                            fullText.append(para.text)

                        texts.append(
                            Txt(filepath,
                                fullText[0],  # .split(": ")[1],
                                fullText[1],  # .split(": ")[1],
                                "\n".join(fullText[2:])))
                        # print('\n'.join(fullText))

 # извлечение именованных сущностей

 # todo: возможно, тут надо кое-что настроить
 combinator = Combinator([
    Person,
    ProbabilisticPerson,
 ])

 persons_engine = InterpretationEngine(PersonObject)


 def extract(text):
    matches = combinator.resolve_matches(combinator.extract(text), strict=False)
    matches = [
        # преобразуем результат парсера в более читаемый формат:
        # (правило, [список, оригинальных, совпадений])
        (grammar, [t for t in tokens]) for (grammar, tokens) in matches
        ]

    persons = list(persons_engine.extract(matches))

    return ([{"firstname": (p.normalized_firstname),
              "middlename": (p.normalized_middlename),
              "lastname": (p.normalized_lastname)} for p in persons])


 def empty_if_none(s):
    if s:
        return s
    else:
        return ""


 with open('museums_ner2.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["path", "date", "author", "firstname", "middlename", "lastname"])

    for txt in texts:
        extracted = extract(txt.text)
        for e in extracted:
            writer.writerow(
                txt.path.split("/")[1:] +
                [txt.date,
                 txt.author,
                 empty_if_none(e["firstname"]),
                 empty_if_none(e["middlename"]),
                 empty_if_none(e["lastname"])])
	# coding: utf-8
	""""
	Скрипт для извлечения имён людей из коллекции текстов о музеях
	"""

	import os

	from docx import Document
	from natasha import Combinator
	from natasha.grammars import Person
	from natasha.grammars.person import PersonObject
	from natasha.grammars.person.grammars import ProbabilisticPerson
	from yargy.interpretation import InterpretationEngine

	import csv


	class Txt(object):
	def __init__(self, path, date, author, txt):
	self.path = path
	self.date = date
	self.author = author
	self.text = txt


	# парсинг docx

	TEXTS_DIR = "texts/"
	texts = []

	for dir in os.listdir(TEXTS_DIR):
	if os.path.isdir(TEXTS_DIR + dir):
	for subdir in os.listdir(TEXTS_DIR + dir):
	if os.path.isdir(TEXTS_DIR + dir + "/" + subdir):
	for f in os.listdir(TEXTS_DIR + dir + "/" + subdir):
	if f.endswith(".docx") and not f.startswith("~"):

	filepath = TEXTS_DIR + dir + "/" + subdir + "/" + f
	doc = Document(filepath)
	fullText = []

	for para in doc.paragraphs:
	fullText.append(para.text)

	texts.append(
	Txt(filepath,
	fullText[0], # .split(": ")[1],
	fullText[1], # .split(": ")[1],
	"\n".join(fullText[2:])))
	# print('\n'.join(fullText))

	# извлечение именованных сущностей

	# todo: возможно, тут надо кое-что настроить
	combinator = Combinator([
	Person,
	ProbabilisticPerson,
	])

	persons_engine = InterpretationEngine(PersonObject)


	def extract(text):
	matches = combinator.resolve_matches(combinator.extract(text), strict=False)
	matches = [
	# преобразуем результат парсера в более читаемый формат:
	# (правило, [список, оригинальных, совпадений])
	(grammar, [t for t in tokens]) for (grammar, tokens) in matches
	]

	persons = list(persons_engine.extract(matches))

	return ([{"firstname": (p.normalized_firstname),
	"middlename": (p.normalized_middlename),
	"lastname": (p.normalized_lastname)} for p in persons])


	def empty_if_none(s):
	if s:
	return s
	else:
	return ""


	with open('museums_ner2.csv', 'w') as csvfile:
	writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
	writer.writerow(["path", "date", "author", "firstname", "middlename", "lastname"])

	for txt in texts:
	extracted = extract(txt.text)
	for e in extracted:
	writer.writerow(
	txt.path.split("/")[1:] +
	[txt.date,
	txt.author,
	empty_if_none(e["firstname"]),
	empty_if_none(e["middlename"]),
	empty_if_none(e["lastname"])])