Created
January 30, 2023 13:54
-
-
Save kingjr/700eb43eabc2a020ffb7c76c379ec741 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pip install benepar | |
#pip install protobuf==3.20.0 | |
import spacy | |
import numpy as np | |
class Parser(): | |
def __init__(self,): | |
model = 'fr_core_news_sm' | |
if not spacy.util.is_package(model): | |
spacy.cli.download(model) | |
self.nlp = spacy.load(model) | |
def process(self, sentence): | |
doc = self.nlp(sentence) | |
assert len(list(doc.sents)) == 1 | |
sent = list(doc.sents)[0] | |
return sent | |
class DependencyParser(Parser): | |
def parse(self, sentence): | |
sent = self.process(sentence) | |
closeds = [] | |
for current in range(1, len(sent)+1): | |
closed = 0 | |
for position, word in enumerate(sent): # [:current] | |
closed += self._is_closed(word, current) | |
closeds.append(closed) | |
closing = np.r_[np.diff(closeds), closeds[-1]] | |
return list(zip(sent, closing)) | |
def _is_closed(self, node, position): | |
if node.i > position: | |
return False | |
for child in node.children: | |
if child.i > position: | |
return False | |
if not self._is_closed(child, position): | |
return False | |
return True | |
class ConstituentParser(Parser): | |
def __init__(self): | |
import os | |
import benepar | |
super().__init__() | |
ben_model = 'benepar_fr2' | |
benepar.download(ben_model) | |
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' | |
self.nlp.add_pipe(benepar.BeneparComponent(ben_model)) | |
def parse(self, sentence): | |
sent = self.process(sentence) | |
tree = sent._.parse_string | |
opening = 0 | |
labels = [] | |
for here in sent._.parse_string.split('('): | |
split = here.split() | |
if len(split): | |
opening += 1 | |
closing = 0 | |
if len(split)>1: | |
word = split[1].strip(')') | |
closing = split[1].count(')') | |
labels.append((word, closing)) | |
opened = 0 | |
return labels | |
const_parser = ConstituentParser() | |
dep_parser = DependencyParser() | |
sentence = "les petits chats de Mamie suivent la souris verte." | |
print(const_parser.parse(sentence)) | |
print(dep_parser.parse(sentence)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment