Last active
February 21, 2019 08:29
-
-
Save jacquesfize/0af846c77bcbf9cc0f61655621ad4fd4 to your computer and use it in GitHub Desktop.
A python class to match element (from a terminology) in text using Spacy module.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding= utf-8 | |
import warnings | |
import re | |
import importlib | |
import glob | |
import copy | |
import pandas as pd | |
import numpy as np | |
from tqdm import tqdm | |
from textblob import Blobber | |
from textblob_fr import PatternTagger, PatternAnalyzer | |
def match_sequence(seq,dataset): | |
""" | |
Return start,end positions of every occurrence of a sequence in a dataset | |
Parameters | |
---------- | |
seq : list | |
sequence | |
dataset : list | |
dataset | |
""" | |
N = len(seq) | |
if N < 1: | |
raise ValueError("Sequence is empty !") | |
if isinstance(dataset,list): | |
dataset=np.asarray(dataset) | |
if isinstance(seq,list): | |
seq=np.asarray(seq,dtype=dataset.dtype) | |
prefix_ind=np.where(dataset == seq[0])[0] | |
results=[] | |
for idx in prefix_ind: | |
start,end=idx,idx+N | |
if (dataset[start:end].tolist() == seq.tolist()): | |
results.append([seq,start,end]) | |
return results | |
def match_sequences(seqs,dataset): | |
""" | |
Return start,end positions of every occurrence of a liste of sequences in a dataset | |
Parameters | |
---------- | |
seqs : list | |
sequences | |
dataset : list | |
dataset | |
""" | |
N=len(seqs) | |
if N < 1: | |
warnings.warn("Sequence Empty") | |
return [] | |
if isinstance(dataset,list): | |
dataset=np.asarray(dataset) | |
if isinstance(seqs,list): | |
seqs=np.array(seqs) | |
prefixes_dict={seq[0]:(seq,i,len(seq)) for i,seq in enumerate(seqs)} | |
prefixes=list(prefixes_dict.keys()) | |
prefix_ind=np.where(np.isin(dataset,prefixes))[0] | |
results=[] | |
for idx in prefix_ind: | |
start,end=idx,idx+prefixes_dict[dataset[idx]][-1] | |
if (dataset[start:end].tolist() == prefixes_dict[dataset[idx]][0]): | |
results.append([prefixes_dict[dataset[idx]][1],start,end]) | |
return results | |
def get_lemmatizer(lang): | |
i = importlib.import_module("spacy.lang.{0}.lemmatizer".format(lang)) | |
return i.LOOKUP | |
class Matcher(): | |
""" | |
A class responsible of the matching of terms in a text. | |
""" | |
def __init__(self,lang,use_lower=True,use_lemma=True,use_singular=True,use_plural=True): | |
""" | |
Constructor | |
Parameters | |
---------- | |
lang : str | |
language | |
use_lower : bool, optional | |
match on lower version (the default is True) | |
use_lemma : bool, optional | |
match on lemma (the default is True) | |
use_singular : bool, optional | |
match on singular (the default is True) | |
use_plural : bool, optional | |
match on plural (the default is True) | |
""" | |
if lang == "fr": | |
self.blobber = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) | |
else: | |
self.blobber= Blobber() | |
self.index_to_id={} | |
self.basic_form=[] | |
self.term_lower_form=[] | |
self.term_singular_form=[] | |
self.term_plural_form=[] | |
self.term_lemma_form=[] | |
self.use_lemma,self.use_lower,self.use_singular,self.use_plural=use_lemma,use_lower,use_singular,use_plural | |
self.lemmatizer=get_lemmatizer(lang) | |
self.N = 0 | |
def add(self,id_,func,basic_form,lower_form=[],lemma_form=[],singular_form=[],plural_form=[]): | |
""" | |
Add a term or one of its variation | |
Parameters | |
---------- | |
id_ : str or int | |
identifier | |
func : func | |
NotImplemented | |
basic_form : list of str | |
initial form | |
lower_form : list, optional | |
lower form (the default is []) | |
lemma_form : list, optional | |
lemma form (the default is []) | |
singular_form : list, optional | |
singular form (the default is []) | |
plural_form : list, optional | |
plural form (the default is []) | |
""" | |
self.index_to_id[self.N]=id_ | |
self.basic_form.append(basic_form) | |
self.term_lower_form.append(lower_form) | |
self.term_singular_form.append(singular_form) | |
self.term_plural_form.append(plural_form) | |
self.term_lemma_form.append(lemma_form) | |
self.N+=1 | |
def match(self,text): | |
""" | |
Execute the matching | |
Parameters | |
---------- | |
text : str | |
input | |
Returns | |
------- | |
list | |
list of matches(id,start,end) | |
""" | |
results=[] | |
doc=self.blobber(text) | |
tokens=list(doc.tokenize()) | |
tokens_lower=list(doc.lower().tokenize()) | |
tokens_lemma = [self.lemmatizer.get(token.lower(), token.lower()) for token in doc] | |
results.extend(match_sequences(self.basic_form,tokens)) | |
if self.use_lemma: | |
results.extend(match_sequences(self.term_lemma_form,tokens_lemma)) | |
if self.use_lower: | |
results.extend(match_sequences(self.term_lower_form,tokens_lower)) | |
if self.use_singular: | |
results.extend(match_sequences(self.term_singular_form,tokens_lower)) | |
if self.use_plural: | |
results.extend(match_sequences(self.term_plural_form,tokens_lower)) | |
return self.parse_results(results) | |
def parse_results(self,results): | |
""" | |
Parse raw match results | |
Parameters | |
---------- | |
results : list | |
raw match results | |
Returns | |
------- | |
list | |
reformatted matches | |
""" | |
new_=[] | |
for res in results: | |
new_.append([self.index_to_id[res[0]],res[1],res[2]]) | |
return new_ | |
def __call__(self,text): | |
return self.match(text) | |
class TerminologyMatcher: | |
""" | |
A Matcher used to detect words from a terminology in a text. It uses the **spacy** Matcher class. | |
Terminology can be given as simple 1D-array(`list`,`numpy.ndarray`), a Python `dict` or `pandas.Dataframe`. | |
Usage: | |
>>> terminology = ["Agroforesterie","équipe"] | |
>>> matcher = TerminologyMatcher(terminology) | |
>>> matcher(nlp("Cette homme travaille dans le domaine de l'agroforesterie. Plusieurs équipes du CIRAD travaille dans ce domaine.")) | |
[(0, 8, 9), (1, 11, 12)] | |
""" | |
def __init__(self, terminology_data, lang="fr", column_id="id", column_label="label", column_alt_label="alt_labels"): | |
""" | |
TerminologyMatcher Constructor | |
Parameters | |
---------- | |
terminology_data : terminology container | |
Iterable object | |
lang : str, optional | |
language of the terminology (the default is "fr", which is French) | |
column_id : str, optional | |
the column name that contains the id of term (Only for pandas.Dataframe input) (the default is "id") | |
column_label : str, optional | |
The name of the column that contains the preferred label (the default is "label") | |
column_alt_label : str, optional | |
The name of the column that contains the alternate labels (must be iterable) (the default is "alt_labels") | |
Raises | |
------ | |
ValueError | |
If the terminology variable is not iterable | |
""" | |
self.terminology_data = terminology_data | |
if not hasattr(terminology_data, '__iter__'): # Checking Vocabulary | |
raise ValueError( | |
"The 'terminology_data' args must be an iterable!") | |
self.is_dict = isinstance(terminology_data, dict) | |
self.is_panda = isinstance(terminology_data, pd.DataFrame) | |
self.column_id_in = column_id in terminology_data | |
self.column_id = column_id | |
self.column_label=column_label | |
self.lang = lang | |
self.lemmatizer = get_lemmatizer(self.lang) # Accelerate the process | |
self.matcher = Matcher("fr") | |
self.inflector = None | |
try: | |
from inflector import French, English | |
if self.lang == "fr": | |
self.inflector = French() | |
elif self.lang == "en": | |
self.inflector = English() | |
except ImportError: | |
raise ImportError("You must install the module `inflector` from https://github.com/Jacobe2169/Python-Inflector") | |
if self.is_panda: | |
for index, row in tqdm(self.terminology_data.iterrows(), desc="Composing the thematic matcher..."): | |
self.matcher.add( | |
index if not column_id in terminology_data else row[column_id], | |
None, | |
*self.generate_input_matcher(row[column_label]) | |
) | |
for label in row[column_alt_label]: | |
self.matcher.add(index if not column_id in terminology_data else row[column_id], | |
None, | |
*self.generate_input_matcher(row[column_label])) | |
elif self.is_dict: | |
for key, value in tqdm(self.terminology_data.items(), desc="Composing the thematic matcher..."): | |
try: | |
self.matcher.add( | |
row[key], | |
None, | |
*self.generate_input_matcher(value) | |
) | |
except: | |
pass | |
else: | |
for i, word in tqdm(enumerate(self.terminology_data), desc="Composing the thematic matcher..."): | |
try: | |
self.matcher.add( | |
i, | |
None, | |
*self.generate_input_matcher(word) | |
) | |
except: | |
pass | |
def __call__(self, doc): | |
""" | |
Overriding the __call__ method so it can be used as the spacy.matcher.Matcher object | |
Parameters | |
---------- | |
doc : str | |
text | |
Returns | |
------- | |
list | |
list of matches position found in the text | |
""" | |
return self.matcher.match(doc) | |
def generate_input_matcher(self, doc): | |
""" | |
Generate the patterns that enable the identification of term (and its variation) for the matcher. | |
Parameters | |
---------- | |
doc : str | |
text | |
Returns | |
------- | |
list or tuple | |
patterns list | |
""" | |
doc = doc.split() | |
basic_form=doc | |
lemma_=[self.lemmatizer.get(token.lower(), token.lower()) for token in doc] | |
lower_=[token.lower() for token in doc] | |
singular_=[self.inflector.singularize(token.lower()) for token in doc] | |
plural_=[self.inflector.pluralize(token.lower()) for token in doc] | |
return (basic_form,lower_,lemma_,singular_,plural_) | |
def get_word(self, key): | |
""" | |
Return the string representation for an id from the spacy.matcher.Matcher() results. | |
Parameters | |
---------- | |
key : int or str | |
id | |
Returns | |
------- | |
str | |
string representation for the id | |
""" | |
if self.is_panda: | |
if self.column_id_in: | |
return self.terminology_data[self.terminology_data[self.column_id == key]] | |
return self.terminology_data.iloc[key][self.column_label] | |
return self.terminology_data[key] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment