Created
June 12, 2018 12:44
-
-
Save piyushrj/29f35793949a44e17ca0f9a08b98226c to your computer and use it in GitHub Desktop.
Adding custom Matcher rules to identify date patterns along with the ones identified with spacy's NER
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy, re, dateparser | |
from spacy.matcher import Matcher | |
from spacy.tokenizer import Tokenizer | |
from spacy import displacy | |
def custom_tokenizer(nlp, infix_reg): | |
""" | |
Function to return a customized tokenizer based on the infix regex | |
PARAMETERS | |
---------- | |
nlp : Language | |
A Spacy language object with loaded model | |
infix_reg : relgular expression object | |
The infix regular expression object based on which the tokenization is to be | |
carried out. | |
RETURNS | |
------- | |
Tokenizer : Tokenizer object | |
The Spacy tokenizer obtained based on the infix regex. | |
""" | |
return Tokenizer(nlp.vocab, infix_finditer = infix_reg.finditer) | |
def is_valid_date(matcher, doc, i, matches): | |
""" | |
on match function to validate whether a matched instance is an actual date or not | |
PARAMETERS | |
---------- | |
matcher : Matcher | |
The Matcher instance | |
doc : Doc | |
The document the matcher was used on | |
i : int | |
Index of the current match | |
matches : list | |
A list of (match_ic, start, end) tuples, describing the matches. A matched | |
tuple describe the span doc[start:end] | |
RETURNS: | |
------- | |
The function doesn't return a value, it just prints whether the found date instance is valid | |
if it's a valid date. | |
""" | |
match_id, start, end = matches[i] | |
if dateparser.parse(doc[start:end].text): | |
print doc[start:end].text, 'valid' | |
def add_date_ent(matcher, doc, i, matches): | |
""" | |
on_match function to name the valid date as a DATE entity | |
for reference see https://spacy.io/usage/linguistic-features#on_match | |
PARAMETERS | |
---------- | |
matcher : Matcher | |
The Matcher instance | |
doc : Doc | |
The document the matcher was used on | |
i : int | |
Index of the current match | |
matches : list | |
A list of (match_ic, start, end) tuples, describing the matches. A matched | |
tuple describe the span doc[start:end] | |
RETURNS: | |
------- | |
The function doesn't return a value rather append a DATE entity to each valid date | |
and print the date with its validity | |
""" | |
match_id, start, end = matches[i] | |
match_str = doc[start:end].text | |
print match_str, 'Suspect' | |
if dateparser.parse(match_str): | |
entity = (DATE, start, end) | |
doc.ents += (entity,) | |
print match_str, 'VALID' | |
else: | |
print match_str, 'INVALID' | |
def add_regex_flag(vocab, pattern_str): | |
""" | |
Function to create a custom regex based flag for token pattern matching | |
Parameters | |
---------- | |
vocab : Vocab | |
The nlp model's vocabulary, which is simply a lookup to access Lexeme objects as well as | |
StringStore | |
pattern_str : String | |
The string regular expression pattern we want to create the flag for | |
RETURNS | |
------- | |
flag_id : int | |
The integer ID by which the flag value can be checked. | |
""" | |
flag_id = vocab.add_flag(re.compile(pattern_str).match) | |
return flag_id | |
if __name__ == '__main__': | |
infix_re = re.compile(r'''[-/,]''') | |
nlp = spacy.load('en') | |
nlp.tokenizer = custom_tokenizer(nlp, infix_re) | |
DATE = nlp.vocab.strings['DATE'] | |
# for the token pattern 1st, 22nd, 15th etc | |
IS_REGEX_MATCH = add_regex_flag(nlp.vocab, '\d{1,2}(?:[stndrh]){2}?') | |
# MM/DD/YYYY and YYYY/MM/DD | |
pattern_1 = [{'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}] | |
# MM-DD-YYYY and YYYY-MM-DD | |
pattern_2 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}] | |
# dates of the form 10-Aug-2018 | |
pattern_3 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}] | |
# dates of the form Aug-10-2018 | |
pattern_4 = [{'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}] | |
# dates of the form 10th August, 2018 | |
pattern_5 = [{IS_REGEX_MATCH: True}, {'IS_ALPHA': True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}] | |
# dates of the form August 10th, 2018 | |
pattern_6 = [{'IS_ALPHA': True}, {IS_REGEX_MATCH: True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}] | |
matcher = Matcher(nlp.vocab) | |
matcher.add('Type1', add_date_ent, pattern_1) | |
matcher.add('Type2', add_date_ent, pattern_2) | |
matcher.add('Type3', add_date_ent, pattern_3) | |
matcher.add('Type4', add_date_ent, pattern_4) | |
matcher.add('Type5', add_date_ent, pattern_5) | |
matcher.add('Type6', add_date_ent, pattern_6) | |
doc = nlp(u'Today is 06/11/2018 yesterday was 10-Jun-2018 and tomorrow is 06-12-2018 and I will go home on 7-Jul-2018 but clearly not on 39/02/2011 and some dates are of the form 12th February,2017') | |
matches = matcher(doc) | |
# displacy.serve(doc, style='ent') | |
Great, thank you!
I am getting an error doc=nlp(".....")
An integer is required
Just don´t works!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Can u please include dot formatted dates. Such as 10.09.2019