piyushrj · June 12, 2018 12:44 · bgshri · Sep 24, 2019 · stelmath · Sep 23, 2020
diff --git a/date_identification.py b/date_identification.py
 import spacy, re, dateparser
 from spacy.matcher import Matcher
 from spacy.tokenizer import Tokenizer
 from spacy import displacy


 def custom_tokenizer(nlp, infix_reg):
 	"""
 	Function to return a customized tokenizer based on the infix regex

 	PARAMETERS
 	----------
 	nlp : Language
 	A Spacy language object with loaded model

 	infix_reg : relgular expression object
 	The infix regular expression object based on which the tokenization is to be 
 	carried out.

 	RETURNS
 	-------
 	Tokenizer : Tokenizer object
 	The Spacy tokenizer obtained based on the infix regex.

 	"""
 	return Tokenizer(nlp.vocab, infix_finditer = infix_reg.finditer)


 def is_valid_date(matcher, doc, i, matches):
 	"""
 	on match function to validate whether a matched instance is an actual date or not

 	PARAMETERS
 	----------
 	matcher : Matcher
 	The Matcher instance

 	doc : Doc
 	The document the matcher was used on 

 	i : int
 	Index of the current match

 	matches : list
 	A list of (match_ic, start, end) tuples, describing the matches. A matched
 	tuple describe the span doc[start:end]


 	RETURNS:
 	-------
 	The function doesn't return a value, it just prints whether the found date instance is valid 
 	if it's a valid date.
 	"""
 	match_id, start, end = matches[i]
 	if dateparser.parse(doc[start:end].text):
 		print doc[start:end].text, 'valid'


 def add_date_ent(matcher, doc, i, matches):
 	"""
 	on_match function to name the valid date as a DATE entity
 	for reference see https://spacy.io/usage/linguistic-features#on_match
 	

 	PARAMETERS
 	----------
 	matcher : Matcher
 	The Matcher instance

 	doc : Doc
 	The document the matcher was used on 

 	i : int
 	Index of the current match

 	matches : list
 	A list of (match_ic, start, end) tuples, describing the matches. A matched
 	tuple describe the span doc[start:end]


 	RETURNS:
 	-------
 	The function doesn't return a value rather append a DATE entity to each valid date
 	and print the date with its validity 

 	"""
 	match_id, start, end = matches[i]
 	match_str = doc[start:end].text
 	print match_str, 'Suspect'
 	if dateparser.parse(match_str):
 		entity = (DATE, start, end)
 		doc.ents += (entity,)
 		print match_str, 'VALID'
 	else:
 		print match_str, 'INVALID'


 def add_regex_flag(vocab, pattern_str):
 	"""
 	Function to create a custom regex based flag for token pattern matching


 	Parameters
 	----------
 	vocab : Vocab
 	The nlp model's vocabulary, which is simply a lookup to access Lexeme objects as well as
 	StringStore

 	pattern_str : String
 	The string regular expression pattern we want to create the flag for

 	RETURNS
 	-------
 	flag_id : int
 	The integer ID by which the flag value can be checked.

 	"""
 	flag_id = vocab.add_flag(re.compile(pattern_str).match)
 	return flag_id


 if __name__ == '__main__':
 	
 	infix_re = re.compile(r'''[-/,]''')
   	nlp = spacy.load('en')
   	nlp.tokenizer = custom_tokenizer(nlp, infix_re)

   	DATE = nlp.vocab.strings['DATE']
 	
   	# for the token pattern 1st, 22nd, 15th etc
   	IS_REGEX_MATCH = add_regex_flag(nlp.vocab, '\d{1,2}(?:[stndrh]){2}?')

   	# MM/DD/YYYY and YYYY/MM/DD
   	pattern_1 = [{'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}]
   	# MM-DD-YYYY and YYYY-MM-DD
   	pattern_2 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
   	# dates of the form 10-Aug-2018
   	pattern_3 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
   	# dates of the form Aug-10-2018
   	pattern_4 = [{'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
   	# dates of the form 10th August, 2018
   	pattern_5 = [{IS_REGEX_MATCH: True}, {'IS_ALPHA': True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}]
   	# dates of the form August 10th, 2018
   	pattern_6 = [{'IS_ALPHA': True}, {IS_REGEX_MATCH: True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}]
 	
 	matcher = Matcher(nlp.vocab)
 	matcher.add('Type1', add_date_ent, pattern_1)
 	matcher.add('Type2', add_date_ent, pattern_2)
 	matcher.add('Type3', add_date_ent, pattern_3)
 	matcher.add('Type4', add_date_ent, pattern_4)
 	matcher.add('Type5', add_date_ent, pattern_5)
 	matcher.add('Type6', add_date_ent, pattern_6)
 	
 	doc = nlp(u'Today is 06/11/2018 yesterday was 10-Jun-2018 and tomorrow is 06-12-2018 and I will go home on 7-Jul-2018 but clearly not on 39/02/2011 and some dates are of the form 12th February,2017')
 	matches = matcher(doc)
 #   displacy.serve(doc, style='ent')
	import spacy, re, dateparser
	from spacy.matcher import Matcher
	from spacy.tokenizer import Tokenizer
	from spacy import displacy


	def custom_tokenizer(nlp, infix_reg):
	"""
	Function to return a customized tokenizer based on the infix regex

	PARAMETERS
	----------
	nlp : Language
	A Spacy language object with loaded model

	infix_reg : relgular expression object
	The infix regular expression object based on which the tokenization is to be
	carried out.

	RETURNS
	-------
	Tokenizer : Tokenizer object
	The Spacy tokenizer obtained based on the infix regex.

	"""
	return Tokenizer(nlp.vocab, infix_finditer = infix_reg.finditer)


	def is_valid_date(matcher, doc, i, matches):
	"""
	on match function to validate whether a matched instance is an actual date or not

	PARAMETERS
	----------
	matcher : Matcher
	The Matcher instance

	doc : Doc
	The document the matcher was used on

	i : int
	Index of the current match

	matches : list
	A list of (match_ic, start, end) tuples, describing the matches. A matched
	tuple describe the span doc[start:end]


	RETURNS:
	-------
	The function doesn't return a value, it just prints whether the found date instance is valid
	if it's a valid date.
	"""
	match_id, start, end = matches[i]
	if dateparser.parse(doc[start:end].text):
	print doc[start:end].text, 'valid'


	def add_date_ent(matcher, doc, i, matches):
	"""
	on_match function to name the valid date as a DATE entity
	for reference see https://spacy.io/usage/linguistic-features#on_match


	PARAMETERS
	----------
	matcher : Matcher
	The Matcher instance

	doc : Doc
	The document the matcher was used on

	i : int
	Index of the current match

	matches : list
	A list of (match_ic, start, end) tuples, describing the matches. A matched
	tuple describe the span doc[start:end]


	RETURNS:
	-------
	The function doesn't return a value rather append a DATE entity to each valid date
	and print the date with its validity

	"""
	match_id, start, end = matches[i]
	match_str = doc[start:end].text
	print match_str, 'Suspect'
	if dateparser.parse(match_str):
	entity = (DATE, start, end)
	doc.ents += (entity,)
	print match_str, 'VALID'
	else:
	print match_str, 'INVALID'


	def add_regex_flag(vocab, pattern_str):
	"""
	Function to create a custom regex based flag for token pattern matching


	Parameters
	----------
	vocab : Vocab
	The nlp model's vocabulary, which is simply a lookup to access Lexeme objects as well as
	StringStore

	pattern_str : String
	The string regular expression pattern we want to create the flag for

	RETURNS
	-------
	flag_id : int
	The integer ID by which the flag value can be checked.

	"""
	flag_id = vocab.add_flag(re.compile(pattern_str).match)
	return flag_id


	if __name__ == '__main__':

	infix_re = re.compile(r'''[-/,]''')
	nlp = spacy.load('en')
	nlp.tokenizer = custom_tokenizer(nlp, infix_re)

	DATE = nlp.vocab.strings['DATE']

	# for the token pattern 1st, 22nd, 15th etc
	IS_REGEX_MATCH = add_regex_flag(nlp.vocab, '\d{1,2}(?:[stndrh]){2}?')

	# MM/DD/YYYY and YYYY/MM/DD
	pattern_1 = [{'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}, {'ORTH': '/'}, {'IS_DIGIT': True}]
	# MM-DD-YYYY and YYYY-MM-DD
	pattern_2 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
	# dates of the form 10-Aug-2018
	pattern_3 = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
	# dates of the form Aug-10-2018
	pattern_4 = [{'IS_ALPHA': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}]
	# dates of the form 10th August, 2018
	pattern_5 = [{IS_REGEX_MATCH: True}, {'IS_ALPHA': True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}]
	# dates of the form August 10th, 2018
	pattern_6 = [{'IS_ALPHA': True}, {IS_REGEX_MATCH: True}, {'ORTH': ',', 'OP': '?'}, {'IS_DIGIT': True}]

	matcher = Matcher(nlp.vocab)
	matcher.add('Type1', add_date_ent, pattern_1)
	matcher.add('Type2', add_date_ent, pattern_2)
	matcher.add('Type3', add_date_ent, pattern_3)
	matcher.add('Type4', add_date_ent, pattern_4)
	matcher.add('Type5', add_date_ent, pattern_5)
	matcher.add('Type6', add_date_ent, pattern_6)

	doc = nlp(u'Today is 06/11/2018 yesterday was 10-Jun-2018 and tomorrow is 06-12-2018 and I will go home on 7-Jul-2018 but clearly not on 39/02/2011 and some dates are of the form 12th February,2017')
	matches = matcher(doc)
	# displacy.serve(doc, style='ent')