neuromaancer · August 23, 2019 09:42
diff --git a/string_preprocessing.py b/string_preprocessing.py
 import os
 import re
 import sys

 import spacy
 from phonenumbers import PhoneNumberMatcher
 from spellchecker import SpellChecker


 def remove_control_chart(string):
    """remove the control chart like \\xc2, \\xa0

    Arguments:
        s {string} -- string to process

    Returns:
        [string] -- string after removing the control charts
    """
    return re.sub(r"\\x..", "", string)


 def clean_special_charac(string):
    """clean the special characters, for the separate the critical inforations,
        we use "~" uniformed the seperations.

    Arguments:
        string {string} -- the string to process

    Returns:
        string -- string after cleaning the special characters
    """
    # string = re.sub(r"[^A-Za-z0-9(),!?’`]", "", string) # Except for A-Za-z0-9(), !?’`, remove the other
    string = re.sub(
        r"(.)\1+", r"\1\1", string
    )  # Match more than two consecutive characters, leaving only two consecutive characters
    string = re.sub(r"'s", " 's", string)
    string = re.sub(r"'ve", " 've", string)
    string = re.sub(r"n't", " n't", string)
    string = re.sub(r"'re", " 're", string)
    string = re.sub(r"'d", " 'd", string)
    string = re.sub(r"'ll", " 'll", string)
    # string = re.sub(r",", " ~ ", string)
    # string = re.sub(r",", " ", string)
    # string = re.sub(r"!", " ! ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " , ", string)
    # string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " , ", string)
    # string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " , ", string)
    # string = re.sub(r"\?", " ", string)
    # string = re.sub(r"\:", " : ", string)
    string = re.sub(r"\:", " ", string)
    string = re.sub(r"\/", " ", string)
    string = re.sub(r"\\", " ", string)
    string = re.sub(r"\"", " ", string)
    string = re.sub(r"\.", " , ", string)
    string = re.sub(r"\-", " ~ ", string)  # "~" is separation of a span
    string = re.sub(r"–", " ~ ", string)
    string = re.sub(r"\n", " ", string)
    string = re.sub(r"\t", " ", string)
    string = re.sub("à", "~", string)  # for the calculation of date, "à" is separation of a span
    string = re.sub("\xa0", u" ", string)
    string = re.sub("\u25cf", u" ", string)
    string = re.sub(r"\s{2,}", " ", string)  #  delete more than two consecutive whitespace characters
    return string.lower()


 def remove_stop_word(
    string, plus_stops=None, not_stops={"au", "à", "aujourd'hui", "ce jour", "actuellement"}, lang="fr"
 ):
    """remove the stop word in the string and convert it to list

    Arguments:
        string {string} -- the string to process

    Keyword Arguments:
        lang {string} -- the language that we use (default: {"fr"})
        plus_stops {set} -- additional stop words set (default: None)
        plus_stops {set} -- additional non stop words set (default: None)

    Returns:
        string -- text without stopword
    """

    nlp = spacy.load(lang)

    if plus_stops is not None:
        nlp.Defaults.stop_words |= plus_stops
    if not_stops is not None:
        nlp.Defaults.stop_words -= not_stops
    doc = nlp(string)
    tokens = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
    return " ".join(tokens)


 def check_spell(string, lang="fr"):
    """spell checker for the text list

    Arguments:
        text_list {list} -- list contains the string

    Keyword Arguments:
        lang {str} -- language (default: {"fr"})

    Returns:
        string -- string after spell checking
    """
    tokens = string.split(" ")
    spell = SpellChecker(language=lang, distance=1)
    text = spell.unknown(tokens)
    for word in text:
        tokens[tokens.index(word)] = spell.correction(word)
    return " ".join(tokens)


 def lemmatize(string, lang="fr"):
    """lemmatization of a string

    Arguments:
        string {str} -- string to precess

    Keyword Arguments:
        lang {str} -- language (default: {"fr"})

    Returns:
        str -- string after being processed
    """
    nlp = spacy.load(lang)
    doc = nlp(string)
    text = " ".join([token.lemma_ for token in doc])
    # tokens = [token.lemma_ for token in doc]
    return text


 def remove_tel(string):
    """remove the phone number from the text

    Arguments:
        string {str} -- text

    Returns:
        str -- text after removing the phone number
    """
    number = ""
    for match in PhoneNumberMatcher(string, "FR"):
        number += match.raw_string
    return re.sub(number, "", string)


 def remove_email(string):
    """remove the email address from the text.

    Arguments:
        string {str} -- text

    Returns:
        str -- text after removing the email address
    """
    reg = "([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)"
    if re.search(reg, string) is not None:
        email = re.search(reg, string).group(0)
        return re.sub(email, "", string)
    else:
        return string


 def preprocess(string):
    """combine all the steps of text preprocessing3

    Arguments:
        string {[string]} -- text to process

    Keyword Arguments:
        plus_stops {set} -- additional stop words set (default: None)
        not_stops {set} -- additional non stop words set (default: {"aujourd'hui", "à ce jour", "actuellement"})
    Returns:
        [string] -- text processed
    """
    text = remove_control_chart(string)
    # text = lemmatize(text)
    text = clean_special_charac(text)
    # text = check_spell(text)
    text = remove_stop_word(text)
    text = remove_tel(text)
    text = remove_email(text)
    text = lemmatize(text)
    return text
	import os
	import re
	import sys

	import spacy
	from phonenumbers import PhoneNumberMatcher
	from spellchecker import SpellChecker


	def remove_control_chart(string):
	"""remove the control chart like \\xc2, \\xa0

	Arguments:
	s {string} -- string to process

	Returns:
	[string] -- string after removing the control charts
	"""
	return re.sub(r"\\x..", "", string)


	def clean_special_charac(string):
	"""clean the special characters, for the separate the critical inforations,
	we use "~" uniformed the seperations.

	Arguments:
	string {string} -- the string to process

	Returns:
	string -- string after cleaning the special characters
	"""
	# string = re.sub(r"[^A-Za-z0-9(),!?’`]", "", string) # Except for A-Za-z0-9(), !?’`, remove the other
	string = re.sub(
	r"(.)\1+", r"\1\1", string
	) # Match more than two consecutive characters, leaving only two consecutive characters
	string = re.sub(r"'s", " 's", string)
	string = re.sub(r"'ve", " 've", string)
	string = re.sub(r"n't", " n't", string)
	string = re.sub(r"'re", " 're", string)
	string = re.sub(r"'d", " 'd", string)
	string = re.sub(r"'ll", " 'll", string)
	# string = re.sub(r",", " ~ ", string)
	# string = re.sub(r",", " ", string)
	# string = re.sub(r"!", " ! ", string)
	string = re.sub(r"!", " ", string)
	string = re.sub(r"\(", " , ", string)
	# string = re.sub(r"\(", " ", string)
	string = re.sub(r"\)", " , ", string)
	# string = re.sub(r"\)", " ", string)
	string = re.sub(r"\?", " , ", string)
	# string = re.sub(r"\?", " ", string)
	# string = re.sub(r"\:", " : ", string)
	string = re.sub(r"\:", " ", string)
	string = re.sub(r"\/", " ", string)
	string = re.sub(r"\\", " ", string)
	string = re.sub(r"\"", " ", string)
	string = re.sub(r"\.", " , ", string)
	string = re.sub(r"\-", " ~ ", string) # "~" is separation of a span
	string = re.sub(r"–", " ~ ", string)
	string = re.sub(r"\n", " ", string)
	string = re.sub(r"\t", " ", string)
	string = re.sub("à", "~", string) # for the calculation of date, "à" is separation of a span
	string = re.sub("\xa0", u" ", string)
	string = re.sub("\u25cf", u" ", string)
	string = re.sub(r"\s{2,}", " ", string) # delete more than two consecutive whitespace characters
	return string.lower()


	def remove_stop_word(
	string, plus_stops=None, not_stops={"au", "à", "aujourd'hui", "ce jour", "actuellement"}, lang="fr"
	):
	"""remove the stop word in the string and convert it to list

	Arguments:
	string {string} -- the string to process

	Keyword Arguments:
	lang {string} -- the language that we use (default: {"fr"})
	plus_stops {set} -- additional stop words set (default: None)
	plus_stops {set} -- additional non stop words set (default: None)

	Returns:
	string -- text without stopword
	"""

	nlp = spacy.load(lang)

	if plus_stops is not None:
	nlp.Defaults.stop_words \|= plus_stops
	if not_stops is not None:
	nlp.Defaults.stop_words -= not_stops
	doc = nlp(string)
	tokens = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
	return " ".join(tokens)


	def check_spell(string, lang="fr"):
	"""spell checker for the text list

	Arguments:
	text_list {list} -- list contains the string

	Keyword Arguments:
	lang {str} -- language (default: {"fr"})

	Returns:
	string -- string after spell checking
	"""
	tokens = string.split(" ")
	spell = SpellChecker(language=lang, distance=1)
	text = spell.unknown(tokens)
	for word in text:
	tokens[tokens.index(word)] = spell.correction(word)
	return " ".join(tokens)


	def lemmatize(string, lang="fr"):
	"""lemmatization of a string

	Arguments:
	string {str} -- string to precess

	Keyword Arguments:
	lang {str} -- language (default: {"fr"})

	Returns:
	str -- string after being processed
	"""
	nlp = spacy.load(lang)
	doc = nlp(string)
	text = " ".join([token.lemma_ for token in doc])
	# tokens = [token.lemma_ for token in doc]
	return text


	def remove_tel(string):
	"""remove the phone number from the text

	Arguments:
	string {str} -- text

	Returns:
	str -- text after removing the phone number
	"""
	number = ""
	for match in PhoneNumberMatcher(string, "FR"):
	number += match.raw_string
	return re.sub(number, "", string)


	def remove_email(string):
	"""remove the email address from the text.

	Arguments:
	string {str} -- text

	Returns:
	str -- text after removing the email address
	"""
	reg = "([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)"
	if re.search(reg, string) is not None:
	email = re.search(reg, string).group(0)
	return re.sub(email, "", string)
	else:
	return string


	def preprocess(string):
	"""combine all the steps of text preprocessing3

	Arguments:
	string {[string]} -- text to process

	Keyword Arguments:
	plus_stops {set} -- additional stop words set (default: None)
	not_stops {set} -- additional non stop words set (default: {"aujourd'hui", "à ce jour", "actuellement"})
	Returns:
	[string] -- text processed
	"""
	text = remove_control_chart(string)
	# text = lemmatize(text)
	text = clean_special_charac(text)
	# text = check_spell(text)
	text = remove_stop_word(text)
	text = remove_tel(text)
	text = remove_email(text)
	text = lemmatize(text)
	return text