Last active
August 23, 2019 09:42
-
-
Save neuromaancer/9f77f8442c4b9c4f36468fc28e1cf3bb to your computer and use it in GitHub Desktop.
python preprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
import spacy | |
from phonenumbers import PhoneNumberMatcher | |
from spellchecker import SpellChecker | |
def remove_control_chart(string): | |
"""remove the control chart like \\xc2, \\xa0 | |
Arguments: | |
s {string} -- string to process | |
Returns: | |
[string] -- string after removing the control charts | |
""" | |
return re.sub(r"\\x..", "", string) | |
def clean_special_charac(string): | |
"""clean the special characters, for the separate the critical inforations, | |
we use "~" uniformed the seperations. | |
Arguments: | |
string {string} -- the string to process | |
Returns: | |
string -- string after cleaning the special characters | |
""" | |
# string = re.sub(r"[^A-Za-z0-9(),!?’`]", "", string) # Except for A-Za-z0-9(), !?’`, remove the other | |
string = re.sub( | |
r"(.)\1+", r"\1\1", string | |
) # Match more than two consecutive characters, leaving only two consecutive characters | |
string = re.sub(r"'s", " 's", string) | |
string = re.sub(r"'ve", " 've", string) | |
string = re.sub(r"n't", " n't", string) | |
string = re.sub(r"'re", " 're", string) | |
string = re.sub(r"'d", " 'd", string) | |
string = re.sub(r"'ll", " 'll", string) | |
# string = re.sub(r",", " ~ ", string) | |
# string = re.sub(r",", " ", string) | |
# string = re.sub(r"!", " ! ", string) | |
string = re.sub(r"!", " ", string) | |
string = re.sub(r"\(", " , ", string) | |
# string = re.sub(r"\(", " ", string) | |
string = re.sub(r"\)", " , ", string) | |
# string = re.sub(r"\)", " ", string) | |
string = re.sub(r"\?", " , ", string) | |
# string = re.sub(r"\?", " ", string) | |
# string = re.sub(r"\:", " : ", string) | |
string = re.sub(r"\:", " ", string) | |
string = re.sub(r"\/", " ", string) | |
string = re.sub(r"\\", " ", string) | |
string = re.sub(r"\"", " ", string) | |
string = re.sub(r"\.", " , ", string) | |
string = re.sub(r"\-", " ~ ", string) # "~" is separation of a span | |
string = re.sub(r"–", " ~ ", string) | |
string = re.sub(r"\n", " ", string) | |
string = re.sub(r"\t", " ", string) | |
string = re.sub("à", "~", string) # for the calculation of date, "à" is separation of a span | |
string = re.sub("\xa0", u" ", string) | |
string = re.sub("\u25cf", u" ", string) | |
string = re.sub(r"\s{2,}", " ", string) # delete more than two consecutive whitespace characters | |
return string.lower() | |
def remove_stop_word( | |
string, plus_stops=None, not_stops={"au", "à", "aujourd'hui", "ce jour", "actuellement"}, lang="fr" | |
): | |
"""remove the stop word in the string and convert it to list | |
Arguments: | |
string {string} -- the string to process | |
Keyword Arguments: | |
lang {string} -- the language that we use (default: {"fr"}) | |
plus_stops {set} -- additional stop words set (default: None) | |
plus_stops {set} -- additional non stop words set (default: None) | |
Returns: | |
string -- text without stopword | |
""" | |
nlp = spacy.load(lang) | |
if plus_stops is not None: | |
nlp.Defaults.stop_words |= plus_stops | |
if not_stops is not None: | |
nlp.Defaults.stop_words -= not_stops | |
doc = nlp(string) | |
tokens = [token.text for token in doc if token.text not in nlp.Defaults.stop_words] | |
return " ".join(tokens) | |
def check_spell(string, lang="fr"): | |
"""spell checker for the text list | |
Arguments: | |
text_list {list} -- list contains the string | |
Keyword Arguments: | |
lang {str} -- language (default: {"fr"}) | |
Returns: | |
string -- string after spell checking | |
""" | |
tokens = string.split(" ") | |
spell = SpellChecker(language=lang, distance=1) | |
text = spell.unknown(tokens) | |
for word in text: | |
tokens[tokens.index(word)] = spell.correction(word) | |
return " ".join(tokens) | |
def lemmatize(string, lang="fr"): | |
"""lemmatization of a string | |
Arguments: | |
string {str} -- string to precess | |
Keyword Arguments: | |
lang {str} -- language (default: {"fr"}) | |
Returns: | |
str -- string after being processed | |
""" | |
nlp = spacy.load(lang) | |
doc = nlp(string) | |
text = " ".join([token.lemma_ for token in doc]) | |
# tokens = [token.lemma_ for token in doc] | |
return text | |
def remove_tel(string): | |
"""remove the phone number from the text | |
Arguments: | |
string {str} -- text | |
Returns: | |
str -- text after removing the phone number | |
""" | |
number = "" | |
for match in PhoneNumberMatcher(string, "FR"): | |
number += match.raw_string | |
return re.sub(number, "", string) | |
def remove_email(string): | |
"""remove the email address from the text. | |
Arguments: | |
string {str} -- text | |
Returns: | |
str -- text after removing the email address | |
""" | |
reg = "([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)" | |
if re.search(reg, string) is not None: | |
email = re.search(reg, string).group(0) | |
return re.sub(email, "", string) | |
else: | |
return string | |
def preprocess(string): | |
"""combine all the steps of text preprocessing3 | |
Arguments: | |
string {[string]} -- text to process | |
Keyword Arguments: | |
plus_stops {set} -- additional stop words set (default: None) | |
not_stops {set} -- additional non stop words set (default: {"aujourd'hui", "à ce jour", "actuellement"}) | |
Returns: | |
[string] -- text processed | |
""" | |
text = remove_control_chart(string) | |
# text = lemmatize(text) | |
text = clean_special_charac(text) | |
# text = check_spell(text) | |
text = remove_stop_word(text) | |
text = remove_tel(text) | |
text = remove_email(text) | |
text = lemmatize(text) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment