Skip to content

Instantly share code, notes, and snippets.

@neuromaancer
Last active August 23, 2019 09:42
Show Gist options
  • Save neuromaancer/9f77f8442c4b9c4f36468fc28e1cf3bb to your computer and use it in GitHub Desktop.
Save neuromaancer/9f77f8442c4b9c4f36468fc28e1cf3bb to your computer and use it in GitHub Desktop.
python preprocessing
import os
import re
import sys
import spacy
from phonenumbers import PhoneNumberMatcher
from spellchecker import SpellChecker
def remove_control_chart(string):
"""remove the control chart like \\xc2, \\xa0
Arguments:
s {string} -- string to process
Returns:
[string] -- string after removing the control charts
"""
return re.sub(r"\\x..", "", string)
def clean_special_charac(string):
"""clean the special characters, for the separate the critical inforations,
we use "~" uniformed the seperations.
Arguments:
string {string} -- the string to process
Returns:
string -- string after cleaning the special characters
"""
# string = re.sub(r"[^A-Za-z0-9(),!?’`]", "", string) # Except for A-Za-z0-9(), !?’`, remove the other
string = re.sub(
r"(.)\1+", r"\1\1", string
) # Match more than two consecutive characters, leaving only two consecutive characters
string = re.sub(r"'s", " 's", string)
string = re.sub(r"'ve", " 've", string)
string = re.sub(r"n't", " n't", string)
string = re.sub(r"'re", " 're", string)
string = re.sub(r"'d", " 'd", string)
string = re.sub(r"'ll", " 'll", string)
# string = re.sub(r",", " ~ ", string)
# string = re.sub(r",", " ", string)
# string = re.sub(r"!", " ! ", string)
string = re.sub(r"!", " ", string)
string = re.sub(r"\(", " , ", string)
# string = re.sub(r"\(", " ", string)
string = re.sub(r"\)", " , ", string)
# string = re.sub(r"\)", " ", string)
string = re.sub(r"\?", " , ", string)
# string = re.sub(r"\?", " ", string)
# string = re.sub(r"\:", " : ", string)
string = re.sub(r"\:", " ", string)
string = re.sub(r"\/", " ", string)
string = re.sub(r"\\", " ", string)
string = re.sub(r"\"", " ", string)
string = re.sub(r"\.", " , ", string)
string = re.sub(r"\-", " ~ ", string) # "~" is separation of a span
string = re.sub(r"–", " ~ ", string)
string = re.sub(r"\n", " ", string)
string = re.sub(r"\t", " ", string)
string = re.sub("à", "~", string) # for the calculation of date, "à" is separation of a span
string = re.sub("\xa0", u" ", string)
string = re.sub("\u25cf", u" ", string)
string = re.sub(r"\s{2,}", " ", string) # delete more than two consecutive whitespace characters
return string.lower()
def remove_stop_word(
string, plus_stops=None, not_stops={"au", "à", "aujourd'hui", "ce jour", "actuellement"}, lang="fr"
):
"""remove the stop word in the string and convert it to list
Arguments:
string {string} -- the string to process
Keyword Arguments:
lang {string} -- the language that we use (default: {"fr"})
plus_stops {set} -- additional stop words set (default: None)
plus_stops {set} -- additional non stop words set (default: None)
Returns:
string -- text without stopword
"""
nlp = spacy.load(lang)
if plus_stops is not None:
nlp.Defaults.stop_words |= plus_stops
if not_stops is not None:
nlp.Defaults.stop_words -= not_stops
doc = nlp(string)
tokens = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
return " ".join(tokens)
def check_spell(string, lang="fr"):
"""spell checker for the text list
Arguments:
text_list {list} -- list contains the string
Keyword Arguments:
lang {str} -- language (default: {"fr"})
Returns:
string -- string after spell checking
"""
tokens = string.split(" ")
spell = SpellChecker(language=lang, distance=1)
text = spell.unknown(tokens)
for word in text:
tokens[tokens.index(word)] = spell.correction(word)
return " ".join(tokens)
def lemmatize(string, lang="fr"):
"""lemmatization of a string
Arguments:
string {str} -- string to precess
Keyword Arguments:
lang {str} -- language (default: {"fr"})
Returns:
str -- string after being processed
"""
nlp = spacy.load(lang)
doc = nlp(string)
text = " ".join([token.lemma_ for token in doc])
# tokens = [token.lemma_ for token in doc]
return text
def remove_tel(string):
"""remove the phone number from the text
Arguments:
string {str} -- text
Returns:
str -- text after removing the phone number
"""
number = ""
for match in PhoneNumberMatcher(string, "FR"):
number += match.raw_string
return re.sub(number, "", string)
def remove_email(string):
"""remove the email address from the text.
Arguments:
string {str} -- text
Returns:
str -- text after removing the email address
"""
reg = "([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)"
if re.search(reg, string) is not None:
email = re.search(reg, string).group(0)
return re.sub(email, "", string)
else:
return string
def preprocess(string):
"""combine all the steps of text preprocessing3
Arguments:
string {[string]} -- text to process
Keyword Arguments:
plus_stops {set} -- additional stop words set (default: None)
not_stops {set} -- additional non stop words set (default: {"aujourd'hui", "à ce jour", "actuellement"})
Returns:
[string] -- text processed
"""
text = remove_control_chart(string)
# text = lemmatize(text)
text = clean_special_charac(text)
# text = check_spell(text)
text = remove_stop_word(text)
text = remove_tel(text)
text = remove_email(text)
text = lemmatize(text)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment