-
-
Save akaanirban/d49875ab8a229f345125eae184262058 to your computer and use it in GitHub Desktop.
Text preprocessing using spaCy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from typing import List | |
import spacy | |
from spacy.tokens import Doc | |
from tqdm import tqdm | |
class SpacyPreprocessor: | |
def __init__( | |
self, | |
spacy_model=None, | |
remove_numbers=False, | |
remove_special=True, | |
pos_to_remove=None, | |
remove_stopwords=False, | |
lemmatize=False, | |
): | |
""" | |
Preprocesses text using spaCy | |
:param remove_numbers: Whether to remove numbers from text | |
:param remove_stopwords: Whether to remove stopwords from text | |
:param remove_special: Whether to remove special characters (including numbers) | |
:param pos_to_remove: list of PoS tags to remove | |
:param lemmatize: Whether to apply lemmatization | |
""" | |
self._remove_numbers = remove_numbers | |
self._pos_to_remove = pos_to_remove | |
self._remove_stopwords = remove_stopwords | |
self._remove_special = remove_special | |
self._lemmatize = lemmatize | |
if not spacy_model: | |
self.model = spacy.load("en_core_web_sm") | |
else: | |
self.model = spacy_model | |
@staticmethod | |
def download_spacy_model(model="en_core_web_sm"): | |
print(f"Downloading spaCy model {model}") | |
spacy.cli.download(model) | |
print(f"Finished downloading model") | |
@staticmethod | |
def load_model(model="en_core_web_sm"): | |
return spacy.load(model, disable=["ner", "parser"]) | |
def tokenize(self, text) -> List[str]: | |
""" | |
Tokenize text using a spaCy pipeline | |
:param text: Text to tokenize | |
:return: list of str | |
""" | |
doc = self.model(text) | |
return [token.text for token in doc] | |
def preprocess_text(self, text) -> str: | |
""" | |
Runs a spaCy pipeline and removes unwanted parts from text | |
:param text: text string to clean | |
:return: str, clean text | |
""" | |
doc = self.model(text) | |
return self.__clean(doc) | |
def preprocess_text_list(self, texts=List[str]) -> List[str]: | |
""" | |
Runs a spaCy pipeline and removes unwantes parts from a list of text. | |
Leverages spaCy's `pipe` for faster batch processing. | |
:param texts: List of texts to clean | |
:return: List of clean texts | |
""" | |
clean_texts = [] | |
for doc in tqdm(self.model.pipe(texts)): | |
clean_texts.append(self.__clean(doc)) | |
return clean_texts | |
def __clean(self, doc: Doc) -> str: | |
tokens = [] | |
# POS Tags removal | |
if self._pos_to_remove: | |
for token in doc: | |
if token.pos_ not in self._pos_to_remove: | |
tokens.append(token) | |
else: | |
tokens = doc | |
# Remove Numbers | |
if self._remove_numbers: | |
tokens = [ | |
token for token in tokens if not (token.like_num or token.is_currency) | |
] | |
# Remove Stopwords | |
if self._remove_stopwords: | |
tokens = [token for token in tokens if not token.is_stop] | |
# remove unwanted tokens | |
tokens = [ | |
token | |
for token in tokens | |
if not ( | |
token.is_punct or token.is_space or token.is_quote or token.is_bracket | |
) | |
] | |
# Remove empty tokens | |
tokens = [token for token in tokens if token.text.strip() != ""] | |
# Lemmatize | |
if self._lemmatize: | |
text = " ".join([token.lemma_ for token in tokens]) | |
else: | |
text = " ".join([token.text for token in tokens]) | |
if self._remove_special: | |
# Remove non alphabetic characters | |
text = re.sub(r"[^a-zA-Z\']", " ", text) | |
# remove non-Unicode characters | |
text = re.sub(r"[^\x00-\x7F]+", "", text) | |
text = text.lower() | |
return text | |
if __name__ == "__main__": | |
spacy_model = SpacyPreprocessor.load_model() | |
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=True) | |
clean_text = preprocessor.preprocess_text("spaCy is awesome! 123") | |
print(clean_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment