Skip to content

Instantly share code, notes, and snippets.

@omri374
Created May 13, 2020 10:39
Show Gist options
  • Save omri374/ec1c243a5a94a657dae40078d47977b6 to your computer and use it in GitHub Desktop.
Save omri374/ec1c243a5a94a657dae40078d47977b6 to your computer and use it in GitHub Desktop.
Text preprocessing using spaCy
import re
from typing import List
import spacy
from spacy.tokens import Doc
from tqdm import tqdm
class SpacyPreprocessor:
def __init__(
self,
spacy_model=None,
remove_numbers=False,
remove_special=True,
pos_to_remove=None,
remove_stopwords=False,
lemmatize=False,
):
"""
Preprocesses text using spaCy
:param remove_numbers: Whether to remove numbers from text
:param remove_stopwords: Whether to remove stopwords from text
:param remove_special: Whether to remove special characters (including numbers)
:param pos_to_remove: list of PoS tags to remove
:param lemmatize: Whether to apply lemmatization
"""
self._remove_numbers = remove_numbers
self._pos_to_remove = pos_to_remove
self._remove_stopwords = remove_stopwords
self._remove_special = remove_special
self._lemmatize = lemmatize
if not spacy_model:
self.model = spacy.load("en_core_web_sm")
else:
self.model = spacy_model
@staticmethod
def download_spacy_model(model="en_core_web_sm"):
print(f"Downloading spaCy model {model}")
spacy.cli.download(model)
print(f"Finished downloading model")
@staticmethod
def load_model(model="en_core_web_sm"):
return spacy.load(model, disable=["ner", "parser"])
def tokenize(self, text) -> List[str]:
"""
Tokenize text using a spaCy pipeline
:param text: Text to tokenize
:return: list of str
"""
doc = self.model(text)
return [token.text for token in doc]
def preprocess_text(self, text) -> str:
"""
Runs a spaCy pipeline and removes unwanted parts from text
:param text: text string to clean
:return: str, clean text
"""
doc = self.model(text)
return self.__clean(doc)
def preprocess_text_list(self, texts=List[str]) -> List[str]:
"""
Runs a spaCy pipeline and removes unwantes parts from a list of text.
Leverages spaCy's `pipe` for faster batch processing.
:param texts: List of texts to clean
:return: List of clean texts
"""
clean_texts = []
for doc in tqdm(self.model.pipe(texts)):
clean_texts.append(self.__clean(doc))
return clean_texts
def __clean(self, doc: Doc) -> str:
tokens = []
# POS Tags removal
if self._pos_to_remove:
for token in doc:
if token.pos_ not in self._pos_to_remove:
tokens.append(token)
else:
tokens = doc
# Remove Numbers
if self._remove_numbers:
tokens = [
token for token in tokens if not (token.like_num or token.is_currency)
]
# Remove Stopwords
if self._remove_stopwords:
tokens = [token for token in tokens if not token.is_stop]
# remove unwanted tokens
tokens = [
token
for token in tokens
if not (
token.is_punct or token.is_space or token.is_quote or token.is_bracket
)
]
# Remove empty tokens
tokens = [token for token in tokens if token.text.strip() != ""]
# Lemmatize
if self._lemmatize:
text = " ".join([token.lemma_ for token in tokens])
else:
text = " ".join([token.text for token in tokens])
if self._remove_special:
# Remove non alphabetic characters
text = re.sub(r"[^a-zA-Z\']", " ", text)
# remove non-Unicode characters
text = re.sub(r"[^\x00-\x7F]+", "", text)
text = text.lower()
return text
if __name__ == "__main__":
spacy_model = SpacyPreprocessor.load_model()
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=True)
clean_text = preprocessor.preprocess_text("spaCy is awesome! 123")
print(clean_text)
@aliforgetti
Copy link

This is great and exactly what I was looking for. Thank you for this :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment