Last active
January 3, 2018 10:16
-
-
Save hendra-herviawan/922cbefab4af5623cf1ace1c79ac7bcc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html | |
import string | |
from nltk.corpus import stopwords as sw | |
from nltk.corpus import wordnet as wn | |
from nltk import wordpunct_tokenize | |
from nltk import WordNetLemmatizer | |
from nltk import sent_tokenize | |
from nltk import pos_tag | |
from sklearn.base import BaseEstimator, TransformerMixin | |
class NLTKPreprocessor(BaseEstimator, TransformerMixin): | |
def __init__(self, stopwords=None, punct=None, | |
lower=True, strip=True): | |
self.lower = lower | |
self.strip = strip | |
self.stopwords = stopwords or set(sw.words('english')) | |
self.punct = punct or set(string.punctuation) | |
self.lemmatizer = WordNetLemmatizer() | |
def fit(self, X, y=None): | |
return self | |
def inverse_transform(self, X): | |
return [" ".join(doc) for doc in X] | |
def transform(self, X): | |
return [ | |
list(self.tokenize(doc)) for doc in X | |
] | |
def tokenize(self, document): | |
# Break the document into sentences | |
for sent in sent_tokenize(document): | |
# Break the sentence into part of speech tagged tokens | |
for token, tag in pos_tag(wordpunct_tokenize(sent)): | |
# Apply preprocessing to the token | |
token = token.lower() if self.lower else token | |
token = token.strip() if self.strip else token | |
token = token.strip('_') if self.strip else token | |
token = token.strip('*') if self.strip else token | |
# If stopword, ignore token and continue | |
if token in self.stopwords: | |
continue | |
# If punctuation, ignore token and continue | |
if all(char in self.punct for char in token): | |
continue | |
# Lemmatize the token and yield | |
lemma = self.lemmatize(token, tag) | |
yield lemma | |
def lemmatize(self, token, tag): | |
tag = { | |
'N': wn.NOUN, | |
'V': wn.VERB, | |
'R': wn.ADV, | |
'J': wn.ADJ | |
}.get(tag[0], wn.NOUN) | |
return self.lemmatizer.lemmatize(token, tag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://github.com/ChenglongChen/Kaggle_HomeDepot/blob/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Code/Chenglong/data_processor.py | |
import nltk | |
class Lemmatizer: | |
def __init__(self): | |
self.Tokenizer = nltk.tokenize.TreebankWordTokenizer() | |
self.Lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() | |
def transform(self, text): | |
tokens = [self.Lemmatizer.lemmatize(token) for token in self.Tokenizer.tokenize(text)] | |
return " ".join(tokens) | |
## stemming | |
class Stemmer: | |
def __init__(self, stemmer_type="snowball"): | |
self.stemmer_type = stemmer_type | |
if self.stemmer_type == "porter": | |
self.stemmer = nltk.stem.PorterStemmer() | |
elif self.stemmer_type == "snowball": | |
self.stemmer = nltk.stem.SnowballStemmer("english") | |
def transform(self, text): | |
tokens = [self.stemmer.stem(token) for token in text.split(" ")] | |
return " ".join(tokens) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment