Skip to content

Instantly share code, notes, and snippets.

@LouisdeBruijn
Last active June 23, 2020 08:08
Show Gist options
  • Select an option

  • Save LouisdeBruijn/17f2e63c7d81883efd430d46bb8787fb to your computer and use it in GitHub Desktop.

Select an option

Save LouisdeBruijn/17f2e63c7d81883efd430d46bb8787fb to your computer and use it in GitHub Desktop.
def tokenize_pos(tokens):
"""Add POS-tags to each token."""
return [token+"_POS-"+tag for token, tag in nltk.pos_tag(tokens)]
class LengthFeatures(BaseEstimator, TransformerMixin):
"""Feature engineer the length of each feature."""
def fit(self, x, y=None):
return self
def _get_features(self, doc):
return {"words": len(doc), "unique_words": len(set(doc))}
def transform(self, raw_documents):
return [self._get_features(doc) for doc in raw_documents]
def feature_union(count, tfidf, textstats):
"""Add features to the pipeline."""
tfidf_vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
count_vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=tokenize_pos, ngram_range=(2, 2))
length_vec = Pipeline([('textstats', LengthFeatures()), ('vec', DictVectorizer())])
features = []
if count:
features.append(('count', count_vec))
if tfidf:
features.append(('tfidf', tfidf_vec))
if textstats:
features.append(('textstats', length_vec))
if len(features) < 1:
critical("Please select one or multiple features.")
exit()
vec = FeatureUnion(features)
classifier = Pipeline([('vec', vec), ('cls', MultinomialNB())])
return classifier
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment