Last active
June 23, 2020 08:08
-
-
Save LouisdeBruijn/17f2e63c7d81883efd430d46bb8787fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def tokenize_pos(tokens): | |
| """Add POS-tags to each token.""" | |
| return [token+"_POS-"+tag for token, tag in nltk.pos_tag(tokens)] | |
| class LengthFeatures(BaseEstimator, TransformerMixin): | |
| """Feature engineer the length of each feature.""" | |
| def fit(self, x, y=None): | |
| return self | |
| def _get_features(self, doc): | |
| return {"words": len(doc), "unique_words": len(set(doc))} | |
| def transform(self, raw_documents): | |
| return [self._get_features(doc) for doc in raw_documents] | |
| def feature_union(count, tfidf, textstats): | |
| """Add features to the pipeline.""" | |
| tfidf_vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x) | |
| count_vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=tokenize_pos, ngram_range=(2, 2)) | |
| length_vec = Pipeline([('textstats', LengthFeatures()), ('vec', DictVectorizer())]) | |
| features = [] | |
| if count: | |
| features.append(('count', count_vec)) | |
| if tfidf: | |
| features.append(('tfidf', tfidf_vec)) | |
| if textstats: | |
| features.append(('textstats', length_vec)) | |
| if len(features) < 1: | |
| critical("Please select one or multiple features.") | |
| exit() | |
| vec = FeatureUnion(features) | |
| classifier = Pipeline([('vec', vec), ('cls', MultinomialNB())]) | |
| return classifier |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment