LouisdeBruijn · June 23, 2020 08:08
diff --git a/feature_union.py b/feature_union.py
 def tokenize_pos(tokens):
    """Add POS-tags to each token."""
    return [token+"_POS-"+tag for token, tag in nltk.pos_tag(tokens)]


 class LengthFeatures(BaseEstimator, TransformerMixin):
    """Feature engineer the length of each feature."""
    def fit(self, x, y=None):
        return self

    def _get_features(self, doc):
        return {"words": len(doc), "unique_words": len(set(doc))}

    def transform(self, raw_documents):
        return [self._get_features(doc) for doc in raw_documents]


 def feature_union(count, tfidf, textstats):
    """Add features to the pipeline."""
    tfidf_vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
    count_vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=tokenize_pos, ngram_range=(2, 2))
    length_vec = Pipeline([('textstats', LengthFeatures()), ('vec', DictVectorizer())])

    features = []
    if count:
        features.append(('count', count_vec))
    if tfidf:
        features.append(('tfidf', tfidf_vec))
    if textstats:
        features.append(('textstats', length_vec))

    if len(features) < 1:
        critical("Please select one or multiple features.")
        exit()

    vec = FeatureUnion(features)
    classifier = Pipeline([('vec', vec), ('cls', MultinomialNB())])

    return classifier
	def tokenize_pos(tokens):
	"""Add POS-tags to each token."""
	return [token+"_POS-"+tag for token, tag in nltk.pos_tag(tokens)]


	class LengthFeatures(BaseEstimator, TransformerMixin):
	"""Feature engineer the length of each feature."""
	def fit(self, x, y=None):
	return self

	def _get_features(self, doc):
	return {"words": len(doc), "unique_words": len(set(doc))}

	def transform(self, raw_documents):
	return [self._get_features(doc) for doc in raw_documents]


	def feature_union(count, tfidf, textstats):
	"""Add features to the pipeline."""
	tfidf_vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
	count_vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=tokenize_pos, ngram_range=(2, 2))
	length_vec = Pipeline([('textstats', LengthFeatures()), ('vec', DictVectorizer())])

	features = []
	if count:
	features.append(('count', count_vec))
	if tfidf:
	features.append(('tfidf', tfidf_vec))
	if textstats:
	features.append(('textstats', length_vec))

	if len(features) < 1:
	critical("Please select one or multiple features.")
	exit()

	vec = FeatureUnion(features)
	classifier = Pipeline([('vec', vec), ('cls', MultinomialNB())])

	return classifier
No results found