Last active
April 22, 2021 12:32
-
-
Save deargle/b57738c8ce2b4ed6ca90f86d5422431f to your computer and use it in GitHub Desktop.
Example of TfidfVectorizer with custom tokenizer that does basic stemming
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Apr 24 16:30:42 2018 | |
@author: deargle | |
""" | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from nltk.stem.porter import PorterStemmer | |
import nltk | |
import pandas as pd | |
import string | |
# These filenames are artifacts from translating the "predict future sales" kaggle competition files | |
# (<csv-name>, <column name of thing to tokenize>, <number of features to retain>) | |
the_things = [ | |
('items-translated', 'item_name_translated', 50), | |
('item_categories-translated','item_category_name_translated', 10), | |
('shops-translated','shop_name_translated', 10)] | |
trans_table = {ord(c): None for c in string.punctuation + string.digits} | |
stemmer = PorterStemmer() | |
def tokenize(text): | |
# my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None` | |
tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords | |
stems = [stemmer.stem(item) for item in tokens] | |
return stems | |
def do_the_thing(filename, name_name, feature_cnt): | |
things_to_do_it_to = pd.read_csv('%s.csv' % filename) | |
tfidf = TfidfVectorizer(tokenizer=tokenize, binary=True, stop_words='english', use_idf=True, max_features=feature_cnt) | |
features = pd.DataFrame(tfidf.fit_transform(things_to_do_it_to[name_name]).toarray(), columns=['tfidf_'+ name.encode('utf-8') for name in tfidf.get_feature_names()]) | |
things_done_to = things_to_do_it_to.join(features) | |
return things_done_to | |
#%% | |
#for (filename, name_name, feature_cnt) in the_things: | |
#do_the_thing(filename, name_name, feature_cnt) | |
#%% | |
things_done_to = do_the_thing(*('pizza', 'text', 20)) | |
things_done_to.to_csv('pizza_features.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment