deargle · April 22, 2021 12:32
diff --git a/tokenize.py b/tokenize.py
 # -*- coding: utf-8 -*-
 """
 Created on Tue Apr 24 16:30:42 2018

 @author: deargle
 """

 from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.stem.porter import PorterStemmer
 import nltk 
 import pandas as pd
 import string

 # These filenames are artifacts from translating the "predict future sales" kaggle competition files
 # (<csv-name>, <column name of thing to tokenize>, <number of features to retain>)
 the_things = [
        ('items-translated', 'item_name_translated', 50),
        ('item_categories-translated','item_category_name_translated', 10),
        ('shops-translated','shop_name_translated', 10)]

 trans_table = {ord(c): None for c in string.punctuation + string.digits}    
 stemmer = PorterStemmer()

 def tokenize(text):
        # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
        tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        stems = [stemmer.stem(item) for item in tokens]
        return stems

 def do_the_thing(filename, name_name, feature_cnt):
    things_to_do_it_to = pd.read_csv('%s.csv' % filename)    
        
    tfidf = TfidfVectorizer(tokenizer=tokenize, binary=True, stop_words='english', use_idf=True, max_features=feature_cnt)
    features = pd.DataFrame(tfidf.fit_transform(things_to_do_it_to[name_name]).toarray(), columns=['tfidf_'+ name.encode('utf-8') for name in tfidf.get_feature_names()])
    things_done_to = things_to_do_it_to.join(features)
    return things_done_to
    
 #%%
 #for (filename, name_name, feature_cnt) in the_things:
    #do_the_thing(filename, name_name, feature_cnt)

 #%%

 things_done_to = do_the_thing(*('pizza', 'text', 20))
 things_done_to.to_csv('pizza_features.csv')
	# -- coding: utf-8 --
	"""
	Created on Tue Apr 24 16:30:42 2018

	@author: deargle
	"""

	from sklearn.feature_extraction.text import TfidfVectorizer
	from nltk.stem.porter import PorterStemmer
	import nltk
	import pandas as pd
	import string

	# These filenames are artifacts from translating the "predict future sales" kaggle competition files
	# (<csv-name>, <column name of thing to tokenize>, <number of features to retain>)
	the_things = [
	('items-translated', 'item_name_translated', 50),
	('item_categories-translated','item_category_name_translated', 10),
	('shops-translated','shop_name_translated', 10)]

	trans_table = {ord(c): None for c in string.punctuation + string.digits}
	stemmer = PorterStemmer()

	def tokenize(text):
	# my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
	tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
	stems = [stemmer.stem(item) for item in tokens]
	return stems

	def do_the_thing(filename, name_name, feature_cnt):
	things_to_do_it_to = pd.read_csv('%s.csv' % filename)

	tfidf = TfidfVectorizer(tokenizer=tokenize, binary=True, stop_words='english', use_idf=True, max_features=feature_cnt)
	features = pd.DataFrame(tfidf.fit_transform(things_to_do_it_to[name_name]).toarray(), columns=['tfidf_'+ name.encode('utf-8') for name in tfidf.get_feature_names()])
	things_done_to = things_to_do_it_to.join(features)
	return things_done_to

	#%%
	#for (filename, name_name, feature_cnt) in the_things:
	#do_the_thing(filename, name_name, feature_cnt)

	#%%

	things_done_to = do_the_thing(*('pizza', 'text', 20))
	things_done_to.to_csv('pizza_features.csv')