Skip to content

Instantly share code, notes, and snippets.

@ortsed
Created September 10, 2019 03:14
Show Gist options
  • Save ortsed/efcc99b810fd040bc3f8a3f4519c4f48 to your computer and use it in GitHub Desktop.
Save ortsed/efcc99b810fd040bc3f8a3f4519c4f48 to your computer and use it in GitHub Desktop.
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
from string import punctuation
stopwords_list += list(punctuation)
from nltk import word_tokenize
tokens = word_tokenize(some_text_data)
stopped_tokens = [w.lower() for w in tokens if w not in stopwords_list]
nltk.FreqDist(tokens)
# http://www.nltk.org/howto/stem.html
from nltk.stem.porter import PorterStemmer, SnowballStemmer
stemmer = PorterStemmer() #SnowballStemmer("english", ignore_stopwords=True)
stemmer.stem("generously")
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('feet')
# http://www.nltk.org/howto/collocations.html
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words_stopped)
scored = finder.score_ngrams(bigram_measures.raw_freq)
finder.apply_freq_filter(5)
finder.score_ngrams(bigram_measures.pmi)
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer()
tf_idf_data_train = tfid.fit_transform(data)
# keras tokenizations
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(complaints)
sequences = tokenizer.texts_to_sequences(complaints)
one_hot_results= tokenizer.texts_to_matrix(complaints, mode='binary') #Similar to sequences, but returns a numpy array
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(complaints)
one_hot_results= tokenizer.texts_to_matrix(complaints, mode='binary')
word_index = tokenizer.word_index
np.shape(one_hot_results)
from sklearn.preprocessing import LabelEncoder
keras.utils.to_categorical
le = preprocessing.LabelEncoder()
le.fit(product)
product_cat = le.transform(product)
product_onehot = to_categorical(product_cat)
import gensim
text = []
for entry in data:
sentence = entry.translate(str.maketrans('', '',
string.punctuation)).split(' ')
new_sent = []
for word in sentence:
new_sent.append(word.lower())
text.append(new_sent)
model = gensim.models.Word2Vec(text, sg=1)
model.train(text, total_examples=model.corpus_count, epochs=model.epochs)
model.wv.most_similar('happiness')
model.wv.most_similar(positive=['president', 'germany'], negative='usa')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment