Skip to content

Instantly share code, notes, and snippets.

View ravishchawla's full-sized avatar

Ravish Chawla ravishchawla

View GitHub Profile
# Obtain predictions
predictions = model.predict_classes(X_test)
# Convert Y_test to the same format as predictions
actuals = [0 if y[0] == 1 else 1 for y in Y_test];
# Use SkLearn's Metrics module
accuracy_score(predictions, actuals)
embeddings_index = dict();
with open('data/glove.6B.100d.txt') as f:
for line in f:
values = line.split();
word = values[0];
coefs = np.asarray(values[1:], dtype='float32');
embeddings_index[word] = coefs;
vocab_size = len(sequence_dict);
embeddings_matrix = np.zeros((vocab_size, 100));
model = Sequential();
model.add(Embedding(len(word_dict), max_cap, input_length=max_cap));
model.add(LSTM(60, return_sequences=True, recurrent_dropout=0.5));
model.add(Dropout(0.5))
model.add(LSTM(60, recurrent_dropout=0.5));
model.add(Dense(60, activation='relu'));
model.add(Dense(2, activation='softmax'));
print(model.summary());
optimizer = Adam(lr=0.01, decay=0.001);
model = Sequential();
model.add(Embedding(len(word_dict), max_cap, input_length=max_cap));
model.add(LSTM(100, return_sequences=True));
model.add(LSTM(100));
model.add(Dense(100, activation='relu'));
model.add(Dense(2, activation='softmax'));
print(model.summary());
optimizer = Adam(lr=0.001, decay=0.0001);
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# Truncate and Pad reviews at a Maximum cap of 60 words.
max_cap = 60;
X = pad_sequences(reviews_encoded, maxlen=max_cap, truncating='post')
# Obtain a One-hot Y array for each review label.
Y = np.array([[0,1] if '0' in label else [1,0] for label in labels])
# Get a randomized sequence of positions to shuffle reviews
np.random.seed(1024);
random_posits = np.arange(len(X))
# Use a Keras Tokenizer and fit on the sentences
tokenizer = Tokenizer();
tokenizer.fit_on_texts(sentences);
text_sequences = np.array(tokenizer.texts_to_sequences(sentences));
sequence_dict = tokenizer.word_index;
word_dict = dict((num, val) for (val, num) in sequence_dict.items());
# We get a map of encoding-to-word in sequence_dict
'''
Clean each document by removing unnecesary characters and splitting by space.
'''
def clean_document(doco):
punctuation = string.punctuation + '\n\n';
punc_replace = ''.join([' ' for s in punctuation]);
doco_clean = doco.replace('-', ' ');
doco_alphas = re.sub(r'\W +', '', doco_clean)
trans_table = str.maketrans(punctuation, punc_replace);
doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
'''
Read reviews from a JSON-formatted file into an array.
'''
lines = [];
num_pos = 0; num_neg = 0; num_total = 75000;
with open('data/review.json', 'r') as f:
for line in f:
if (len(lines) >= (num_total * 2)):
break;
@ravishchawla
ravishchawla / prbot.py
Last active November 15, 2017 20:11
prbot
import time;
import datetime;
import requests;
import json;
import os;
import os.path;
import time;
'''
Bot to post Pull Request changes to a Slack channel.
# coding: utf-8
# # Training a Word2Vec Model on the Reddit Comments Dataset
#
# ### Ravish Chawla
# In[276]:
get_ipython().magic('matplotlib inline')