Skip to content

Instantly share code, notes, and snippets.

@rohit-gupta
Last active September 19, 2017 10:29
Show Gist options
  • Save rohit-gupta/1dfe64d55ea61780affbdce932f4ac1f to your computer and use it in GitHub Desktop.
Save rohit-gupta/1dfe64d55ea61780affbdce932f4ac1f to your computer and use it in GitHub Desktop.
Training Language Model on Breitbart Comments
from keras.utils import Sequence
class BreitbartCommentsSequence(Sequence):
def __init__(self, comments, vocab_size, previous_words, batch_size):
all_comments_text = ' '.join(comments)
all_words = all_comments_text.split(" ")
vocabulary = collections.Counter(all_words)
self.vocabulary = [word for word, count in vocabulary.most_common(vocab_size)]
self.previous_words = previous_words
self.batch_size = batch_size
self.comments = comments
def __len__(self):
return len(self.comments) // self.batch_size
def __getitem__(self,idx):
# TODO preprocess comments into beam
batch = comments[idx*self.batch_size:(idx+1)*self.batch_size]
batch_x =
batch_y =
return np.array([
resize(imread(file_name), (200,200))
for file_name in batch_x]), np.array(batch_y)
import pandas
comments = pandas.read_pickle("breitbart.pickle")
comment_texts = comments['comment_text']
VOCAB_SIZE = 5000
BATCH_SIZE = 16
PREV_WORDS = 16
train_generator = BreitbartCommentsSequence(comment_texts, VOCAB_SIZE, PREV_WORDS, BATCH_SIZE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment