Last active
September 19, 2017 10:29
-
-
Save rohit-gupta/1dfe64d55ea61780affbdce932f4ac1f to your computer and use it in GitHub Desktop.
Training Language Model on Breitbart Comments
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.utils import Sequence | |
class BreitbartCommentsSequence(Sequence): | |
def __init__(self, comments, vocab_size, previous_words, batch_size): | |
all_comments_text = ' '.join(comments) | |
all_words = all_comments_text.split(" ") | |
vocabulary = collections.Counter(all_words) | |
self.vocabulary = [word for word, count in vocabulary.most_common(vocab_size)] | |
self.previous_words = previous_words | |
self.batch_size = batch_size | |
self.comments = comments | |
def __len__(self): | |
return len(self.comments) // self.batch_size | |
def __getitem__(self,idx): | |
# TODO preprocess comments into beam | |
batch = comments[idx*self.batch_size:(idx+1)*self.batch_size] | |
batch_x = | |
batch_y = | |
return np.array([ | |
resize(imread(file_name), (200,200)) | |
for file_name in batch_x]), np.array(batch_y) | |
import pandas | |
comments = pandas.read_pickle("breitbart.pickle") | |
comment_texts = comments['comment_text'] | |
VOCAB_SIZE = 5000 | |
BATCH_SIZE = 16 | |
PREV_WORDS = 16 | |
train_generator = BreitbartCommentsSequence(comment_texts, VOCAB_SIZE, PREV_WORDS, BATCH_SIZE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment