Skip to content

Instantly share code, notes, and snippets.

@splitline
Created May 5, 2018 02:36
Show Gist options
  • Save splitline/04be2ced4d96b53f816b36300cb70391 to your computer and use it in GitHub Desktop.
Save splitline/04be2ced4d96b53f816b36300cb70391 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Flatten, Dense, Conv1D, Embedding, Input
from keras import backend as K
def loss(y_true, y_pred):
return(K.mean(K.abs(y_true-y_pred)))
train_df = pd.read_pickle("./train.pkl").sample(frac=1, random_state=123)
test_df = pd.read_pickle("./test.pkl")
w2v_model = Word2Vec.load("./word2vec.model")
train_df.head()
embedding_matrix = np.zeros(
(len(w2v_model.wv.vocab.items()) + 1, w2v_model.vector_size))
word2idx = {}
vocab_list = [(word, w2v_model.wv[word])
for word, _ in w2v_model.wv.vocab.items()]
for i, vocab in enumerate(vocab_list):
word, vec = vocab
embedding_matrix[i + 1] = vec
word2idx[word] = i + 1
def text_to_index(corpus):
new_corpus = []
for doc in corpus:
new_doc = []
for word in doc:
try:
new_doc.append(word2idx[word])
except:
new_doc.append(0)
new_corpus.append(new_doc)
return np.array(new_corpus)
PADDING_LENGTH = 200
X = text_to_index(train_df.text)
X = pad_sequences(X, maxlen=PADDING_LENGTH)
print("Shape:", X.shape)
print("Sample:", X[0])
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
trainable=False,
input_length=200)
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(
filters=(9),
kernel_size=(256),
padding='same',
input_shape=(9, 32, 32),
activation='relu',
))
model.add(Flatten())
model.add(Dense(2, activation='relu'))
model.compile(optimizer='adam',
loss=loss,
metrics=['accuracy'])
model.summary()
result = train_df.drop(['text'], axis=1).as_matrix()
model.fit(x=X, y=result, batch_size=3000, epochs=100, validation_split=0.05)
# X_test = text_to_index(test_df.text)
# X_test = pad_sequences(X_test, maxlen=PADDING_LENGTH)
# predictions = model.predict_classes(X_test, verbose=1).flatten()
# submissions = pd.DataFrame(
# {"id": test_df['id'], "good": predictions[0], "bad": predictions[1]})
# submissions.to_csv("survived.csv", index=False, header=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment