Created
July 8, 2017 00:27
-
-
Save hhl60492/176359706b5534604f7215e6f63a0a10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from process import * | |
import pandas as pd | |
import glob | |
import numpy as np | |
from keras.models import Sequential | |
from keras.layers import Conv2D, Flatten, MaxPool2D, Dense, Dropout | |
from random import shuffle | |
MAX_CHAR_LEN = 140 | |
TRAIN_RATIO = 0.9 | |
DENSE_NUM = 256 | |
EPOCHS = 5 | |
### read in multiple csvs and concat to one df | |
path =r'C:\spam' # path of csv files | |
allFiles = glob.glob(path + "/*.csv") | |
frame = pd.DataFrame() | |
list_ = [] | |
for file_ in allFiles: | |
df = pd.read_csv(file_,index_col=None, header=0) | |
list_.append(df) | |
frame = pd.concat(list_) | |
#print(frame.head()) | |
train_set = [] | |
test_set = [] | |
stop = int(TRAIN_RATIO*len(frame)) | |
### split into training and test sets (hold out CV) | |
for i in range(stop): | |
train_set.append([frame.iloc[i]['CONTENT'], frame.iloc[i]['CLASS']]) | |
for i in range(len(frame) - stop): | |
test_set.append([frame.iloc[i]['CONTENT'], frame.iloc[i]['CLASS']]) | |
# convert all strings to lower case | |
train_set = to_lower(train_set) | |
test_set = to_lower(test_set) | |
# remove punctuation from strings | |
train_set = remove_punc(train_set) | |
test_set = remove_punc(test_set) | |
### build the nnet model | |
model = Sequential() | |
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(MAX_CHAR_LEN, 32, 1))) | |
model.add(Conv2D(32, (3, 3), activation='relu')) | |
model.add(MaxPool2D(pool_size=(2, 2))) | |
model.add(Dropout(0.25)) | |
model.add(Conv2D(32, (3, 3), activation='relu')) | |
model.add(Conv2D(32, (3, 3), activation='relu')) | |
model.add(MaxPool2D(pool_size=(2, 2))) | |
model.add(Dropout(0.25)) | |
model.add(Flatten()) | |
model.add(Dense(DENSE_NUM, activation='sigmoid')) | |
model.add(Dropout(0.5)) | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='mean_squared_error', | |
optimizer='rmsprop', | |
metrics=['accuracy']) | |
### training and validation loop | |
for i in range(EPOCHS): | |
shuffle(train_set) | |
for j in range(len(train_set)): | |
seq_in = convert_to_sequence(train_set[j][0],MAX_CHAR_LEN) | |
resp_in = np.asarray(train_set[j][1]) | |
seq_in = np.reshape(seq_in, (1, MAX_CHAR_LEN, 32, 1)) | |
resp_in = np.reshape(resp_in, (1,1)) | |
model.fit(seq_in, resp_in, batch_size=1, epochs=1, shuffle=False) | |
print("Epoch " + str(i) + " Iteration " + str(j)) | |
# do validation | |
correct_class = 0 | |
for j in range(len(test_set)): | |
seq_in = convert_to_sequence(test_set[j][0], MAX_CHAR_LEN) | |
resp_in = np.asarray(test_set[j][1]) | |
seq_in = np.reshape(seq_in, (1, MAX_CHAR_LEN, 32, 1)) | |
pred = model.predict(seq_in, batch_size=1) | |
pred = pred[0][0] | |
print(str(resp_in) + "|" + str(pred) + " " + str(test_set[j][0])) | |
if(pred < 0.5): | |
pred = 0 | |
else: | |
pred = 1 | |
if(pred == resp_in): | |
correct_class = correct_class + 1 | |
print("Test classification accuracy: " + str(float(correct_class / len(test_set)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment