Last active
May 28, 2017 05:59
-
-
Save ratsgo/7f3063413bd959e6c987612437e46977 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import datetime | |
from tensorflow import flags | |
import tensorflow as tf | |
import numpy as np | |
class TextCNN(object): | |
""" | |
A CNN for text classification. | |
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. | |
<Parameters> | |
- sequence_length: 최대 문장 길이 | |
- num_classes: 클래스 개수 | |
- vocab_size: 등장 단어 수 | |
- embedding_size: 각 단어에 해당되는 임베디드 벡터의 차원 | |
- filter_sizes: convolutional filter들의 사이즈 (= 각 filter가 몇 개의 단어를 볼 것인가?) (예: "3, 4, 5") | |
- num_filters: 각 filter size 별 filter 수 | |
- l2_reg_lambda: 각 weights, biases에 대한 l2 regularization 정도 | |
""" | |
def __init__( | |
self, sequence_length, num_classes, embedding_size, | |
filter_sizes, num_filters, l2_reg_lambda=0.0): | |
self.input_x = tf.placeholder(tf.float32, [None, sequence_length, embedding_size], name="input_x") | |
input_x_expanded = tf.expand_dims(self.input_x, -1) | |
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") | |
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") | |
l2_loss = tf.constant(0.0) | |
# Create a convolution + maxpool layer for each filter size | |
self.h_outputs = [] | |
pooled_outputs = [] | |
for i, filter_size in enumerate(filter_sizes): | |
with tf.name_scope("conv-maxpool-%s" % filter_size): | |
# Convolution Layer | |
filter_shape = [filter_size, embedding_size, 1, num_filters] | |
pad_input = tf.pad(input_x_expanded, [[0, 0], [1, filter_size - 2], [0, 0], [0, 0]], | |
mode='CONSTANT') | |
Wc = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") | |
bc = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") | |
conv = tf.nn.conv2d( | |
pad_input, | |
Wc, | |
strides=[1, 1, 1, 1], | |
padding="VALID", | |
name="conv") | |
# Apply nonlinearity | |
h = tf.nn.relu(tf.nn.bias_add(conv, bc), name="relu") | |
# Average pooling over the outputs | |
pooled = tf.nn.avg_pool( | |
h, | |
ksize=[1, sequence_length, 1, 1], | |
strides=[1, 1, 1, 1], | |
padding='VALID', | |
name="pool") | |
self.h_outputs.append(h) | |
pooled_outputs.append(pooled) | |
# Combine all the pooled features | |
num_filters_total = num_filters * len(filter_sizes) | |
h_pool = tf.concat(3, pooled_outputs) | |
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) | |
# Add dropout | |
with tf.name_scope("dropout"): | |
h_drop = tf.nn.dropout(h_pool_flat, self.dropout_keep_prob) | |
# Final (unnormalized) scores and predictions | |
with tf.name_scope("output"): | |
self.finW = tf.get_variable( | |
"finW", | |
shape=[num_filters_total, num_classes], | |
initializer=tf.contrib.layers.xavier_initializer()) | |
b = tf.constant(0.0, shape=[num_classes], name="b") | |
l2_loss += tf.nn.l2_loss(self.finW) | |
scores = tf.nn.xw_plus_b(h_drop, self.finW, b, name="scores") | |
self.predictions = tf.argmax(scores, 1, name="predictions") | |
# Calculate Mean cross-entropy loss | |
with tf.name_scope("loss"): | |
losses = tf.nn.softmax_cross_entropy_with_logits(scores, self.input_y) | |
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss | |
# Accuracy | |
with tf.name_scope("accuracy"): | |
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) | |
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") | |
# data loading | |
import cnn_movie_advanced_padding_tool_word2vec as tool | |
data_path = 'C:/Users/ratsgo/GoogleDrive/내폴더/textmining/data/eng_review_corpus.csv' # 영문 | |
corpus, points = tool.loading_rdata(data_path, minlength=10, eng=True, num=False, punc=False) | |
#data_path = 'C:/Users/ratsgo/GoogleDrive/내폴더/textmining/data/watcha_movie_review_spacecorrected_noisedeleted.csv' # 한글 | |
#corpus, points = tool.loading_rdata(data_path, minlength=10, eng=False, num=False, punc=False) | |
train_idx, _ = tool.get_train_idx(len(corpus),train_prop=1) | |
word2vec_path= 'C:/textmining/eng_moviereview_word2vec.pickle' # 영문 | |
#word2vec_path= 'C:/textmining/moviereview_word2vec.pickle' # 한글 | |
embedding_model=tool.load_word2vec(word2vec_path) | |
max_document_length = 30 | |
num_classes = 2 | |
''' | |
# 이 부분은 기존 입력값 만드는 부분 | |
rawcontents = tool.cut(corpus, cut=3) | |
raw_x, vocabulary, vocab_size, vocab_processor = tool.make_raw_input(rawcontents,max_document_length) | |
print('사전단어수 : %s' % (vocab_size)) | |
y = tool.make_output(points, threshold=2.5) | |
raw_x_train, raw_x_test, y_train, y_test, train_idx = tool.divide(raw_x,y,train_prop=0.9) | |
#word2vec_x_train, word2vec_x_test, y_train, y_test, train_idx = tool.divide(word2vec_x,y,train_prop=0.9) | |
''' | |
# Model Hyperparameters | |
flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of embedded vector (default: 128)") | |
flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") | |
flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") | |
flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") | |
flags.DEFINE_float("l2_reg_lambda", 0.1, "L2 regularization lambda (default: 0.0)") | |
# Training parameters | |
flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") | |
flags.DEFINE_integer("num_epochs", 3, "Number of training epochs (default: 200)") | |
flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") | |
flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") | |
flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") | |
# Misc Parameters | |
flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") | |
flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") | |
FLAGS = tf.flags.FLAGS | |
FLAGS._parse_flags() | |
print("\nParameters:") | |
for attr, value in sorted(FLAGS.__flags.items()): | |
print("{}={}".format(attr.upper(), value)) | |
print("") | |
# 3. train the model and test | |
with tf.Graph().as_default(): | |
sess = tf.Session() | |
with sess.as_default(): | |
cnn = TextCNN(sequence_length=max_document_length, | |
num_classes=num_classes, | |
embedding_size=FLAGS.embedding_dim, | |
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), | |
num_filters=FLAGS.num_filters, | |
l2_reg_lambda=FLAGS.l2_reg_lambda) | |
''' | |
cnn = TextCNN(sequence_length=raw_x_train.shape[1], | |
num_classes=y_train.shape[1], | |
vocab_size=vocab_size, | |
embedding_size=FLAGS.embedding_dim, | |
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), | |
num_filters=FLAGS.num_filters, | |
l2_reg_lambda=FLAGS.l2_reg_lambda) | |
''' | |
# Define Training procedure | |
global_step = tf.Variable(0, name="global_step", trainable=False) | |
optimizer = tf.train.AdamOptimizer(1e-3) | |
grads_and_vars = optimizer.compute_gradients(cnn.loss) | |
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) | |
# Initialize all variables | |
sess.run(tf.global_variables_initializer()) | |
def train_step(x_batch, y_batch): | |
feed_dict = { | |
cnn.input_x: x_batch, | |
cnn.input_y: y_batch, | |
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob | |
} | |
_, step, loss, accuracy = sess.run( | |
[train_op, global_step, cnn.loss, cnn.accuracy], | |
feed_dict) | |
time_str = datetime.datetime.now().isoformat() | |
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) | |
# Generate batches | |
batches = tool.get_batch(list(zip(corpus, points)), FLAGS.batch_size, FLAGS.num_epochs, train_idx, | |
word2vec=True, max_document_length=max_document_length, word2vec_model=embedding_model) | |
# Training loop | |
for batch in batches: | |
x_batch, y_batch = zip(*batch) | |
train_step(x_batch, y_batch) | |
current_step = tf.train.global_step(sess, global_step) | |
# act map * weights 저장 | |
import collections | |
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))) | |
fin_weights = sess.run(cnn.finW) | |
batches = tool.get_batch(list(zip(corpus, points)), FLAGS.batch_size, 1, train_idx, | |
word2vec=True, max_document_length=max_document_length, word2vec_model=embedding_model) | |
results = [] | |
doc_idx = 0 | |
for num, batch in enumerate(batches): | |
if num % 100 == 0: | |
print("batch {}".format(num)) | |
x_batch, y_batch = zip(*batch) | |
if len(x_batch) == FLAGS.batch_size: | |
actmaps, predictions = sess.run([cnn.h_outputs, cnn.predictions], | |
feed_dict={cnn.input_x: x_batch, | |
cnn.input_y: y_batch, | |
cnn.dropout_keep_prob: 1.0}) | |
for batch_idx in range(FLAGS.batch_size): | |
combined_actmap = \ | |
np.zeros((max_document_length, len(filter_sizes) * FLAGS.num_filters)) | |
start = 0 | |
end = FLAGS.num_filters | |
for actmap_idx in range(len(actmaps)): | |
combined_actmap[:, start:end] = \ | |
actmaps[actmap_idx][batch_idx].reshape( | |
(max_document_length, FLAGS.num_filters)) | |
start += FLAGS.num_filters | |
end += FLAGS.num_filters | |
batch_result = np.dot(combined_actmap, fin_weights) | |
batch_result = batch_result[:, predictions[batch_idx]] | |
fin_result = collections.OrderedDict() | |
text = corpus[train_idx[doc_idx]].split() | |
for word_idx, score in enumerate(batch_result): | |
if word_idx < len(text): | |
fin_result[text[word_idx]] = score | |
else: | |
fin_result[word_idx] = score | |
preinfo = {'실제값' : y_batch[batch_idx], '예측값' : predictions[batch_idx]} | |
results.append([preinfo,fin_result]) | |
doc_idx += 1 | |
# 상위 5개에 별표 치기 | |
star_results = [] | |
for result_idx, data in enumerate(results): | |
if result_idx % 10000 == 0: | |
print("data {}".format(result_idx)) | |
tmp = sorted(data[1].items(), key=lambda x: x[1], reverse=True)[0:5] ## | |
tmp = [score_tuple[0] for score_tuple in tmp] | |
star_result = collections.OrderedDict() | |
for score_tuple in data[1].items(): | |
star_result[score_tuple[0]] = '' | |
for num in range(len(tmp)): | |
star_result[tmp[num]] = '*' * (num + 1) | |
star_results.append([data[0],star_result]) | |
# 상위 5개에 포함돼 있는 단어 사전 만들기 | |
positive_dict = collections.defaultdict(int) | |
negative_dict = collections.defaultdict(int) | |
for result_idx, data in enumerate(results): | |
if result_idx % 10000 == 0: | |
print("data {}".format(result_idx)) | |
tmp = sorted(data[1].items(), key=lambda x: x[1], reverse=True)[0:5] ## | |
tmp = [score_tuple[0] for score_tuple in tmp] | |
if data[0]['예측값'] == 0: | |
for num in range(len(tmp)): | |
if type(tmp[num]) == int: | |
continue | |
positive_dict[tmp[num]] += 1 | |
else: | |
for num in range(len(tmp)): | |
if type(tmp[num]) == int: | |
continue | |
negative_dict[tmp[num]] += 1 | |
positive_dict = sorted(positive_dict.items(), key=lambda x: x[1], reverse=True) | |
negative_dict = sorted(negative_dict.items(), key=lambda x: x[1], reverse=True) | |
# 결과 저장 | |
import pickle | |
with open('result_epoch3_minleng10_maxleng30_cut3_word2vec_english.pickle', 'wb') as f: | |
pickle.dump(results,f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment