Created
October 13, 2015 01:20
-
-
Save rachtsingh/beee1540949440af294c to your computer and use it in GitHub Desktop.
Preprocessing of Pos/Neg review data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import h5py | |
import numpy as np | |
import random | |
import re | |
import pickle | |
import pdb | |
from sklearn.utils import resample | |
# from process_data.py | |
def clean_str(string, TREC=False): | |
""" | |
Tokenization/string cleaning for all datasets except for SST. | |
Every dataset is lower cased except for TREC | |
""" | |
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |
string = re.sub(r"\'s", " \'s", string) | |
string = re.sub(r"\'ve", " \'ve", string) | |
string = re.sub(r"n\'t", " n\'t", string) | |
string = re.sub(r"\'re", " \'re", string) | |
string = re.sub(r"\'d", " \'d", string) | |
string = re.sub(r"\'ll", " \'ll", string) | |
string = re.sub(r",", " , ", string) | |
string = re.sub(r"!", " ! ", string) | |
string = re.sub(r"\(", " \( ", string) | |
string = re.sub(r"\)", " \) ", string) | |
string = re.sub(r"\?", " \? ", string) | |
string = re.sub(r"\s{2,}", " ", string) | |
return string.strip() if TREC else string.strip().lower() | |
def fill_and_zero(lol, length): | |
""" | |
Takes a list of lists and pads it with zeros to make a matrix | |
""" | |
return np.array([xi+[1]*(length-len(xi)) for xi in lol]) | |
if __name__ == '__main__': | |
words = set() | |
dataset = [[], []] # neg, pos so 0/1 | |
# preprocess text so that it's on the proper format | |
for i, filename in enumerate(['reviews/rt-polarity.neg', 'reviews/rt-polarity.pos']): | |
# first, create the entire list of words that are going to be in the data | |
rawdata = open(filename, 'r').read() # should be simple plain text file | |
data = clean_str(rawdata) | |
# create the dataset | |
dataset[i] = [clean_str(line) for line in rawdata.split('\n')] | |
# make the set for a map | |
words.update(data.split()) | |
data_size, vocab_size = len(dataset[0]) + len(dataset[1]), len(words) | |
print 'data has %d lines, %d unique words.' % (data_size, vocab_size) | |
word_to_idx = { word:i + 1 for i,word in enumerate(words) } | |
idx_to_word = { i+1:word for i,word in enumerate(words) } | |
idx_to_word[1] = "" | |
# now actually process the data | |
# once for the positive, and once for the negative | |
# let's just assume for now that 0.1/0.1/0.8 is a good split for validation | |
data = { | |
'train': { | |
'X': [], | |
'Y': [] | |
}, | |
'test': { | |
'X': [], | |
'Y': [] | |
}, | |
'val': { | |
'X': [], | |
'Y': [] | |
} | |
} | |
for line in dataset[0][:len(dataset[0])/10]: | |
data['val']['X'].append([word_to_idx[word] for word in line.split()]) | |
data['val']['Y'] = np.zeros(len(data['val']['X'])) + 1 | |
for line in dataset[0][len(dataset[0])/10:2*len(dataset[0])/10]: | |
data['test']['X'].append([word_to_idx[word] for word in line.split()]) | |
data['test']['Y'] = np.zeros(len(data['test']['X'])) + 1 | |
for line in dataset[0][2*len(dataset[0])/10:]: | |
data['train']['X'].append([word_to_idx[word] for word in line.split()]) | |
data['train']['Y'] = np.zeros(len(data['train']['X'])) + 1 | |
# now do the same for the negative set | |
for line in dataset[1][:len(dataset[1])/10]: | |
data['val']['X'].append([word_to_idx[word] for word in line.split()]) | |
data['val']['Y'] = np.append(data['val']['Y'], np.ones(len(dataset[1])/10) + 1) | |
for line in dataset[1][len(dataset[1])/10:2*len(dataset[1])/10]: | |
data['test']['X'].append([word_to_idx[word] for word in line.split()]) | |
data['test']['Y'] = np.append(data['test']['Y'], np.ones(2*len(dataset[1])/10 - len(dataset[1])/10) + 1) | |
for line in dataset[1][2*len(dataset[1])/10:]: | |
data['train']['X'].append([word_to_idx[word] for word in line.split()]) | |
data['train']['Y'] = np.append(data['train']['Y'], np.ones(len(dataset[1]) - 2*len(dataset[1])/10) + 1) | |
lenmax = 0 | |
lenmax = max(max(map(len, data['val']['X'])), lenmax) | |
lenmax = max(max(map(len, data['test']['X'])), lenmax) | |
lenmax = max(max(map(len, data['train']['X'])), lenmax) | |
f = h5py.File('reviews2.h5', 'w') | |
# f['word_to_idx'] = word_to_idx | |
# f['idx_to_word'] = idx_to_word | |
# f['data'] = data | |
data['train']['X'] = fill_and_zero(data['train']['X'], lenmax) | |
data['test']['X'] = fill_and_zero(data['test']['X'], lenmax) | |
data['val']['X'] = fill_and_zero(data['val']['X'], lenmax) | |
f['train_X'], f['train_Y'] = resample(data['train']['X'], data['train']['Y']) | |
f['test_X'], f['test_Y'] = resample(data['test']['X'], data['test']['Y']) | |
f['val_X'], f['val_Y'] = resample(data['val']['X'], data['val']['Y']) | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment