Last active
October 31, 2018 13:41
-
-
Save georgepar/eba00343e7ddc995898a7f075dcfc445 to your computer and use it in GitHub Desktop.
Read and preprocess IMDB sentiment dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
data_dir = './aclImdb/' | |
train_dir = os.path.join(data_dir, 'train') | |
test_dir = os.path.join(data_dir, 'test') | |
pos_train_dir = os.path.join(train_dir, 'pos') | |
neg_train_dir = os.path.join(train_dir, 'neg') | |
pos_test_dir = os.path.join(test_dir, 'pos') | |
neg_test_dir = os.path.join(test_dir, 'neg') | |
# For memory limitations. These parameters fit in 8GB of RAM. | |
# If you have 16G of RAM you can experiment with the full dataset / W2V | |
MAX_NUM_SAMPLES = 5000 | |
# Load first 1M word embeddings. This works because GoogleNews are roughly | |
# sorted from most frequent to least frequent. | |
# It may yield much worse results for other embeddings corpora | |
NUM_W2V_TO_LOAD = 1000000 | |
import numpy as np | |
SEED = 42 | |
# Fix numpy random seed for reproducibility | |
np.random.seed(42) | |
try: | |
import glob2 as glob | |
except ImportError: | |
import glob | |
import re | |
def strip_punctuation(s): | |
return re.sub(r'[^a-zA-Z\s]', ' ', s) | |
def preprocess(s): | |
return re.sub('\s+',' ', strip_punctuation(s).lower()) | |
def tokenize(s): | |
return s.split(' ') | |
def preproc_tok(s): | |
return tokenize(preprocess(s)) | |
def read_samples(folder, preprocess=lambda x: x): | |
samples = glob.iglob(os.path.join(folder, '*.txt')) | |
data = [] | |
for i, sample in enumerate(samples): | |
if MAX_NUM_SAMPLES > 0 and i == MAX_NUM_SAMPLES: | |
break | |
with open(sample, 'r') as fd: | |
x = [preprocess(l) for l in fd][0] | |
data.append(x) | |
return data | |
def create_corpus(pos, neg): | |
corpus = np.array(pos + neg) | |
y = np.array([1 for _ in pos] + [0 for _ in neg]) | |
indices = np.arange(y.shape[0]) | |
np.random.shuffle(indices) | |
return list(corpus[indices]), list(y[indices]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment