Skip to content

Instantly share code, notes, and snippets.

@georgepar
Last active October 31, 2018 13:41
Show Gist options
  • Save georgepar/eba00343e7ddc995898a7f075dcfc445 to your computer and use it in GitHub Desktop.
Save georgepar/eba00343e7ddc995898a7f075dcfc445 to your computer and use it in GitHub Desktop.
Read and preprocess IMDB sentiment dataset
import os
data_dir = './aclImdb/'
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
pos_train_dir = os.path.join(train_dir, 'pos')
neg_train_dir = os.path.join(train_dir, 'neg')
pos_test_dir = os.path.join(test_dir, 'pos')
neg_test_dir = os.path.join(test_dir, 'neg')
# For memory limitations. These parameters fit in 8GB of RAM.
# If you have 16G of RAM you can experiment with the full dataset / W2V
MAX_NUM_SAMPLES = 5000
# Load first 1M word embeddings. This works because GoogleNews are roughly
# sorted from most frequent to least frequent.
# It may yield much worse results for other embeddings corpora
NUM_W2V_TO_LOAD = 1000000
import numpy as np
SEED = 42
# Fix numpy random seed for reproducibility
np.random.seed(42)
try:
import glob2 as glob
except ImportError:
import glob
import re
def strip_punctuation(s):
return re.sub(r'[^a-zA-Z\s]', ' ', s)
def preprocess(s):
return re.sub('\s+',' ', strip_punctuation(s).lower())
def tokenize(s):
return s.split(' ')
def preproc_tok(s):
return tokenize(preprocess(s))
def read_samples(folder, preprocess=lambda x: x):
samples = glob.iglob(os.path.join(folder, '*.txt'))
data = []
for i, sample in enumerate(samples):
if MAX_NUM_SAMPLES > 0 and i == MAX_NUM_SAMPLES:
break
with open(sample, 'r') as fd:
x = [preprocess(l) for l in fd][0]
data.append(x)
return data
def create_corpus(pos, neg):
corpus = np.array(pos + neg)
y = np.array([1 for _ in pos] + [0 for _ in neg])
indices = np.arange(y.shape[0])
np.random.shuffle(indices)
return list(corpus[indices]), list(y[indices])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment