farizrahman4u · February 19, 2019 11:56
diff --git a/utils b/utils
 from keras.preprocessing.sequence import pad_sequences
 import numpy as np
 import json
 import random

 # indexes
 word2idx = {}
 idx2word = {}
 tag2idx = {}
 idx2tag = {}

 def get_word(idx):
    if idx == 0:
        return "<UNK>"
    return idx2word[idx]


 def get_word_idx(word, add_new=False):
    if word not in word2idx:
        if add_new:
            idx = len(word2idx) + 2
            word2idx[word] = idx
            idx2word[idx] = word
            return idx
        else:
            return 0
    return word2idx[word]


 def get_tag(idx):
    return idx2tag[idx]


 def get_tag_idx(tag, add_new=False):
    if tag not in tag2idx:
        if add_new:
            idx = len(tag2idx) + 1
            tag2idx[tag] = idx
            idx2tag[idx] = tag
            return idx
        else:
            return 0
    return tag2idx[tag]



 def vectorize_sentence(sentence):
    '''
    # Arguments:
        sentence: list of string
    '''
    return [get_word_idx(word, True) for word in sentence]


 def vectorize_tags(tags):
    '''
    # Arguments:
        tags: list of string
    '''
    return [get_tag_idx(tag, True) for tag in tags]



 def get_data(test_split=0.1):
    with open('data/data.json', 'r') as f:
        rows = json.load(f)
    
    X = []
    Y = []

    for row in rows:
        sentance, poss, tags = row
        x = vectorize_sentence(sentance)
        y = vectorize_tags(tags)
        X.append(x)
        Y.append(y)

    # save indexes for later use
    with open('idexes.json', 'w') as f:
        json.dump([word2idx, idx2tag, tag2idx, idx2tag], f)

    # shuffle
    idxs = list(range(len(X)))
    random.shuffle(idxs)
    X = [X[i] for i in idxs]
    Y = [Y[i] for i in idxs]

    # padding
    X = pad_sequences(X, padding='post')
    Y = pad_sequences(Y, padding='post')
    
    if test_split is None:
        return X, Y
    else:
        num_test_samples = int(len(X) * test_split)
        X_train = X[:-num_test_samples]
        Y_train = Y[:-num_test_samples]
        X_test = X[-num_test_samples:]
        Y_test = Y[-num_test_samples:]
        train_data = X_train, Y_train
        test_data = X_test, Y_test
        return train_data, test_data
	from keras.preprocessing.sequence import pad_sequences
	import numpy as np
	import json
	import random

	# indexes
	word2idx = {}
	idx2word = {}
	tag2idx = {}
	idx2tag = {}

	def get_word(idx):
	if idx == 0:
	return "<UNK>"
	return idx2word[idx]


	def get_word_idx(word, add_new=False):
	if word not in word2idx:
	if add_new:
	idx = len(word2idx) + 2
	word2idx[word] = idx
	idx2word[idx] = word
	return idx
	else:
	return 0
	return word2idx[word]


	def get_tag(idx):
	return idx2tag[idx]


	def get_tag_idx(tag, add_new=False):
	if tag not in tag2idx:
	if add_new:
	idx = len(tag2idx) + 1
	tag2idx[tag] = idx
	idx2tag[idx] = tag
	return idx
	else:
	return 0
	return tag2idx[tag]



	def vectorize_sentence(sentence):
	'''
	# Arguments:
	sentence: list of string
	'''
	return [get_word_idx(word, True) for word in sentence]


	def vectorize_tags(tags):
	'''
	# Arguments:
	tags: list of string
	'''
	return [get_tag_idx(tag, True) for tag in tags]



	def get_data(test_split=0.1):
	with open('data/data.json', 'r') as f:
	rows = json.load(f)

	X = []
	Y = []

	for row in rows:
	sentance, poss, tags = row
	x = vectorize_sentence(sentance)
	y = vectorize_tags(tags)
	X.append(x)
	Y.append(y)

	# save indexes for later use
	with open('idexes.json', 'w') as f:
	json.dump([word2idx, idx2tag, tag2idx, idx2tag], f)

	# shuffle
	idxs = list(range(len(X)))
	random.shuffle(idxs)
	X = [X[i] for i in idxs]
	Y = [Y[i] for i in idxs]

	# padding
	X = pad_sequences(X, padding='post')
	Y = pad_sequences(Y, padding='post')

	if test_split is None:
	return X, Y
	else:
	num_test_samples = int(len(X) * test_split)
	X_train = X[:-num_test_samples]
	Y_train = Y[:-num_test_samples]
	X_test = X[-num_test_samples:]
	Y_test = Y[-num_test_samples:]
	train_data = X_train, Y_train
	test_data = X_test, Y_test
	return train_data, test_data