Prateek Joshi prateekjoshi565

🎯

Focusing

Data Scientist | AI Engineer | Python

prateekjoshi565 / NMT.py

Created February 2, 2019 07:09

Neural Machine Translation using Keras

	import re
	import string
	from numpy import array, argmax, random, take
	import pandas as pd
	from keras.models import Sequential
	from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
	from keras.preprocessing.text import Tokenizer
	from keras.callbacks import ModelCheckpoint
	from keras.preprocessing.sequence import pad_sequences
	from keras.models import load_model

prateekjoshi565 / import_libraries_MT.py

Created February 6, 2019 09:21

import libraries for Machine Translation

	import string
	import re
	from numpy import array, argmax, random, take
	import pandas as pd
	from keras.models import Sequential
	from keras.layers import Dense, LSTM, Embedding, RepeatVector
	from keras.preprocessing.text import Tokenizer
	from keras.callbacks import ModelCheckpoint
	from keras.preprocessing.sequence import pad_sequences
	from keras.models import load_model

prateekjoshi565 / func_read_text.py

Last active May 23, 2023 14:33

function to read raw text file

	# function to read raw text file
	def read_text(filename):
	# open the file
	file = open(filename, mode='rt', encoding='utf-8')

	# read all text
	text = file.read()
	file.close()
	return text

prateekjoshi565 / func_split_text.py

Created February 6, 2019 09:26

split text into sentences

prateekjoshi565 / text_preprocessing.py

Created February 6, 2019 09:36

text preprocessing

	# Remove punctuation
	deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]]
	deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]]

	# convert text to lowercase
	for i in range(len(deu_eng)):
	deu_eng[i,0] = deu_eng[i,0].lower()
	deu_eng[i,1] = deu_eng[i,1].lower()

prateekjoshi565 / sequence_length.py

Last active February 6, 2019 09:39

Text preprocessing 2

	# empty lists
	eng_l = []
	deu_l = []

	# populate the lists with sentence lengths
	for i in deu_eng[:,0]:
	eng_l.append(len(i.split()))

	for i in deu_eng[:,1]:
	deu_l.append(len(i.split()))

prateekjoshi565 / sequence_prep.py

Created February 6, 2019 09:44

Sequence preparation

	# function to build a tokenizer
	def tokenization(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

	# prepare english tokenizer
	eng_tokenizer = tokenization(deu_eng[:, 0])
	eng_vocab_size = len(eng_tokenizer.word_index) + 1
	eng_length = 8

prateekjoshi565 / encode_sequence.py

Created February 6, 2019 09:45

Encode Sequences

	# encode and pad sequences
	def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	seq = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	seq = pad_sequences(seq, maxlen=length, padding='post')
	return seq

prateekjoshi565 / split_data.py

Created February 6, 2019 09:46

Split data

	from sklearn.model_selection import train_test_split

	# split data into train and test set
	train,test= train_test_split(deu_eng,test_size=0.2,random_state= 12)

prateekjoshi565 / data_prep.py

Last active February 6, 2019 09:50

Data preparation

	# prepare training data
	trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
	trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

	# prepare validation data
	testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1])
	testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])