Last active
May 11, 2021 13:33
-
-
Save vpeopleonatank/69a16ec979d8349e4a53f940d742965d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tensorflow import keras | |
import tensorflow as tf | |
from tensorflow import keras | |
from keras.preprocessing.text import Tokenizer | |
from keras.preprocessing.sequence import pad_sequences | |
from pydantic import BaseModel, Field | |
import re | |
def tokenize(x): | |
x_tk = Tokenizer() | |
x_tk.fit_on_texts(x) | |
return x_tk.texts_to_sequences(x), x_tk | |
def pad(x, length = None): | |
if length is None: | |
length = max([len(sentence) for sentence in x]) | |
return pad_sequences(x, maxlen = length, padding = 'post') | |
def preprocess(x, y): | |
preprocess_x, x_tk = tokenize(x) | |
preprocess_y, y_tk = tokenize(y) | |
preprocess_x = pad(preprocess_x) | |
preprocess_y = pad(preprocess_y) | |
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions | |
preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1) | |
return preprocess_x, preprocess_y, x_tk, y_tk | |
preproc_english_sentences, preproc_vietnamese_sentences, english_tokenizer, vietnamese_tokenizer =\ | |
preprocess(english_sentences, vietnamese_sentences) | |
y_id_to_word = {value: key for key, value in vietnamese_tokenizer.word_index.items()} | |
y_id_to_word[0] = '<PAD>' | |
translation_model = keras.models.load_model('en_vi_1.h5') | |
class EngToViTranslationInput(BaseModel): | |
text: str = Field( | |
..., | |
title="Text Input", | |
description="The input text to use as to translate text.", | |
max_length=1000, | |
) | |
class EngToViTranslationOutput(BaseModel): | |
generated_text: str = Field(...) | |
patt = re.compile('(\s*)<PAD>(\s*)') | |
def translate_eng_to_vi(input: EngToViTranslationInput) -> EngToViTranslationOutput: | |
"""Generate text based on a given prompt.""" | |
sentence = [english_tokenizer.word_index[word] for word in input.split()] | |
sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post') | |
sentences = np.array([sentence[0], x[0]]) | |
predictions = translation_model.predict(sentences, len(sentences)) | |
res = ' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]) | |
res = patt.sub('', res) | |
return EngToViTranslationOutput(generated_text=res) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment