pvbhanuteja · August 4, 2020 20:26
diff --git a/download data and pre-process b/download data and pre-process
 # Download the file
 path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

 path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

 # Converts the unicode file to ascii
 def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


 def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w
  
 def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)
	# Download the file
	path_to_zip = tf.keras.utils.get_file(
	'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
	extract=True)

	path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

	# Converts the unicode file to ascii
	def unicode_to_ascii(s):
	return ''.join(c for c in unicodedata.normalize('NFD', s)
	if unicodedata.category(c) != 'Mn')


	def preprocess_sentence(w):
	w = unicode_to_ascii(w.lower().strip())

	# creating a space between a word and the punctuation following it
	# eg: "he is a boy." => "he is a boy ."
	# Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
	w = re.sub(r"([?.!,¿])", r" \1 ", w)
	w = re.sub(r'[" "]+', " ", w)

	# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
	w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

	w = w.strip()

	# adding a start and an end token to the sentence
	# so that the model know when to start and stop predicting.
	w = '<start> ' + w + ' <end>'
	return w

	def create_dataset(path, num_examples):
	lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

	word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]

	return zip(*word_pairs)