Skip to content

Instantly share code, notes, and snippets.

@pythonlessons
Created August 24, 2023 10:12
Show Gist options
  • Save pythonlessons/04e5c01f3544775a9080580d0ed0ba92 to your computer and use it in GitHub Desktop.
Save pythonlessons/04e5c01f3544775a9080580d0ed0ba92 to your computer and use it in GitHub Desktop.
transformers_nlp_data
en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
def read_files(path):
with open(path, "r", encoding="utf-8") as f:
en_train_dataset = f.read().split("\n")[:-1]
return en_train_dataset
en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
es_training_data = read_files(es_training_data_path)
es_validation_data = read_files(es_validation_data_path)
max_lenght = 500
train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
es_training_data, en_training_data = zip(*train_dataset)
es_validation_data, en_validation_data = zip(*val_dataset)
print(len(es_training_data))
print(len(es_validation_data))
print(es_training_data[:3])
print(en_training_data[:3])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment