Created
July 9, 2021 12:40
-
-
Save mapmeld/e888825777f4e894a5862a27fefdf65c to your computer and use it in GitHub Desktop.
Add text file task to T5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
t5.data.TaskRegistry.add( | |
"byt5_ex", | |
t5.data.TextLineTask, | |
split_to_filepattern={ | |
"train": "gs://BUCKET/train_lines.txt", | |
"validation": "gs://BUCKET/validation_lines.txt", | |
}, | |
text_preprocessor=[ | |
functools.partial( | |
t5.data.preprocessors.parse_tsv, | |
field_names=['text'], | |
field_delim='~', # check ASCII char doesn't appear in files, default is tab (\t), no \n\n blank lines | |
), | |
functools.partial( | |
t5.data.preprocessors.rekey, | |
key_map={ | |
"inputs": None, | |
"targets": "text" | |
}), | |
], | |
token_preprocessor=functools.partial( | |
t5.data.preprocessors.span_corruption, | |
mean_noise_span_length=MEAN_NOISE_SPAN_LENGTH), | |
output_features=DEFAULT_BYTE_OUTPUT_FEATURES, | |
metric_fns=[]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment