Last active
January 25, 2023 17:40
-
-
Save ymoslem/73aa029c007976b3e2d23eedc2b07eb1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Subwording the source file only | |
# Command: python3 subword.py <source_model_file> <source_pred_file> | |
# Note: If you did not train the model with start and end tokens remove ['<s>'] and ['</s>'] from line #30 | |
import sys | |
import sentencepiece as spm | |
source_model = sys.argv[1] | |
source_raw = sys.argv[2] | |
source_subworded = source_raw + ".subword" | |
print("Source Model:", source_model) | |
print("Source Dataset:", source_raw) | |
sp = spm.SentencePieceProcessor() | |
# Subwording the train source | |
sp.load(source_model) | |
with open(source_raw) as source, open(source_subworded, "w+") as source_subword: | |
for line in source: | |
line = ['<s>'] + sp.encode_as_pieces(line) + ['</s>'] # encode and add start & end tokens | |
line = " ".join([token for token in line]) | |
source_subword.write(line + "\n") | |
print("Done subwording the source file! Output:", source_subworded) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment