ymoslem · January 25, 2023 17:40
diff --git a/subword_source_only.py b/subword_source_only.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 # Subwording the source file only
 # Command: python3 subword.py <source_model_file> <source_pred_file>
 # Note: If you did not train the model with start and end tokens remove ['<s>'] and ['</s>'] from line #30


 import sys
 import sentencepiece as spm


 source_model = sys.argv[1]
 source_raw = sys.argv[2]
 source_subworded = source_raw + ".subword"

 print("Source Model:", source_model)
 print("Source Dataset:", source_raw)


 sp = spm.SentencePieceProcessor()


 # Subwording the train source

 sp.load(source_model)

 with open(source_raw) as source, open(source_subworded, "w+") as source_subword:
    for line in source:
        line = ['<s>'] + sp.encode_as_pieces(line) + ['</s>']    # encode and add start & end tokens
        line = " ".join([token for token in line])
        source_subword.write(line + "\n")

 print("Done subwording the source file! Output:", source_subworded)
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	# Subwording the source file only
	# Command: python3 subword.py <source_model_file> <source_pred_file>
	# Note: If you did not train the model with start and end tokens remove ['<s>'] and ['</s>'] from line #30


	import sys
	import sentencepiece as spm


	source_model = sys.argv[1]
	source_raw = sys.argv[2]
	source_subworded = source_raw + ".subword"

	print("Source Model:", source_model)
	print("Source Dataset:", source_raw)


	sp = spm.SentencePieceProcessor()


	# Subwording the train source

	sp.load(source_model)

	with open(source_raw) as source, open(source_subworded, "w+") as source_subword:
	for line in source:
	line = ['<s>'] + sp.encode_as_pieces(line) + ['</s>'] # encode and add start & end tokens
	line = " ".join([token for token in line])
	source_subword.write(line + "\n")

	print("Done subwording the source file! Output:", source_subworded)