gaphex · May 9, 2019 15:46
diff --git a/read_sentencepiece_vocab.py b/read_sentencepiece_vocab.py
 def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

 snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
 print("Learnt vocab size: {}".format(len(snt_vocab)))
 print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))
	def read_sentencepiece_vocab(filepath):
	voc = []
	with open(filepath, encoding='utf-8') as fi:
	for line in fi:
	voc.append(line.split("\t")[0])
	# skip the first <unk> token
	voc = voc[1:]
	return voc

	snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
	print("Learnt vocab size: {}".format(len(snt_vocab)))
	print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))
No results found