negedng · October 18, 2020 21:49
diff --git a/nlp_datasets_hfds_tokenizer_freq.py b/nlp_datasets_hfds_tokenizer_freq.py
 # Formating vocab dictionary from the most common words
 vocab_dict = {k:i+4 for i,k in enumerate([l for l,m in vocabulary_counter.most_common(20000-4)])}
 # Adding the special characters
 vocab_dict["[PAD]"]=0
 vocab_dict["[UNK]"]=1
 vocab_dict["[CLS]"]=2
 vocab_dict["[SEP]"]=3
 vocab_dict["[MASK]"]=4

 tokenizer_2 = BertWordPieceTokenizer(vocab_dict)
	# Formating vocab dictionary from the most common words
	vocab_dict = {k:i+4 for i,k in enumerate([l for l,m in vocabulary_counter.most_common(20000-4)])}
	# Adding the special characters
	vocab_dict["[PAD]"]=0
	vocab_dict["[UNK]"]=1
	vocab_dict["[CLS]"]=2
	vocab_dict["[SEP]"]=3
	vocab_dict["[MASK]"]=4

	tokenizer_2 = BertWordPieceTokenizer(vocab_dict)