Last active
July 3, 2020 07:47
-
-
Save piegu/3bb3786834839e140c057c5697fa6132 to your computer and use it in GitHub Desktop.
English vs Portuguese tokenizer on Portuguese Wikipedia of Byte-Level-BPE_universal_tokenizer_but_en_tokenizer.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# English vs Portuguese tokenizer on Portuguese Wikipedia | |
lang = 'pt' | |
fname = f'all_texts_{lang}wiki.csv' | |
df = pd.read_csv(path_data/fname) | |
df2 = df.copy() | |
tokens_en_list = list() | |
num_token_by_word_en_list = list() | |
tokens_pt_list = list() | |
num_token_by_word_pt_list = list() | |
for index, row in df2.iterrows(): | |
text = row['text'] | |
tokens_en = tokenizer_en.encode(text) | |
tokens_pt = tokenizer_pt.encode(text) | |
tokens_en_list.append(tokens_en) | |
tokens_pt_list.append(tokens_pt) | |
length_text = len(text.split()) | |
tokens_by_word_en = len(tokens_en)/length_text | |
tokens_by_word_pt = len(tokens_pt)/length_text | |
num_token_by_word_en_list.append(tokens_by_word_en) | |
num_token_by_word_pt_list.append(tokens_by_word_pt) | |
df2['tokens_en'] = tokens_en_list | |
df2['num_token_by_word_en'] = num_token_by_word_en_list | |
df2['tokens_pt'] = tokens_pt_list | |
df2['num_token_by_word_pt'] = num_token_by_word_pt_list | |
# check min | |
num_token_by_word_en_min = df2.num_token_by_word_en.min() | |
num_token_by_word_pt_min = df2.num_token_by_word_pt.min() | |
print('(en)',round(num_token_by_word_en_min,2)) | |
print('(pt)',round(num_token_by_word_pt_min,2)) | |
# check max | |
num_token_by_word_en_max = df2.num_token_by_word_en.max() | |
num_token_by_word_pt_max = df2.num_token_by_word_pt.max() | |
print('(en)',round(num_token_by_word_en_max,2)) | |
print('(pt)',round(num_token_by_word_pt_max,2)) | |
# check mean | |
num_token_by_word_en_mean = df2.num_token_by_word_en.mean() | |
num_token_by_word_pt_mean = df2.num_token_by_word_pt.mean() | |
print('(en)',round(num_token_by_word_en_mean,2)) | |
print('(pt)',round(num_token_by_word_pt_mean,2)) | |
# check increase rate and Multiplier coefficient | |
increase = 0. | |
multiplier = 0. | |
for tok_en,tok_pt in zip(*(tokens_en_list,tokens_pt_list)): | |
increase += (len(tok_en)-len(tok_pt))/len(tok_pt) | |
multiplier += len(tok_en)/len(tok_pt) | |
# Rate of increase in % from pt to en | |
increase_pct = increase / len(tokens_en_list) | |
print('Rate of increase:',round(increase_pct*100,2),'%') | |
# Multiplier coefficient = (Rate of increase in %, converted to number) + 1 | |
multiplier_coef = round(increase_pct+1,2) | |
print('Multiplier coefficient:',multiplier_coef) | |
# Multiplier coefficient in % = Multiplier coefficient, converted to % | |
multiplier_pct = round((multiplier/len(tokens_en_list))*100,2) | |
print('Multiplier coefficient in %:',multiplier_pct,'%') | |
# graph | |
len_tokens_text_list = list() | |
for index, row in df2.iterrows(): | |
text = row['text'] | |
length_text = len(text.split()) | |
len_tokens_text_list.append(length_text) | |
tokens_en_list = df2.tokens_en.tolist() | |
len_tokens_en_list = [len(t) for t in tokens_en_list] | |
tokens_pt_list = df2.tokens_pt.tolist() | |
len_tokens_pt_list = [len(t) for t in tokens_pt_list] | |
sorted_len_tokens_text_list = sorted(len_tokens_text_list) | |
y_len_tokens_en_list = (12*np.array(sorted_len_tokens_text_list)).tolist() | |
y_len_tokens_pt_list = (7*np.array(sorted_len_tokens_text_list)).tolist() | |
ax = plt.subplot(111) | |
ax.scatter(len_tokens_text_list, len_tokens_en_list) | |
ax.plot(sorted_len_tokens_text_list, y_len_tokens_en_list) | |
ax.scatter(len_tokens_text_list, len_tokens_pt_list) | |
ax.plot(sorted_len_tokens_text_list, y_len_tokens_pt_list) | |
ax.set_xlabel('length of texts') | |
ax.set_ylabel('length of en and pt tokens') | |
ax.legend(['en', 'pt']) | |
ax.set_title('Number of tokens by tokenization method') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment