Created
October 5, 2020 13:08
-
-
Save kzinmr/d140006058e33792d151d2c490395c85 to your computer and use it in GitHub Desktop.
pytokenizations demo (https://github.com/tamuhey/tokenizations)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import tokenizations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"å\t->\tabc\n", | |
"BC\t->\tabc\n", | |
"åBC\t<-\tabc\n" | |
] | |
} | |
], | |
"source": [ | |
"# token列同士のアラインメント\n", | |
"\n", | |
"tokens_a = [\"å\", \"BC\"]\n", | |
"tokens_b = [\"abc\"]\n", | |
"a2b, b2a = tokenizations.get_alignments(tokens_a, tokens_b)\n", | |
"# a2b[i] is a list representing the alignment from tokens_a to tokens_b.\n", | |
"for b_component, token_a in zip(a2b, tokens_a):\n", | |
" print( '{}\\t->\\t{}'.format(token_a, ''.join([tokens_b[i] for i in b_component])) )\n", | |
"for a_component, token_b in zip(b2a, tokens_b):\n", | |
" print( '{}\\t<-\\t{}'.format(''.join([tokens_a[i] for i in a_component]), token_b) )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"WORD\t->\tSUBWORDS\n", | |
"John\t->\t['john']\n", | |
"Johanson\t->\t['johan', '##son']\n", | |
"'s\t->\t[\"'\", 's']\n", | |
"house\t->\t['house']\n" | |
] | |
} | |
], | |
"source": [ | |
"# 使用例. word-tokenとwordpiece-tokenの対応関係取得\n", | |
"tokens_word = [\"John\", \"Johanson\", \"'s\", \"house\"]\n", | |
"tokens_subword = [\"john\", \"johan\", \"##son\", \"'\", \"s\", \"house\"]\n", | |
"w2s, _ = tokenizations.get_alignments(tokens_word, tokens_subword)\n", | |
"\n", | |
"print('WORD\\t->\\tSUBWORDS')\n", | |
"for s_component, token_word in zip(w2s, tokens_word):\n", | |
" print( '{}\\t->\\t{}'.format(token_word, [tokens_subword[i] for i in s_component]) )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"f\t->\tF\n", | |
"o\t->\tO\n", | |
"o\t->\to\n", | |
"b\t->\tB\n", | |
"a\t->\tå\n", | |
"r\t->\tR\n", | |
"b\t->\tb\n", | |
"a\t->\ta\n", | |
"z\t->\tZ\n", | |
"f\t<-\tF\n", | |
"o\t<-\tO\n", | |
"None\t<-\t \n", | |
"o\t<-\to\n", | |
"None\t<-\t.\n", | |
"b\t<-\tB\n", | |
"a\t<-\tå\n", | |
"r\t<-\tR\n", | |
"None\t<-\t \n", | |
"b\t<-\tb\n", | |
"a\t<-\ta\n", | |
"z\t<-\tZ\n" | |
] | |
} | |
], | |
"source": [ | |
"# 文字列同士のアラインメント\n", | |
"chars_a = \"foobarbaz\"\n", | |
"chars_b = \"FO o.BåR baZ\"\n", | |
"a2b, b2a = tokenizations.get_charmap(chars_a, chars_b)\n", | |
"\n", | |
"for b_component, char_a in zip(a2b, chars_a):\n", | |
" print( '{}\\t->\\t{}'.format(char_a, ''.join([chars_b[i] for i in b_component])) )\n", | |
"# for a_component, token_b in zip(b2a, tokens_b):\n", | |
"# print( '{}\\t<-\\t{}'.format(''.join([tokens_a[i] for i in a_component]), token_b) )\n", | |
"for a_component, char_b in zip(b2a, chars_b):\n", | |
" print( '{}\\t<-\\t{}'.format(''.join([chars_a[i] for i in a_component] if a_component else 'None'), char_b ))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"トークン\t->\t正規化前トークン\n", | |
"foo\t->\tFO o\n", | |
"bar\t->\tBåR\n" | |
] | |
} | |
], | |
"source": [ | |
"# トークン列と文字列のアラインメント(スパンで取得)\n", | |
"# 与えられたトークン列に対応する、正規化前文字列のスパンを復元する\n", | |
"tokens = [\"foo\", \"bar\"]\n", | |
"original_text = \"FO o.BåR baZ\"\n", | |
"original_spans = tokenizations.get_original_spans(tokens, original_text)\n", | |
"\n", | |
"print('トークン\\t->\\t正規化前トークン')\n", | |
"for token, (orgs, orge) in zip(tokens, original_spans):\n", | |
" print('{}\\t->\\t{}'.format(token, original_text[orgs:orge]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"foo -> FO\n", | |
"foo -> o\n", | |
"bar -> BåR\n" | |
] | |
} | |
], | |
"source": [ | |
"# 文字列間のアラインメントをとり対応するスパンを復元(tokenizations.get_original_spansよりやや厳密な対応を取得)\n", | |
"import textspan\n", | |
"\n", | |
"spans = [(0, 3), (3, 6)]\n", | |
"text = \"foobarbaz\"\n", | |
"\n", | |
"original_text = \"FO o.BåR baZ\"\n", | |
"original_spans = textspan.align_spans(spans, text, original_text)\n", | |
"# 内部的には tokenizations.get_charmap(text, original_text) を経由\n", | |
"for (s, e), orgspans in zip(spans, original_spans):\n", | |
" for orgs, orge in orgspans:\n", | |
" print('{} -> {}'.format(text[s:e], original_text[orgs:orge]))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment