Last active
September 5, 2023 08:19
-
-
Save dayyass/d02036838213fab1f8fab4837279f7b9 to your computer and use it in GitHub Desktop.
How to get and use tokenizer from "universal-sentence-encoder-multilingual".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "5aa8a12c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from tf_tokenizer_utils import (\n", | |
" download_thhub_model,\n", | |
" get_path_without_extension,\n", | |
" get_tokenizer_from_saved_model,\n", | |
" parse_saved_model,\n", | |
" tokenize,\n", | |
" unpack_tar,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "f62d33d6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# init variable\n", | |
"thhub_model_url = \"https://tfhub.dev/google/universal-sentence-encoder-multilingual/3\"\n", | |
"save_model_path = \".cache/universal-sentence-encoder-multilingual_3.tar\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "f7b6bab8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# load and unpack model\n", | |
"download_thhub_model(\n", | |
" thhub_model_url=thhub_model_url,\n", | |
" save_model_path=save_model_path,\n", | |
")\n", | |
"unpack_tar(path=save_model_path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "c5207ad4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# get tokenizer\n", | |
"tokenizer = get_tokenizer_from_saved_model(\n", | |
" parse_saved_model(\n", | |
" get_path_without_extension(save_model_path)\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "380abd77", | |
"metadata": {}, | |
"source": [ | |
"### use tokenizer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a4fd4212", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Some texts of different lengths.\n", | |
"english_sentences = [\"dog\", \"Puppies are nice.\", \"I enjoy taking long walks along the beach with my dog.\"]\n", | |
"italian_sentences = [\"cane\", \"I cuccioli sono carini.\", \"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.\"]\n", | |
"japanese_sentences = [\"犬\", \"子犬はいいです\", \"私は犬と一緒にビーチを散歩するのが好きです\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "26ed85da", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"dog -> ['▁dog']\n", | |
"Puppies are nice. -> ['▁Pupp', 'ies', '▁are', '▁nice', '.']\n", | |
"I enjoy taking long walks along the beach with my dog. -> ['▁I', '▁enjoy', '▁taking', '▁long', '▁walk', 's', '▁along', '▁the', '▁beach', '▁with', '▁my', '▁dog', '.']\n" | |
] | |
} | |
], | |
"source": [ | |
"for sentence in english_sentences:\n", | |
" tokenized_sentence = tokenize(\n", | |
" sentence=sentence,\n", | |
" tokenizer=tokenizer,\n", | |
" )\n", | |
" print(f\"{sentence} -> {tokenized_sentence}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "d788e5bd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"cane -> ['▁cane']\n", | |
"I cuccioli sono carini. -> ['▁I', '▁cu', 'ccioli', '▁sono', '▁car', 'ini', '.']\n", | |
"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane. -> ['▁Mi', '▁piace', '▁fare', '▁lunghe', '▁passeggiat', 'e', '▁lungo', '▁la', '▁spiaggia', '▁con', '▁il', '▁mio', '▁cane', '.']\n" | |
] | |
} | |
], | |
"source": [ | |
"for sentence in italian_sentences:\n", | |
" tokenized_sentence = tokenize(\n", | |
" sentence=sentence,\n", | |
" tokenizer=tokenizer,\n", | |
" )\n", | |
" print(f\"{sentence} -> {tokenized_sentence}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "9664c90f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"犬 -> ['▁', '犬']\n", | |
"子犬はいいです -> ['▁', '子', '犬', 'は', 'いい', 'です']\n", | |
"私は犬と一緒にビーチを散歩するのが好きです -> ['▁私', 'は', '犬', 'と一緒に', 'ビーチ', 'を', '散', '歩', 'するのが', '好き', 'です']\n" | |
] | |
} | |
], | |
"source": [ | |
"for sentence in japanese_sentences:\n", | |
" tokenized_sentence = tokenize(\n", | |
" sentence=sentence,\n", | |
" tokenizer=tokenizer,\n", | |
" )\n", | |
" print(f\"{sentence} -> {tokenized_sentence}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "883211e8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jupyter==1.0.0 | |
requests==2.25.1 | |
tensorflow==2.5.0 | |
tensorflow_text==2.5.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tarfile | |
from contextlib import closing | |
from pathlib import Path | |
from typing import List | |
import requests | |
from tensorflow.core.protobuf.saved_model_pb2 import SavedModel | |
from tensorflow.python.saved_model.loader_impl import parse_saved_model | |
from tensorflow_text.python.ops.sentencepiece_tokenizer import SentencepieceTokenizer | |
def get_path_without_extension(path: str) -> Path: | |
""" | |
Get path without extension. | |
:param str path: path. | |
:return: path without extension. | |
:rtype: Path | |
""" | |
path = Path(path) | |
return path.parent.joinpath(path.stem) | |
def unpack_tar(path: str) -> None: | |
""" | |
Unpack .tar file given path. | |
:param str path: path to .tar file. | |
""" | |
path_without_extension = get_path_without_extension(path) | |
if not path_without_extension.exists(): | |
# https://stackoverflow.com/questions/6086603/statement-with-and-tarfile | |
with closing(tarfile.open(path)) as fp: | |
fp.extractall(path_without_extension) | |
def download_thhub_model( | |
thhub_model_url: str, | |
save_model_path: str, | |
) -> None: | |
""" | |
Download th hub model given URL. | |
:param str thhub_model_url: tf hub model URL. | |
:param str save_model_path: path to save model. | |
""" | |
if not Path(save_model_path).exists(): | |
# download compressed model | |
response = requests.get(f"{thhub_model_url}?tf-hub-format=compressed") | |
# make dir if not exists | |
# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory | |
Path(save_model_path).parent.absolute().mkdir(parents=True, exist_ok=True) | |
# save compressed model | |
with open(save_model_path, mode="wb") as fp: | |
fp.write(response.content) | |
def get_tokenizer_from_saved_model(saved_model: SavedModel) -> SentencepieceTokenizer: | |
""" | |
Get tokenizer from tf SavedModel. | |
:param SavedModel saved_model: tf SavedModel. | |
:return: tokenizer. | |
:rtype: SentencepieceTokenizer | |
""" | |
# extract functions that contain SentencePiece somewhere in there | |
functions_with_sp = [ | |
f | |
for f in saved_model.meta_graphs[0].graph_def.library.function | |
if "sentencepiecetokenizeop" in str(f).lower() | |
] | |
assert len(functions_with_sp) == 1 | |
# find SentencePieceOp (contains the model) in the found function | |
nodes_with_sp = [ | |
n for n in functions_with_sp[0].node_def if n.op == "SentencepieceOp" | |
] | |
assert len(nodes_with_sp) == 1 | |
# we can pretty much save the model into a file since it does not change | |
model = nodes_with_sp[0].attr["model"].s | |
# instantiate the model | |
tokenizer = SentencepieceTokenizer(model) | |
return tokenizer | |
def tokenize( | |
sentence: str, tokenizer: SentencepieceTokenizer, encoding: str = "utf-8" | |
) -> List[str]: | |
""" | |
Tokenize sentence given tokenizer. | |
:param str sentence: sentence to tokenize. | |
:param SentencepieceTokenizer tokenizer: tokenizer. | |
:param str encoding: encoding (default: "utf-8"). | |
:return: tokenized sentence. | |
:rtype: List[str] | |
""" | |
tokenized_sentence = [] | |
token_ids = tokenizer.tokenize(sentence).numpy() | |
for token_id in token_ids: | |
bytes_token = tokenizer.id_to_string(token_id).numpy() | |
token = bytes_token.decode(encoding) | |
tokenized_sentence.append(token) | |
return tokenized_sentence |
REST API for sentence tokenization and embedding using MUSE: https://github.com/dayyass/muse_as_service
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Inspired by tensorflow/hub#662