Skip to content

Instantly share code, notes, and snippets.

@dayyass
Last active September 5, 2023 08:19
Show Gist options
  • Save dayyass/d02036838213fab1f8fab4837279f7b9 to your computer and use it in GitHub Desktop.
Save dayyass/d02036838213fab1f8fab4837279f7b9 to your computer and use it in GitHub Desktop.
How to get and use tokenizer from "universal-sentence-encoder-multilingual".
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "5aa8a12c",
"metadata": {},
"outputs": [],
"source": [
"from tf_tokenizer_utils import (\n",
" download_thhub_model,\n",
" get_path_without_extension,\n",
" get_tokenizer_from_saved_model,\n",
" parse_saved_model,\n",
" tokenize,\n",
" unpack_tar,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f62d33d6",
"metadata": {},
"outputs": [],
"source": [
"# init variable\n",
"thhub_model_url = \"https://tfhub.dev/google/universal-sentence-encoder-multilingual/3\"\n",
"save_model_path = \".cache/universal-sentence-encoder-multilingual_3.tar\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f7b6bab8",
"metadata": {},
"outputs": [],
"source": [
"# load and unpack model\n",
"download_thhub_model(\n",
" thhub_model_url=thhub_model_url,\n",
" save_model_path=save_model_path,\n",
")\n",
"unpack_tar(path=save_model_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c5207ad4",
"metadata": {},
"outputs": [],
"source": [
"# get tokenizer\n",
"tokenizer = get_tokenizer_from_saved_model(\n",
" parse_saved_model(\n",
" get_path_without_extension(save_model_path)\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"id": "380abd77",
"metadata": {},
"source": [
"### use tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a4fd4212",
"metadata": {},
"outputs": [],
"source": [
"# Some texts of different lengths.\n",
"english_sentences = [\"dog\", \"Puppies are nice.\", \"I enjoy taking long walks along the beach with my dog.\"]\n",
"italian_sentences = [\"cane\", \"I cuccioli sono carini.\", \"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.\"]\n",
"japanese_sentences = [\"犬\", \"子犬はいいです\", \"私は犬と一緒にビーチを散歩するのが好きです\"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "26ed85da",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dog -> ['▁dog']\n",
"Puppies are nice. -> ['▁Pupp', 'ies', '▁are', '▁nice', '.']\n",
"I enjoy taking long walks along the beach with my dog. -> ['▁I', '▁enjoy', '▁taking', '▁long', '▁walk', 's', '▁along', '▁the', '▁beach', '▁with', '▁my', '▁dog', '.']\n"
]
}
],
"source": [
"for sentence in english_sentences:\n",
" tokenized_sentence = tokenize(\n",
" sentence=sentence,\n",
" tokenizer=tokenizer,\n",
" )\n",
" print(f\"{sentence} -> {tokenized_sentence}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d788e5bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cane -> ['▁cane']\n",
"I cuccioli sono carini. -> ['▁I', '▁cu', 'ccioli', '▁sono', '▁car', 'ini', '.']\n",
"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane. -> ['▁Mi', '▁piace', '▁fare', '▁lunghe', '▁passeggiat', 'e', '▁lungo', '▁la', '▁spiaggia', '▁con', '▁il', '▁mio', '▁cane', '.']\n"
]
}
],
"source": [
"for sentence in italian_sentences:\n",
" tokenized_sentence = tokenize(\n",
" sentence=sentence,\n",
" tokenizer=tokenizer,\n",
" )\n",
" print(f\"{sentence} -> {tokenized_sentence}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9664c90f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"犬 -> ['▁', '犬']\n",
"子犬はいいです -> ['▁', '子', '犬', 'は', 'いい', 'です']\n",
"私は犬と一緒にビーチを散歩するのが好きです -> ['▁私', 'は', '犬', 'と一緒に', 'ビーチ', 'を', '散', '歩', 'するのが', '好き', 'です']\n"
]
}
],
"source": [
"for sentence in japanese_sentences:\n",
" tokenized_sentence = tokenize(\n",
" sentence=sentence,\n",
" tokenizer=tokenizer,\n",
" )\n",
" print(f\"{sentence} -> {tokenized_sentence}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "883211e8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
jupyter==1.0.0
requests==2.25.1
tensorflow==2.5.0
tensorflow_text==2.5.0
import tarfile
from contextlib import closing
from pathlib import Path
from typing import List
import requests
from tensorflow.core.protobuf.saved_model_pb2 import SavedModel
from tensorflow.python.saved_model.loader_impl import parse_saved_model
from tensorflow_text.python.ops.sentencepiece_tokenizer import SentencepieceTokenizer
def get_path_without_extension(path: str) -> Path:
"""
Get path without extension.
:param str path: path.
:return: path without extension.
:rtype: Path
"""
path = Path(path)
return path.parent.joinpath(path.stem)
def unpack_tar(path: str) -> None:
"""
Unpack .tar file given path.
:param str path: path to .tar file.
"""
path_without_extension = get_path_without_extension(path)
if not path_without_extension.exists():
# https://stackoverflow.com/questions/6086603/statement-with-and-tarfile
with closing(tarfile.open(path)) as fp:
fp.extractall(path_without_extension)
def download_thhub_model(
thhub_model_url: str,
save_model_path: str,
) -> None:
"""
Download th hub model given URL.
:param str thhub_model_url: tf hub model URL.
:param str save_model_path: path to save model.
"""
if not Path(save_model_path).exists():
# download compressed model
response = requests.get(f"{thhub_model_url}?tf-hub-format=compressed")
# make dir if not exists
# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
Path(save_model_path).parent.absolute().mkdir(parents=True, exist_ok=True)
# save compressed model
with open(save_model_path, mode="wb") as fp:
fp.write(response.content)
def get_tokenizer_from_saved_model(saved_model: SavedModel) -> SentencepieceTokenizer:
"""
Get tokenizer from tf SavedModel.
:param SavedModel saved_model: tf SavedModel.
:return: tokenizer.
:rtype: SentencepieceTokenizer
"""
# extract functions that contain SentencePiece somewhere in there
functions_with_sp = [
f
for f in saved_model.meta_graphs[0].graph_def.library.function
if "sentencepiecetokenizeop" in str(f).lower()
]
assert len(functions_with_sp) == 1
# find SentencePieceOp (contains the model) in the found function
nodes_with_sp = [
n for n in functions_with_sp[0].node_def if n.op == "SentencepieceOp"
]
assert len(nodes_with_sp) == 1
# we can pretty much save the model into a file since it does not change
model = nodes_with_sp[0].attr["model"].s
# instantiate the model
tokenizer = SentencepieceTokenizer(model)
return tokenizer
def tokenize(
sentence: str, tokenizer: SentencepieceTokenizer, encoding: str = "utf-8"
) -> List[str]:
"""
Tokenize sentence given tokenizer.
:param str sentence: sentence to tokenize.
:param SentencepieceTokenizer tokenizer: tokenizer.
:param str encoding: encoding (default: "utf-8").
:return: tokenized sentence.
:rtype: List[str]
"""
tokenized_sentence = []
token_ids = tokenizer.tokenize(sentence).numpy()
for token_id in token_ids:
bytes_token = tokenizer.id_to_string(token_id).numpy()
token = bytes_token.decode(encoding)
tokenized_sentence.append(token)
return tokenized_sentence
@dayyass
Copy link
Author

dayyass commented Jun 8, 2021

Inspired by tensorflow/hub#662

@dayyass
Copy link
Author

dayyass commented Jun 12, 2021

REST API for sentence tokenization and embedding using MUSE: https://github.com/dayyass/muse_as_service

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment