Skip to content

Instantly share code, notes, and snippets.

@dayyass
Last active September 5, 2023 08:19
Show Gist options
  • Save dayyass/d02036838213fab1f8fab4837279f7b9 to your computer and use it in GitHub Desktop.
Save dayyass/d02036838213fab1f8fab4837279f7b9 to your computer and use it in GitHub Desktop.
How to get and use tokenizer from "universal-sentence-encoder-multilingual".
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
jupyter==1.0.0
requests==2.25.1
tensorflow==2.5.0
tensorflow_text==2.5.0
import tarfile
from contextlib import closing
from pathlib import Path
from typing import List
import requests
from tensorflow.core.protobuf.saved_model_pb2 import SavedModel
from tensorflow.python.saved_model.loader_impl import parse_saved_model
from tensorflow_text.python.ops.sentencepiece_tokenizer import SentencepieceTokenizer
def get_path_without_extension(path: str) -> Path:
"""
Get path without extension.
:param str path: path.
:return: path without extension.
:rtype: Path
"""
path = Path(path)
return path.parent.joinpath(path.stem)
def unpack_tar(path: str) -> None:
"""
Unpack .tar file given path.
:param str path: path to .tar file.
"""
path_without_extension = get_path_without_extension(path)
if not path_without_extension.exists():
# https://stackoverflow.com/questions/6086603/statement-with-and-tarfile
with closing(tarfile.open(path)) as fp:
fp.extractall(path_without_extension)
def download_thhub_model(
thhub_model_url: str,
save_model_path: str,
) -> None:
"""
Download th hub model given URL.
:param str thhub_model_url: tf hub model URL.
:param str save_model_path: path to save model.
"""
if not Path(save_model_path).exists():
# download compressed model
response = requests.get(f"{thhub_model_url}?tf-hub-format=compressed")
# make dir if not exists
# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
Path(save_model_path).parent.absolute().mkdir(parents=True, exist_ok=True)
# save compressed model
with open(save_model_path, mode="wb") as fp:
fp.write(response.content)
def get_tokenizer_from_saved_model(saved_model: SavedModel) -> SentencepieceTokenizer:
"""
Get tokenizer from tf SavedModel.
:param SavedModel saved_model: tf SavedModel.
:return: tokenizer.
:rtype: SentencepieceTokenizer
"""
# extract functions that contain SentencePiece somewhere in there
functions_with_sp = [
f
for f in saved_model.meta_graphs[0].graph_def.library.function
if "sentencepiecetokenizeop" in str(f).lower()
]
assert len(functions_with_sp) == 1
# find SentencePieceOp (contains the model) in the found function
nodes_with_sp = [
n for n in functions_with_sp[0].node_def if n.op == "SentencepieceOp"
]
assert len(nodes_with_sp) == 1
# we can pretty much save the model into a file since it does not change
model = nodes_with_sp[0].attr["model"].s
# instantiate the model
tokenizer = SentencepieceTokenizer(model)
return tokenizer
def tokenize(
sentence: str, tokenizer: SentencepieceTokenizer, encoding: str = "utf-8"
) -> List[str]:
"""
Tokenize sentence given tokenizer.
:param str sentence: sentence to tokenize.
:param SentencepieceTokenizer tokenizer: tokenizer.
:param str encoding: encoding (default: "utf-8").
:return: tokenized sentence.
:rtype: List[str]
"""
tokenized_sentence = []
token_ids = tokenizer.tokenize(sentence).numpy()
for token_id in token_ids:
bytes_token = tokenizer.id_to_string(token_id).numpy()
token = bytes_token.decode(encoding)
tokenized_sentence.append(token)
return tokenized_sentence
@dayyass
Copy link
Author

dayyass commented Jun 12, 2021

REST API for sentence tokenization and embedding using MUSE: https://github.com/dayyass/muse_as_service

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment