dayyass · September 5, 2023 08:19 · dayyass · Jun 8, 2021 · dayyass · Jun 12, 2021
diff --git a/muse_tokenize.ipynb b/muse_tokenize.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5aa8a12c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tf_tokenizer_utils import (\n",
    "    download_thhub_model,\n",
    "    get_path_without_extension,\n",
    "    get_tokenizer_from_saved_model,\n",
    "    parse_saved_model,\n",
    "    tokenize,\n",
    "    unpack_tar,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f62d33d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# init variable\n",
    "thhub_model_url = \"https://tfhub.dev/google/universal-sentence-encoder-multilingual/3\"\n",
    "save_model_path = \".cache/universal-sentence-encoder-multilingual_3.tar\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f7b6bab8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load and unpack model\n",
    "download_thhub_model(\n",
    "    thhub_model_url=thhub_model_url,\n",
    "    save_model_path=save_model_path,\n",
    ")\n",
    "unpack_tar(path=save_model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c5207ad4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get tokenizer\n",
    "tokenizer = get_tokenizer_from_saved_model(\n",
    "    parse_saved_model(\n",
    "        get_path_without_extension(save_model_path)\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "380abd77",
   "metadata": {},
   "source": [
    "### use tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a4fd4212",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Some texts of different lengths.\n",
    "english_sentences = [\"dog\", \"Puppies are nice.\", \"I enjoy taking long walks along the beach with my dog.\"]\n",
    "italian_sentences = [\"cane\", \"I cuccioli sono carini.\", \"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.\"]\n",
    "japanese_sentences = [\"犬\", \"子犬はいいです\", \"私は犬と一緒にビーチを散歩するのが好きです\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "26ed85da",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dog -> ['▁dog']\n",
      "Puppies are nice. -> ['▁Pupp', 'ies', '▁are', '▁nice', '.']\n",
      "I enjoy taking long walks along the beach with my dog. -> ['▁I', '▁enjoy', '▁taking', '▁long', '▁walk', 's', '▁along', '▁the', '▁beach', '▁with', '▁my', '▁dog', '.']\n"
     ]
    }
   ],
   "source": [
    "for sentence in english_sentences:\n",
    "    tokenized_sentence = tokenize(\n",
    "        sentence=sentence,\n",
    "        tokenizer=tokenizer,\n",
    "    )\n",
    "    print(f\"{sentence} -> {tokenized_sentence}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d788e5bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cane -> ['▁cane']\n",
      "I cuccioli sono carini. -> ['▁I', '▁cu', 'ccioli', '▁sono', '▁car', 'ini', '.']\n",
      "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane. -> ['▁Mi', '▁piace', '▁fare', '▁lunghe', '▁passeggiat', 'e', '▁lungo', '▁la', '▁spiaggia', '▁con', '▁il', '▁mio', '▁cane', '.']\n"
     ]
    }
   ],
   "source": [
    "for sentence in italian_sentences:\n",
    "    tokenized_sentence = tokenize(\n",
    "        sentence=sentence,\n",
    "        tokenizer=tokenizer,\n",
    "    )\n",
    "    print(f\"{sentence} -> {tokenized_sentence}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9664c90f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "犬 -> ['▁', '犬']\n",
      "子犬はいいです -> ['▁', '子', '犬', 'は', 'いい', 'です']\n",
      "私は犬と一緒にビーチを散歩するのが好きです -> ['▁私', 'は', '犬', 'と一緒に', 'ビーチ', 'を', '散', '歩', 'するのが', '好き', 'です']\n"
     ]
    }
   ],
   "source": [
    "for sentence in japanese_sentences:\n",
    "    tokenized_sentence = tokenize(\n",
    "        sentence=sentence,\n",
    "        tokenizer=tokenizer,\n",
    "    )\n",
    "    print(f\"{sentence} -> {tokenized_sentence}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "883211e8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/requirements.txt b/requirements.txt
 jupyter==1.0.0
 requests==2.25.1
 tensorflow==2.5.0
 tensorflow_text==2.5.0
diff --git a/tf_tokenizer_utils.py b/tf_tokenizer_utils.py
 import tarfile
 from contextlib import closing
 from pathlib import Path
 from typing import List

 import requests
 from tensorflow.core.protobuf.saved_model_pb2 import SavedModel
 from tensorflow.python.saved_model.loader_impl import parse_saved_model
 from tensorflow_text.python.ops.sentencepiece_tokenizer import SentencepieceTokenizer


 def get_path_without_extension(path: str) -> Path:
    """
    Get path without extension.

    :param str path: path.
    :return: path without extension.
    :rtype: Path
    """

    path = Path(path)
    return path.parent.joinpath(path.stem)


 def unpack_tar(path: str) -> None:
    """
    Unpack .tar file given path.

    :param str path: path to .tar file.
    """
    
    path_without_extension = get_path_without_extension(path)
    
    if not path_without_extension.exists():

        # https://stackoverflow.com/questions/6086603/statement-with-and-tarfile
        with closing(tarfile.open(path)) as fp:
            fp.extractall(path_without_extension)


 def download_thhub_model(
    thhub_model_url: str,
    save_model_path: str,
 ) -> None:
    """
    Download th hub model given URL.

    :param str thhub_model_url: tf hub model URL.
    :param str save_model_path: path to save model.
    """

    if not Path(save_model_path).exists():

        # download compressed model
        response = requests.get(f"{thhub_model_url}?tf-hub-format=compressed")

        # make dir if not exists
        # https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
        Path(save_model_path).parent.absolute().mkdir(parents=True, exist_ok=True)

        # save compressed model
        with open(save_model_path, mode="wb") as fp:
            fp.write(response.content)


 def get_tokenizer_from_saved_model(saved_model: SavedModel) -> SentencepieceTokenizer:
    """
    Get tokenizer from tf SavedModel.

    :param SavedModel saved_model: tf SavedModel.
    :return: tokenizer.
    :rtype: SentencepieceTokenizer
    """

    # extract functions that contain SentencePiece somewhere in there
    functions_with_sp = [
        f
        for f in saved_model.meta_graphs[0].graph_def.library.function
        if "sentencepiecetokenizeop" in str(f).lower()
    ]

    assert len(functions_with_sp) == 1

    # find SentencePieceOp (contains the model) in the found function
    nodes_with_sp = [
        n for n in functions_with_sp[0].node_def if n.op == "SentencepieceOp"
    ]

    assert len(nodes_with_sp) == 1

    # we can pretty much save the model into a file since it does not change
    model = nodes_with_sp[0].attr["model"].s

    # instantiate the model
    tokenizer = SentencepieceTokenizer(model)

    return tokenizer


 def tokenize(
    sentence: str, tokenizer: SentencepieceTokenizer, encoding: str = "utf-8"
 ) -> List[str]:
    """
    Tokenize sentence given tokenizer.

    :param str sentence: sentence to tokenize.
    :param SentencepieceTokenizer tokenizer: tokenizer.
    :param str encoding: encoding (default: "utf-8").
    :return: tokenized sentence.
    :rtype: List[str]
    """

    tokenized_sentence = []

    token_ids = tokenizer.tokenize(sentence).numpy()
    for token_id in token_ids:
        bytes_token = tokenizer.id_to_string(token_id).numpy()
        token = bytes_token.decode(encoding)
        tokenized_sentence.append(token)

    return tokenized_sentence
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "5aa8a12c",
	"metadata": {},
	"outputs": [],
	"source": [
	"from tf_tokenizer_utils import (\n",
	" download_thhub_model,\n",
	" get_path_without_extension,\n",
	" get_tokenizer_from_saved_model,\n",
	" parse_saved_model,\n",
	" tokenize,\n",
	" unpack_tar,\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "f62d33d6",
	"metadata": {},
	"outputs": [],
	"source": [
	"# init variable\n",
	"thhub_model_url = \"https://tfhub.dev/google/universal-sentence-encoder-multilingual/3\"\n",
	"save_model_path = \".cache/universal-sentence-encoder-multilingual_3.tar\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "f7b6bab8",
	"metadata": {},
	"outputs": [],
	"source": [
	"# load and unpack model\n",
	"download_thhub_model(\n",
	" thhub_model_url=thhub_model_url,\n",
	" save_model_path=save_model_path,\n",
	")\n",
	"unpack_tar(path=save_model_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "c5207ad4",
	"metadata": {},
	"outputs": [],
	"source": [
	"# get tokenizer\n",
	"tokenizer = get_tokenizer_from_saved_model(\n",
	" parse_saved_model(\n",
	" get_path_without_extension(save_model_path)\n",
	" )\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "380abd77",
	"metadata": {},
	"source": [
	"### use tokenizer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "a4fd4212",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Some texts of different lengths.\n",
	"english_sentences = [\"dog\", \"Puppies are nice.\", \"I enjoy taking long walks along the beach with my dog.\"]\n",
	"italian_sentences = [\"cane\", \"I cuccioli sono carini.\", \"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.\"]\n",
	"japanese_sentences = [\"犬\", \"子犬はいいです\", \"私は犬と一緒にビーチを散歩するのが好きです\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "26ed85da",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"dog -> ['▁dog']\n",
	"Puppies are nice. -> ['▁Pupp', 'ies', '▁are', '▁nice', '.']\n",
	"I enjoy taking long walks along the beach with my dog. -> ['▁I', '▁enjoy', '▁taking', '▁long', '▁walk', 's', '▁along', '▁the', '▁beach', '▁with', '▁my', '▁dog', '.']\n"
	]
	}
	],
	"source": [
	"for sentence in english_sentences:\n",
	" tokenized_sentence = tokenize(\n",
	" sentence=sentence,\n",
	" tokenizer=tokenizer,\n",
	" )\n",
	" print(f\"{sentence} -> {tokenized_sentence}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "d788e5bd",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"cane -> ['▁cane']\n",
	"I cuccioli sono carini. -> ['▁I', '▁cu', 'ccioli', '▁sono', '▁car', 'ini', '.']\n",
	"Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane. -> ['▁Mi', '▁piace', '▁fare', '▁lunghe', '▁passeggiat', 'e', '▁lungo', '▁la', '▁spiaggia', '▁con', '▁il', '▁mio', '▁cane', '.']\n"
	]
	}
	],
	"source": [
	"for sentence in italian_sentences:\n",
	" tokenized_sentence = tokenize(\n",
	" sentence=sentence,\n",
	" tokenizer=tokenizer,\n",
	" )\n",
	" print(f\"{sentence} -> {tokenized_sentence}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "9664c90f",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"犬 -> ['▁', '犬']\n",
	"子犬はいいです -> ['▁', '子', '犬', 'は', 'いい', 'です']\n",
	"私は犬と一緒にビーチを散歩するのが好きです -> ['▁私', 'は', '犬', 'と一緒に', 'ビーチ', 'を', '散', '歩', 'するのが', '好き', 'です']\n"
	]
	}
	],
	"source": [
	"for sentence in japanese_sentences:\n",
	" tokenized_sentence = tokenize(\n",
	" sentence=sentence,\n",
	" tokenizer=tokenizer,\n",
	" )\n",
	" print(f\"{sentence} -> {tokenized_sentence}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "883211e8",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
	jupyter==1.0.0
	requests==2.25.1
	tensorflow==2.5.0
	tensorflow_text==2.5.0
	import tarfile
	from contextlib import closing
	from pathlib import Path
	from typing import List

	import requests
	from tensorflow.core.protobuf.saved_model_pb2 import SavedModel
	from tensorflow.python.saved_model.loader_impl import parse_saved_model
	from tensorflow_text.python.ops.sentencepiece_tokenizer import SentencepieceTokenizer


	def get_path_without_extension(path: str) -> Path:
	"""
	Get path without extension.

	:param str path: path.
	:return: path without extension.
	:rtype: Path
	"""

	path = Path(path)
	return path.parent.joinpath(path.stem)


	def unpack_tar(path: str) -> None:
	"""
	Unpack .tar file given path.

	:param str path: path to .tar file.
	"""

	path_without_extension = get_path_without_extension(path)

	if not path_without_extension.exists():

	# https://stackoverflow.com/questions/6086603/statement-with-and-tarfile
	with closing(tarfile.open(path)) as fp:
	fp.extractall(path_without_extension)


	def download_thhub_model(
	thhub_model_url: str,
	save_model_path: str,
	) -> None:
	"""
	Download th hub model given URL.

	:param str thhub_model_url: tf hub model URL.
	:param str save_model_path: path to save model.
	"""

	if not Path(save_model_path).exists():

	# download compressed model
	response = requests.get(f"{thhub_model_url}?tf-hub-format=compressed")

	# make dir if not exists
	# https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
	Path(save_model_path).parent.absolute().mkdir(parents=True, exist_ok=True)

	# save compressed model
	with open(save_model_path, mode="wb") as fp:
	fp.write(response.content)


	def get_tokenizer_from_saved_model(saved_model: SavedModel) -> SentencepieceTokenizer:
	"""
	Get tokenizer from tf SavedModel.

	:param SavedModel saved_model: tf SavedModel.
	:return: tokenizer.
	:rtype: SentencepieceTokenizer
	"""

	# extract functions that contain SentencePiece somewhere in there
	functions_with_sp = [
	f
	for f in saved_model.meta_graphs[0].graph_def.library.function
	if "sentencepiecetokenizeop" in str(f).lower()
	]

	assert len(functions_with_sp) == 1

	# find SentencePieceOp (contains the model) in the found function
	nodes_with_sp = [
	n for n in functions_with_sp[0].node_def if n.op == "SentencepieceOp"
	]

	assert len(nodes_with_sp) == 1

	# we can pretty much save the model into a file since it does not change
	model = nodes_with_sp[0].attr["model"].s

	# instantiate the model
	tokenizer = SentencepieceTokenizer(model)

	return tokenizer


	def tokenize(
	sentence: str, tokenizer: SentencepieceTokenizer, encoding: str = "utf-8"
	) -> List[str]:
	"""
	Tokenize sentence given tokenizer.

	:param str sentence: sentence to tokenize.
	:param SentencepieceTokenizer tokenizer: tokenizer.
	:param str encoding: encoding (default: "utf-8").
	:return: tokenized sentence.
	:rtype: List[str]
	"""

	tokenized_sentence = []

	token_ids = tokenizer.tokenize(sentence).numpy()
	for token_id in token_ids:
	bytes_token = tokenizer.id_to_string(token_id).numpy()
	token = bytes_token.decode(encoding)
	tokenized_sentence.append(token)

	return tokenized_sentence