benathi · October 1, 2023 02:21
diff --git a/tiktoken_demo.ipynb b/tiktoken_demo.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d9d06852-51f8-4891-b2b6-e5f9e7c9f5b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting with individual bytes of the input (parts) =  [b'h', b'e', b'l', b'l', b'o']\n",
      "\u001b[48;5;167mh\u001b[48;5;179me\u001b[48;5;185ml\u001b[48;5;77ml\u001b[48;5;80mo\u001b[0m\n",
      "Merging b'e' and b'l' since b'el' has the lowest rank (301) out of all pairs\n",
      "\n",
      "\u001b[48;5;167mh\u001b[48;5;179mel\u001b[48;5;77ml\u001b[48;5;80mo\u001b[0m\n",
      "Merging b'l' and b'o' since b'lo' has the lowest rank (385) out of all pairs\n",
      "\n",
      "\u001b[48;5;167mh\u001b[48;5;179mel\u001b[48;5;77mlo\u001b[0m\n",
      "Merging b'el' and b'lo' since b'ello' has the lowest rank (4896) out of all pairs\n",
      "\n",
      "\u001b[48;5;167mh\u001b[48;5;179mello\u001b[0m\n",
      "Merging b'h' and b'ello' since b'hello' has the lowest rank (15339) out of all pairs\n",
      "\n",
      "\u001b[48;5;167mhello\u001b[0m\n",
      "No pair is found in the vocabulary. BPE step finished for pre-token b'hello'!\n",
      "\n",
      "\n",
      "\n",
      "Starting with individual bytes of the input (parts) =  [b' ', b'w', b'o', b'r', b'l', b'd', b'd', b'd', b'd', b'd']\n",
      "\u001b[48;5;167m \u001b[48;5;179mw\u001b[48;5;185mo\u001b[48;5;77mr\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
      "Merging b'o' and b'r' since b'or' has the lowest rank (269) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m \u001b[48;5;179mw\u001b[48;5;185mor\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
      "Merging b' ' and b'w' since b' w' has the lowest rank (289) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
      "Merging b'l' and b'd' since b'ld' has the lowest rank (509) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
      "Merging b'd' and b'd' since b'dd' has the lowest rank (634) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134mdd\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
      "Merging b'd' and b'd' since b'dd' has the lowest rank (634) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
      "Merging b'or' and b'ld' since b'orld' has the lowest rank (1410) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m w\u001b[48;5;185morld\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
      "Merging b' w' and b'orld' since b' world' has the lowest rank (1917) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m world\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
      "Merging b'dd' and b'dd' since b'dddd' has the lowest rank (65200) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m world\u001b[48;5;134mdddd\u001b[0m\n",
      "No pair is found in the vocabulary. BPE step finished for pre-token b' worlddddd'!\n",
      "\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['hello', ' world', 'dddd']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tiktoken._educational import *\n",
    "enc = SimpleBytePairEncoding.from_tiktoken(\"cl100k_base\")\n",
    "[enc.decode([x]) for x in enc.encode(\"hello worlddddd\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "77246607-5ec4-4c7f-ac47-e670dda21fbf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting with individual bytes of the input (parts) =  [b'\\xf0', b'\\x9f', b'\\x90', b'\\xb1']\n",
      "\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[48;5;77m�\u001b[0m\n",
      "Merging b'\\xf0' and b'\\x9f' since b'\\xf0\\x9f' has the lowest rank (9468) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[0m\n",
      "No pair is found in the vocabulary. BPE step finished for pre-token b'\\xf0\\x9f\\x90\\xb1'!\n",
      "\n",
      "\n",
      "\n",
      "[9468, 238, 109]\n",
      "🐱\n"
     ]
    }
   ],
   "source": [
    "x = enc.encode(\"🐱\")\n",
    "print(x)\n",
    "print(enc.decode(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "883e3267-d19e-4bdd-ba2d-fe060a1c0826",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting with individual bytes of the input (parts) =  [b'\\xe3', b'\\x82', b'\\xab']\n",
      "\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[0m\n",
      "Merging b'\\xe3' and b'\\x82' since b'\\xe3\\x82' has the lowest rank (3484) out of all pairs\n",
      "\n",
      "\u001b[48;5;167m�\u001b[48;5;179m�\u001b[0m\n",
      "Merging b'\\xe3\\x82' and b'\\xab' since b'\\xe3\\x82\\xab' has the lowest rank (71493) out of all pairs\n",
      "\n",
      "\u001b[48;5;167mカ\u001b[0m\n",
      "No pair is found in the vocabulary. BPE step finished for pre-token b'\\xe3\\x82\\xab'!\n",
      "\n",
      "\n",
      "\n",
      "[71493]\n",
      "カ\n"
     ]
    }
   ],
   "source": [
    "x = enc.encode(\"カ\")\n",
    "print(x)\n",
    "print(enc.decode(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b93cb459-4234-43c3-ac13-490f9f624555",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d9d06852-51f8-4891-b2b6-e5f9e7c9f5b0",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting with individual bytes of the input (parts) = [b'h', b'e', b'l', b'l', b'o']\n",
	"\u001b[48;5;167mh\u001b[48;5;179me\u001b[48;5;185ml\u001b[48;5;77ml\u001b[48;5;80mo\u001b[0m\n",
	"Merging b'e' and b'l' since b'el' has the lowest rank (301) out of all pairs\n",
	"\n",
	"\u001b[48;5;167mh\u001b[48;5;179mel\u001b[48;5;77ml\u001b[48;5;80mo\u001b[0m\n",
	"Merging b'l' and b'o' since b'lo' has the lowest rank (385) out of all pairs\n",
	"\n",
	"\u001b[48;5;167mh\u001b[48;5;179mel\u001b[48;5;77mlo\u001b[0m\n",
	"Merging b'el' and b'lo' since b'ello' has the lowest rank (4896) out of all pairs\n",
	"\n",
	"\u001b[48;5;167mh\u001b[48;5;179mello\u001b[0m\n",
	"Merging b'h' and b'ello' since b'hello' has the lowest rank (15339) out of all pairs\n",
	"\n",
	"\u001b[48;5;167mhello\u001b[0m\n",
	"No pair is found in the vocabulary. BPE step finished for pre-token b'hello'!\n",
	"\n",
	"\n",
	"\n",
	"Starting with individual bytes of the input (parts) = [b' ', b'w', b'o', b'r', b'l', b'd', b'd', b'd', b'd', b'd']\n",
	"\u001b[48;5;167m \u001b[48;5;179mw\u001b[48;5;185mo\u001b[48;5;77mr\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
	"Merging b'o' and b'r' since b'or' has the lowest rank (269) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m \u001b[48;5;179mw\u001b[48;5;185mor\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
	"Merging b' ' and b'w' since b' w' has the lowest rank (289) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80ml\u001b[48;5;68md\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
	"Merging b'l' and b'd' since b'ld' has the lowest rank (509) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134md\u001b[48;5;167md\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
	"Merging b'd' and b'd' since b'dd' has the lowest rank (634) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134mdd\u001b[48;5;179md\u001b[48;5;185md\u001b[0m\n",
	"Merging b'd' and b'd' since b'dd' has the lowest rank (634) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m w\u001b[48;5;185mor\u001b[48;5;80mld\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
	"Merging b'or' and b'ld' since b'orld' has the lowest rank (1410) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m w\u001b[48;5;185morld\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
	"Merging b' w' and b'orld' since b' world' has the lowest rank (1917) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m world\u001b[48;5;134mdd\u001b[48;5;179mdd\u001b[0m\n",
	"Merging b'dd' and b'dd' since b'dddd' has the lowest rank (65200) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m world\u001b[48;5;134mdddd\u001b[0m\n",
	"No pair is found in the vocabulary. BPE step finished for pre-token b' worlddddd'!\n",
	"\n",
	"\n",
	"\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"['hello', ' world', 'dddd']"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from tiktoken._educational import *\n",
	"enc = SimpleBytePairEncoding.from_tiktoken(\"cl100k_base\")\n",
	"[enc.decode([x]) for x in enc.encode(\"hello worlddddd\")]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "77246607-5ec4-4c7f-ac47-e670dda21fbf",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting with individual bytes of the input (parts) = [b'\\xf0', b'\\x9f', b'\\x90', b'\\xb1']\n",
	"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[48;5;77m�\u001b[0m\n",
	"Merging b'\\xf0' and b'\\x9f' since b'\\xf0\\x9f' has the lowest rank (9468) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[0m\n",
	"No pair is found in the vocabulary. BPE step finished for pre-token b'\\xf0\\x9f\\x90\\xb1'!\n",
	"\n",
	"\n",
	"\n",
	"[9468, 238, 109]\n",
	"🐱\n"
	]
	}
	],
	"source": [
	"x = enc.encode(\"🐱\")\n",
	"print(x)\n",
	"print(enc.decode(x))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "883e3267-d19e-4bdd-ba2d-fe060a1c0826",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting with individual bytes of the input (parts) = [b'\\xe3', b'\\x82', b'\\xab']\n",
	"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[48;5;185m�\u001b[0m\n",
	"Merging b'\\xe3' and b'\\x82' since b'\\xe3\\x82' has the lowest rank (3484) out of all pairs\n",
	"\n",
	"\u001b[48;5;167m�\u001b[48;5;179m�\u001b[0m\n",
	"Merging b'\\xe3\\x82' and b'\\xab' since b'\\xe3\\x82\\xab' has the lowest rank (71493) out of all pairs\n",
	"\n",
	"\u001b[48;5;167mカ\u001b[0m\n",
	"No pair is found in the vocabulary. BPE step finished for pre-token b'\\xe3\\x82\\xab'!\n",
	"\n",
	"\n",
	"\n",
	"[71493]\n",
	"カ\n"
	]
	}
	],
	"source": [
	"x = enc.encode(\"カ\")\n",
	"print(x)\n",
	"print(enc.decode(x))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b93cb459-4234-43c3-ac13-490f9f624555",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.18"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}