Created
January 7, 2026 14:25
-
-
Save mehulmpt/27f54783c1106b48e91d50e4d7e744e4 to your computer and use it in GitHub Desktop.
Simple tokenizer in TypeScript. Full video: https://www.youtube.com/watch?v=mRcf5qQSYws
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| In 2026, José said: “Hello, 世界! 👋🚀” while debugging const π = 3.14159; at 03:45 AM. | |
| His café bill was €12.50 (₹1045.75), uptime = 99.99%, latency ≤ 10 ms. | |
| Meanwhile العربية تُكتب من اليمين ← اليسار, हिंदी भी यहाँ है, and emojis like 🤖✨🔥 coexist with math ∑x², arrows → ⇄, and URLs such as [https://example.com?q=テスト#α](https://example.com?q=テスト#α). | |
| Final check: naïve façade coöperate — does it tokenize correctly? ✅ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import fs from 'fs' | |
| import path from 'path' | |
| function getPairStats(data: number[]) { | |
| const stats: Record<string, number | undefined> = {} | |
| // [65, 32, 80, 114, 111, 103, 114, 97, 109, 109, 101, ...] | |
| for (let i = 0; i < data.length - 1; i++) { | |
| const num1 = data[i] | |
| const num2 = data[i + 1] | |
| stats[`${num1}-${num2}`] = (stats[`${num1}-${num2}`] ?? 0) + 1 | |
| } | |
| // console.log(stats) | |
| const finalValue: [number, [number, number]][] = [] | |
| for (const key in stats) { | |
| finalValue.push([ | |
| stats[key] ?? 0, | |
| key.split('-').map((t) => parseInt(t, 10)) as [number, number], | |
| ]) | |
| } | |
| return finalValue.sort((a, b) => b[0] - a[0]) | |
| } | |
| function performTokenSwapping({ | |
| tokens, | |
| mergePair, | |
| newTokenId, | |
| }: { | |
| tokens: number[] | |
| mergePair: [number, number] | |
| newTokenId: number | |
| }): number[] { | |
| let tokensToOperate = [...tokens] | |
| for (let i = 0; i < tokensToOperate.length - 1; i++) { | |
| const num1 = tokensToOperate[i] | |
| const num2 = tokensToOperate[i + 1] | |
| if (num1 === mergePair[0] && num2 === mergePair[1]) { | |
| // found the pair | |
| tokensToOperate[i] = newTokenId | |
| tokensToOperate[i + 1] = null as never // we'll remove it later | |
| } | |
| } | |
| tokensToOperate = tokensToOperate.filter((t) => t != null) | |
| // console.dir(tokensToOperate.join(' '), { maxStringLength: null }) | |
| return tokensToOperate | |
| } | |
| function tokenize() { | |
| const str = fs.readFileSync(path.resolve(__dirname, 'data.txt'), 'utf-8') | |
| const bytes = [...Buffer.from(str, 'utf-8')] | |
| const sizeOfVocab = 300 | |
| const iterationsRequired = sizeOfVocab - 256 | |
| let tokensToOperateOn = [...bytes] | |
| const mergeDictOrdered: [`${number}-${number}`, number][] = [] | |
| for (let i = 0; i < iterationsRequired; i++) { | |
| const sortedPairStats = getPairStats(tokensToOperateOn) | |
| const newTokenId = i + 256 | |
| tokensToOperateOn = performTokenSwapping({ | |
| tokens: tokensToOperateOn, | |
| mergePair: sortedPairStats[0][1], | |
| newTokenId, | |
| }) | |
| mergeDictOrdered.push([ | |
| `${sortedPairStats[0][1][0]}-${sortedPairStats[0][1][1]}`, | |
| newTokenId, | |
| ]) | |
| } | |
| console.log('Original', bytes.length) | |
| console.log('Final', tokensToOperateOn.length) | |
| console.log('Final', mergeDictOrdered) | |
| function encode(str: string) { | |
| let bytes = [...Buffer.from(str, 'utf-8')] | |
| // console.log({ str, bytes }) | |
| for (const item of mergeDictOrdered) { | |
| const priorityKey = item[0] | |
| for (let i = 0; i < bytes.length - 1; i++) { | |
| const b1 = bytes[i] | |
| const b2 = bytes[i + 1] | |
| if (priorityKey === `${b1}-${b2}`) { | |
| // good to replace | |
| bytes[i] = item[1] | |
| bytes[i + 1] = null as never // will be removed later | |
| // skip the next byte (its going to be null) | |
| i++ | |
| } | |
| } | |
| } | |
| bytes = bytes.filter((t) => t != null) | |
| return bytes | |
| } | |
| function decode(tokens: number[]) { | |
| const bytes = [...tokens] | |
| const reverseDictionary: Record< | |
| number, | |
| { n1: number; n2: number } | undefined | |
| > = {} | |
| for (const item of mergeDictOrdered) { | |
| const [n1, n2] = item[0].split('-').map((t) => parseInt(t, 10)) as [ | |
| number, | |
| number | |
| ] | |
| reverseDictionary[item[1]] = { n1, n2 } | |
| } | |
| for (let i = 0; i < bytes.length; i++) { | |
| const lookup = reverseDictionary[bytes[i]] | |
| if (lookup != null) { | |
| bytes[i] = lookup.n1 | |
| bytes.splice(i + 1, 0, lookup.n2) | |
| i-- // process this byte again because there could be layers and layers of encoding | |
| } | |
| } | |
| return Buffer.from(bytes).toString('utf-8') | |
| } | |
| console.log('Encoding', encode('hello world!')) | |
| console.log('Decoding', decode(encode('hello world!'))) | |
| } | |
| tokenize() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment