Skip to content

Instantly share code, notes, and snippets.

@mehulmpt
Created January 7, 2026 14:25
Show Gist options
  • Select an option

  • Save mehulmpt/27f54783c1106b48e91d50e4d7e744e4 to your computer and use it in GitHub Desktop.

Select an option

Save mehulmpt/27f54783c1106b48e91d50e4d7e744e4 to your computer and use it in GitHub Desktop.
Simple tokenizer in TypeScript. Full video: https://www.youtube.com/watch?v=mRcf5qQSYws
In 2026, José said: “Hello, 世界! 👋🚀” while debugging const π = 3.14159; at 03:45 AM.
His café bill was €12.50 (₹1045.75), uptime = 99.99%, latency ≤ 10 ms.
Meanwhile العربية تُكتب من اليمين ← اليسار, हिंदी भी यहाँ है, and emojis like 🤖✨🔥 coexist with math ∑x², arrows → ⇄, and URLs such as [https://example.com?q=テスト#α](https://example.com?q=テスト#α).
Final check: naïve façade coöperate — does it tokenize correctly? ✅
import fs from 'fs'
import path from 'path'
function getPairStats(data: number[]) {
const stats: Record<string, number | undefined> = {}
// [65, 32, 80, 114, 111, 103, 114, 97, 109, 109, 101, ...]
for (let i = 0; i < data.length - 1; i++) {
const num1 = data[i]
const num2 = data[i + 1]
stats[`${num1}-${num2}`] = (stats[`${num1}-${num2}`] ?? 0) + 1
}
// console.log(stats)
const finalValue: [number, [number, number]][] = []
for (const key in stats) {
finalValue.push([
stats[key] ?? 0,
key.split('-').map((t) => parseInt(t, 10)) as [number, number],
])
}
return finalValue.sort((a, b) => b[0] - a[0])
}
function performTokenSwapping({
tokens,
mergePair,
newTokenId,
}: {
tokens: number[]
mergePair: [number, number]
newTokenId: number
}): number[] {
let tokensToOperate = [...tokens]
for (let i = 0; i < tokensToOperate.length - 1; i++) {
const num1 = tokensToOperate[i]
const num2 = tokensToOperate[i + 1]
if (num1 === mergePair[0] && num2 === mergePair[1]) {
// found the pair
tokensToOperate[i] = newTokenId
tokensToOperate[i + 1] = null as never // we'll remove it later
}
}
tokensToOperate = tokensToOperate.filter((t) => t != null)
// console.dir(tokensToOperate.join(' '), { maxStringLength: null })
return tokensToOperate
}
function tokenize() {
const str = fs.readFileSync(path.resolve(__dirname, 'data.txt'), 'utf-8')
const bytes = [...Buffer.from(str, 'utf-8')]
const sizeOfVocab = 300
const iterationsRequired = sizeOfVocab - 256
let tokensToOperateOn = [...bytes]
const mergeDictOrdered: [`${number}-${number}`, number][] = []
for (let i = 0; i < iterationsRequired; i++) {
const sortedPairStats = getPairStats(tokensToOperateOn)
const newTokenId = i + 256
tokensToOperateOn = performTokenSwapping({
tokens: tokensToOperateOn,
mergePair: sortedPairStats[0][1],
newTokenId,
})
mergeDictOrdered.push([
`${sortedPairStats[0][1][0]}-${sortedPairStats[0][1][1]}`,
newTokenId,
])
}
console.log('Original', bytes.length)
console.log('Final', tokensToOperateOn.length)
console.log('Final', mergeDictOrdered)
function encode(str: string) {
let bytes = [...Buffer.from(str, 'utf-8')]
// console.log({ str, bytes })
for (const item of mergeDictOrdered) {
const priorityKey = item[0]
for (let i = 0; i < bytes.length - 1; i++) {
const b1 = bytes[i]
const b2 = bytes[i + 1]
if (priorityKey === `${b1}-${b2}`) {
// good to replace
bytes[i] = item[1]
bytes[i + 1] = null as never // will be removed later
// skip the next byte (its going to be null)
i++
}
}
}
bytes = bytes.filter((t) => t != null)
return bytes
}
function decode(tokens: number[]) {
const bytes = [...tokens]
const reverseDictionary: Record<
number,
{ n1: number; n2: number } | undefined
> = {}
for (const item of mergeDictOrdered) {
const [n1, n2] = item[0].split('-').map((t) => parseInt(t, 10)) as [
number,
number
]
reverseDictionary[item[1]] = { n1, n2 }
}
for (let i = 0; i < bytes.length; i++) {
const lookup = reverseDictionary[bytes[i]]
if (lookup != null) {
bytes[i] = lookup.n1
bytes.splice(i + 1, 0, lookup.n2)
i-- // process this byte again because there could be layers and layers of encoding
}
}
return Buffer.from(bytes).toString('utf-8')
}
console.log('Encoding', encode('hello world!'))
console.log('Decoding', decode(encode('hello world!')))
}
tokenize()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment