Skip to content

Instantly share code, notes, and snippets.

@Fashad-Ahmed
Created October 27, 2025 18:48
Show Gist options
  • Select an option

  • Save Fashad-Ahmed/5161b30a76e13614754dd220df5dde16 to your computer and use it in GitHub Desktop.

Select an option

Save Fashad-Ahmed/5161b30a76e13614754dd220df5dde16 to your computer and use it in GitHub Desktop.
/**
* Example implementation of proper tokenization for Llama and Gemini
* This file shows how to implement Option 4 with full model-specific tokenization
*
* To use this:
* 1. Install required dependencies: npm install @huggingface/tokenizers
* 2. Replace the current token-counter.util.ts with this implementation
* 3. Update all countTokens calls to handle async properly
*/
import { encoding_for_model, get_encoding, type Tiktoken } from 'tiktoken';
import { Model } from '../../debate/enums/model.enum';
// import { Tokenizer } from '@huggingface/tokenizers'; // Uncomment when ready
export class TokenCounter {
private static readonly FALLBACK_CHARS_PER_TOKEN = 4;
private static encodingCache = new Map<string, Tiktoken>();
// private static llamaTokenizer: Tokenizer | null = null;
// private static geminiTokenizer: any = null;
/**
* Load Llama tokenizer from Hugging Face
* This provides accurate tokenization for Llama models
*/
private static async getLlamaTokenizer() {
// const { Tokenizer } = await import('@huggingface/tokenizers');
// if (!this.llamaTokenizer) {
// this.llamaTokenizer = await Tokenizer.from_pretrained("meta-llama/Llama-4");
// }
// return this.llamaTokenizer;
// For now, return null to use fallback
return null;
}
/**
* Count tokens for Llama models using proper SentencePiece tokenization
*/
private static async countLlamaTokens(text: string): Promise<number> {
try {
const tokenizer = await this.getLlamaTokenizer();
if (tokenizer) {
const encoded = await tokenizer.encode(text);
return encoded.length;
}
} catch (error) {
console.warn('Llama tokenization failed, using fallback:', error);
}
// Fallback to approximation
return this.fallbackTokenCount(text);
}
/**
* Count tokens for Gemini models
* Note: Google doesn't provide a public tokenizer, so we use approximation
*/
private static async countGeminiTokens(text: string): Promise<number> {
// Google Gemini uses SentencePiece but doesn't expose it publicly
// Use a more accurate approximation if needed
return this.fallbackTokenCount(text);
}
private static getEncodingForModel(model: Model): Tiktoken | null {
const cacheKey = model;
if (this.encodingCache.has(cacheKey)) {
return this.encodingCache.get(cacheKey) || null;
}
try {
let encoding: Tiktoken;
switch (model) {
case Model.GPT_5_mini:
case Model.GPT_4o_mini:
encoding = encoding_for_model('gpt-4');
break;
case Model.Grok_4:
encoding = encoding_for_model('gpt-4');
break;
case Model.Claude_3_5_Sonnet:
case Model.Claude_3_5_Haiku:
encoding = encoding_for_model('gpt-4');
break;
case Model.DeepSeek_V3:
encoding = encoding_for_model('gpt-4');
break;
case Model.Llama_4:
case Model.Gemini_Pro:
// These will be handled by async methods
// Return null to trigger async handling
return null;
default:
encoding = get_encoding('cl100k_base');
}
this.encodingCache.set(cacheKey, encoding);
return encoding;
} catch (error) {
console.warn(`Failed to get encoding for model ${model}:`, error);
return null;
}
}
/**
* Count tokens with proper async handling for Llama and Gemini
*/
static async countTokens(
text: string,
model: Model = Model.GPT_4o_mini,
): Promise<number> {
if (!text) return 0;
// Handle models that require async tokenization
if (model === Model.Llama_4) {
return await this.countLlamaTokens(text);
}
if (model === Model.Gemini_Pro) {
return await this.countGeminiTokens(text);
}
// Handle models with tiktoken (synchronous)
try {
const encoding = this.getEncodingForModel(model);
if (encoding) {
return encoding.encode(text).length;
}
} catch (error) {
console.warn(`Token counting failed for model ${model}:`, error);
}
// Fallback to character-based estimation
return this.fallbackTokenCount(text);
}
/**
* Synchronous wrapper for countTokens (for backward compatibility)
* Note: This will return approximation for Llama and Gemini
*/
static countTokensSync(
text: string,
model: Model = Model.GPT_4o_mini,
): number {
if (!text) return 0;
// For async models, use fallback
if (model === Model.Llama_4 || model === Model.Gemini_Pro) {
return this.fallbackTokenCount(text);
}
// Handle models with tiktoken (synchronous)
try {
const encoding = this.getEncodingForModel(model);
if (encoding) {
return encoding.encode(text).length;
}
} catch (error) {
console.warn(`Token counting failed for model ${model}:`, error);
}
return this.fallbackTokenCount(text);
}
static async countTokensForArray(
texts: string[],
model: Model = Model.GPT_4o_mini,
): Promise<number> {
const counts = await Promise.all(
texts.map((text) => this.countTokens(text, model)),
);
return counts.reduce((sum, count) => sum + count, 0);
}
static async countTokensForJSON(
obj: any,
model: Model = Model.GPT_4o_mini,
): Promise<number> {
return await this.countTokens(JSON.stringify(obj), model);
}
static async truncateToTokenLimit(
text: string,
maxTokens: number,
model: Model = Model.GPT_4o_mini,
): Promise<string> {
const currentTokens = await this.countTokens(text, model);
if (currentTokens <= maxTokens) return text;
// For async models, use fallback truncation
if (model === Model.Llama_4 || model === Model.Gemini_Pro) {
return this.fallbackTruncate(text, maxTokens);
}
// For other models, use tiktoken truncation
try {
const encoding = this.getEncodingForModel(model);
if (encoding) {
const tokens = encoding.encode(text);
if (tokens.length <= maxTokens) return text;
const truncatedTokens = tokens.slice(0, maxTokens - 1);
const truncatedText = encoding.decode(truncatedTokens);
const truncatedString = String(truncatedText);
if ((await this.countTokens(truncatedString, model)) < currentTokens) {
return truncatedString + '...';
}
}
} catch (error) {
console.warn(`Token truncation failed:`, error);
}
return this.fallbackTruncate(text, maxTokens);
}
private static fallbackTokenCount(text: string): number {
const baseTokens = Math.ceil(text.length / this.FALLBACK_CHARS_PER_TOKEN);
return Math.ceil(baseTokens * 1.1);
}
private static fallbackTruncate(text: string, maxTokens: number): string {
const targetChars = Math.floor(
maxTokens * this.FALLBACK_CHARS_PER_TOKEN * 0.9,
);
return text.slice(0, targetChars) + '...';
}
static getTokenLimit(model: Model): number {
switch (model) {
case Model.GPT_5_mini:
return 128000;
case Model.GPT_4o_mini:
return 128000;
case Model.Grok_4:
return 128000;
case Model.Claude_3_5_Sonnet:
return 200000;
case Model.Claude_3_5_Haiku:
return 200000;
case Model.DeepSeek_V3:
return 128000;
case Model.Llama_4:
return 128000;
case Model.Gemini_Pro:
return 1000000;
default:
return 128000;
}
}
static async fitsWithinLimit(
text: string,
model: Model = Model.GPT_4o_mini,
): Promise<boolean> {
const tokens = await this.countTokens(text, model);
return tokens <= this.getTokenLimit(model);
}
static async getTokenUsagePercentage(
text: string,
model: Model = Model.GPT_4o_mini,
): Promise<number> {
const tokens = await this.countTokens(text, model);
const limit = this.getTokenLimit(model);
return (tokens / limit) * 100;
}
static cleanup(): void {
this.encodingCache.clear();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment