Fashad-Ahmed · October 27, 2025 18:48
diff --git a/token-counter-advanced.example.ts b/token-counter-advanced.example.ts
 /**
 * Example implementation of proper tokenization for Llama and Gemini
 * This file shows how to implement Option 4 with full model-specific tokenization
 * 
 * To use this:
 * 1. Install required dependencies: npm install @huggingface/tokenizers
 * 2. Replace the current token-counter.util.ts with this implementation
 * 3. Update all countTokens calls to handle async properly
 */

 import { encoding_for_model, get_encoding, type Tiktoken } from 'tiktoken';
 import { Model } from '../../debate/enums/model.enum';
 // import { Tokenizer } from '@huggingface/tokenizers'; // Uncomment when ready

 export class TokenCounter {
  private static readonly FALLBACK_CHARS_PER_TOKEN = 4;
  private static encodingCache = new Map<string, Tiktoken>();
  // private static llamaTokenizer: Tokenizer | null = null;
  // private static geminiTokenizer: any = null;

  /**
   * Load Llama tokenizer from Hugging Face
   * This provides accurate tokenization for Llama models
   */
  private static async getLlamaTokenizer() {
    // const { Tokenizer } = await import('@huggingface/tokenizers');
    // if (!this.llamaTokenizer) {
    //   this.llamaTokenizer = await Tokenizer.from_pretrained("meta-llama/Llama-4");
    // }
    // return this.llamaTokenizer;
    
    // For now, return null to use fallback
    return null;
  }

  /**
   * Count tokens for Llama models using proper SentencePiece tokenization
   */
  private static async countLlamaTokens(text: string): Promise<number> {
    try {
      const tokenizer = await this.getLlamaTokenizer();
      if (tokenizer) {
        const encoded = await tokenizer.encode(text);
        return encoded.length;
      }
    } catch (error) {
      console.warn('Llama tokenization failed, using fallback:', error);
    }
    
    // Fallback to approximation
    return this.fallbackTokenCount(text);
  }

  /**
   * Count tokens for Gemini models
   * Note: Google doesn't provide a public tokenizer, so we use approximation
   */
  private static async countGeminiTokens(text: string): Promise<number> {
    // Google Gemini uses SentencePiece but doesn't expose it publicly
    // Use a more accurate approximation if needed
    return this.fallbackTokenCount(text);
  }

  private static getEncodingForModel(model: Model): Tiktoken | null {
    const cacheKey = model;

    if (this.encodingCache.has(cacheKey)) {
      return this.encodingCache.get(cacheKey) || null;
    }

    try {
      let encoding: Tiktoken;

      switch (model) {
        case Model.GPT_5_mini:
        case Model.GPT_4o_mini:
          encoding = encoding_for_model('gpt-4');
          break;
        case Model.Grok_4:
          encoding = encoding_for_model('gpt-4');
          break;
        case Model.Claude_3_5_Sonnet:
        case Model.Claude_3_5_Haiku:
          encoding = encoding_for_model('gpt-4');
          break;
        case Model.DeepSeek_V3:
          encoding = encoding_for_model('gpt-4');
          break;
        case Model.Llama_4:
        case Model.Gemini_Pro:
          // These will be handled by async methods
          // Return null to trigger async handling
          return null;
        default:
          encoding = get_encoding('cl100k_base');
      }

      this.encodingCache.set(cacheKey, encoding);
      return encoding;
    } catch (error) {
      console.warn(`Failed to get encoding for model ${model}:`, error);
      return null;
    }
  }

  /**
   * Count tokens with proper async handling for Llama and Gemini
   */
  static async countTokens(
    text: string,
    model: Model = Model.GPT_4o_mini,
  ): Promise<number> {
    if (!text) return 0;

    // Handle models that require async tokenization
    if (model === Model.Llama_4) {
      return await this.countLlamaTokens(text);
    }

    if (model === Model.Gemini_Pro) {
      return await this.countGeminiTokens(text);
    }

    // Handle models with tiktoken (synchronous)
    try {
      const encoding = this.getEncodingForModel(model);
      if (encoding) {
        return encoding.encode(text).length;
      }
    } catch (error) {
      console.warn(`Token counting failed for model ${model}:`, error);
    }

    // Fallback to character-based estimation
    return this.fallbackTokenCount(text);
  }

  /**
   * Synchronous wrapper for countTokens (for backward compatibility)
   * Note: This will return approximation for Llama and Gemini
   */
  static countTokensSync(
    text: string,
    model: Model = Model.GPT_4o_mini,
  ): number {
    if (!text) return 0;

    // For async models, use fallback
    if (model === Model.Llama_4 || model === Model.Gemini_Pro) {
      return this.fallbackTokenCount(text);
    }

    // Handle models with tiktoken (synchronous)
    try {
      const encoding = this.getEncodingForModel(model);
      if (encoding) {
        return encoding.encode(text).length;
      }
    } catch (error) {
      console.warn(`Token counting failed for model ${model}:`, error);
    }

    return this.fallbackTokenCount(text);
  }

  static async countTokensForArray(
    texts: string[],
    model: Model = Model.GPT_4o_mini,
  ): Promise<number> {
    const counts = await Promise.all(
      texts.map((text) => this.countTokens(text, model)),
    );
    return counts.reduce((sum, count) => sum + count, 0);
  }

  static async countTokensForJSON(
    obj: any,
    model: Model = Model.GPT_4o_mini,
  ): Promise<number> {
    return await this.countTokens(JSON.stringify(obj), model);
  }

  static async truncateToTokenLimit(
    text: string,
    maxTokens: number,
    model: Model = Model.GPT_4o_mini,
  ): Promise<string> {
    const currentTokens = await this.countTokens(text, model);
    if (currentTokens <= maxTokens) return text;

    // For async models, use fallback truncation
    if (model === Model.Llama_4 || model === Model.Gemini_Pro) {
      return this.fallbackTruncate(text, maxTokens);
    }

    // For other models, use tiktoken truncation
    try {
      const encoding = this.getEncodingForModel(model);
      if (encoding) {
        const tokens = encoding.encode(text);
        if (tokens.length <= maxTokens) return text;

        const truncatedTokens = tokens.slice(0, maxTokens - 1);
        const truncatedText = encoding.decode(truncatedTokens);
        const truncatedString = String(truncatedText);

        if ((await this.countTokens(truncatedString, model)) < currentTokens) {
          return truncatedString + '...';
        }
      }
    } catch (error) {
      console.warn(`Token truncation failed:`, error);
    }

    return this.fallbackTruncate(text, maxTokens);
  }

  private static fallbackTokenCount(text: string): number {
    const baseTokens = Math.ceil(text.length / this.FALLBACK_CHARS_PER_TOKEN);
    return Math.ceil(baseTokens * 1.1);
  }

  private static fallbackTruncate(text: string, maxTokens: number): string {
    const targetChars = Math.floor(
      maxTokens * this.FALLBACK_CHARS_PER_TOKEN * 0.9,
    );
    return text.slice(0, targetChars) + '...';
  }

  static getTokenLimit(model: Model): number {
    switch (model) {
      case Model.GPT_5_mini:
        return 128000;
      case Model.GPT_4o_mini:
        return 128000;
      case Model.Grok_4:
        return 128000;
      case Model.Claude_3_5_Sonnet:
        return 200000;
      case Model.Claude_3_5_Haiku:
        return 200000;
      case Model.DeepSeek_V3:
        return 128000;
      case Model.Llama_4:
        return 128000;
      case Model.Gemini_Pro:
        return 1000000;
      default:
        return 128000;
    }
  }

  static async fitsWithinLimit(
    text: string,
    model: Model = Model.GPT_4o_mini,
  ): Promise<boolean> {
    const tokens = await this.countTokens(text, model);
    return tokens <= this.getTokenLimit(model);
  }

  static async getTokenUsagePercentage(
    text: string,
    model: Model = Model.GPT_4o_mini,
  ): Promise<number> {
    const tokens = await this.countTokens(text, model);
    const limit = this.getTokenLimit(model);
    return (tokens / limit) * 100;
  }

  static cleanup(): void {
    this.encodingCache.clear();
  }
 }
	/**
	* Example implementation of proper tokenization for Llama and Gemini
	* This file shows how to implement Option 4 with full model-specific tokenization
	*
	* To use this:
	* 1. Install required dependencies: npm install @huggingface/tokenizers
	* 2. Replace the current token-counter.util.ts with this implementation
	* 3. Update all countTokens calls to handle async properly
	*/

	import { encoding_for_model, get_encoding, type Tiktoken } from 'tiktoken';
	import { Model } from '../../debate/enums/model.enum';
	// import { Tokenizer } from '@huggingface/tokenizers'; // Uncomment when ready

	export class TokenCounter {
	private static readonly FALLBACK_CHARS_PER_TOKEN = 4;
	private static encodingCache = new Map<string, Tiktoken>();
	// private static llamaTokenizer: Tokenizer \| null = null;
	// private static geminiTokenizer: any = null;

	/**
	* Load Llama tokenizer from Hugging Face
	* This provides accurate tokenization for Llama models
	*/
	private static async getLlamaTokenizer() {
	// const { Tokenizer } = await import('@huggingface/tokenizers');
	// if (!this.llamaTokenizer) {
	// this.llamaTokenizer = await Tokenizer.from_pretrained("meta-llama/Llama-4");
	// }
	// return this.llamaTokenizer;

	// For now, return null to use fallback
	return null;
	}

	/**
	* Count tokens for Llama models using proper SentencePiece tokenization
	*/
	private static async countLlamaTokens(text: string): Promise<number> {
	try {
	const tokenizer = await this.getLlamaTokenizer();
	if (tokenizer) {
	const encoded = await tokenizer.encode(text);
	return encoded.length;
	}
	} catch (error) {
	console.warn('Llama tokenization failed, using fallback:', error);
	}

	// Fallback to approximation
	return this.fallbackTokenCount(text);
	}

	/**
	* Count tokens for Gemini models
	* Note: Google doesn't provide a public tokenizer, so we use approximation
	*/
	private static async countGeminiTokens(text: string): Promise<number> {
	// Google Gemini uses SentencePiece but doesn't expose it publicly
	// Use a more accurate approximation if needed
	return this.fallbackTokenCount(text);
	}

	private static getEncodingForModel(model: Model): Tiktoken \| null {
	const cacheKey = model;

	if (this.encodingCache.has(cacheKey)) {
	return this.encodingCache.get(cacheKey) \|\| null;
	}

	try {
	let encoding: Tiktoken;

	switch (model) {
	case Model.GPT_5_mini:
	case Model.GPT_4o_mini:
	encoding = encoding_for_model('gpt-4');
	break;
	case Model.Grok_4:
	encoding = encoding_for_model('gpt-4');
	break;
	case Model.Claude_3_5_Sonnet:
	case Model.Claude_3_5_Haiku:
	encoding = encoding_for_model('gpt-4');
	break;
	case Model.DeepSeek_V3:
	encoding = encoding_for_model('gpt-4');
	break;
	case Model.Llama_4:
	case Model.Gemini_Pro:
	// These will be handled by async methods
	// Return null to trigger async handling
	return null;
	default:
	encoding = get_encoding('cl100k_base');
	}

	this.encodingCache.set(cacheKey, encoding);
	return encoding;
	} catch (error) {
	console.warn(`Failed to get encoding for model ${model}:`, error);
	return null;
	}
	}

	/**
	* Count tokens with proper async handling for Llama and Gemini
	*/
	static async countTokens(
	text: string,
	model: Model = Model.GPT_4o_mini,
	): Promise<number> {
	if (!text) return 0;

	// Handle models that require async tokenization
	if (model === Model.Llama_4) {
	return await this.countLlamaTokens(text);
	}

	if (model === Model.Gemini_Pro) {
	return await this.countGeminiTokens(text);
	}

	// Handle models with tiktoken (synchronous)
	try {
	const encoding = this.getEncodingForModel(model);
	if (encoding) {
	return encoding.encode(text).length;
	}
	} catch (error) {
	console.warn(`Token counting failed for model ${model}:`, error);
	}

	// Fallback to character-based estimation
	return this.fallbackTokenCount(text);
	}

	/**
	* Synchronous wrapper for countTokens (for backward compatibility)
	* Note: This will return approximation for Llama and Gemini
	*/
	static countTokensSync(
	text: string,
	model: Model = Model.GPT_4o_mini,
	): number {
	if (!text) return 0;

	// For async models, use fallback
	if (model === Model.Llama_4 \|\| model === Model.Gemini_Pro) {
	return this.fallbackTokenCount(text);
	}

	// Handle models with tiktoken (synchronous)
	try {
	const encoding = this.getEncodingForModel(model);
	if (encoding) {
	return encoding.encode(text).length;
	}
	} catch (error) {
	console.warn(`Token counting failed for model ${model}:`, error);
	}

	return this.fallbackTokenCount(text);
	}

	static async countTokensForArray(
	texts: string[],
	model: Model = Model.GPT_4o_mini,
	): Promise<number> {
	const counts = await Promise.all(
	texts.map((text) => this.countTokens(text, model)),
	);
	return counts.reduce((sum, count) => sum + count, 0);
	}

	static async countTokensForJSON(
	obj: any,
	model: Model = Model.GPT_4o_mini,
	): Promise<number> {
	return await this.countTokens(JSON.stringify(obj), model);
	}

	static async truncateToTokenLimit(
	text: string,
	maxTokens: number,
	model: Model = Model.GPT_4o_mini,
	): Promise<string> {
	const currentTokens = await this.countTokens(text, model);
	if (currentTokens <= maxTokens) return text;

	// For async models, use fallback truncation
	if (model === Model.Llama_4 \|\| model === Model.Gemini_Pro) {
	return this.fallbackTruncate(text, maxTokens);
	}

	// For other models, use tiktoken truncation
	try {
	const encoding = this.getEncodingForModel(model);
	if (encoding) {
	const tokens = encoding.encode(text);
	if (tokens.length <= maxTokens) return text;

	const truncatedTokens = tokens.slice(0, maxTokens - 1);
	const truncatedText = encoding.decode(truncatedTokens);
	const truncatedString = String(truncatedText);

	if ((await this.countTokens(truncatedString, model)) < currentTokens) {
	return truncatedString + '...';
	}
	}
	} catch (error) {
	console.warn(`Token truncation failed:`, error);
	}

	return this.fallbackTruncate(text, maxTokens);
	}

	private static fallbackTokenCount(text: string): number {
	const baseTokens = Math.ceil(text.length / this.FALLBACK_CHARS_PER_TOKEN);
	return Math.ceil(baseTokens * 1.1);
	}

	private static fallbackTruncate(text: string, maxTokens: number): string {
	const targetChars = Math.floor(
	maxTokens * this.FALLBACK_CHARS_PER_TOKEN * 0.9,
	);
	return text.slice(0, targetChars) + '...';
	}

	static getTokenLimit(model: Model): number {
	switch (model) {
	case Model.GPT_5_mini:
	return 128000;
	case Model.GPT_4o_mini:
	return 128000;
	case Model.Grok_4:
	return 128000;
	case Model.Claude_3_5_Sonnet:
	return 200000;
	case Model.Claude_3_5_Haiku:
	return 200000;
	case Model.DeepSeek_V3:
	return 128000;
	case Model.Llama_4:
	return 128000;
	case Model.Gemini_Pro:
	return 1000000;
	default:
	return 128000;
	}
	}

	static async fitsWithinLimit(
	text: string,
	model: Model = Model.GPT_4o_mini,
	): Promise<boolean> {
	const tokens = await this.countTokens(text, model);
	return tokens <= this.getTokenLimit(model);
	}

	static async getTokenUsagePercentage(
	text: string,
	model: Model = Model.GPT_4o_mini,
	): Promise<number> {
	const tokens = await this.countTokens(text, model);
	const limit = this.getTokenLimit(model);
	return (tokens / limit) * 100;
	}

	static cleanup(): void {
	this.encodingCache.clear();
	}
	}
No results found