lucidrains · January 13, 2026 15:50
diff --git a/gte.nim b/gte.nim
 ## GTE-Small Embedding Library - Nim Port
 ## A single-file, self-contained text embedding solution.
 ##
 ## Original C implementation by Antirez (Salvatore Sanfilippo)
 ## Nim port maintains the same algorithm and produces identical results.
 ##
 ## MIT License - Copyright (c) 2026 Salvatore Sanfilippo
 ## See LICENSE file for full terms.
 ##
 ## USAGE: Just compile and run - model downloads automatically on first use!
 ##
 ## Build for maximum performance (matches C speed):
 ##   nim c -d:release -d:danger --opt:speed --passC:"-march=native -ffast-math" gte.nim
 ##
 ## Build for debugging:
 ##   nim c gte.nim
 ##
 ## Requirements: Python 3.8+ with pip (for auto-download only, uv preferred)

 import std/[strutils, tables, math, times, os, parseopt, osproc, tempfiles]

 # ========================================================================
 # Constants
 # ========================================================================

 const
  GTE_MAGIC = "GTE1"
  GTE_LAYER_NORM_EPS = 1e-12f32
  
  # Special token IDs
  TOKEN_PAD = 0
  TOKEN_UNK = 100
  TOKEN_CLS = 101
  TOKEN_SEP = 102
  TOKEN_MASK = 103
  
  # Hash table size for vocabulary
  VOCAB_HASH_SIZE = 40009
  
  DEFAULT_MODEL_PATH = "gte-small.gtemodel"
  MAX_SENTENCES = 64
  
  # Embedded Python script for downloading and converting the model
  DOWNLOAD_SCRIPT = """
 # /// script
 # requires-python = ">=3.8"
 # dependencies = ["huggingface_hub", "safetensors", "numpy"]
 # ///
 import sys, struct, json, os
 from pathlib import Path

 def main():
    output_path = sys.argv[1] if len(sys.argv) > 1 else "gte-small.gtemodel"
    cache_dir = Path.home() / ".cache" / "gte-nim"
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    print("Downloading GTE-small model from HuggingFace...")
    from huggingface_hub import hf_hub_download
    
    model_file = hf_hub_download("thenlper/gte-small", "model.safetensors", cache_dir=str(cache_dir))
    tokenizer_file = hf_hub_download("thenlper/gte-small", "tokenizer.json", cache_dir=str(cache_dir))
    config_file = hf_hub_download("thenlper/gte-small", "config.json", cache_dir=str(cache_dir))
    
    # Load config
    with open(config_file) as f:
        config = json.load(f)
    
    vocab_size = config["vocab_size"]
    hidden_size = config["hidden_size"]
    num_layers = config["num_hidden_layers"]
    num_heads = config["num_attention_heads"]
    intermediate_size = config["intermediate_size"]
    max_seq_length = config["max_position_embeddings"]
    
    print(f"Model: {vocab_size} vocab, {hidden_size} hidden, {num_layers} layers")
    
    # Extract vocab from tokenizer.json
    with open(tokenizer_file) as f:
        tokenizer = json.load(f)
    vocab_dict = tokenizer["model"]["vocab"]
    vocab = [word for word, _ in sorted(vocab_dict.items(), key=lambda x: x[1])]
    
    # Load safetensors
    from safetensors import safe_open
    tensors = safe_open(model_file, framework="numpy")
    
    print("Converting to .gtemodel format...")
    with open(output_path, 'wb') as f:
        f.write(b'GTE1')
        for val in [vocab_size, hidden_size, num_layers, num_heads, intermediate_size, max_seq_length]:
            f.write(struct.pack('<I', val))
        
        for word in vocab:
            word_bytes = word.encode('utf-8')
            f.write(struct.pack('<H', len(word_bytes)))
            f.write(word_bytes)
        
        def write_tensor(name):
            f.write(tensors.get_tensor(name).astype('float32').tobytes())
        
        write_tensor("embeddings.word_embeddings.weight")
        write_tensor("embeddings.position_embeddings.weight")
        write_tensor("embeddings.token_type_embeddings.weight")
        write_tensor("embeddings.LayerNorm.weight")
        write_tensor("embeddings.LayerNorm.bias")
        
        for l in range(num_layers):
            p = f"encoder.layer.{l}"
            write_tensor(f"{p}.attention.self.query.weight")
            write_tensor(f"{p}.attention.self.query.bias")
            write_tensor(f"{p}.attention.self.key.weight")
            write_tensor(f"{p}.attention.self.key.bias")
            write_tensor(f"{p}.attention.self.value.weight")
            write_tensor(f"{p}.attention.self.value.bias")
            write_tensor(f"{p}.attention.output.dense.weight")
            write_tensor(f"{p}.attention.output.dense.bias")
            write_tensor(f"{p}.attention.output.LayerNorm.weight")
            write_tensor(f"{p}.attention.output.LayerNorm.bias")
            write_tensor(f"{p}.intermediate.dense.weight")
            write_tensor(f"{p}.intermediate.dense.bias")
            write_tensor(f"{p}.output.dense.weight")
            write_tensor(f"{p}.output.dense.bias")
            write_tensor(f"{p}.output.LayerNorm.weight")
            write_tensor(f"{p}.output.LayerNorm.bias")
        
        write_tensor("pooler.dense.weight")
        write_tensor("pooler.dense.bias")
    
    size_mb = os.path.getsize(output_path) / 1024 / 1024
    print(f"Model saved to {output_path} ({size_mb:.1f} MB)")

 if __name__ == "__main__":
    main()
 """

 # ========================================================================
 # Model Download
 # ========================================================================

 proc download_model*(output_path: string): bool =
  ## Download and convert the GTE-small model if not present.
  ## Uses 'uv' if available (PEP 723 inline script), otherwise falls back to pip.
  ## Returns true on success.
  
  # Create temp file for the Python script
  let (script_file, script_path) = createTempFile("gte_download_", ".py")
  script_file.write(DOWNLOAD_SCRIPT)
  script_file.close()
  defer: removeFile(script_path)
  
  # Try uv first (supports PEP 723 inline dependencies)
  let uv_check = execCmdEx("which uv")
  if uv_check.exitCode == 0:
    echo "Using uv to download model..."
    let res = execShellCmd("uv run " & script_path & " " & quoteShell(output_path))
    return res == 0
  
  # Fall back to pip + python
  echo "Using pip to install dependencies..."
  let pip_install = execShellCmd("pip install -q huggingface_hub safetensors 2>/dev/null || pip3 install -q huggingface_hub safetensors")
  if pip_install != 0:
    stderr.writeLine "Failed to install Python dependencies"
    return false
  
  # Try python3 first, then python
  let python_check = execCmdEx("which python3")
  let python_cmd = if python_check.exitCode == 0: "python3" else: "python"
  
  let res = execShellCmd(python_cmd & " " & script_path & " " & quoteShell(output_path))
  return res == 0

 # ========================================================================
 # Data Structures
 # ========================================================================

 type
  VocabEntry = object
    word: string
    id: int
    
  LayerWeights = object
    # Self-attention
    query_weight: seq[float32]      # [hidden_size, hidden_size]
    query_bias: seq[float32]        # [hidden_size]
    key_weight: seq[float32]        # [hidden_size, hidden_size]
    key_bias: seq[float32]          # [hidden_size]
    value_weight: seq[float32]      # [hidden_size, hidden_size]
    value_bias: seq[float32]        # [hidden_size]
    attn_output_weight: seq[float32] # [hidden_size, hidden_size]
    attn_output_bias: seq[float32]   # [hidden_size]
    attn_ln_weight: seq[float32]     # [hidden_size]
    attn_ln_bias: seq[float32]       # [hidden_size]
    # Feed-forward network
    ffn_inter_weight: seq[float32]  # [intermediate_size, hidden_size]
    ffn_inter_bias: seq[float32]    # [intermediate_size]
    ffn_output_weight: seq[float32] # [hidden_size, intermediate_size]
    ffn_output_bias: seq[float32]   # [hidden_size]
    ffn_ln_weight: seq[float32]     # [hidden_size]
    ffn_ln_bias: seq[float32]       # [hidden_size]
    
  GteCtx* = ref object
    # Config
    vocab_size: int
    hidden_size: int
    num_layers: int
    num_heads: int
    intermediate_size: int
    max_seq_len: int
    head_dim: int
    
    # Vocabulary
    vocab: seq[string]
    vocab_hash: Table[string, int]
    
    # Embeddings
    token_embeddings: seq[float32]      # [vocab_size, hidden_size]
    position_embeddings: seq[float32]   # [max_seq_len, hidden_size]
    token_type_embeddings: seq[float32] # [2, hidden_size]
    embed_ln_weight: seq[float32]       # [hidden_size]
    embed_ln_bias: seq[float32]         # [hidden_size]
    
    # Transformer layers
    layers: seq[LayerWeights]
    
    # Pooler (not used for embeddings but loaded)
    pooler_weight: seq[float32]   # [hidden_size, hidden_size]
    pooler_bias: seq[float32]     # [hidden_size]
    
    # Working memory for inference
    hidden_states: seq[float32]   # [max_seq_len, hidden_size]
    attn_scores: seq[float32]     # [num_heads, max_seq_len, max_seq_len]
    q_proj: seq[float32]          # [max_seq_len, hidden_size]
    k_proj: seq[float32]          # [max_seq_len, hidden_size]
    v_proj: seq[float32]          # [max_seq_len, hidden_size]
    attn_output: seq[float32]     # [max_seq_len, hidden_size]
    ffn_hidden: seq[float32]      # [max_seq_len, intermediate_size]
    temp_hidden: seq[float32]     # [max_seq_len, hidden_size]

 # ========================================================================
 # Matrix Operations
 # ========================================================================

 proc linear(y: var openArray[float32], x: openArray[float32], 
            W: openArray[float32], b: openArray[float32],
            seq_len, in_dim, out_dim: int) =
  ## Matrix-vector multiplication with bias: y = x @ W^T + b
  for s in 0..<seq_len:
    for o in 0..<out_dim:
      var sum = if b.len > 0: b[o] else: 0.0f32
      for i in 0..<in_dim:
        sum += x[s * in_dim + i] * W[o * in_dim + i]
      y[s * out_dim + o] = sum

 proc layer_norm(out_buf: var openArray[float32], x: openArray[float32],
                gamma, beta: openArray[float32], seq_len, hidden_size: int) =
  ## Layer normalization
  for s in 0..<seq_len:
    let row_offset = s * hidden_size
    
    # Compute mean
    var mean = 0.0f32
    for i in 0..<hidden_size:
      mean += x[row_offset + i]
    mean /= hidden_size.float32
    
    # Compute variance
    var variance = 0.0f32
    for i in 0..<hidden_size:
      let diff = x[row_offset + i] - mean
      variance += diff * diff
    variance /= hidden_size.float32
    
    # Normalize and scale
    let std_inv = 1.0f32 / sqrt(variance + GTE_LAYER_NORM_EPS)
    for i in 0..<hidden_size:
      out_buf[row_offset + i] = gamma[i] * (x[row_offset + i] - mean) * std_inv + beta[i]

 proc gelu(x: var openArray[float32], n: int) =
  ## GELU activation (approximate)
  for i in 0..<n:
    let val = x[i]
    x[i] = 0.5f32 * val * (1.0f32 + tanh(0.7978845608f32 * (val + 0.044715f32 * val * val * val)))

 proc softmax(x: var openArray[float32], start, n: int) =
  ## Softmax over slice [start..start+n)
  # Find max for numerical stability
  var max_val = x[start]
  for i in 1..<n:
    if x[start + i] > max_val:
      max_val = x[start + i]
  
  # Compute exp and sum
  var sum = 0.0f32
  for i in 0..<n:
    x[start + i] = exp(x[start + i] - max_val)
    sum += x[start + i]
  
  # Normalize
  let inv_sum = 1.0f32 / sum
  for i in 0..<n:
    x[start + i] *= inv_sum

 proc l2_normalize(x: var openArray[float32], n: int) =
  ## L2 normalize in place
  var norm = 0.0f32
  for i in 0..<n:
    norm += x[i] * x[i]
  norm = sqrt(norm)
  if norm > 0.0f32:
    let inv_norm = 1.0f32 / norm
    for i in 0..<n:
      x[i] *= inv_norm

 # ========================================================================
 # Tokenizer
 # ========================================================================

 proc is_punctuation(c: char): bool =
  let code = ord(c)
  (code >= 33 and code <= 47) or (code >= 58 and code <= 64) or
    (code >= 91 and code <= 96) or (code >= 123 and code <= 126)

 proc is_whitespace(c: char): bool =
  c == ' ' or c == '\t' or c == '\n' or c == '\r'

 proc basic_tokenize(text: string): seq[string] =
  ## Basic tokenization: split on whitespace and punctuation, lowercase
  result = @[]
  var i = 0
  while i < text.len:
    # Skip whitespace
    while i < text.len and is_whitespace(text[i]):
      inc i
    if i >= text.len:
      break
    
    # Find end of token
    let start = i
    if is_punctuation(text[i]):
      inc i
    else:
      while i < text.len and not is_whitespace(text[i]) and not is_punctuation(text[i]):
        inc i
    
    # Create lowercase token
    result.add(text[start..<i].toLowerAscii())

 proc wordpiece_tokenize(ctx: GteCtx, word: string): seq[int] =
  ## WordPiece tokenization of a single word
  if word.len == 0:
    return @[]
  
  result = @[]
  var start = 0
  
  while start < word.len:
    var end_pos = word.len
    var found_id = -1
    
    # Find longest matching subword
    while start < end_pos:
      var candidate: string
      if start > 0:
        candidate = "##" & word[start..<end_pos]
      else:
        candidate = word[start..<end_pos]
      
      if candidate in ctx.vocab_hash:
        found_id = ctx.vocab_hash[candidate]
        break
      dec end_pos
    
    if found_id < 0:
      result.add(TOKEN_UNK)
      inc start
    else:
      result.add(found_id)
      start = end_pos

 proc tokenize(ctx: GteCtx, text: string, max_len: int): seq[int] =
  ## Full tokenization: text -> token IDs
  let basic_tokens = basic_tokenize(text)
  
  result = @[TOKEN_CLS]
  
  for token in basic_tokens:
    if result.len >= max_len - 1:
      break
    let subtokens = wordpiece_tokenize(ctx, token)
    for subtoken in subtokens:
      if result.len >= max_len - 1:
        break
      result.add(subtoken)
  
  result.add(TOKEN_SEP)

 # ========================================================================
 # Transformer Forward Pass
 # ========================================================================

 proc self_attention(ctx: GteCtx, layer: LayerWeights, seq_len: int, attn_mask: seq[int]) =
  let hidden = ctx.hidden_size
  let heads = ctx.num_heads
  let head_dim = ctx.head_dim
  
  # Project Q, K, V
  linear(ctx.q_proj, ctx.hidden_states, layer.query_weight, layer.query_bias, seq_len, hidden, hidden)
  linear(ctx.k_proj, ctx.hidden_states, layer.key_weight, layer.key_bias, seq_len, hidden, hidden)
  linear(ctx.v_proj, ctx.hidden_states, layer.value_weight, layer.value_bias, seq_len, hidden, hidden)
  
  # Compute attention for each head
  let scale = 1.0f32 / sqrt(head_dim.float32)
  
  for h in 0..<heads:
    # Attention scores for this head: Q @ K^T / sqrt(d_k)
    for i in 0..<seq_len:
      for j in 0..<seq_len:
        var score = 0.0f32
        for d in 0..<head_dim:
          let q_idx = i * hidden + h * head_dim + d
          let k_idx = j * hidden + h * head_dim + d
          score += ctx.q_proj[q_idx] * ctx.k_proj[k_idx]
        score *= scale
        
        # Apply attention mask
        if attn_mask.len > 0 and attn_mask[j] == 0:
          score = -10000.0f32
        
        ctx.attn_scores[h * seq_len * seq_len + i * seq_len + j] = score
      
      # Softmax over keys
      softmax(ctx.attn_scores, h * seq_len * seq_len + i * seq_len, seq_len)
    
    # Weighted sum of values
    for i in 0..<seq_len:
      for d in 0..<head_dim:
        var sum = 0.0f32
        for j in 0..<seq_len:
          let attn = ctx.attn_scores[h * seq_len * seq_len + i * seq_len + j]
          let v_idx = j * hidden + h * head_dim + d
          sum += attn * ctx.v_proj[v_idx]
        ctx.attn_output[i * hidden + h * head_dim + d] = sum
  
  # Output projection
  linear(ctx.temp_hidden, ctx.attn_output, layer.attn_output_weight, layer.attn_output_bias, seq_len, hidden, hidden)
  
  # Residual connection and layer norm
  for i in 0..<seq_len * hidden:
    ctx.temp_hidden[i] += ctx.hidden_states[i]
  layer_norm(ctx.hidden_states, ctx.temp_hidden, layer.attn_ln_weight, layer.attn_ln_bias, seq_len, hidden)

 proc feed_forward(ctx: GteCtx, layer: LayerWeights, seq_len: int) =
  let hidden = ctx.hidden_size
  let inter = ctx.intermediate_size
  
  # Intermediate layer
  linear(ctx.ffn_hidden, ctx.hidden_states, layer.ffn_inter_weight, layer.ffn_inter_bias, seq_len, hidden, inter)
  gelu(ctx.ffn_hidden, seq_len * inter)
  
  # Output layer
  linear(ctx.temp_hidden, ctx.ffn_hidden, layer.ffn_output_weight, layer.ffn_output_bias, seq_len, inter, hidden)
  
  # Residual connection and layer norm
  for i in 0..<seq_len * hidden:
    ctx.temp_hidden[i] += ctx.hidden_states[i]
  layer_norm(ctx.hidden_states, ctx.temp_hidden, layer.ffn_ln_weight, layer.ffn_ln_bias, seq_len, hidden)

 proc transformer_forward(ctx: GteCtx, token_ids: seq[int], seq_len: int, attn_mask: seq[int]) =
  let hidden = ctx.hidden_size
  
  # Compute embeddings
  for s in 0..<seq_len:
    let token_id = token_ids[s]
    for d in 0..<hidden:
      ctx.hidden_states[s * hidden + d] = 
        ctx.token_embeddings[token_id * hidden + d] +
        ctx.position_embeddings[s * hidden + d] +
        ctx.token_type_embeddings[d]  # token_type = 0
  
  # Embedding layer norm
  layer_norm(ctx.hidden_states, ctx.hidden_states, ctx.embed_ln_weight, ctx.embed_ln_bias, seq_len, hidden)
  
  # Process each transformer layer
  for layer in ctx.layers:
    self_attention(ctx, layer, seq_len, attn_mask)
    feed_forward(ctx, layer, seq_len)

 proc mean_pooling(output: var seq[float32], hidden_states: seq[float32], 
                  attn_mask: seq[int], seq_len, hidden_size: int) =
  ## Mean pooling over non-padded tokens
  # Initialize output to zero
  for i in 0..<hidden_size:
    output[i] = 0.0f32
  
  # Sum up hidden states for non-padded tokens
  var count = 0
  for s in 0..<seq_len:
    if attn_mask[s] != 0:
      for d in 0..<hidden_size:
        output[d] += hidden_states[s * hidden_size + d]
      inc count
  
  # Average
  if count > 0:
    let inv_count = 1.0f32 / count.float32
    for d in 0..<hidden_size:
      output[d] *= inv_count

 # ========================================================================
 # Model Loading
 # ========================================================================

 proc read_uint32(f: File): int =
  var buf: array[4, uint8]
  if f.readBytes(buf, 0, 4) != 4:
    raise newException(IOError, "Failed to read uint32")
  result = int(buf[0]) or (int(buf[1]) shl 8) or (int(buf[2]) shl 16) or (int(buf[3]) shl 24)

 proc read_uint16(f: File): int =
  var buf: array[2, uint8]
  if f.readBytes(buf, 0, 2) != 2:
    raise newException(IOError, "Failed to read uint16")
  result = int(buf[0]) or (int(buf[1]) shl 8)

 proc read_floats(f: File, count: int): seq[float32] =
  result = newSeq[float32](count)
  let bytes = count * sizeof(float32)
  if f.readBuffer(addr result[0], bytes) != bytes:
    raise newException(IOError, "Failed to read floats")

 proc gte_load*(model_path: string): GteCtx =
  ## Load model from .gtemodel file
  if not fileExists(model_path):
    stderr.writeLine "gte_load: cannot open ", model_path
    return nil
  
  var f = open(model_path, fmRead)
  defer: f.close()
  
  # Check magic
  var magic: array[4, char]
  if f.readChars(magic) != 4 or magic != ['G', 'T', 'E', '1']:
    stderr.writeLine "gte_load: invalid magic"
    return nil
  
  result = GteCtx()
  
  # Read config
  result.vocab_size = read_uint32(f)
  result.hidden_size = read_uint32(f)
  result.num_layers = read_uint32(f)
  result.num_heads = read_uint32(f)
  result.intermediate_size = read_uint32(f)
  result.max_seq_len = read_uint32(f)
  result.head_dim = result.hidden_size div result.num_heads
  
  # Read vocabulary
  result.vocab = newSeq[string](result.vocab_size)
  result.vocab_hash = initTable[string, int]()
  
  for i in 0..<result.vocab_size:
    let len = read_uint16(f)
    var word = newString(len)
    if f.readChars(word) != len:
      raise newException(IOError, "Failed to read vocab word")
    result.vocab[i] = word
    result.vocab_hash[word] = i
  
  # Read embeddings
  result.token_embeddings = read_floats(f, result.vocab_size * result.hidden_size)
  result.position_embeddings = read_floats(f, result.max_seq_len * result.hidden_size)
  result.token_type_embeddings = read_floats(f, 2 * result.hidden_size)
  result.embed_ln_weight = read_floats(f, result.hidden_size)
  result.embed_ln_bias = read_floats(f, result.hidden_size)
  
  # Read transformer layers
  result.layers = newSeq[LayerWeights](result.num_layers)
  
  for l in 0..<result.num_layers:
    result.layers[l].query_weight = read_floats(f, result.hidden_size * result.hidden_size)
    result.layers[l].query_bias = read_floats(f, result.hidden_size)
    result.layers[l].key_weight = read_floats(f, result.hidden_size * result.hidden_size)
    result.layers[l].key_bias = read_floats(f, result.hidden_size)
    result.layers[l].value_weight = read_floats(f, result.hidden_size * result.hidden_size)
    result.layers[l].value_bias = read_floats(f, result.hidden_size)
    result.layers[l].attn_output_weight = read_floats(f, result.hidden_size * result.hidden_size)
    result.layers[l].attn_output_bias = read_floats(f, result.hidden_size)
    result.layers[l].attn_ln_weight = read_floats(f, result.hidden_size)
    result.layers[l].attn_ln_bias = read_floats(f, result.hidden_size)
    
    result.layers[l].ffn_inter_weight = read_floats(f, result.intermediate_size * result.hidden_size)
    result.layers[l].ffn_inter_bias = read_floats(f, result.intermediate_size)
    result.layers[l].ffn_output_weight = read_floats(f, result.hidden_size * result.intermediate_size)
    result.layers[l].ffn_output_bias = read_floats(f, result.hidden_size)
    result.layers[l].ffn_ln_weight = read_floats(f, result.hidden_size)
    result.layers[l].ffn_ln_bias = read_floats(f, result.hidden_size)
  
  # Read pooler (not used for embeddings)
  result.pooler_weight = read_floats(f, result.hidden_size * result.hidden_size)
  result.pooler_bias = read_floats(f, result.hidden_size)
  
  # Allocate working memory
  let max_seq = result.max_seq_len
  let hidden = result.hidden_size
  let inter = result.intermediate_size
  let heads = result.num_heads
  
  result.hidden_states = newSeq[float32](max_seq * hidden)
  result.attn_scores = newSeq[float32](heads * max_seq * max_seq)
  result.q_proj = newSeq[float32](max_seq * hidden)
  result.k_proj = newSeq[float32](max_seq * hidden)
  result.v_proj = newSeq[float32](max_seq * hidden)
  result.attn_output = newSeq[float32](max_seq * hidden)
  result.ffn_hidden = newSeq[float32](max_seq * inter)
  result.temp_hidden = newSeq[float32](max_seq * hidden)

 # ========================================================================
 # Public API
 # ========================================================================

 proc gte_embed*(ctx: GteCtx, text: string): seq[float32] =
  ## Generate embedding for a single text
  if ctx == nil:
    return @[]
  
  # Tokenize
  let token_ids = tokenize(ctx, text, ctx.max_seq_len)
  let num_tokens = token_ids.len
  
  # Create attention mask
  var attn_mask = newSeq[int](num_tokens)
  for i in 0..<num_tokens:
    attn_mask[i] = 1
  
  # Run transformer
  transformer_forward(ctx, token_ids, num_tokens, attn_mask)
  
  # Mean pooling
  result = newSeq[float32](ctx.hidden_size)
  mean_pooling(result, ctx.hidden_states, attn_mask, num_tokens, ctx.hidden_size)
  
  # L2 normalize
  l2_normalize(result, ctx.hidden_size)

 proc gte_dim*(ctx: GteCtx): int =
  ## Get the embedding dimension (384 for GTE-small)
  if ctx != nil: ctx.hidden_size else: 0

 proc gte_max_seq_len*(ctx: GteCtx): int =
  ## Get the maximum sequence length (512 for GTE-small)
  if ctx != nil: ctx.max_seq_len else: 0

 proc gte_cosine_similarity*(a, b: openArray[float32], dim: int): float32 =
  ## Compute cosine similarity between two embeddings
  ## Assumes normalized vectors, so dot product = cosine similarity
  result = 0.0f32
  for i in 0..<dim:
    result += a[i] * b[i]

 # ========================================================================
 # Test Program
 # ========================================================================

 proc print_embedding(emb: openArray[float32], dim, n: int) =
  stdout.write "["
  for i in 0..<min(n, dim):
    stdout.write emb[i].formatFloat(ffDecimal, 6)
    if i < n - 1:
      stdout.write ", "
  if n < dim:
    stdout.write ", ..."
  echo "]"

 proc print_usage(prog: string) =
  echo "Usage: ", prog, " [OPTIONS] [SENTENCES...]"
  echo ""
  echo "Test GTE-small embedding model by computing embeddings and similarity matrix."
  echo "The model is automatically downloaded on first run if not present."
  echo ""
  echo "Options:"
  echo "  --model-path PATH   Path to .gtemodel file (default: ", DEFAULT_MODEL_PATH, ")"
  echo "  --help              Show this help message"
  echo ""
  echo "Arguments:"
  echo "  SENTENCES           One or more sentences to embed (quote each sentence)"
  echo "                      If none provided, uses built-in example sentences"
  echo ""
  echo "Examples:"
  echo "  ", prog
  echo "  ", prog, " \"Hello world\" \"Goodbye world\""
  echo "  ", prog, " --model-path my-model.gtemodel \"Test sentence\""

 when isMainModule:
  var model_path = DEFAULT_MODEL_PATH
  var user_sentences: seq[string] = @[]
  
  # Parse arguments
  var p = initOptParser()
  while true:
    p.next()
    case p.kind
    of cmdEnd: break
    of cmdShortOption, cmdLongOption:
      case p.key
      of "help", "h":
        print_usage(getAppFilename())
        quit(0)
      of "model-path":
        model_path = p.val
      else:
        stderr.writeLine "Error: Unknown option '", p.key, "'"
        quit(1)
    of cmdArgument:
      user_sentences.add(p.key)
  
  # Default sentences if none provided
  let default_sentences = @[
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
    "Machine learning is transforming industries.",
    "I love programming in C."
  ]
  
  let sentences = if user_sentences.len > 0: user_sentences else: default_sentences
  
  # Auto-download model if not present
  if not fileExists(model_path):
    echo "Model not found at ", model_path
    echo "Attempting to download GTE-small model..."
    if not download_model(model_path):
      stderr.writeLine "Failed to download model. Please download manually."
      quit(1)
  
  # Load model
  echo "Loading model from ", model_path, "..."
  let start_load = cpuTime()
  
  let ctx = gte_load(model_path)
  if ctx == nil:
    stderr.writeLine "Failed to load model"
    quit(1)
  
  let load_time = cpuTime() - start_load
  echo "Model loaded in ", load_time.formatFloat(ffDecimal, 2), " seconds"
  echo "Embedding dimension: ", gte_dim(ctx)
  echo "Max sequence length: ", gte_max_seq_len(ctx)
  echo ""
  
  # Generate embeddings
  echo "Generating embeddings..."
  echo ""
  
  var embeddings: seq[seq[float32]] = @[]
  
  for i, sentence in sentences:
    let start_embed = cpuTime()
    let embedding = gte_embed(ctx, sentence)
    let embed_time = cpuTime() - start_embed
    
    embeddings.add(embedding)
    
    echo "S", i + 1, ": \"", sentence, "\""
    echo "    Time: ", (embed_time * 1000).formatFloat(ffDecimal, 3), " ms"
    stdout.write "    Embedding: "
    print_embedding(embedding, gte_dim(ctx), 5)
    echo ""
  
  # Compute similarity matrix
  echo "Cosine similarity matrix:"
  stdout.write "     "
  for i in 0..<sentences.len:
    stdout.write "  S", i + 1, "   "
  echo ""
  
  for i in 0..<sentences.len:
    stdout.write "S", i + 1, ": "
    for j in 0..<sentences.len:
      let sim = gte_cosine_similarity(embeddings[i], embeddings[j], gte_dim(ctx))
      stdout.write " ", sim.formatFloat(ffDecimal, 3), " "
    echo ""
  
  echo ""
  echo "Done!"
No results found