Last active
January 13, 2026 15:50
-
-
Save lucidrains/08f41712008fdb83a87041d34f8f06b0 to your computer and use it in GitHub Desktop.
gte-pure-c.nim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## GTE-Small Embedding Library - Nim Port | |
| ## A single-file, self-contained text embedding solution. | |
| ## | |
| ## Original C implementation by Antirez (Salvatore Sanfilippo) | |
| ## Nim port maintains the same algorithm and produces identical results. | |
| ## | |
| ## MIT License - Copyright (c) 2026 Salvatore Sanfilippo | |
| ## See LICENSE file for full terms. | |
| ## | |
| ## USAGE: Just compile and run - model downloads automatically on first use! | |
| ## | |
| ## Build for maximum performance (matches C speed): | |
| ## nim c -d:release -d:danger --opt:speed --passC:"-march=native -ffast-math" gte.nim | |
| ## | |
| ## Build for debugging: | |
| ## nim c gte.nim | |
| ## | |
| ## Requirements: Python 3.8+ with pip (for auto-download only, uv preferred) | |
| import std/[strutils, tables, math, times, os, parseopt, osproc, tempfiles] | |
| # ======================================================================== | |
| # Constants | |
| # ======================================================================== | |
| const | |
| GTE_MAGIC = "GTE1" | |
| GTE_LAYER_NORM_EPS = 1e-12f32 | |
| # Special token IDs | |
| TOKEN_PAD = 0 | |
| TOKEN_UNK = 100 | |
| TOKEN_CLS = 101 | |
| TOKEN_SEP = 102 | |
| TOKEN_MASK = 103 | |
| # Hash table size for vocabulary | |
| VOCAB_HASH_SIZE = 40009 | |
| DEFAULT_MODEL_PATH = "gte-small.gtemodel" | |
| MAX_SENTENCES = 64 | |
| # Embedded Python script for downloading and converting the model | |
| DOWNLOAD_SCRIPT = """ | |
| # /// script | |
| # requires-python = ">=3.8" | |
| # dependencies = ["huggingface_hub", "safetensors", "numpy"] | |
| # /// | |
| import sys, struct, json, os | |
| from pathlib import Path | |
| def main(): | |
| output_path = sys.argv[1] if len(sys.argv) > 1 else "gte-small.gtemodel" | |
| cache_dir = Path.home() / ".cache" / "gte-nim" | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| print("Downloading GTE-small model from HuggingFace...") | |
| from huggingface_hub import hf_hub_download | |
| model_file = hf_hub_download("thenlper/gte-small", "model.safetensors", cache_dir=str(cache_dir)) | |
| tokenizer_file = hf_hub_download("thenlper/gte-small", "tokenizer.json", cache_dir=str(cache_dir)) | |
| config_file = hf_hub_download("thenlper/gte-small", "config.json", cache_dir=str(cache_dir)) | |
| # Load config | |
| with open(config_file) as f: | |
| config = json.load(f) | |
| vocab_size = config["vocab_size"] | |
| hidden_size = config["hidden_size"] | |
| num_layers = config["num_hidden_layers"] | |
| num_heads = config["num_attention_heads"] | |
| intermediate_size = config["intermediate_size"] | |
| max_seq_length = config["max_position_embeddings"] | |
| print(f"Model: {vocab_size} vocab, {hidden_size} hidden, {num_layers} layers") | |
| # Extract vocab from tokenizer.json | |
| with open(tokenizer_file) as f: | |
| tokenizer = json.load(f) | |
| vocab_dict = tokenizer["model"]["vocab"] | |
| vocab = [word for word, _ in sorted(vocab_dict.items(), key=lambda x: x[1])] | |
| # Load safetensors | |
| from safetensors import safe_open | |
| tensors = safe_open(model_file, framework="numpy") | |
| print("Converting to .gtemodel format...") | |
| with open(output_path, 'wb') as f: | |
| f.write(b'GTE1') | |
| for val in [vocab_size, hidden_size, num_layers, num_heads, intermediate_size, max_seq_length]: | |
| f.write(struct.pack('<I', val)) | |
| for word in vocab: | |
| word_bytes = word.encode('utf-8') | |
| f.write(struct.pack('<H', len(word_bytes))) | |
| f.write(word_bytes) | |
| def write_tensor(name): | |
| f.write(tensors.get_tensor(name).astype('float32').tobytes()) | |
| write_tensor("embeddings.word_embeddings.weight") | |
| write_tensor("embeddings.position_embeddings.weight") | |
| write_tensor("embeddings.token_type_embeddings.weight") | |
| write_tensor("embeddings.LayerNorm.weight") | |
| write_tensor("embeddings.LayerNorm.bias") | |
| for l in range(num_layers): | |
| p = f"encoder.layer.{l}" | |
| write_tensor(f"{p}.attention.self.query.weight") | |
| write_tensor(f"{p}.attention.self.query.bias") | |
| write_tensor(f"{p}.attention.self.key.weight") | |
| write_tensor(f"{p}.attention.self.key.bias") | |
| write_tensor(f"{p}.attention.self.value.weight") | |
| write_tensor(f"{p}.attention.self.value.bias") | |
| write_tensor(f"{p}.attention.output.dense.weight") | |
| write_tensor(f"{p}.attention.output.dense.bias") | |
| write_tensor(f"{p}.attention.output.LayerNorm.weight") | |
| write_tensor(f"{p}.attention.output.LayerNorm.bias") | |
| write_tensor(f"{p}.intermediate.dense.weight") | |
| write_tensor(f"{p}.intermediate.dense.bias") | |
| write_tensor(f"{p}.output.dense.weight") | |
| write_tensor(f"{p}.output.dense.bias") | |
| write_tensor(f"{p}.output.LayerNorm.weight") | |
| write_tensor(f"{p}.output.LayerNorm.bias") | |
| write_tensor("pooler.dense.weight") | |
| write_tensor("pooler.dense.bias") | |
| size_mb = os.path.getsize(output_path) / 1024 / 1024 | |
| print(f"Model saved to {output_path} ({size_mb:.1f} MB)") | |
| if __name__ == "__main__": | |
| main() | |
| """ | |
| # ======================================================================== | |
| # Model Download | |
| # ======================================================================== | |
| proc download_model*(output_path: string): bool = | |
| ## Download and convert the GTE-small model if not present. | |
| ## Uses 'uv' if available (PEP 723 inline script), otherwise falls back to pip. | |
| ## Returns true on success. | |
| # Create temp file for the Python script | |
| let (script_file, script_path) = createTempFile("gte_download_", ".py") | |
| script_file.write(DOWNLOAD_SCRIPT) | |
| script_file.close() | |
| defer: removeFile(script_path) | |
| # Try uv first (supports PEP 723 inline dependencies) | |
| let uv_check = execCmdEx("which uv") | |
| if uv_check.exitCode == 0: | |
| echo "Using uv to download model..." | |
| let res = execShellCmd("uv run " & script_path & " " & quoteShell(output_path)) | |
| return res == 0 | |
| # Fall back to pip + python | |
| echo "Using pip to install dependencies..." | |
| let pip_install = execShellCmd("pip install -q huggingface_hub safetensors 2>/dev/null || pip3 install -q huggingface_hub safetensors") | |
| if pip_install != 0: | |
| stderr.writeLine "Failed to install Python dependencies" | |
| return false | |
| # Try python3 first, then python | |
| let python_check = execCmdEx("which python3") | |
| let python_cmd = if python_check.exitCode == 0: "python3" else: "python" | |
| let res = execShellCmd(python_cmd & " " & script_path & " " & quoteShell(output_path)) | |
| return res == 0 | |
| # ======================================================================== | |
| # Data Structures | |
| # ======================================================================== | |
| type | |
| VocabEntry = object | |
| word: string | |
| id: int | |
| LayerWeights = object | |
| # Self-attention | |
| query_weight: seq[float32] # [hidden_size, hidden_size] | |
| query_bias: seq[float32] # [hidden_size] | |
| key_weight: seq[float32] # [hidden_size, hidden_size] | |
| key_bias: seq[float32] # [hidden_size] | |
| value_weight: seq[float32] # [hidden_size, hidden_size] | |
| value_bias: seq[float32] # [hidden_size] | |
| attn_output_weight: seq[float32] # [hidden_size, hidden_size] | |
| attn_output_bias: seq[float32] # [hidden_size] | |
| attn_ln_weight: seq[float32] # [hidden_size] | |
| attn_ln_bias: seq[float32] # [hidden_size] | |
| # Feed-forward network | |
| ffn_inter_weight: seq[float32] # [intermediate_size, hidden_size] | |
| ffn_inter_bias: seq[float32] # [intermediate_size] | |
| ffn_output_weight: seq[float32] # [hidden_size, intermediate_size] | |
| ffn_output_bias: seq[float32] # [hidden_size] | |
| ffn_ln_weight: seq[float32] # [hidden_size] | |
| ffn_ln_bias: seq[float32] # [hidden_size] | |
| GteCtx* = ref object | |
| # Config | |
| vocab_size: int | |
| hidden_size: int | |
| num_layers: int | |
| num_heads: int | |
| intermediate_size: int | |
| max_seq_len: int | |
| head_dim: int | |
| # Vocabulary | |
| vocab: seq[string] | |
| vocab_hash: Table[string, int] | |
| # Embeddings | |
| token_embeddings: seq[float32] # [vocab_size, hidden_size] | |
| position_embeddings: seq[float32] # [max_seq_len, hidden_size] | |
| token_type_embeddings: seq[float32] # [2, hidden_size] | |
| embed_ln_weight: seq[float32] # [hidden_size] | |
| embed_ln_bias: seq[float32] # [hidden_size] | |
| # Transformer layers | |
| layers: seq[LayerWeights] | |
| # Pooler (not used for embeddings but loaded) | |
| pooler_weight: seq[float32] # [hidden_size, hidden_size] | |
| pooler_bias: seq[float32] # [hidden_size] | |
| # Working memory for inference | |
| hidden_states: seq[float32] # [max_seq_len, hidden_size] | |
| attn_scores: seq[float32] # [num_heads, max_seq_len, max_seq_len] | |
| q_proj: seq[float32] # [max_seq_len, hidden_size] | |
| k_proj: seq[float32] # [max_seq_len, hidden_size] | |
| v_proj: seq[float32] # [max_seq_len, hidden_size] | |
| attn_output: seq[float32] # [max_seq_len, hidden_size] | |
| ffn_hidden: seq[float32] # [max_seq_len, intermediate_size] | |
| temp_hidden: seq[float32] # [max_seq_len, hidden_size] | |
| # ======================================================================== | |
| # Matrix Operations | |
| # ======================================================================== | |
| proc linear(y: var openArray[float32], x: openArray[float32], | |
| W: openArray[float32], b: openArray[float32], | |
| seq_len, in_dim, out_dim: int) = | |
| ## Matrix-vector multiplication with bias: y = x @ W^T + b | |
| for s in 0..<seq_len: | |
| for o in 0..<out_dim: | |
| var sum = if b.len > 0: b[o] else: 0.0f32 | |
| for i in 0..<in_dim: | |
| sum += x[s * in_dim + i] * W[o * in_dim + i] | |
| y[s * out_dim + o] = sum | |
| proc layer_norm(out_buf: var openArray[float32], x: openArray[float32], | |
| gamma, beta: openArray[float32], seq_len, hidden_size: int) = | |
| ## Layer normalization | |
| for s in 0..<seq_len: | |
| let row_offset = s * hidden_size | |
| # Compute mean | |
| var mean = 0.0f32 | |
| for i in 0..<hidden_size: | |
| mean += x[row_offset + i] | |
| mean /= hidden_size.float32 | |
| # Compute variance | |
| var variance = 0.0f32 | |
| for i in 0..<hidden_size: | |
| let diff = x[row_offset + i] - mean | |
| variance += diff * diff | |
| variance /= hidden_size.float32 | |
| # Normalize and scale | |
| let std_inv = 1.0f32 / sqrt(variance + GTE_LAYER_NORM_EPS) | |
| for i in 0..<hidden_size: | |
| out_buf[row_offset + i] = gamma[i] * (x[row_offset + i] - mean) * std_inv + beta[i] | |
| proc gelu(x: var openArray[float32], n: int) = | |
| ## GELU activation (approximate) | |
| for i in 0..<n: | |
| let val = x[i] | |
| x[i] = 0.5f32 * val * (1.0f32 + tanh(0.7978845608f32 * (val + 0.044715f32 * val * val * val))) | |
| proc softmax(x: var openArray[float32], start, n: int) = | |
| ## Softmax over slice [start..start+n) | |
| # Find max for numerical stability | |
| var max_val = x[start] | |
| for i in 1..<n: | |
| if x[start + i] > max_val: | |
| max_val = x[start + i] | |
| # Compute exp and sum | |
| var sum = 0.0f32 | |
| for i in 0..<n: | |
| x[start + i] = exp(x[start + i] - max_val) | |
| sum += x[start + i] | |
| # Normalize | |
| let inv_sum = 1.0f32 / sum | |
| for i in 0..<n: | |
| x[start + i] *= inv_sum | |
| proc l2_normalize(x: var openArray[float32], n: int) = | |
| ## L2 normalize in place | |
| var norm = 0.0f32 | |
| for i in 0..<n: | |
| norm += x[i] * x[i] | |
| norm = sqrt(norm) | |
| if norm > 0.0f32: | |
| let inv_norm = 1.0f32 / norm | |
| for i in 0..<n: | |
| x[i] *= inv_norm | |
| # ======================================================================== | |
| # Tokenizer | |
| # ======================================================================== | |
| proc is_punctuation(c: char): bool = | |
| let code = ord(c) | |
| (code >= 33 and code <= 47) or (code >= 58 and code <= 64) or | |
| (code >= 91 and code <= 96) or (code >= 123 and code <= 126) | |
| proc is_whitespace(c: char): bool = | |
| c == ' ' or c == '\t' or c == '\n' or c == '\r' | |
| proc basic_tokenize(text: string): seq[string] = | |
| ## Basic tokenization: split on whitespace and punctuation, lowercase | |
| result = @[] | |
| var i = 0 | |
| while i < text.len: | |
| # Skip whitespace | |
| while i < text.len and is_whitespace(text[i]): | |
| inc i | |
| if i >= text.len: | |
| break | |
| # Find end of token | |
| let start = i | |
| if is_punctuation(text[i]): | |
| inc i | |
| else: | |
| while i < text.len and not is_whitespace(text[i]) and not is_punctuation(text[i]): | |
| inc i | |
| # Create lowercase token | |
| result.add(text[start..<i].toLowerAscii()) | |
| proc wordpiece_tokenize(ctx: GteCtx, word: string): seq[int] = | |
| ## WordPiece tokenization of a single word | |
| if word.len == 0: | |
| return @[] | |
| result = @[] | |
| var start = 0 | |
| while start < word.len: | |
| var end_pos = word.len | |
| var found_id = -1 | |
| # Find longest matching subword | |
| while start < end_pos: | |
| var candidate: string | |
| if start > 0: | |
| candidate = "##" & word[start..<end_pos] | |
| else: | |
| candidate = word[start..<end_pos] | |
| if candidate in ctx.vocab_hash: | |
| found_id = ctx.vocab_hash[candidate] | |
| break | |
| dec end_pos | |
| if found_id < 0: | |
| result.add(TOKEN_UNK) | |
| inc start | |
| else: | |
| result.add(found_id) | |
| start = end_pos | |
| proc tokenize(ctx: GteCtx, text: string, max_len: int): seq[int] = | |
| ## Full tokenization: text -> token IDs | |
| let basic_tokens = basic_tokenize(text) | |
| result = @[TOKEN_CLS] | |
| for token in basic_tokens: | |
| if result.len >= max_len - 1: | |
| break | |
| let subtokens = wordpiece_tokenize(ctx, token) | |
| for subtoken in subtokens: | |
| if result.len >= max_len - 1: | |
| break | |
| result.add(subtoken) | |
| result.add(TOKEN_SEP) | |
| # ======================================================================== | |
| # Transformer Forward Pass | |
| # ======================================================================== | |
| proc self_attention(ctx: GteCtx, layer: LayerWeights, seq_len: int, attn_mask: seq[int]) = | |
| let hidden = ctx.hidden_size | |
| let heads = ctx.num_heads | |
| let head_dim = ctx.head_dim | |
| # Project Q, K, V | |
| linear(ctx.q_proj, ctx.hidden_states, layer.query_weight, layer.query_bias, seq_len, hidden, hidden) | |
| linear(ctx.k_proj, ctx.hidden_states, layer.key_weight, layer.key_bias, seq_len, hidden, hidden) | |
| linear(ctx.v_proj, ctx.hidden_states, layer.value_weight, layer.value_bias, seq_len, hidden, hidden) | |
| # Compute attention for each head | |
| let scale = 1.0f32 / sqrt(head_dim.float32) | |
| for h in 0..<heads: | |
| # Attention scores for this head: Q @ K^T / sqrt(d_k) | |
| for i in 0..<seq_len: | |
| for j in 0..<seq_len: | |
| var score = 0.0f32 | |
| for d in 0..<head_dim: | |
| let q_idx = i * hidden + h * head_dim + d | |
| let k_idx = j * hidden + h * head_dim + d | |
| score += ctx.q_proj[q_idx] * ctx.k_proj[k_idx] | |
| score *= scale | |
| # Apply attention mask | |
| if attn_mask.len > 0 and attn_mask[j] == 0: | |
| score = -10000.0f32 | |
| ctx.attn_scores[h * seq_len * seq_len + i * seq_len + j] = score | |
| # Softmax over keys | |
| softmax(ctx.attn_scores, h * seq_len * seq_len + i * seq_len, seq_len) | |
| # Weighted sum of values | |
| for i in 0..<seq_len: | |
| for d in 0..<head_dim: | |
| var sum = 0.0f32 | |
| for j in 0..<seq_len: | |
| let attn = ctx.attn_scores[h * seq_len * seq_len + i * seq_len + j] | |
| let v_idx = j * hidden + h * head_dim + d | |
| sum += attn * ctx.v_proj[v_idx] | |
| ctx.attn_output[i * hidden + h * head_dim + d] = sum | |
| # Output projection | |
| linear(ctx.temp_hidden, ctx.attn_output, layer.attn_output_weight, layer.attn_output_bias, seq_len, hidden, hidden) | |
| # Residual connection and layer norm | |
| for i in 0..<seq_len * hidden: | |
| ctx.temp_hidden[i] += ctx.hidden_states[i] | |
| layer_norm(ctx.hidden_states, ctx.temp_hidden, layer.attn_ln_weight, layer.attn_ln_bias, seq_len, hidden) | |
| proc feed_forward(ctx: GteCtx, layer: LayerWeights, seq_len: int) = | |
| let hidden = ctx.hidden_size | |
| let inter = ctx.intermediate_size | |
| # Intermediate layer | |
| linear(ctx.ffn_hidden, ctx.hidden_states, layer.ffn_inter_weight, layer.ffn_inter_bias, seq_len, hidden, inter) | |
| gelu(ctx.ffn_hidden, seq_len * inter) | |
| # Output layer | |
| linear(ctx.temp_hidden, ctx.ffn_hidden, layer.ffn_output_weight, layer.ffn_output_bias, seq_len, inter, hidden) | |
| # Residual connection and layer norm | |
| for i in 0..<seq_len * hidden: | |
| ctx.temp_hidden[i] += ctx.hidden_states[i] | |
| layer_norm(ctx.hidden_states, ctx.temp_hidden, layer.ffn_ln_weight, layer.ffn_ln_bias, seq_len, hidden) | |
| proc transformer_forward(ctx: GteCtx, token_ids: seq[int], seq_len: int, attn_mask: seq[int]) = | |
| let hidden = ctx.hidden_size | |
| # Compute embeddings | |
| for s in 0..<seq_len: | |
| let token_id = token_ids[s] | |
| for d in 0..<hidden: | |
| ctx.hidden_states[s * hidden + d] = | |
| ctx.token_embeddings[token_id * hidden + d] + | |
| ctx.position_embeddings[s * hidden + d] + | |
| ctx.token_type_embeddings[d] # token_type = 0 | |
| # Embedding layer norm | |
| layer_norm(ctx.hidden_states, ctx.hidden_states, ctx.embed_ln_weight, ctx.embed_ln_bias, seq_len, hidden) | |
| # Process each transformer layer | |
| for layer in ctx.layers: | |
| self_attention(ctx, layer, seq_len, attn_mask) | |
| feed_forward(ctx, layer, seq_len) | |
| proc mean_pooling(output: var seq[float32], hidden_states: seq[float32], | |
| attn_mask: seq[int], seq_len, hidden_size: int) = | |
| ## Mean pooling over non-padded tokens | |
| # Initialize output to zero | |
| for i in 0..<hidden_size: | |
| output[i] = 0.0f32 | |
| # Sum up hidden states for non-padded tokens | |
| var count = 0 | |
| for s in 0..<seq_len: | |
| if attn_mask[s] != 0: | |
| for d in 0..<hidden_size: | |
| output[d] += hidden_states[s * hidden_size + d] | |
| inc count | |
| # Average | |
| if count > 0: | |
| let inv_count = 1.0f32 / count.float32 | |
| for d in 0..<hidden_size: | |
| output[d] *= inv_count | |
| # ======================================================================== | |
| # Model Loading | |
| # ======================================================================== | |
| proc read_uint32(f: File): int = | |
| var buf: array[4, uint8] | |
| if f.readBytes(buf, 0, 4) != 4: | |
| raise newException(IOError, "Failed to read uint32") | |
| result = int(buf[0]) or (int(buf[1]) shl 8) or (int(buf[2]) shl 16) or (int(buf[3]) shl 24) | |
| proc read_uint16(f: File): int = | |
| var buf: array[2, uint8] | |
| if f.readBytes(buf, 0, 2) != 2: | |
| raise newException(IOError, "Failed to read uint16") | |
| result = int(buf[0]) or (int(buf[1]) shl 8) | |
| proc read_floats(f: File, count: int): seq[float32] = | |
| result = newSeq[float32](count) | |
| let bytes = count * sizeof(float32) | |
| if f.readBuffer(addr result[0], bytes) != bytes: | |
| raise newException(IOError, "Failed to read floats") | |
| proc gte_load*(model_path: string): GteCtx = | |
| ## Load model from .gtemodel file | |
| if not fileExists(model_path): | |
| stderr.writeLine "gte_load: cannot open ", model_path | |
| return nil | |
| var f = open(model_path, fmRead) | |
| defer: f.close() | |
| # Check magic | |
| var magic: array[4, char] | |
| if f.readChars(magic) != 4 or magic != ['G', 'T', 'E', '1']: | |
| stderr.writeLine "gte_load: invalid magic" | |
| return nil | |
| result = GteCtx() | |
| # Read config | |
| result.vocab_size = read_uint32(f) | |
| result.hidden_size = read_uint32(f) | |
| result.num_layers = read_uint32(f) | |
| result.num_heads = read_uint32(f) | |
| result.intermediate_size = read_uint32(f) | |
| result.max_seq_len = read_uint32(f) | |
| result.head_dim = result.hidden_size div result.num_heads | |
| # Read vocabulary | |
| result.vocab = newSeq[string](result.vocab_size) | |
| result.vocab_hash = initTable[string, int]() | |
| for i in 0..<result.vocab_size: | |
| let len = read_uint16(f) | |
| var word = newString(len) | |
| if f.readChars(word) != len: | |
| raise newException(IOError, "Failed to read vocab word") | |
| result.vocab[i] = word | |
| result.vocab_hash[word] = i | |
| # Read embeddings | |
| result.token_embeddings = read_floats(f, result.vocab_size * result.hidden_size) | |
| result.position_embeddings = read_floats(f, result.max_seq_len * result.hidden_size) | |
| result.token_type_embeddings = read_floats(f, 2 * result.hidden_size) | |
| result.embed_ln_weight = read_floats(f, result.hidden_size) | |
| result.embed_ln_bias = read_floats(f, result.hidden_size) | |
| # Read transformer layers | |
| result.layers = newSeq[LayerWeights](result.num_layers) | |
| for l in 0..<result.num_layers: | |
| result.layers[l].query_weight = read_floats(f, result.hidden_size * result.hidden_size) | |
| result.layers[l].query_bias = read_floats(f, result.hidden_size) | |
| result.layers[l].key_weight = read_floats(f, result.hidden_size * result.hidden_size) | |
| result.layers[l].key_bias = read_floats(f, result.hidden_size) | |
| result.layers[l].value_weight = read_floats(f, result.hidden_size * result.hidden_size) | |
| result.layers[l].value_bias = read_floats(f, result.hidden_size) | |
| result.layers[l].attn_output_weight = read_floats(f, result.hidden_size * result.hidden_size) | |
| result.layers[l].attn_output_bias = read_floats(f, result.hidden_size) | |
| result.layers[l].attn_ln_weight = read_floats(f, result.hidden_size) | |
| result.layers[l].attn_ln_bias = read_floats(f, result.hidden_size) | |
| result.layers[l].ffn_inter_weight = read_floats(f, result.intermediate_size * result.hidden_size) | |
| result.layers[l].ffn_inter_bias = read_floats(f, result.intermediate_size) | |
| result.layers[l].ffn_output_weight = read_floats(f, result.hidden_size * result.intermediate_size) | |
| result.layers[l].ffn_output_bias = read_floats(f, result.hidden_size) | |
| result.layers[l].ffn_ln_weight = read_floats(f, result.hidden_size) | |
| result.layers[l].ffn_ln_bias = read_floats(f, result.hidden_size) | |
| # Read pooler (not used for embeddings) | |
| result.pooler_weight = read_floats(f, result.hidden_size * result.hidden_size) | |
| result.pooler_bias = read_floats(f, result.hidden_size) | |
| # Allocate working memory | |
| let max_seq = result.max_seq_len | |
| let hidden = result.hidden_size | |
| let inter = result.intermediate_size | |
| let heads = result.num_heads | |
| result.hidden_states = newSeq[float32](max_seq * hidden) | |
| result.attn_scores = newSeq[float32](heads * max_seq * max_seq) | |
| result.q_proj = newSeq[float32](max_seq * hidden) | |
| result.k_proj = newSeq[float32](max_seq * hidden) | |
| result.v_proj = newSeq[float32](max_seq * hidden) | |
| result.attn_output = newSeq[float32](max_seq * hidden) | |
| result.ffn_hidden = newSeq[float32](max_seq * inter) | |
| result.temp_hidden = newSeq[float32](max_seq * hidden) | |
| # ======================================================================== | |
| # Public API | |
| # ======================================================================== | |
| proc gte_embed*(ctx: GteCtx, text: string): seq[float32] = | |
| ## Generate embedding for a single text | |
| if ctx == nil: | |
| return @[] | |
| # Tokenize | |
| let token_ids = tokenize(ctx, text, ctx.max_seq_len) | |
| let num_tokens = token_ids.len | |
| # Create attention mask | |
| var attn_mask = newSeq[int](num_tokens) | |
| for i in 0..<num_tokens: | |
| attn_mask[i] = 1 | |
| # Run transformer | |
| transformer_forward(ctx, token_ids, num_tokens, attn_mask) | |
| # Mean pooling | |
| result = newSeq[float32](ctx.hidden_size) | |
| mean_pooling(result, ctx.hidden_states, attn_mask, num_tokens, ctx.hidden_size) | |
| # L2 normalize | |
| l2_normalize(result, ctx.hidden_size) | |
| proc gte_dim*(ctx: GteCtx): int = | |
| ## Get the embedding dimension (384 for GTE-small) | |
| if ctx != nil: ctx.hidden_size else: 0 | |
| proc gte_max_seq_len*(ctx: GteCtx): int = | |
| ## Get the maximum sequence length (512 for GTE-small) | |
| if ctx != nil: ctx.max_seq_len else: 0 | |
| proc gte_cosine_similarity*(a, b: openArray[float32], dim: int): float32 = | |
| ## Compute cosine similarity between two embeddings | |
| ## Assumes normalized vectors, so dot product = cosine similarity | |
| result = 0.0f32 | |
| for i in 0..<dim: | |
| result += a[i] * b[i] | |
| # ======================================================================== | |
| # Test Program | |
| # ======================================================================== | |
| proc print_embedding(emb: openArray[float32], dim, n: int) = | |
| stdout.write "[" | |
| for i in 0..<min(n, dim): | |
| stdout.write emb[i].formatFloat(ffDecimal, 6) | |
| if i < n - 1: | |
| stdout.write ", " | |
| if n < dim: | |
| stdout.write ", ..." | |
| echo "]" | |
| proc print_usage(prog: string) = | |
| echo "Usage: ", prog, " [OPTIONS] [SENTENCES...]" | |
| echo "" | |
| echo "Test GTE-small embedding model by computing embeddings and similarity matrix." | |
| echo "The model is automatically downloaded on first run if not present." | |
| echo "" | |
| echo "Options:" | |
| echo " --model-path PATH Path to .gtemodel file (default: ", DEFAULT_MODEL_PATH, ")" | |
| echo " --help Show this help message" | |
| echo "" | |
| echo "Arguments:" | |
| echo " SENTENCES One or more sentences to embed (quote each sentence)" | |
| echo " If none provided, uses built-in example sentences" | |
| echo "" | |
| echo "Examples:" | |
| echo " ", prog | |
| echo " ", prog, " \"Hello world\" \"Goodbye world\"" | |
| echo " ", prog, " --model-path my-model.gtemodel \"Test sentence\"" | |
| when isMainModule: | |
| var model_path = DEFAULT_MODEL_PATH | |
| var user_sentences: seq[string] = @[] | |
| # Parse arguments | |
| var p = initOptParser() | |
| while true: | |
| p.next() | |
| case p.kind | |
| of cmdEnd: break | |
| of cmdShortOption, cmdLongOption: | |
| case p.key | |
| of "help", "h": | |
| print_usage(getAppFilename()) | |
| quit(0) | |
| of "model-path": | |
| model_path = p.val | |
| else: | |
| stderr.writeLine "Error: Unknown option '", p.key, "'" | |
| quit(1) | |
| of cmdArgument: | |
| user_sentences.add(p.key) | |
| # Default sentences if none provided | |
| let default_sentences = @[ | |
| "The weather is lovely today.", | |
| "It's so sunny outside!", | |
| "He drove to the stadium.", | |
| "Machine learning is transforming industries.", | |
| "I love programming in C." | |
| ] | |
| let sentences = if user_sentences.len > 0: user_sentences else: default_sentences | |
| # Auto-download model if not present | |
| if not fileExists(model_path): | |
| echo "Model not found at ", model_path | |
| echo "Attempting to download GTE-small model..." | |
| if not download_model(model_path): | |
| stderr.writeLine "Failed to download model. Please download manually." | |
| quit(1) | |
| # Load model | |
| echo "Loading model from ", model_path, "..." | |
| let start_load = cpuTime() | |
| let ctx = gte_load(model_path) | |
| if ctx == nil: | |
| stderr.writeLine "Failed to load model" | |
| quit(1) | |
| let load_time = cpuTime() - start_load | |
| echo "Model loaded in ", load_time.formatFloat(ffDecimal, 2), " seconds" | |
| echo "Embedding dimension: ", gte_dim(ctx) | |
| echo "Max sequence length: ", gte_max_seq_len(ctx) | |
| echo "" | |
| # Generate embeddings | |
| echo "Generating embeddings..." | |
| echo "" | |
| var embeddings: seq[seq[float32]] = @[] | |
| for i, sentence in sentences: | |
| let start_embed = cpuTime() | |
| let embedding = gte_embed(ctx, sentence) | |
| let embed_time = cpuTime() - start_embed | |
| embeddings.add(embedding) | |
| echo "S", i + 1, ": \"", sentence, "\"" | |
| echo " Time: ", (embed_time * 1000).formatFloat(ffDecimal, 3), " ms" | |
| stdout.write " Embedding: " | |
| print_embedding(embedding, gte_dim(ctx), 5) | |
| echo "" | |
| # Compute similarity matrix | |
| echo "Cosine similarity matrix:" | |
| stdout.write " " | |
| for i in 0..<sentences.len: | |
| stdout.write " S", i + 1, " " | |
| echo "" | |
| for i in 0..<sentences.len: | |
| stdout.write "S", i + 1, ": " | |
| for j in 0..<sentences.len: | |
| let sim = gte_cosine_similarity(embeddings[i], embeddings[j], gte_dim(ctx)) | |
| stdout.write " ", sim.formatFloat(ffDecimal, 3), " " | |
| echo "" | |
| echo "" | |
| echo "Done!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment