simbo1905 · April 24, 2026 16:23
diff --git a/README.md b/README.md
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
diff --git a/chunk_text.awk b/chunk_text.awk
 #!/usr/bin/env awk -f

 # Chunk text file into overlapping blocks for RAG processing
 # Usage: ./chunk_text.awk [size=CHUNK_SIZE] [step=STEP_SIZE] input_file.md
 # Example: ./chunk_text.awk size=11000 step=10000 sources.md
 # Purpose: When doing RAG or LLM summaristion with small local models they cannot handle huge Files
 #          a small overlap avoids data loss and will not harm reserch or learning e.g. creating flash cards

 BEGIN {
    # Set defaults if not provided
    if (size == "") size = 11000
    if (step == "") step = 10000
    
    print "📄 Starting text chunking..."
    print "   Input file: " FILENAME
    print "   Chunk size: " size " characters"
    print "   Step size: " step " characters"
    print "   Overlap: " (size - step) " characters"
    print "========================================"
 }

 {
    text = text $0 "\n"
 }

 END {
    len = length(text)
    base = FILENAME
    # Strip only the extension - preserve any extension (.log, .md, .txt, etc)
    if (match(base, /\.[^.]+$/)) {
        ext = substr(base, RSTART)
        base = substr(base, 1, RSTART - 1)
    } else {
        ext = ""
    }
    
    print "📊 Processing " FILENAME " (" len " characters)"
    
    i = 0
    chunks_created = 0
    for (pos = 1; pos <= len; pos += step) {
        chunk = substr(text, pos, size)
        fname = sprintf("%s%02d%s", base, i, ext)
        print "   Creating chunk: " fname
        print chunk > fname
        close(fname)
        i++
        chunks_created++
    }
    
    print "✅ Completed"
    print "   Created " chunks_created " chunks"
    print "   Files: " base "00" ext " to " base (i-1) ext
    print "   Total characters processed: " len
 }
diff --git a/process_chunks_cloud.py b/process_chunks_cloud.py
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.11"
 # dependencies = []
 # ///
 # Cloud-hosted transcript summarization via OpenAI-compatible chat completions.
 # Only the transcript-summary models that graded A- or better are allowed.

 """
 Process text chunks and generate summaries via cloud APIs.

 Provider selection order:
 1. Shell environment variables already present in the process
 2. `.env` at repo root as a fallback

 Current provider priority for the default model:
 1. Mistral (`MISTRAL_API_KEY`) -> `magistral-small`
 2. Groq (`GROQ_API_KEY`) -> `llama-3.3-70b-versatile`

 To add another provider later:
 - add its key name to `SHELL_FIRST_PROVIDER_KEYS`
 - add a default model for that provider
 - update `ensure_provider_key` and `call_model`
 - insert it into the priority order where you want it checked

 Usage:
  uv run scripts/process_chunks_cloud.py <folder> <input_prefix> <output_prefix>

 Example:
  uv run scripts/process_chunks_cloud.py LiveSessionsPlayback/Course1_Module6_Class transcript summary
 """

 import argparse
 import json
 import os
 import re
 import sys
 import time
 import urllib.error
 import urllib.request
 from pathlib import Path


 GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
 MISTRAL_API_URL = "https://api.mistral.ai/v1/chat/completions"
 DEFAULT_MODEL = "magistral-small"
 DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile"

 MODEL_ALIASES = {
    # Approved Mistral option
    "magistral-small": "magistral-small-latest",
    # Approved Groq options
    "llama-70b": "llama-3.3-70b-versatile",
    "llama-4-scout": "meta-llama/llama-4-scout-17b-16e-instruct",
 }

 APPROVED_MISTRAL_MODELS = {
    "magistral-small-latest",
 }

 APPROVED_GROQ_MODELS = {
    "meta-llama/llama-4-scout-17b-16e-instruct",
    "llama-3.3-70b-versatile",
 }

 APPROVED_MODELS = APPROVED_MISTRAL_MODELS | APPROVED_GROQ_MODELS
 SHELL_FIRST_PROVIDER_KEYS = ("MISTRAL_API_KEY", "GROQ_API_KEY")


 def strip_ansi(text: str) -> str:
    """Remove ANSI escape sequences from text."""
    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
    return ansi_escape.sub("", text)


 def read_dotenv() -> dict[str, str]:
    """Read repo-root .env without overwriting existing shell environment."""
    env_path = Path(__file__).resolve().parent.parent / ".env"
    if not env_path.exists():
        return {}

    try:
        values: dict[str, str] = {}
        for raw in env_path.read_text(encoding="utf-8").splitlines():
            line = raw.strip()
            if not line or line.startswith("#"):
                continue
            if "=" not in line:
                continue
            key, value = line.split("=", 1)
            key = key.strip()
            value = value.strip().strip('"').strip("'")
            if key:
                values[key] = value
        return values
    except Exception:
        # Don't fail hard on malformed .env; the call path will emit a useful error.
        return {}


 def select_default_model(dotenv_values: dict[str, str]) -> str:
    """Choose the default model from available provider keys.

    Order is shell env first, then `.env`, and Mistral before Groq.
    """
    if os.environ.get("MISTRAL_API_KEY"):
        return DEFAULT_MODEL
    if os.environ.get("GROQ_API_KEY"):
        return DEFAULT_GROQ_MODEL
    if dotenv_values.get("MISTRAL_API_KEY"):
        return DEFAULT_MODEL
    if dotenv_values.get("GROQ_API_KEY"):
        return DEFAULT_GROQ_MODEL
    raise RuntimeError(
        "No supported cloud API key found. Set MISTRAL_API_KEY or GROQ_API_KEY "
        "in the shell environment or repo-root .env."
    )


 def ensure_provider_key(model: str, dotenv_values: dict[str, str]) -> None:
    """Ensure the provider key for the chosen model is available in the env."""
    if model in APPROVED_MISTRAL_MODELS:
        if os.environ.get("MISTRAL_API_KEY"):
            return
        if dotenv_values.get("MISTRAL_API_KEY"):
            os.environ["MISTRAL_API_KEY"] = dotenv_values["MISTRAL_API_KEY"]
            return
        raise RuntimeError(
            "MISTRAL_API_KEY is required for this model. Set it in the shell or .env."
        )

    if model in APPROVED_GROQ_MODELS:
        if os.environ.get("GROQ_API_KEY"):
            return
        if dotenv_values.get("GROQ_API_KEY"):
            os.environ["GROQ_API_KEY"] = dotenv_values["GROQ_API_KEY"]
            return
        raise RuntimeError(
            "GROQ_API_KEY is required for this model. Set it in the shell or .env."
        )


 def strip_thinking_blocks(text: str) -> str:
    """Remove Thinking blocks if present."""
    lines = text.split("\n")
    result = []
    skip_until_done = False

    for line in lines:
        if "Thinking..." in line:
            skip_until_done = True
        elif "...done thinking." in line:
            skip_until_done = False
        elif not skip_until_done:
            result.append(line)

    return "\n".join(result)


 def call_groq(prompt: str, model: str) -> str:
    api_key = os.environ.get("GROQ_API_KEY")
    if not api_key:
        raise RuntimeError("GROQ_API_KEY environment variable is not set")

    body = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt,
            }
        ],
        "stream": False,
    }

    req = urllib.request.Request(
        GROQ_API_URL,
        data=json.dumps(body).encode("utf-8"),
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "Accept": "application/json",
            "User-Agent": "process_chunks_cloud/1.0",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, timeout=120) as resp:
            payload = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as e:
        detail = e.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"Groq API error {e.code}: {detail}") from e
    except urllib.error.URLError as e:
        raise RuntimeError(f"Could not reach Groq API: {e}") from e

    try:
        return payload["choices"][0]["message"]["content"]
    except Exception as e:
        raise RuntimeError(f"Unexpected Groq API response format: {payload}") from e


 def call_mistral(prompt: str, model: str) -> str:
    api_key = os.environ.get("MISTRAL_API_KEY")
    if not api_key:
        raise RuntimeError("MISTRAL_API_KEY environment variable is not set")

    body = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt,
            }
        ],
        "stream": False,
    }

    req = urllib.request.Request(
        MISTRAL_API_URL,
        data=json.dumps(body).encode("utf-8"),
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "Accept": "application/json",
            "User-Agent": "process_chunks_cloud/1.0",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, timeout=120) as resp:
            payload = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as e:
        detail = e.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"Mistral API error {e.code}: {detail}") from e
    except urllib.error.URLError as e:
        raise RuntimeError(f"Could not reach Mistral API: {e}") from e

    try:
        content = payload["choices"][0]["message"]["content"]
    except Exception as e:
        raise RuntimeError(f"Unexpected Mistral API response format: {payload}") from e

    if isinstance(content, str):
        return content

    if isinstance(content, list):
        parts = []
        for item in content:
            if not isinstance(item, dict):
                continue
            if item.get("type") == "text":
                text = item.get("text", "")
                if text:
                    parts.append(text)
            elif item.get("type") == "thinking":
                thinking_parts = []
                for block in item.get("thinking", []):
                    if isinstance(block, dict) and block.get("type") == "text":
                        text = block.get("text", "")
                        if text:
                            thinking_parts.append(text)
                if thinking_parts:
                    parts.append("Thinking...\n" + "\n".join(thinking_parts) + "\n...done thinking.")
        return "\n\n".join(parts)

    raise RuntimeError(f"Unexpected Mistral content format: {content!r}")


 def resolve_model(model: str) -> str:
    resolved = MODEL_ALIASES.get(model, model)
    if resolved not in APPROVED_MODELS:
        allowed = ", ".join(sorted([*MODEL_ALIASES.keys(), *APPROVED_MODELS]))
        raise RuntimeError(
            "Unsupported model for transcript summarisation. "
            f"Allowed models: {allowed}"
        )
    return resolved


 def call_model(prompt: str, model: str) -> str:
    resolved = resolve_model(model)
    if resolved in APPROVED_MISTRAL_MODELS:
        return call_mistral(prompt, resolved)
    return call_groq(prompt, resolved)


 def process_chunk_file(chunk_file: Path, output_file: Path, thinking_file: Path, model: str, prompt_prefix: str) -> None:
    print(f"📄 Processing: {chunk_file.name} → {output_file.name}")
    started_at = time.perf_counter()

    file_size = chunk_file.stat().st_size
    file_lines = sum(1 for _ in open(chunk_file, "r", encoding="utf-8"))
    print(f"   Input: {file_size} bytes ({file_lines} lines)")

    chunk_text = chunk_file.read_text(encoding="utf-8")
    prompt = f"{prompt_prefix}\n\n{chunk_text}"

    resolved_model = resolve_model(model)
    print(f"🤖 Running cloud model: {resolved_model}")

    output = call_model(prompt, model)

    raw_output = output
    clean_output = strip_thinking_blocks(strip_ansi(raw_output))

    with open(thinking_file, "w", encoding="utf-8") as f:
        f.write(raw_output)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(clean_output)

    clean_size = output_file.stat().st_size
    clean_lines = len(clean_output.split("\n"))
    elapsed = time.perf_counter() - started_at

    print(f"✅ Completed in {elapsed:.2f} seconds")
    print(f"   Output: {clean_size} bytes ({clean_lines} lines)")
    print(f"   Clean: {output_file}")
    print(f"   Raw: {thinking_file}")


 def main() -> None:
    dotenv_values = read_dotenv()

    parser = argparse.ArgumentParser(
        description="Process text chunks with cloud APIs and create clean outputs",
        formatter_class=argparse.RawTextHelpFormatter,
    )
    parser.add_argument("directory", help="Directory containing chunk files")
    parser.add_argument(
        "input_prefix",
        help='Prefix of input chunk files (e.g. "transcript" for transcript00.md)',
    )
    parser.add_argument(
        "output_prefix",
        help='Prefix for output files (e.g. "summary" creates summary_transcript00.md)',
    )
    parser.add_argument(
        "-m",
        "--model",
        default=None,
        help=(
            "Cloud model to use. If omitted, the script checks shell env first, then .env, "
            "preferring Mistral over Groq. "
            f"Default Mistral model: {DEFAULT_MODEL}. Default Groq model: {DEFAULT_GROQ_MODEL}. "
            "Allowed models only: magistral-small, magistral-small-latest, "
            "llama-70b, llama-3.3-70b-versatile, llama-4-scout, "
            "meta-llama/llama-4-scout-17b-16e-instruct"
        ),
    )
    parser.add_argument(
        "-p",
        "--prompt",
        default="Summarise this file:",
        help='Prompt prefix for the model (default: "Summarise this file:")',
    )

    args = parser.parse_args()
    selected_model = args.model or select_default_model(dotenv_values)
    resolved_model = resolve_model(selected_model)
    ensure_provider_key(resolved_model, dotenv_values)

    folder = Path(args.directory)
    if not folder.is_dir():
        print(f"❌ Error: Directory '{folder}' does not exist")
        sys.exit(1)

    print("🚀 Starting chunk processing")
    print(f"   Directory: {folder}")
    print(f"   Input prefix: {args.input_prefix}")
    print(f"   Output prefix: {args.output_prefix}")
    print(f"   Model: {selected_model}")
    print(f"   Resolved model: {resolved_model}")
    print(f"   Prompt: '{args.prompt}'")
    print("========================================")

    # Find all chunk files, auto-detecting the extension
    chunk_files = []
    for ext in (".md", ".log", ".txt"):
        chunk_files = sorted(list(folder.glob(f"{args.input_prefix}[0-9][0-9]{ext}")))
        if chunk_files:
            break
    if not chunk_files:
        # Last resort: any extension
        chunk_files = sorted(list(folder.glob(f"{args.input_prefix}[0-9][0-9].*")))
    total_files = len(chunk_files)
    if total_files == 0:
        print(f"⚠️  No chunk files found with prefix '{args.input_prefix}'")
        return

    print(f"📊 Found {total_files} chunk files to process")

    for i, chunk_file in enumerate(chunk_files, 1):
        print(f"\nProcessing file {i}/{total_files}")
        chunk_num = chunk_file.name[len(args.input_prefix) : len(args.input_prefix) + 2]

        output_file = folder / f"{args.output_prefix}_{args.input_prefix}{chunk_num}.md"
        thinking_file = folder / f"thinking_{args.output_prefix}_{args.input_prefix}{chunk_num}.md"

        process_chunk_file(
            chunk_file,
            output_file,
            thinking_file,
            selected_model,
            args.prompt,
        )

    print(f"\n🎉 Processing complete!")
    print(f"   Clean files: {args.output_prefix}_{args.input_prefix}00.md to {args.output_prefix}_{args.input_prefix}{(i - 1):02d}.md")
    print(f"   Raw files: thinking_{args.output_prefix}_{args.input_prefix}00.md to thinking_{args.output_prefix}_{args.input_prefix}{(i - 1):02d}.md")


 if __name__ == "__main__":
    main()
diff --git a/process_chunks_ollama.py b/process_chunks_ollama.py
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "ollama>=0.1.0",
 # ]
 # ///
 # As at 2026-04-19 This script works well with this Ollama model: gemma4:26b
 # The idea is that you tke some massive text and split it up into blocks named "something00.md" where the 00 are
 # block numbers such as 'source00.md source01.md source02.md'. This script needs the directory and the prefix to the numbered
 # chunks and the file prefix of the summary file to make
 #   ./scripts/process_chunks_ollama.py some/folder source summary
 #   or: uv run scripts/process_chunks_ollama.py some/folder source summary
 # it will then output files with the raw thinking in it but then a filered version without the thinking:
 #   summary00.md summary01.md summary02.md thinking_summary00.md thinking_summary01.md thinking_summary02.md
 # if your chunking breaks up a sentence then the idea in that sentence may not appear in either chunk. So you should
 # create over lapping chunks, say 11k char in each chunk with a 1k overlap.
 # You can join the files back into one file with "cat summary*.md > all.md" which may include overlaps yet this script
 # is not to create files to publish but files to ask an AI about "what in the course material addresses the question xxxx?" which
 # should handle the max 10$ repetition due to chunk overlaps without any issues.
 import os
 import sys
 import re
 import argparse
 import time
 from pathlib import Path

 import ollama

 try:
    # When running inside the repo
    REPO_ROOT = Path(__file__).resolve().parent.parent
    if str(REPO_ROOT) not in sys.path:
        sys.path.insert(0, str(REPO_ROOT))
    from scripts.common import DEFAULT_MODEL
 except (ImportError, ModuleNotFoundError):
    # Standalone / gist usage
    DEFAULT_MODEL = "gemma4:26b"


 def strip_ansi(text):
    """Remove ANSI escape sequences from text"""
    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
    return ansi_escape.sub("", text)


 def strip_thinking_blocks(text):
    """Remove 'Thinking...' blocks from text"""
    lines = text.split("\n")
    result = []
    skip_until_done = False

    for line in lines:
        if "Thinking..." in line:
            skip_until_done = True
        elif "...done thinking." in line:
            skip_until_done = False
        elif not skip_until_done:
            result.append(line)

    return "\n".join(result)


 def process_chunk_file(chunk_file, output_file, thinking_file, model, prompt_prefix):
    """Process a single chunk file with Ollama and create clean output"""
    print(f"📄 Processing: {chunk_file.name} → {output_file.name}")
    started_at = time.perf_counter()

    # Get file stats
    file_size = chunk_file.stat().st_size
    file_lines = sum(1 for _ in open(chunk_file, "r"))
    print(f"   Input: {file_size} bytes ({file_lines} lines)")

    # Build the prompt directly and use the Ollama API rather than the CLI.
    # `ollama run` is an interactive terminal UI and pollutes captured output
    # with cursor movement, spinner frames, and other control characters.
    chunk_text = chunk_file.read_text()
    prompt = f"{prompt_prefix}\n\n{chunk_text}"

    print(f"🤖 Running Ollama API: {model}")

    # Process output from the structured API stream instead of terminal stdout.
    raw_output = []
    clean_output_lines = []

    stream = ollama.generate(
        model=model,
        prompt=prompt,
        stream=True,
        keep_alive="5m",
    )

    for chunk in stream:
        if isinstance(chunk, dict):
            text = chunk.get("response", "")
        else:
            text = getattr(chunk, "response", "")

        if text:
            raw_output.append(text)
            clean_output_lines.append(strip_ansi(text))

    # Join all lines and strip thinking blocks
    clean_output_text = "".join(clean_output_lines)
    clean_output_text = strip_thinking_blocks(clean_output_text)

    # Write outputs
    with open(thinking_file, "w") as f:
        f.write("".join(raw_output))

    with open(output_file, "w") as f:
        f.write(clean_output_text)

    # Get output stats
    clean_size = output_file.stat().st_size
    clean_lines = len(clean_output_text.split("\n"))
    elapsed = time.perf_counter() - started_at

    print(f"✅ Completed in {elapsed:.2f} seconds")
    print(f"   Output: {clean_size} bytes ({clean_lines} lines)")
    print(f"   Clean: {output_file}")
    print(f"   Raw: {thinking_file}")


 def main():
    parser = argparse.ArgumentParser(
        description="Process text chunks with Ollama and create clean outputs",
        formatter_class=argparse.RawTextHelpFormatter,
    )
    parser.add_argument("directory", help="Directory containing chunk files")
    parser.add_argument(
        "input_prefix",
        help='Prefix of input chunk files (e.g., "sources" for sources00.md)',
    )
    parser.add_argument(
        "output_prefix",
        help='Prefix for output files (e.g., "summary" creates summary_sources00.md)',
    )
    parser.add_argument(
        "-m",
        "--model",
        default=DEFAULT_MODEL,
        help=f"Ollama model to use (default: {DEFAULT_MODEL})",
    )
    parser.add_argument(
        "-p",
        "--prompt",
        default="Summarise this file:",
        help='Prompt prefix for Ollama (default: "Summarise this file:")',
    )

    args = parser.parse_args()

    # Validate directory
    folder = Path(args.directory)
    if not folder.is_dir():
        print(f"❌ Error: Directory '{folder}' does not exist")
        sys.exit(1)

    print("🚀 Starting chunk processing...")
    print(f"   Directory: {folder}")
    print(f"   Input prefix: {args.input_prefix}")
    print(f"   Output prefix: {args.output_prefix}")
    print(f"   Model: {args.model}")
    print(f"   Prompt: '{args.prompt}'")
    print("========================================")

    # Find all chunk files, auto-detecting the extension
    chunk_files = []
    for ext in (".md", ".log", ".txt"):
        chunk_files = sorted(list(folder.glob(f"{args.input_prefix}[0-9][0-9]{ext}")))
        if chunk_files:
            break
    if not chunk_files:
        # Last resort: any extension
        chunk_files = sorted(list(folder.glob(f"{args.input_prefix}[0-9][0-9].*")))
    total_files = len(chunk_files)

    if total_files == 0:
        print(f"⚠️  No chunk files found with prefix '{args.input_prefix}'")
        return

    print(f"📊 Found {total_files} chunk files to process")

    # Process each chunk
    for i, chunk_file in enumerate(chunk_files, 1):
        print(f"\nProcessing file {i}/{total_files}")

        # Extract chunk number from filename
        chunk_num = chunk_file.name[len(args.input_prefix) : len(args.input_prefix) + 2]

        # Create output filenames: output_prefix_input_prefixXX.md
        output_filename = f"{args.output_prefix}_{args.input_prefix}{chunk_num}.md"
        thinking_filename = (
            f"thinking_{args.output_prefix}_{args.input_prefix}{chunk_num}.md"
        )

        output_file = folder / output_filename
        thinking_file = folder / thinking_filename

        process_chunk_file(
            chunk_file, output_file, thinking_file, args.model, args.prompt
        )

    print(f"\n🎉 Processing complete!")
    print(f"   Processed {total_files} files")
    print(
        f"   Clean files: {args.output_prefix}_{args.input_prefix}00.md to {args.output_prefix}_{args.input_prefix}{(i - 1):02d}.md"
    )
    print(
        f"   Raw files: thinking_{args.output_prefix}_{args.input_prefix}00.md to thinking_{args.output_prefix}_{args.input_prefix}{(i - 1):02d}.md"
    )


 if __name__ == "__main__":
    main()
Chunk	Characters	Tokens
`video_transcript00.log`	11001	2053
`video_transcript01.log`	11001	2089
`video_transcript02.log`	11001	2030
`video_transcript03.log`	11001	2038
`video_transcript04.log`	11001	2005
`video_transcript05.log`	11001	2044
`video_transcript06.log`	11001	2059
`video_transcript07.log`	8524	1636
Model	Grade	Total time	Notes
`magistral-small`	A	61.67s	most faithful, slower
`llama-3.3-70b-versatile`	A-	14.16s	best speed/quality balance
`llama-4-scout-17b-16e-instruct`	A-	8.31s	fastest, slightly thinner summaries
Model	Grade	Total time	Reason dropped
`open-mistral-nemo`	B-	20.61s	more inference drift than desired
`openai/gpt-oss-20b`	B-	11.08s	too willing to invent structure
`openai/gpt-oss-120b`	B-	20.69s	rich but too hallucination-prone
`qwen/qwen3-32b`	C	24.49s	leakage / drift / too verbose
	#!/usr/bin/env awk -f

	# Chunk text file into overlapping blocks for RAG processing
	# Usage: ./chunk_text.awk [size=CHUNK_SIZE] [step=STEP_SIZE] input_file.md
	# Example: ./chunk_text.awk size=11000 step=10000 sources.md
	# Purpose: When doing RAG or LLM summaristion with small local models they cannot handle huge Files
	# a small overlap avoids data loss and will not harm reserch or learning e.g. creating flash cards

	BEGIN {
	# Set defaults if not provided
	if (size == "") size = 11000
	if (step == "") step = 10000

	print "📄 Starting text chunking..."
	print " Input file: " FILENAME
	print " Chunk size: " size " characters"
	print " Step size: " step " characters"
	print " Overlap: " (size - step) " characters"
	print "========================================"
	}

	{
	text = text $0 "\n"
	}

	END {
	len = length(text)
	base = FILENAME
	# Strip only the extension - preserve any extension (.log, .md, .txt, etc)
	if (match(base, /\.[^.]+$/)) {
	ext = substr(base, RSTART)
	base = substr(base, 1, RSTART - 1)
	} else {
	ext = ""
	}

	print "📊 Processing " FILENAME " (" len " characters)"

	i = 0
	chunks_created = 0
	for (pos = 1; pos <= len; pos += step) {
	chunk = substr(text, pos, size)
	fname = sprintf("%s%02d%s", base, i, ext)
	print " Creating chunk: " fname
	print chunk > fname
	close(fname)
	i++
	chunks_created++
	}

	print "✅ Completed"
	print " Created " chunks_created " chunks"
	print " Files: " base "00" ext " to " base (i-1) ext
	print " Total characters processed: " len
	}
	#!/usr/bin/env -S uv run --script
	# /// script
	# requires-python = ">=3.11"
	# dependencies = []
	# ///
	# Cloud-hosted transcript summarization via OpenAI-compatible chat completions.
	# Only the transcript-summary models that graded A- or better are allowed.

	"""
	Process text chunks and generate summaries via cloud APIs.

	Provider selection order:
	1. Shell environment variables already present in the process
	2. `.env` at repo root as a fallback

	Current provider priority for the default model:
	1. Mistral (`MISTRAL_API_KEY`) -> `magistral-small`
	2. Groq (`GROQ_API_KEY`) -> `llama-3.3-70b-versatile`

	To add another provider later:
	- add its key name to `SHELL_FIRST_PROVIDER_KEYS`
	- add a default model for that provider
	- update `ensure_provider_key` and `call_model`
	- insert it into the priority order where you want it checked

	Usage:
	uv run scripts/process_chunks_cloud.py <folder> <input_prefix> <output_prefix>

	Example:
	uv run scripts/process_chunks_cloud.py LiveSessionsPlayback/Course1_Module6_Class transcript summary
	"""

	import argparse
	import json
	import os
	import re
	import sys
	import time
	import urllib.error
	import urllib.request
	from pathlib import Path


	GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
	MISTRAL_API_URL = "https://api.mistral.ai/v1/chat/completions"
	DEFAULT_MODEL = "magistral-small"
	DEFAULT_GROQ_MODEL = "llama-3.3-70b-versatile"

	MODEL_ALIASES = {
	# Approved Mistral option
	"magistral-small": "magistral-small-latest",
	# Approved Groq options
	"llama-70b": "llama-3.3-70b-versatile",
	"llama-4-scout": "meta-llama/llama-4-scout-17b-16e-instruct",
	}

	APPROVED_MISTRAL_MODELS = {
	"magistral-small-latest",
	}

	APPROVED_GROQ_MODELS = {
	"meta-llama/llama-4-scout-17b-16e-instruct",
	"llama-3.3-70b-versatile",
	}

	APPROVED_MODELS = APPROVED_MISTRAL_MODELS \| APPROVED_GROQ_MODELS
	SHELL_FIRST_PROVIDER_KEYS = ("MISTRAL_API_KEY", "GROQ_API_KEY")


	def strip_ansi(text: str) -> str:
	"""Remove ANSI escape sequences from text."""
	ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]\|\[[0-?][ -/][@-~])")
	return ansi_escape.sub("", text)


	def read_dotenv() -> dict[str, str]:
	"""Read repo-root .env without overwriting existing shell environment."""
	env_path = Path(__file__).resolve().parent.parent / ".env"
	if not env_path.exists():
	return {}

	try:
	values: dict[str, str] = {}
	for raw in env_path.read_text(encoding="utf-8").splitlines():
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if "=" not in line:
	continue
	key, value = line.split("=", 1)
	key = key.strip()
	value = value.strip().strip('"').strip("'")
	if key:
	values[key] = value
	return values
	except Exception:
	# Don't fail hard on malformed .env; the call path will emit a useful error.
	return {}


	def select_default_model(dotenv_values: dict[str, str]) -> str:
	"""Choose the default model from available provider keys.

	Order is shell env first, then `.env`, and Mistral before Groq.
	"""
	if os.environ.get("MISTRAL_API_KEY"):
	return DEFAULT_MODEL
	if os.environ.get("GROQ_API_KEY"):
	return DEFAULT_GROQ_MODEL
	if dotenv_values.get("MISTRAL_API_KEY"):
	return DEFAULT_MODEL
	if dotenv_values.get("GROQ_API_KEY"):
	return DEFAULT_GROQ_MODEL
	raise RuntimeError(
	"No supported cloud API key found. Set MISTRAL_API_KEY or GROQ_API_KEY "
	"in the shell environment or repo-root .env."
	)


	def ensure_provider_key(model: str, dotenv_values: dict[str, str]) -> None:
	"""Ensure the provider key for the chosen model is available in the env."""
	if model in APPROVED_MISTRAL_MODELS:
	if os.environ.get("MISTRAL_API_KEY"):
	return
	if dotenv_values.get("MISTRAL_API_KEY"):
	os.environ["MISTRAL_API_KEY"] = dotenv_values["MISTRAL_API_KEY"]
	return
	raise RuntimeError(
	"MISTRAL_API_KEY is required for this model. Set it in the shell or .env."
	)

	if model in APPROVED_GROQ_MODELS:
	if os.environ.get("GROQ_API_KEY"):
	return
	if dotenv_values.get("GROQ_API_KEY"):
	os.environ["GROQ_API_KEY"] = dotenv_values["GROQ_API_KEY"]
	return
	raise RuntimeError(
	"GROQ_API_KEY is required for this model. Set it in the shell or .env."
	)


	def strip_thinking_blocks(text: str) -> str:
	"""Remove Thinking blocks if present."""
	lines = text.split("\n")
	result = []
	skip_until_done = False

	for line in lines:
	if "Thinking..." in line:
	skip_until_done = True
	elif "...done thinking." in line:
	skip_until_done = False
	elif not skip_until_done:
	result.append(line)

	return "\n".join(result)


	def call_groq(prompt: str, model: str) -> str:
	api_key = os.environ.get("GROQ_API_KEY")
	if not api_key:
	raise RuntimeError("GROQ_API_KEY environment variable is not set")

	body = {
	"model": model,
	"messages": [
	{
	"role": "user",
	"content": prompt,
	}
	],
	"stream": False,
	}

	req = urllib.request.Request(
	GROQ_API_URL,
	data=json.dumps(body).encode("utf-8"),
	headers={
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json",
	"Accept": "application/json",
	"User-Agent": "process_chunks_cloud/1.0",
	},
	method="POST",
	)

	try:
	with urllib.request.urlopen(req, timeout=120) as resp:
	payload = json.loads(resp.read().decode("utf-8"))
	except urllib.error.HTTPError as e:
	detail = e.read().decode("utf-8", errors="replace")
	raise RuntimeError(f"Groq API error {e.code}: {detail}") from e
	except urllib.error.URLError as e:
	raise RuntimeError(f"Could not reach Groq API: {e}") from e

	try:
	return payload["choices"][0]["message"]["content"]
	except Exception as e:
	raise RuntimeError(f"Unexpected Groq API response format: {payload}") from e


	def call_mistral(prompt: str, model: str) -> str:
	api_key = os.environ.get("MISTRAL_API_KEY")
	if not api_key:
	raise RuntimeError("MISTRAL_API_KEY environment variable is not set")

	body = {
	"model": model,
	"messages": [
	{
	"role": "user",
	"content": prompt,
	}
	],
	"stream": False,
	}

	req = urllib.request.Request(
	MISTRAL_API_URL,
	data=json.dumps(body).encode("utf-8"),
	headers={
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json",
	"Accept": "application/json",
	"User-Agent": "process_chunks_cloud/1.0",
	},
	method="POST",
	)

	try:
	with urllib.request.urlopen(req, timeout=120) as resp:
	payload = json.loads(resp.read().decode("utf-8"))
	except urllib.error.HTTPError as e:
	detail = e.read().decode("utf-8", errors="replace")
	raise RuntimeError(f"Mistral API error {e.code}: {detail}") from e
	except urllib.error.URLError as e:
	raise RuntimeError(f"Could not reach Mistral API: {e}") from e

	try:
	content = payload["choices"][0]["message"]["content"]
	except Exception as e:
	raise RuntimeError(f"Unexpected Mistral API response format: {payload}") from e

	if isinstance(content, str):
	return content

	if isinstance(content, list):
	parts = []
	for item in content:
	if not isinstance(item, dict):
	continue
	if item.get("type") == "text":
	text = item.get("text", "")
	if text:
	parts.append(text)
	elif item.get("type") == "thinking":
	thinking_parts = []
	for block in item.get("thinking", []):
	if isinstance(block, dict) and block.get("type") == "text":
	text = block.get("text", "")
	if text:
	thinking_parts.append(text)
	if thinking_parts:
	parts.append("Thinking...\n" + "\n".join(thinking_parts) + "\n...done thinking.")
	return "\n\n".join(parts)

	raise RuntimeError(f"Unexpected Mistral content format: {content!r}")


	def resolve_model(model: str) -> str:
	resolved = MODEL_ALIASES.get(model, model)
	if resolved not in APPROVED_MODELS:
	allowed = ", ".join(sorted([MODEL_ALIASES.keys(), APPROVED_MODELS]))
	raise RuntimeError(
	"Unsupported model for transcript summarisation. "
	f"Allowed models: {allowed}"
	)
	return resolved


	def call_model(prompt: str, model: str) -> str:
	resolved = resolve_model(model)
	if resolved in APPROVED_MISTRAL_MODELS:
	return call_mistral(prompt, resolved)
	return call_groq(prompt, resolved)


	def process_chunk_file(chunk_file: Path, output_file: Path, thinking_file: Path, model: str, prompt_prefix: str) -> None:
	print(f"📄 Processing: {chunk_file.name} → {output_file.name}")
	started_at = time.perf_counter()

	file_size = chunk_file.stat().st_size
	file_lines = sum(1 for _ in open(chunk_file, "r", encoding="utf-8"))
	print(f" Input: {file_size} bytes ({file_lines} lines)")

	chunk_text = chunk_file.read_text(encoding="utf-8")
	prompt = f"{prompt_prefix}\n\n{chunk_text}"

	resolved_model = resolve_model(model)
	print(f"🤖 Running cloud model: {resolved_model}")

	output = call_model(prompt, model)

	raw_output = output
	clean_output = strip_thinking_blocks(strip_ansi(raw_output))

	with open(thinking_file, "w", encoding="utf-8") as f:
	f.write(raw_output)

	with open(output_file, "w", encoding="utf-8") as f:
	f.write(clean_output)

	clean_size = output_file.stat().st_size
	clean_lines = len(clean_output.split("\n"))
	elapsed = time.perf_counter() - started_at

	print(f"✅ Completed in {elapsed:.2f} seconds")
	print(f" Output: {clean_size} bytes ({clean_lines} lines)")
	print(f" Clean: {output_file}")
	print(f" Raw: {thinking_file}")


	def main() -> None:
	dotenv_values = read_dotenv()

	parser = argparse.ArgumentParser(
	description="Process text chunks with cloud APIs and create clean outputs",
	formatter_class=argparse.RawTextHelpFormatter,
	)
	parser.add_argument("directory", help="Directory containing chunk files")
	parser.add_argument(
	"input_prefix",
	help='Prefix of input chunk files (e.g. "transcript" for transcript00.md)',
	)
	parser.add_argument(
	"output_prefix",
	help='Prefix for output files (e.g. "summary" creates summary_transcript00.md)',
	)
	parser.add_argument(
	"-m",
	"--model",
	default=None,
	help=(
	"Cloud model to use. If omitted, the script checks shell env first, then .env, "
	"preferring Mistral over Groq. "
	f"Default Mistral model: {DEFAULT_MODEL}. Default Groq model: {DEFAULT_GROQ_MODEL}. "
	"Allowed models only: magistral-small, magistral-small-latest, "
	"llama-70b, llama-3.3-70b-versatile, llama-4-scout, "
	"meta-llama/llama-4-scout-17b-16e-instruct"
	),
	)
	parser.add_argument(
	"-p",
	"--prompt",
	default="Summarise this file:",
	help='Prompt prefix for the model (default: "Summarise this file:")',
	)

	args = parser.parse_args()
	selected_model = args.model or select_default_model(dotenv_values)
	resolved_model = resolve_model(selected_model)
	ensure_provider_key(resolved_model, dotenv_values)

	folder = Path(args.directory)
	if not folder.is_dir():
	print(f"❌ Error: Directory '{folder}' does not exist")
	sys.exit(1)

	print("🚀 Starting chunk processing")
	print(f" Directory: {folder}")
	print(f" Input prefix: {args.input_prefix}")
	print(f" Output prefix: {args.output_prefix}")
	print(f" Model: {selected_model}")
	print(f" Resolved model: {resolved_model}")
	print(f" Prompt: '{args.prompt}'")
	print("========================================")

	# Find all chunk files, auto-detecting the extension
	chunk_files = []
	for ext in (".md", ".log", ".txt"):
	chunk_files = sorted(list(folder.glob(f"{args.input_prefix}[0-9][0-9]{ext}")))
	if chunk_files:
	break
	if not chunk_files:
	# Last resort: any extension
	chunk_files = sorted(list(folder.glob(f"{args.input_prefix}[0-9][0-9].*")))
	total_files = len(chunk_files)
	if total_files == 0:
	print(f"⚠️ No chunk files found with prefix '{args.input_prefix}'")
	return

	print(f"📊 Found {total_files} chunk files to process")

	for i, chunk_file in enumerate(chunk_files, 1):
	print(f"\nProcessing file {i}/{total_files}")
	chunk_num = chunk_file.name[len(args.input_prefix) : len(args.input_prefix) + 2]

	output_file = folder / f"{args.output_prefix}_{args.input_prefix}{chunk_num}.md"
	thinking_file = folder / f"thinking_{args.output_prefix}_{args.input_prefix}{chunk_num}.md"

	process_chunk_file(
	chunk_file,
	output_file,
	thinking_file,
	selected_model,
	args.prompt,
	)

	print(f"\n🎉 Processing complete!")
	print(f" Clean files: {args.output_prefix}_{args.input_prefix}00.md to {args.output_prefix}_{args.input_prefix}{(i - 1):02d}.md")
	print(f" Raw files: thinking_{args.output_prefix}_{args.input_prefix}00.md to thinking_{args.output_prefix}_{args.input_prefix}{(i - 1):02d}.md")


	if __name__ == "__main__":
	main()