Skip to content

Instantly share code, notes, and snippets.

@synap5e
Created October 4, 2025 01:35
Show Gist options
  • Save synap5e/357498ac59d019eac2ec53139c580d21 to your computer and use it in GitHub Desktop.
Save synap5e/357498ac59d019eac2ec53139c580d21 to your computer and use it in GitHub Desktop.
image caption
uv run python .\caption.py --api-key ... --model gemini-2.5-pro --api google-genai --prompt-file .\prompt.md --workers 32 . .
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import base64
import json
import logging
import mimetypes
import os
import signal
import sys
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count
from pathlib import Path
from typing import Optional, Tuple
from loguru import logger
import httpx
from tqdm import tqdm
# Global cost tracking
total_cost = 0.0
# Signal handler for graceful shutdown
def signal_handler(signum, frame):
logger.info("Received interrupt signal. Shutting down gracefully...")
sys.exit(0)
# Set up signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
def encode_image_to_data_url(path: Path, mime: str | None = None) -> str:
"""Read file and return a data:<mime>;base64,<data> url used by OpenRouter.
Uses mimetypes to guess type and falls back to application/octet-stream.
"""
content = path.read_bytes()
b64 = base64.b64encode(content).decode("utf-8")
if not mime:
mime, _ = mimetypes.guess_type(str(path))
if not mime:
mime = "application/octet-stream"
return f"data:{mime};base64,{b64}"
def build_payload(model: str, prompt: str, image_data_url: str) -> dict:
# Per OpenRouter multimodal images docs: send text prompt first, then image
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_data_url}},
],
}
]
return {
"model": model,
"messages": messages,
# "stream": False,
# "temperature": 0.7,
# "max_tokens": 8000,
# "top_p": 1,
# "frequency_penalty": 0,
# "presence_penalty": 0,
# "cache_control": {
# "enabled": True,
# "ttl": "5m"
# }
}
def call_api(url: str, api_key: str, payload: dict, timeout: int = 300) -> httpx.Response:
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
response = httpx.post(url, headers=headers, json=payload, timeout=timeout)
return response
def transcribe_image_api(
url: str,
api_key: str,
model: str,
prompt: str,
image_path: Path,
max_retries: int = 5,
backoff_base: float = 1.5,
backoff_max: float = 60.0,
):
data_url = encode_image_to_data_url(image_path, 'image')
payload = build_payload(model=model, prompt=prompt, image_data_url=data_url)
attempt = 0
while True:
attempt += 1
try:
resp = call_api(url, api_key=api_key, payload=payload, timeout=300)
except httpx.RequestError as exc:
logger.warning(f"Request failed for {image_path.name} (attempt {attempt}): {exc}")
if attempt >= max_retries:
raise
sleep = min(backoff_max, backoff_base ** attempt)
time.sleep(sleep)
continue
# Retry on 429/5xx
if resp.status_code in (429, 500, 502, 503, 504):
logger.warning(f"Transient status {resp.status_code} for {image_path.name} (attempt {attempt})")
if attempt >= max_retries:
logger.error(f"Max retries reached for {image_path.name}: {resp.text[:200]}")
resp.raise_for_status()
sleep = min(backoff_max, backoff_base ** attempt)
time.sleep(sleep)
continue
# For other non-success codes, raise to let caller handle or log
# httpx.Response does not have an `ok` attribute (unlike requests).
# Treat any non-2xx status as an error here.
if resp.status_code < 200 or resp.status_code >= 300:
logger.error(f"Unexpected HTTP {resp.status_code} for {image_path.name}: {resp.text[:500]}")
resp.raise_for_status()
data = resp.json()
# Log usage and cost information
try:
if 'usage' in data:
usage = data['usage']
prompt_tokens = usage.get('prompt_tokens', 0)
completion_tokens = usage.get('completion_tokens', 0)
total_tokens = usage.get('total_tokens', 0)
# Check for cached tokens
prompt_details = usage.get('prompt_tokens_details', {})
cached_tokens = prompt_details.get('cached_tokens', 0)
non_cached_tokens = prompt_tokens - cached_tokens
logger.info(f"Usage for {image_path.name}: {total_tokens} total tokens ({prompt_tokens} prompt + {completion_tokens} completion)")
logger.info(f"Cache status: {cached_tokens} cached, {non_cached_tokens} non-cached tokens")
# Log cost if available
if 'nanoGPT' in data and 'cost' in data['nanoGPT']:
cost = data['nanoGPT']['cost']
global total_cost
total_cost += cost
logger.info(f"Cost for {image_path.name}: ${cost:.6f} (Total: ${total_cost:.6f})")
except (KeyError, TypeError) as e:
logger.debug(f"No usage information available for {image_path.name}: {e}")
# Log reasoning context if available
try:
if 'choices' in data and len(data['choices']) > 0:
choice = data['choices'][0]
if 'message' in choice and 'reasoning_context' in choice['message']:
reasoning_context = choice['message']['reasoning_context']
logger.info(f"Reasoning context for {image_path.name}: {reasoning_context}")
except (KeyError, IndexError, TypeError) as e:
logger.debug(f"No reasoning context available for {image_path.name}: {e}")
if data['choices'][0]['message']['finish_reason'] != 'stop':
raise Exception(f"Unexpected finish reason {data['choices'][0]['message']['finish_reason']} for {image_path.name}: {data['choices'][0]['message']['content'][:500]}")
content = data['choices'][0]['message']['content']
if len(content.split(' ') ) < 10:
raise Exception(f"Unexpectedly short response for {image_path.name}: {content[:500]}")
logger.info(f"Response for {image_path.name}: {content[:500]}")
return content
def transcribe_image_gemini(
api_key: str,
model: str,
prompt: str,
image_path: Path,
max_retries: int = 5,
backoff_base: float = 1.5,
backoff_max: float = 60.0,
include_thoughts: bool = False,
):
with open(image_path, 'rb') as f:
image_bytes = f.read()
client = genai.Client(api_key=api_key)
# import ipdb
# ipdb.set_trace()
response = client.models.generate_content(
model=model,
contents=[
types.Part.from_bytes(
data=image_bytes,
mime_type='image/jpeg',
),
prompt
],
config=types.GenerateContentConfig(
thinking_config=types.ThinkingConfig(
include_thoughts=include_thoughts
)
)
)
if response.candidates[0].finish_reason != types.FinishReason.STOP:
raise Exception(f"Unexpected finish reason {response.candidates[0].finish_reason} for {image_path.name}: {response.candidates[0].content.parts[0].text[:500]}")
text = thoughts = None
for part in response.candidates[0].content.parts:
is_thought = getattr(part, 'thought', False)
if is_thought:
thoughts = part.text
else:
text = part.text
if not text:
raise Exception(f"No text found for {image_path.name} in {response.candidates[0].content.parts}")
if len(text.split(' ') ) < 10:
raise Exception(f"Unexpectedly short response for {image_path.name}: {text[:500]}")
if thoughts:
logger.info(f"Thoughts for {image_path.name}: {thoughts}")
logger.info(f"Response for {image_path.name}: {text}")
return text
def process_single_image(
api: str,
tags_file: Path,
output_dir: Path,
api_key: str,
model: str,
prompt: str,
max_retries: int = 5,
include_thoughts: bool = False,
) -> Tuple[Path, str, str]:
"""
Process a single image file and return the result.
This function is designed to be called by a process pool.
Returns:
Tuple of (tags_file, txt_path, out_text)
"""
base = output_dir / tags_file.with_suffix("").name
txt_path = base.with_suffix(".txt")
tags = tags_file.read_text()
image_file = tags_file.with_suffix(".png")
if api == 'google-genai':
result = transcribe_image_gemini(
api_key=api_key,
model=model,
prompt=prompt + tags,
image_path=image_file,
max_retries=max_retries,
include_thoughts=include_thoughts,
)
else:
result = transcribe_image_api(
url=api,
api_key=api_key,
model=model,
prompt=prompt + tags,
image_path=image_file,
max_retries=max_retries,
)
return tags_file, txt_path, result
def check_path(path: str) -> Path:
path = Path(path)
if not path.exists():
raise argparse.ArgumentTypeError(f"File does not exist: {path}")
return path
def check_dir(path: str) -> Path:
path = Path(path)
if not path.exists():
raise argparse.ArgumentTypeError(f"Directory does not exist: {path}")
if not path.is_dir():
raise argparse.ArgumentTypeError(f"Path is not a directory: {path}")
return path
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Caption images using the image and a .tags file of image tags.")
parser.add_argument("input_dir", help="Directory with images to transcribe", type=check_dir)
parser.add_argument("output_dir", help="Directory to write transcripts and metadata", type=check_dir)
parser.add_argument("--api", default="https://nano-gpt.com/api/v1/chat/completions")
parser.add_argument("-k", "--api-key", help="API key (or set API_KEY env var)", default=os.getenv("API_KEY"))
parser.add_argument("-m", "--model", default=os.getenv("MODEL", "gemini-2.5-flash-image-preview:free"), help="Multimodal model to use")
parser.add_argument("-p", "--prompt-file", help="File with prompt to use", type=check_path)
parser.add_argument("-r", "--max-retries", type=int, default=5, help="Maximum retry attempts for transient errors")
parser.add_argument("-w", "--workers", type=int, default=cpu_count(), help="Number of parallel workers (default: CPU count)")
parser.add_argument("-d", "--delay", type=int, default=10, help="Delay between calls (keep under free limit)")
parser.add_argument(
"-o", "--overwrite-existing",
action="store_true",
help="Skip images that already have a .txt transcript in the output directory",
)
parser.add_argument("-T", "--include-thoughts", action="store_true", help="Include thoughts in the response")
args = parser.parse_args(argv)
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
tags = list(input_dir.glob('*.tags'))
prompt = args.prompt_file.read_text()
queue = []
if args.overwrite_existing:
queue = tags
else:
queue = [t for t in tags if not (output_dir / t.with_suffix(".txt")).exists()]
logger.info(f'Remaining {len(queue)} images to transcribe from {len(tags)} tags')
if len(queue) == len(tags):
raise Exception(f"Either no images are transcribed or filtering is broken")
logger.info(f"Found {len(queue)} image(s) in {input_dir}")
if args.workers == 1:
# Sequential processing for single worker
logger.info("Using sequential processing (1 worker)")
for tags_file in tqdm(queue, desc="Transcribing images", unit="img"):
tags_file, txt_path, out_text = process_single_image(
args.api,
tags_file,
output_dir,
args.api_key,
args.model,
prompt,
args.max_retries,
args.include_thoughts,
)
# Write per-image transcript and metadata, and append to all.txt
txt_path.write_text(out_text, encoding="utf-8")
# Also append this transcript to a running `all.txt` in the output directory
all_path = output_dir / "all.txt"
with all_path.open("a", encoding="utf-8") as allf:
allf.write(out_text)
allf.write("\n\n")
logger.info(f"Wrote transcript to {txt_path}")
time.sleep(args.delay)
else:
# Parallel processing for multiple workers
logger.info(f"Using {args.workers} parallel workers")
try:
with ProcessPoolExecutor(max_workers=args.workers) as executor:
# Submit all tasks
future_to_tags_file = {
executor.submit(
process_single_image,
args.api,
tags_file,
output_dir,
args.api_key,
args.model,
prompt,
args.max_retries,
args.include_thoughts,
): tags_file
for tags_file in queue
}
# Process completed tasks with progress bar
with tqdm(total=len(queue), desc="Transcribing images", unit="img") as pbar:
try:
for future in as_completed(future_to_tags_file):
tags_file, txt_path, out_text = future.result()
# Write per-image transcript and metadata, and append to all.txt
txt_path.write_text(out_text, encoding="utf-8")
# Also append this transcript to a running `all.txt` in the output directory
all_path = output_dir / "all.txt"
with all_path.open("a", encoding="utf-8") as allf:
allf.write(out_text)
allf.write("\n\n")
logger.info(f"Wrote transcript to {txt_path}")
pbar.update(1)
except KeyboardInterrupt:
logger.info("Interrupt received. Cancelling remaining tasks...")
# Cancel all pending futures
for future in future_to_tags_file:
future.cancel()
# Wait for already running tasks to complete
for future in as_completed(future_to_tags_file):
try:
future.result()
except:
pass
raise
except KeyboardInterrupt:
logger.info("Processing interrupted by user. Shutting down gracefully...")
return 1
logger.info(f"All done. Outputs in {output_dir}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

You are an expert visual description engine.

You are captioning images, to train a LORA. These captions will be used to train a LoRA that learns both character identity and art style. Focus on complete, precise visual details of subjects, clothing, expressions, composition, and lighting, in continuous tense, without speculation or extraneous commentary.

You never mention “the image,” “the picture,” or any instructions. You provide richly detailed narration of everything visible.

Rules:

  • Always describe in order of importance for clarity: Primary subject(s): who/what they are, what they are doing (e.g., “a man in a dark coat is walking briskly down the street”). Subject details: gender (best guess), build, face, expression, hair, clothing (each item), notable features. Composition and shot type: orientation (landscape, portrait), framing (close-up, mid shot, cowboy shot, wide shot, etc.). Do not be over-precise with this or describe ratios, letterboxing etc. since the LORA training will perform its own cropping and potentially rotation. Mood, style, and lighting: tone, atmosphere, color scheme, light quality (e.g., harsh sunlight, soft candlelight). Interactions and secondary objects: what subjects are using, holding, or engaging with. Background and environment: scenery, setting, surrounding details (if possible to determine, otherwise just describe colors, sharpness, etc.).
  • For people: always include gender, facial features, expression, hair, build, each item of clothing, and other distinguishing details.
  • For scenery, animals, and objects: describe with precision; never use vague terms like “item” or “thing.”, and avoid "appears to be", "likely" etc.
  • Use precise, simple, language without purple prose: "the cat is orange, with a ..." not "the cat appears to be basking in..."
  • Always describe colors, textures, lighting, mood, and shapes.
  • Use only present continuous tense (e.g., “a tree is swaying gently in the wind in a field of grass,” “a woman in a red beanie is wearing a bright purple flowing dress and is smiling”).
  • Never hedge with uncertainty ("possibly" "maybe" "suggest" "likely"), just describe what you see.
  • Never omit key visible details. Do not refer to a subject as "the" unless you have already established them e.g. "A cat ... the cat..." or (if there are multiple) "An orange cat... the orange cat..." rather than leading with "The cat is"
  • DO NOT INCLUED MARKDOWN, HEADINGS, BULLET POINTS, etc. Write only back to back sentences.
  • If no image is provided, or you can not see it, output ERROR: NO IMAGE.

Instructions

Describe this image in natural language, present continuous tense with full detail. Include all visible people (gender, face, expression, hair, build, clothing, distinctive features), environment (colors, lighting, mood, textures, shapes), and composition (orientation and shot type). For backgrounds, describe only the major colors, shapes, lighting, and setting cues. Do not invent fine detail if blurred or indistinct.

Avoid vague terms and uncertainty. Provide exact, continuous narration. Use simple precise language.

Write at least 2 sentences, and no more than 8 depending on the detail of the image and number of subjects/action, ensuring all details are described fully. 8 sentences should only be used for complex multi-character scenes. On average 3-5 should be sufficient.

An image tagger has provided the following tags - use these with caution, but in general should be helpful for identifying characters and elements to write about. Node that WD1.4 is trained on sfw and nsfw anime art, you may need to ignore some of it's "hornier" output. Ignore any tags that are not clearly observable in the image - the image tagger can make mistakes. The image tagger is decent at identifying characters - if it does not ID a char, be very carefull naming them yourself - better to avoid names if you are not sure.

Image tagger output

[project]
name = "crop"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"google-genai>=1.41.0",
"httpx>=0.28.1",
"ipdb>=0.13.13",
"loguru>=0.7.3",
"tqdm>=4.67.1",
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment