uv run python .\caption.py --api-key ... --model gemini-2.5-pro --api google-genai --prompt-file .\prompt.md --workers 32 . .
-
-
Save synap5e/357498ac59d019eac2ec53139c580d21 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import argparse | |
import base64 | |
import json | |
import logging | |
import mimetypes | |
import os | |
import signal | |
import sys | |
import time | |
from concurrent.futures import ProcessPoolExecutor, as_completed | |
from multiprocessing import cpu_count | |
from pathlib import Path | |
from typing import Optional, Tuple | |
from loguru import logger | |
import httpx | |
from tqdm import tqdm | |
# Global cost tracking | |
total_cost = 0.0 | |
# Signal handler for graceful shutdown | |
def signal_handler(signum, frame): | |
logger.info("Received interrupt signal. Shutting down gracefully...") | |
sys.exit(0) | |
# Set up signal handlers | |
signal.signal(signal.SIGINT, signal_handler) | |
signal.signal(signal.SIGTERM, signal_handler) | |
def encode_image_to_data_url(path: Path, mime: str | None = None) -> str: | |
"""Read file and return a data:<mime>;base64,<data> url used by OpenRouter. | |
Uses mimetypes to guess type and falls back to application/octet-stream. | |
""" | |
content = path.read_bytes() | |
b64 = base64.b64encode(content).decode("utf-8") | |
if not mime: | |
mime, _ = mimetypes.guess_type(str(path)) | |
if not mime: | |
mime = "application/octet-stream" | |
return f"data:{mime};base64,{b64}" | |
def build_payload(model: str, prompt: str, image_data_url: str) -> dict: | |
# Per OpenRouter multimodal images docs: send text prompt first, then image | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": prompt}, | |
{"type": "image_url", "image_url": {"url": image_data_url}}, | |
], | |
} | |
] | |
return { | |
"model": model, | |
"messages": messages, | |
# "stream": False, | |
# "temperature": 0.7, | |
# "max_tokens": 8000, | |
# "top_p": 1, | |
# "frequency_penalty": 0, | |
# "presence_penalty": 0, | |
# "cache_control": { | |
# "enabled": True, | |
# "ttl": "5m" | |
# } | |
} | |
def call_api(url: str, api_key: str, payload: dict, timeout: int = 300) -> httpx.Response: | |
headers = { | |
"Authorization": f"Bearer {api_key}", | |
"Content-Type": "application/json", | |
} | |
response = httpx.post(url, headers=headers, json=payload, timeout=timeout) | |
return response | |
def transcribe_image_api( | |
url: str, | |
api_key: str, | |
model: str, | |
prompt: str, | |
image_path: Path, | |
max_retries: int = 5, | |
backoff_base: float = 1.5, | |
backoff_max: float = 60.0, | |
): | |
data_url = encode_image_to_data_url(image_path, 'image') | |
payload = build_payload(model=model, prompt=prompt, image_data_url=data_url) | |
attempt = 0 | |
while True: | |
attempt += 1 | |
try: | |
resp = call_api(url, api_key=api_key, payload=payload, timeout=300) | |
except httpx.RequestError as exc: | |
logger.warning(f"Request failed for {image_path.name} (attempt {attempt}): {exc}") | |
if attempt >= max_retries: | |
raise | |
sleep = min(backoff_max, backoff_base ** attempt) | |
time.sleep(sleep) | |
continue | |
# Retry on 429/5xx | |
if resp.status_code in (429, 500, 502, 503, 504): | |
logger.warning(f"Transient status {resp.status_code} for {image_path.name} (attempt {attempt})") | |
if attempt >= max_retries: | |
logger.error(f"Max retries reached for {image_path.name}: {resp.text[:200]}") | |
resp.raise_for_status() | |
sleep = min(backoff_max, backoff_base ** attempt) | |
time.sleep(sleep) | |
continue | |
# For other non-success codes, raise to let caller handle or log | |
# httpx.Response does not have an `ok` attribute (unlike requests). | |
# Treat any non-2xx status as an error here. | |
if resp.status_code < 200 or resp.status_code >= 300: | |
logger.error(f"Unexpected HTTP {resp.status_code} for {image_path.name}: {resp.text[:500]}") | |
resp.raise_for_status() | |
data = resp.json() | |
# Log usage and cost information | |
try: | |
if 'usage' in data: | |
usage = data['usage'] | |
prompt_tokens = usage.get('prompt_tokens', 0) | |
completion_tokens = usage.get('completion_tokens', 0) | |
total_tokens = usage.get('total_tokens', 0) | |
# Check for cached tokens | |
prompt_details = usage.get('prompt_tokens_details', {}) | |
cached_tokens = prompt_details.get('cached_tokens', 0) | |
non_cached_tokens = prompt_tokens - cached_tokens | |
logger.info(f"Usage for {image_path.name}: {total_tokens} total tokens ({prompt_tokens} prompt + {completion_tokens} completion)") | |
logger.info(f"Cache status: {cached_tokens} cached, {non_cached_tokens} non-cached tokens") | |
# Log cost if available | |
if 'nanoGPT' in data and 'cost' in data['nanoGPT']: | |
cost = data['nanoGPT']['cost'] | |
global total_cost | |
total_cost += cost | |
logger.info(f"Cost for {image_path.name}: ${cost:.6f} (Total: ${total_cost:.6f})") | |
except (KeyError, TypeError) as e: | |
logger.debug(f"No usage information available for {image_path.name}: {e}") | |
# Log reasoning context if available | |
try: | |
if 'choices' in data and len(data['choices']) > 0: | |
choice = data['choices'][0] | |
if 'message' in choice and 'reasoning_context' in choice['message']: | |
reasoning_context = choice['message']['reasoning_context'] | |
logger.info(f"Reasoning context for {image_path.name}: {reasoning_context}") | |
except (KeyError, IndexError, TypeError) as e: | |
logger.debug(f"No reasoning context available for {image_path.name}: {e}") | |
if data['choices'][0]['message']['finish_reason'] != 'stop': | |
raise Exception(f"Unexpected finish reason {data['choices'][0]['message']['finish_reason']} for {image_path.name}: {data['choices'][0]['message']['content'][:500]}") | |
content = data['choices'][0]['message']['content'] | |
if len(content.split(' ') ) < 10: | |
raise Exception(f"Unexpectedly short response for {image_path.name}: {content[:500]}") | |
logger.info(f"Response for {image_path.name}: {content[:500]}") | |
return content | |
def transcribe_image_gemini( | |
api_key: str, | |
model: str, | |
prompt: str, | |
image_path: Path, | |
max_retries: int = 5, | |
backoff_base: float = 1.5, | |
backoff_max: float = 60.0, | |
include_thoughts: bool = False, | |
): | |
with open(image_path, 'rb') as f: | |
image_bytes = f.read() | |
client = genai.Client(api_key=api_key) | |
# import ipdb | |
# ipdb.set_trace() | |
response = client.models.generate_content( | |
model=model, | |
contents=[ | |
types.Part.from_bytes( | |
data=image_bytes, | |
mime_type='image/jpeg', | |
), | |
prompt | |
], | |
config=types.GenerateContentConfig( | |
thinking_config=types.ThinkingConfig( | |
include_thoughts=include_thoughts | |
) | |
) | |
) | |
if response.candidates[0].finish_reason != types.FinishReason.STOP: | |
raise Exception(f"Unexpected finish reason {response.candidates[0].finish_reason} for {image_path.name}: {response.candidates[0].content.parts[0].text[:500]}") | |
text = thoughts = None | |
for part in response.candidates[0].content.parts: | |
is_thought = getattr(part, 'thought', False) | |
if is_thought: | |
thoughts = part.text | |
else: | |
text = part.text | |
if not text: | |
raise Exception(f"No text found for {image_path.name} in {response.candidates[0].content.parts}") | |
if len(text.split(' ') ) < 10: | |
raise Exception(f"Unexpectedly short response for {image_path.name}: {text[:500]}") | |
if thoughts: | |
logger.info(f"Thoughts for {image_path.name}: {thoughts}") | |
logger.info(f"Response for {image_path.name}: {text}") | |
return text | |
def process_single_image( | |
api: str, | |
tags_file: Path, | |
output_dir: Path, | |
api_key: str, | |
model: str, | |
prompt: str, | |
max_retries: int = 5, | |
include_thoughts: bool = False, | |
) -> Tuple[Path, str, str]: | |
""" | |
Process a single image file and return the result. | |
This function is designed to be called by a process pool. | |
Returns: | |
Tuple of (tags_file, txt_path, out_text) | |
""" | |
base = output_dir / tags_file.with_suffix("").name | |
txt_path = base.with_suffix(".txt") | |
tags = tags_file.read_text() | |
image_file = tags_file.with_suffix(".png") | |
if api == 'google-genai': | |
result = transcribe_image_gemini( | |
api_key=api_key, | |
model=model, | |
prompt=prompt + tags, | |
image_path=image_file, | |
max_retries=max_retries, | |
include_thoughts=include_thoughts, | |
) | |
else: | |
result = transcribe_image_api( | |
url=api, | |
api_key=api_key, | |
model=model, | |
prompt=prompt + tags, | |
image_path=image_file, | |
max_retries=max_retries, | |
) | |
return tags_file, txt_path, result | |
def check_path(path: str) -> Path: | |
path = Path(path) | |
if not path.exists(): | |
raise argparse.ArgumentTypeError(f"File does not exist: {path}") | |
return path | |
def check_dir(path: str) -> Path: | |
path = Path(path) | |
if not path.exists(): | |
raise argparse.ArgumentTypeError(f"Directory does not exist: {path}") | |
if not path.is_dir(): | |
raise argparse.ArgumentTypeError(f"Path is not a directory: {path}") | |
return path | |
def main(argv: Optional[list[str]] = None) -> int: | |
parser = argparse.ArgumentParser(description="Caption images using the image and a .tags file of image tags.") | |
parser.add_argument("input_dir", help="Directory with images to transcribe", type=check_dir) | |
parser.add_argument("output_dir", help="Directory to write transcripts and metadata", type=check_dir) | |
parser.add_argument("--api", default="https://nano-gpt.com/api/v1/chat/completions") | |
parser.add_argument("-k", "--api-key", help="API key (or set API_KEY env var)", default=os.getenv("API_KEY")) | |
parser.add_argument("-m", "--model", default=os.getenv("MODEL", "gemini-2.5-flash-image-preview:free"), help="Multimodal model to use") | |
parser.add_argument("-p", "--prompt-file", help="File with prompt to use", type=check_path) | |
parser.add_argument("-r", "--max-retries", type=int, default=5, help="Maximum retry attempts for transient errors") | |
parser.add_argument("-w", "--workers", type=int, default=cpu_count(), help="Number of parallel workers (default: CPU count)") | |
parser.add_argument("-d", "--delay", type=int, default=10, help="Delay between calls (keep under free limit)") | |
parser.add_argument( | |
"-o", "--overwrite-existing", | |
action="store_true", | |
help="Skip images that already have a .txt transcript in the output directory", | |
) | |
parser.add_argument("-T", "--include-thoughts", action="store_true", help="Include thoughts in the response") | |
args = parser.parse_args(argv) | |
input_dir = Path(args.input_dir) | |
output_dir = Path(args.output_dir) | |
output_dir.mkdir(parents=True, exist_ok=True) | |
tags = list(input_dir.glob('*.tags')) | |
prompt = args.prompt_file.read_text() | |
queue = [] | |
if args.overwrite_existing: | |
queue = tags | |
else: | |
queue = [t for t in tags if not (output_dir / t.with_suffix(".txt")).exists()] | |
logger.info(f'Remaining {len(queue)} images to transcribe from {len(tags)} tags') | |
if len(queue) == len(tags): | |
raise Exception(f"Either no images are transcribed or filtering is broken") | |
logger.info(f"Found {len(queue)} image(s) in {input_dir}") | |
if args.workers == 1: | |
# Sequential processing for single worker | |
logger.info("Using sequential processing (1 worker)") | |
for tags_file in tqdm(queue, desc="Transcribing images", unit="img"): | |
tags_file, txt_path, out_text = process_single_image( | |
args.api, | |
tags_file, | |
output_dir, | |
args.api_key, | |
args.model, | |
prompt, | |
args.max_retries, | |
args.include_thoughts, | |
) | |
# Write per-image transcript and metadata, and append to all.txt | |
txt_path.write_text(out_text, encoding="utf-8") | |
# Also append this transcript to a running `all.txt` in the output directory | |
all_path = output_dir / "all.txt" | |
with all_path.open("a", encoding="utf-8") as allf: | |
allf.write(out_text) | |
allf.write("\n\n") | |
logger.info(f"Wrote transcript to {txt_path}") | |
time.sleep(args.delay) | |
else: | |
# Parallel processing for multiple workers | |
logger.info(f"Using {args.workers} parallel workers") | |
try: | |
with ProcessPoolExecutor(max_workers=args.workers) as executor: | |
# Submit all tasks | |
future_to_tags_file = { | |
executor.submit( | |
process_single_image, | |
args.api, | |
tags_file, | |
output_dir, | |
args.api_key, | |
args.model, | |
prompt, | |
args.max_retries, | |
args.include_thoughts, | |
): tags_file | |
for tags_file in queue | |
} | |
# Process completed tasks with progress bar | |
with tqdm(total=len(queue), desc="Transcribing images", unit="img") as pbar: | |
try: | |
for future in as_completed(future_to_tags_file): | |
tags_file, txt_path, out_text = future.result() | |
# Write per-image transcript and metadata, and append to all.txt | |
txt_path.write_text(out_text, encoding="utf-8") | |
# Also append this transcript to a running `all.txt` in the output directory | |
all_path = output_dir / "all.txt" | |
with all_path.open("a", encoding="utf-8") as allf: | |
allf.write(out_text) | |
allf.write("\n\n") | |
logger.info(f"Wrote transcript to {txt_path}") | |
pbar.update(1) | |
except KeyboardInterrupt: | |
logger.info("Interrupt received. Cancelling remaining tasks...") | |
# Cancel all pending futures | |
for future in future_to_tags_file: | |
future.cancel() | |
# Wait for already running tasks to complete | |
for future in as_completed(future_to_tags_file): | |
try: | |
future.result() | |
except: | |
pass | |
raise | |
except KeyboardInterrupt: | |
logger.info("Processing interrupted by user. Shutting down gracefully...") | |
return 1 | |
logger.info(f"All done. Outputs in {output_dir}") | |
return 0 | |
if __name__ == "__main__": | |
raise SystemExit(main()) |
You are an expert visual description engine.
You are captioning images, to train a LORA. These captions will be used to train a LoRA that learns both character identity and art style. Focus on complete, precise visual details of subjects, clothing, expressions, composition, and lighting, in continuous tense, without speculation or extraneous commentary.
You never mention “the image,” “the picture,” or any instructions. You provide richly detailed narration of everything visible.
- Always describe in order of importance for clarity: Primary subject(s): who/what they are, what they are doing (e.g., “a man in a dark coat is walking briskly down the street”). Subject details: gender (best guess), build, face, expression, hair, clothing (each item), notable features. Composition and shot type: orientation (landscape, portrait), framing (close-up, mid shot, cowboy shot, wide shot, etc.). Do not be over-precise with this or describe ratios, letterboxing etc. since the LORA training will perform its own cropping and potentially rotation. Mood, style, and lighting: tone, atmosphere, color scheme, light quality (e.g., harsh sunlight, soft candlelight). Interactions and secondary objects: what subjects are using, holding, or engaging with. Background and environment: scenery, setting, surrounding details (if possible to determine, otherwise just describe colors, sharpness, etc.).
- For people: always include gender, facial features, expression, hair, build, each item of clothing, and other distinguishing details.
- For scenery, animals, and objects: describe with precision; never use vague terms like “item” or “thing.”, and avoid "appears to be", "likely" etc.
- Use precise, simple, language without purple prose: "the cat is orange, with a ..." not "the cat appears to be basking in..."
- Always describe colors, textures, lighting, mood, and shapes.
- Use only present continuous tense (e.g., “a tree is swaying gently in the wind in a field of grass,” “a woman in a red beanie is wearing a bright purple flowing dress and is smiling”).
- Never hedge with uncertainty ("possibly" "maybe" "suggest" "likely"), just describe what you see.
- Never omit key visible details. Do not refer to a subject as "the" unless you have already established them e.g. "A cat ... the cat..." or (if there are multiple) "An orange cat... the orange cat..." rather than leading with "The cat is"
- DO NOT INCLUED MARKDOWN, HEADINGS, BULLET POINTS, etc. Write only back to back sentences.
- If no image is provided, or you can not see it, output
ERROR: NO IMAGE
.
Describe this image in natural language, present continuous tense with full detail. Include all visible people (gender, face, expression, hair, build, clothing, distinctive features), environment (colors, lighting, mood, textures, shapes), and composition (orientation and shot type). For backgrounds, describe only the major colors, shapes, lighting, and setting cues. Do not invent fine detail if blurred or indistinct.
Avoid vague terms and uncertainty. Provide exact, continuous narration. Use simple precise language.
Write at least 2 sentences, and no more than 8 depending on the detail of the image and number of subjects/action, ensuring all details are described fully. 8 sentences should only be used for complex multi-character scenes. On average 3-5 should be sufficient.
An image tagger has provided the following tags - use these with caution, but in general should be helpful for identifying characters and elements to write about. Node that WD1.4 is trained on sfw and nsfw anime art, you may need to ignore some of it's "hornier" output. Ignore any tags that are not clearly observable in the image - the image tagger can make mistakes. The image tagger is decent at identifying characters - if it does not ID a char, be very carefull naming them yourself - better to avoid names if you are not sure.
[project] | |
name = "crop" | |
version = "0.1.0" | |
description = "Add your description here" | |
readme = "README.md" | |
requires-python = ">=3.13" | |
dependencies = [ | |
"google-genai>=1.41.0", | |
"httpx>=0.28.1", | |
"ipdb>=0.13.13", | |
"loguru>=0.7.3", | |
"tqdm>=4.67.1", | |
] |