synap5e · October 4, 2025 01:35
diff --git a/README.md b/README.md
diff --git a/caption.py b/caption.py
 #!/usr/bin/env python3
 from __future__ import annotations

 import argparse
 import base64
 import json
 import logging
 import mimetypes
 import os
 import signal
 import sys
 import time
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from multiprocessing import cpu_count
 from pathlib import Path
 from typing import Optional, Tuple
 from loguru import logger

 import httpx
 from tqdm import tqdm

 # Global cost tracking
 total_cost = 0.0

 # Signal handler for graceful shutdown
 def signal_handler(signum, frame):
    logger.info("Received interrupt signal. Shutting down gracefully...")
    sys.exit(0)

 # Set up signal handlers
 signal.signal(signal.SIGINT, signal_handler)
 signal.signal(signal.SIGTERM, signal_handler)


 def encode_image_to_data_url(path: Path, mime: str | None = None) -> str:
    """Read file and return a data:<mime>;base64,<data> url used by OpenRouter.

    Uses mimetypes to guess type and falls back to application/octet-stream.
    """
    content = path.read_bytes()
    b64 = base64.b64encode(content).decode("utf-8")
    if not mime:
        mime, _ = mimetypes.guess_type(str(path))
    if not mime:
        mime = "application/octet-stream"
    return f"data:{mime};base64,{b64}"


 def build_payload(model: str, prompt: str, image_data_url: str) -> dict:
    # Per OpenRouter multimodal images docs: send text prompt first, then image
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": image_data_url}},
            ],
        }
    ]

    return {
        "model": model, 
        "messages": messages,
        # "stream": False,
        # "temperature": 0.7,
        # "max_tokens": 8000,
        # "top_p": 1,
        # "frequency_penalty": 0,
        # "presence_penalty": 0,
        # "cache_control": {
        #     "enabled": True,
        #     "ttl": "5m"
        # }
    }


 def call_api(url: str, api_key: str, payload: dict, timeout: int = 300) -> httpx.Response:
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    response = httpx.post(url, headers=headers, json=payload, timeout=timeout)
    return response

 def transcribe_image_api(
    url: str,
    api_key: str,
    model: str,
    prompt: str,
    image_path: Path,
    max_retries: int = 5,
    backoff_base: float = 1.5,
    backoff_max: float = 60.0,
 ):
    data_url = encode_image_to_data_url(image_path, 'image')
    payload = build_payload(model=model, prompt=prompt, image_data_url=data_url)

    attempt = 0
    while True:
        attempt += 1
        try:
            resp = call_api(url, api_key=api_key, payload=payload, timeout=300)
        except httpx.RequestError as exc:
            logger.warning(f"Request failed for {image_path.name} (attempt {attempt}): {exc}")
            if attempt >= max_retries:
                raise
            sleep = min(backoff_max, backoff_base ** attempt)
            time.sleep(sleep)
            continue

        # Retry on 429/5xx
        if resp.status_code in (429, 500, 502, 503, 504):
            logger.warning(f"Transient status {resp.status_code} for {image_path.name} (attempt {attempt})")
            if attempt >= max_retries:
                logger.error(f"Max retries reached for {image_path.name}: {resp.text[:200]}")
                resp.raise_for_status()
            sleep = min(backoff_max, backoff_base ** attempt)
            time.sleep(sleep)
            continue

        # For other non-success codes, raise to let caller handle or log
        # httpx.Response does not have an `ok` attribute (unlike requests).
        # Treat any non-2xx status as an error here.
        if resp.status_code < 200 or resp.status_code >= 300:
            logger.error(f"Unexpected HTTP {resp.status_code} for {image_path.name}: {resp.text[:500]}")
            resp.raise_for_status()

        data = resp.json()
        
        # Log usage and cost information
        try:
            if 'usage' in data:
                usage = data['usage']
                prompt_tokens = usage.get('prompt_tokens', 0)
                completion_tokens = usage.get('completion_tokens', 0)
                total_tokens = usage.get('total_tokens', 0)
                
                # Check for cached tokens
                prompt_details = usage.get('prompt_tokens_details', {})
                cached_tokens = prompt_details.get('cached_tokens', 0)
                non_cached_tokens = prompt_tokens - cached_tokens
                
                logger.info(f"Usage for {image_path.name}: {total_tokens} total tokens ({prompt_tokens} prompt + {completion_tokens} completion)")
                logger.info(f"Cache status: {cached_tokens} cached, {non_cached_tokens} non-cached tokens")
                
                # Log cost if available
                if 'nanoGPT' in data and 'cost' in data['nanoGPT']:
                    cost = data['nanoGPT']['cost']
                    global total_cost
                    total_cost += cost
                    logger.info(f"Cost for {image_path.name}: ${cost:.6f} (Total: ${total_cost:.6f})")
                    
        except (KeyError, TypeError) as e:
            logger.debug(f"No usage information available for {image_path.name}: {e}")
        
        # Log reasoning context if available
        try:
            if 'choices' in data and len(data['choices']) > 0:
                choice = data['choices'][0]
                if 'message' in choice and 'reasoning_context' in choice['message']:
                    reasoning_context = choice['message']['reasoning_context']
                    logger.info(f"Reasoning context for {image_path.name}: {reasoning_context}")
        except (KeyError, IndexError, TypeError) as e:
            logger.debug(f"No reasoning context available for {image_path.name}: {e}")
        
        if data['choices'][0]['message']['finish_reason'] != 'stop':
            raise Exception(f"Unexpected finish reason {data['choices'][0]['message']['finish_reason']} for {image_path.name}: {data['choices'][0]['message']['content'][:500]}")
        
        
        content = data['choices'][0]['message']['content']
        if len(content.split(' ') ) < 10:
            raise Exception(f"Unexpectedly short response for {image_path.name}: {content[:500]}")
        
        logger.info(f"Response for {image_path.name}: {content[:500]}")

        return content

 def transcribe_image_gemini(
    api_key: str,
    model: str,
    prompt: str,
    image_path: Path,
    max_retries: int = 5,
    backoff_base: float = 1.5,
    backoff_max: float = 60.0,

    include_thoughts: bool = False,
 ):
    with open(image_path, 'rb') as f:
        image_bytes = f.read()
    client = genai.Client(api_key=api_key)
    # import ipdb
    # ipdb.set_trace()
    response = client.models.generate_content(
        model=model,
        contents=[
            types.Part.from_bytes(
            data=image_bytes,
            mime_type='image/jpeg',
            ),
            prompt
        ],
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(
                include_thoughts=include_thoughts
            )
        )
    )
    if response.candidates[0].finish_reason != types.FinishReason.STOP:
        raise Exception(f"Unexpected finish reason {response.candidates[0].finish_reason} for {image_path.name}: {response.candidates[0].content.parts[0].text[:500]}")
    
    text = thoughts = None
    for part in response.candidates[0].content.parts:
        is_thought = getattr(part, 'thought', False)
        if is_thought:
            thoughts = part.text
        else:
            text = part.text

    if not text:
        raise Exception(f"No text found for {image_path.name} in {response.candidates[0].content.parts}")

    if len(text.split(' ') ) < 10:
        raise Exception(f"Unexpectedly short response for {image_path.name}: {text[:500]}")

    if thoughts:
        logger.info(f"Thoughts for {image_path.name}: {thoughts}")
    
    logger.info(f"Response for {image_path.name}: {text}")
    
    return text

 def process_single_image(
    api: str,
    tags_file: Path,
    output_dir: Path,
    api_key: str,
    model: str,
    prompt: str,
    max_retries: int = 5,

    include_thoughts: bool = False,
 ) -> Tuple[Path, str, str]:
    """
    Process a single image file and return the result.
    This function is designed to be called by a process pool.
    
    Returns:
        Tuple of (tags_file, txt_path, out_text)
    """
    base = output_dir / tags_file.with_suffix("").name
    txt_path = base.with_suffix(".txt")
    
    tags = tags_file.read_text()
    image_file = tags_file.with_suffix(".png")
    
    if api == 'google-genai':
        result = transcribe_image_gemini(
            api_key=api_key,
            model=model,
            prompt=prompt + tags,
            image_path=image_file,
            max_retries=max_retries,

            include_thoughts=include_thoughts,
        )
    else:
        result = transcribe_image_api(
            url=api,
            api_key=api_key,
            model=model,
            prompt=prompt + tags,
            image_path=image_file,
            max_retries=max_retries,
        )
    return tags_file, txt_path, result


 def check_path(path: str) -> Path:
    path = Path(path)
    if not path.exists():
        raise argparse.ArgumentTypeError(f"File does not exist: {path}")
    return path


 def check_dir(path: str) -> Path:
    path = Path(path)
    if not path.exists():
        raise argparse.ArgumentTypeError(f"Directory does not exist: {path}")
    if not path.is_dir():
        raise argparse.ArgumentTypeError(f"Path is not a directory: {path}")
    return path


 def main(argv: Optional[list[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Caption images using the image and a .tags file of image tags.")

    parser.add_argument("input_dir", help="Directory with images to transcribe", type=check_dir)
    parser.add_argument("output_dir", help="Directory to write transcripts and metadata", type=check_dir)

    parser.add_argument("--api", default="https://nano-gpt.com/api/v1/chat/completions")
    parser.add_argument("-k", "--api-key", help="API key (or set API_KEY env var)", default=os.getenv("API_KEY"))
    parser.add_argument("-m", "--model", default=os.getenv("MODEL", "gemini-2.5-flash-image-preview:free"), help="Multimodal model to use")

    parser.add_argument("-p", "--prompt-file", help="File with prompt to use", type=check_path)
    parser.add_argument("-r", "--max-retries", type=int, default=5, help="Maximum retry attempts for transient errors")
    parser.add_argument("-w", "--workers", type=int, default=cpu_count(), help="Number of parallel workers (default: CPU count)")
    parser.add_argument("-d", "--delay", type=int, default=10, help="Delay between calls (keep under free limit)")
    parser.add_argument(
        "-o", "--overwrite-existing",
        action="store_true",
        help="Skip images that already have a .txt transcript in the output directory",
    )

    parser.add_argument("-T", "--include-thoughts", action="store_true", help="Include thoughts in the response")

    args = parser.parse_args(argv)

    input_dir = Path(args.input_dir)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    tags = list(input_dir.glob('*.tags'))

    prompt = args.prompt_file.read_text()

    queue = []
    if args.overwrite_existing:
        queue = tags
    else:
        queue = [t for t in tags if not (output_dir / t.with_suffix(".txt")).exists()]

    logger.info(f'Remaining {len(queue)} images to transcribe from {len(tags)} tags')
    if len(queue) == len(tags):
        raise Exception(f"Either no images are transcribed or filtering is broken")

    logger.info(f"Found {len(queue)} image(s) in {input_dir}")
    
    if args.workers == 1:
        # Sequential processing for single worker
        logger.info("Using sequential processing (1 worker)")
        for tags_file in tqdm(queue, desc="Transcribing images", unit="img"):
            tags_file, txt_path, out_text = process_single_image(
                args.api,
                tags_file,
                output_dir,
                args.api_key,
                args.model,
                prompt,
                args.max_retries,
                args.include_thoughts,
            )
            
            # Write per-image transcript and metadata, and append to all.txt
            txt_path.write_text(out_text, encoding="utf-8")
            # Also append this transcript to a running `all.txt` in the output directory
            all_path = output_dir / "all.txt"
            with all_path.open("a", encoding="utf-8") as allf:
                allf.write(out_text)
                allf.write("\n\n")
            
            logger.info(f"Wrote transcript to {txt_path}")
            time.sleep(args.delay)
    else:
        # Parallel processing for multiple workers
        logger.info(f"Using {args.workers} parallel workers")
        try:
            with ProcessPoolExecutor(max_workers=args.workers) as executor:
                # Submit all tasks
                future_to_tags_file = {
                    executor.submit(
                        process_single_image,
                        args.api,
                        tags_file,
                        output_dir,
                        args.api_key,
                        args.model,
                        prompt,
                        args.max_retries,
                        args.include_thoughts,
                    ): tags_file
                    for tags_file in queue
                }
                
                # Process completed tasks with progress bar
                with tqdm(total=len(queue), desc="Transcribing images", unit="img") as pbar:
                    try:
                        for future in as_completed(future_to_tags_file):
                            tags_file, txt_path, out_text = future.result()
                            
                            # Write per-image transcript and metadata, and append to all.txt
                            txt_path.write_text(out_text, encoding="utf-8")
                            # Also append this transcript to a running `all.txt` in the output directory
                            all_path = output_dir / "all.txt"
                            with all_path.open("a", encoding="utf-8") as allf:
                                allf.write(out_text)
                                allf.write("\n\n")
                            
                            logger.info(f"Wrote transcript to {txt_path}")
                            pbar.update(1)
                    except KeyboardInterrupt:
                        logger.info("Interrupt received. Cancelling remaining tasks...")
                        # Cancel all pending futures
                        for future in future_to_tags_file:
                            future.cancel()
                        # Wait for already running tasks to complete
                        for future in as_completed(future_to_tags_file):
                            try:
                                future.result()
                            except:
                                pass
                        raise
        except KeyboardInterrupt:
            logger.info("Processing interrupted by user. Shutting down gracefully...")
            return 1

    logger.info(f"All done. Outputs in {output_dir}")
    return 0


 if __name__ == "__main__":
    raise SystemExit(main())
diff --git a/prompt.md b/prompt.md
diff --git a/pyproject.toml b/pyproject.toml
 [project]
 name = "crop"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "google-genai>=1.41.0",
    "httpx>=0.28.1",
    "ipdb>=0.13.13",
    "loguru>=0.7.3",
    "tqdm>=4.67.1",
 ]
	#!/usr/bin/env python3
	from __future__ import annotations

	import argparse
	import base64
	import json
	import logging
	import mimetypes
	import os
	import signal
	import sys
	import time
	from concurrent.futures import ProcessPoolExecutor, as_completed
	from multiprocessing import cpu_count
	from pathlib import Path
	from typing import Optional, Tuple
	from loguru import logger

	import httpx
	from tqdm import tqdm

	# Global cost tracking
	total_cost = 0.0

	# Signal handler for graceful shutdown
	def signal_handler(signum, frame):
	logger.info("Received interrupt signal. Shutting down gracefully...")
	sys.exit(0)

	# Set up signal handlers
	signal.signal(signal.SIGINT, signal_handler)
	signal.signal(signal.SIGTERM, signal_handler)


	def encode_image_to_data_url(path: Path, mime: str \| None = None) -> str:
	"""Read file and return a data:<mime>;base64,<data> url used by OpenRouter.

	Uses mimetypes to guess type and falls back to application/octet-stream.
	"""
	content = path.read_bytes()
	b64 = base64.b64encode(content).decode("utf-8")
	if not mime:
	mime, _ = mimetypes.guess_type(str(path))
	if not mime:
	mime = "application/octet-stream"
	return f"data:{mime};base64,{b64}"


	def build_payload(model: str, prompt: str, image_data_url: str) -> dict:
	# Per OpenRouter multimodal images docs: send text prompt first, then image
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": image_data_url}},
	],
	}
	]

	return {
	"model": model,
	"messages": messages,
	# "stream": False,
	# "temperature": 0.7,
	# "max_tokens": 8000,
	# "top_p": 1,
	# "frequency_penalty": 0,
	# "presence_penalty": 0,
	# "cache_control": {
	# "enabled": True,
	# "ttl": "5m"
	# }
	}


	def call_api(url: str, api_key: str, payload: dict, timeout: int = 300) -> httpx.Response:
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json",
	}
	response = httpx.post(url, headers=headers, json=payload, timeout=timeout)
	return response

	def transcribe_image_api(
	url: str,
	api_key: str,
	model: str,
	prompt: str,
	image_path: Path,
	max_retries: int = 5,
	backoff_base: float = 1.5,
	backoff_max: float = 60.0,
	):
	data_url = encode_image_to_data_url(image_path, 'image')
	payload = build_payload(model=model, prompt=prompt, image_data_url=data_url)

	attempt = 0
	while True:
	attempt += 1
	try:
	resp = call_api(url, api_key=api_key, payload=payload, timeout=300)
	except httpx.RequestError as exc:
	logger.warning(f"Request failed for {image_path.name} (attempt {attempt}): {exc}")
	if attempt >= max_retries:
	raise
	sleep = min(backoff_max, backoff_base ** attempt)
	time.sleep(sleep)
	continue

	# Retry on 429/5xx
	if resp.status_code in (429, 500, 502, 503, 504):
	logger.warning(f"Transient status {resp.status_code} for {image_path.name} (attempt {attempt})")
	if attempt >= max_retries:
	logger.error(f"Max retries reached for {image_path.name}: {resp.text[:200]}")
	resp.raise_for_status()
	sleep = min(backoff_max, backoff_base ** attempt)
	time.sleep(sleep)
	continue

	# For other non-success codes, raise to let caller handle or log
	# httpx.Response does not have an `ok` attribute (unlike requests).
	# Treat any non-2xx status as an error here.
	if resp.status_code < 200 or resp.status_code >= 300:
	logger.error(f"Unexpected HTTP {resp.status_code} for {image_path.name}: {resp.text[:500]}")
	resp.raise_for_status()

	data = resp.json()

	# Log usage and cost information
	try:
	if 'usage' in data:
	usage = data['usage']
	prompt_tokens = usage.get('prompt_tokens', 0)
	completion_tokens = usage.get('completion_tokens', 0)
	total_tokens = usage.get('total_tokens', 0)

	# Check for cached tokens
	prompt_details = usage.get('prompt_tokens_details', {})
	cached_tokens = prompt_details.get('cached_tokens', 0)
	non_cached_tokens = prompt_tokens - cached_tokens

	logger.info(f"Usage for {image_path.name}: {total_tokens} total tokens ({prompt_tokens} prompt + {completion_tokens} completion)")
	logger.info(f"Cache status: {cached_tokens} cached, {non_cached_tokens} non-cached tokens")

	# Log cost if available
	if 'nanoGPT' in data and 'cost' in data['nanoGPT']:
	cost = data['nanoGPT']['cost']
	global total_cost
	total_cost += cost
	logger.info(f"Cost for {image_path.name}: ${cost:.6f} (Total: ${total_cost:.6f})")

	except (KeyError, TypeError) as e:
	logger.debug(f"No usage information available for {image_path.name}: {e}")

	# Log reasoning context if available
	try:
	if 'choices' in data and len(data['choices']) > 0:
	choice = data['choices'][0]
	if 'message' in choice and 'reasoning_context' in choice['message']:
	reasoning_context = choice['message']['reasoning_context']
	logger.info(f"Reasoning context for {image_path.name}: {reasoning_context}")
	except (KeyError, IndexError, TypeError) as e:
	logger.debug(f"No reasoning context available for {image_path.name}: {e}")

	if data['choices'][0]['message']['finish_reason'] != 'stop':
	raise Exception(f"Unexpected finish reason {data['choices'][0]['message']['finish_reason']} for {image_path.name}: {data['choices'][0]['message']['content'][:500]}")


	content = data['choices'][0]['message']['content']
	if len(content.split(' ') ) < 10:
	raise Exception(f"Unexpectedly short response for {image_path.name}: {content[:500]}")

	logger.info(f"Response for {image_path.name}: {content[:500]}")

	return content

	def transcribe_image_gemini(
	api_key: str,
	model: str,
	prompt: str,
	image_path: Path,
	max_retries: int = 5,
	backoff_base: float = 1.5,
	backoff_max: float = 60.0,

	include_thoughts: bool = False,
	):
	with open(image_path, 'rb') as f:
	image_bytes = f.read()
	client = genai.Client(api_key=api_key)
	# import ipdb
	# ipdb.set_trace()
	response = client.models.generate_content(
	model=model,
	contents=[
	types.Part.from_bytes(
	data=image_bytes,
	mime_type='image/jpeg',
	),
	prompt
	],
	config=types.GenerateContentConfig(
	thinking_config=types.ThinkingConfig(
	include_thoughts=include_thoughts
	)
	)
	)
	if response.candidates[0].finish_reason != types.FinishReason.STOP:
	raise Exception(f"Unexpected finish reason {response.candidates[0].finish_reason} for {image_path.name}: {response.candidates[0].content.parts[0].text[:500]}")

	text = thoughts = None
	for part in response.candidates[0].content.parts:
	is_thought = getattr(part, 'thought', False)
	if is_thought:
	thoughts = part.text
	else:
	text = part.text

	if not text:
	raise Exception(f"No text found for {image_path.name} in {response.candidates[0].content.parts}")

	if len(text.split(' ') ) < 10:
	raise Exception(f"Unexpectedly short response for {image_path.name}: {text[:500]}")

	if thoughts:
	logger.info(f"Thoughts for {image_path.name}: {thoughts}")

	logger.info(f"Response for {image_path.name}: {text}")

	return text

	def process_single_image(
	api: str,
	tags_file: Path,
	output_dir: Path,
	api_key: str,
	model: str,
	prompt: str,
	max_retries: int = 5,

	include_thoughts: bool = False,
	) -> Tuple[Path, str, str]:
	"""
	Process a single image file and return the result.
	This function is designed to be called by a process pool.

	Returns:
	Tuple of (tags_file, txt_path, out_text)
	"""
	base = output_dir / tags_file.with_suffix("").name
	txt_path = base.with_suffix(".txt")

	tags = tags_file.read_text()
	image_file = tags_file.with_suffix(".png")

	if api == 'google-genai':
	result = transcribe_image_gemini(
	api_key=api_key,
	model=model,
	prompt=prompt + tags,
	image_path=image_file,
	max_retries=max_retries,

	include_thoughts=include_thoughts,
	)
	else:
	result = transcribe_image_api(
	url=api,
	api_key=api_key,
	model=model,
	prompt=prompt + tags,
	image_path=image_file,
	max_retries=max_retries,
	)
	return tags_file, txt_path, result


	def check_path(path: str) -> Path:
	path = Path(path)
	if not path.exists():
	raise argparse.ArgumentTypeError(f"File does not exist: {path}")
	return path


	def check_dir(path: str) -> Path:
	path = Path(path)
	if not path.exists():
	raise argparse.ArgumentTypeError(f"Directory does not exist: {path}")
	if not path.is_dir():
	raise argparse.ArgumentTypeError(f"Path is not a directory: {path}")
	return path


	def main(argv: Optional[list[str]] = None) -> int:
	parser = argparse.ArgumentParser(description="Caption images using the image and a .tags file of image tags.")

	parser.add_argument("input_dir", help="Directory with images to transcribe", type=check_dir)
	parser.add_argument("output_dir", help="Directory to write transcripts and metadata", type=check_dir)

	parser.add_argument("--api", default="https://nano-gpt.com/api/v1/chat/completions")
	parser.add_argument("-k", "--api-key", help="API key (or set API_KEY env var)", default=os.getenv("API_KEY"))
	parser.add_argument("-m", "--model", default=os.getenv("MODEL", "gemini-2.5-flash-image-preview:free"), help="Multimodal model to use")

	parser.add_argument("-p", "--prompt-file", help="File with prompt to use", type=check_path)
	parser.add_argument("-r", "--max-retries", type=int, default=5, help="Maximum retry attempts for transient errors")
	parser.add_argument("-w", "--workers", type=int, default=cpu_count(), help="Number of parallel workers (default: CPU count)")
	parser.add_argument("-d", "--delay", type=int, default=10, help="Delay between calls (keep under free limit)")
	parser.add_argument(
	"-o", "--overwrite-existing",
	action="store_true",
	help="Skip images that already have a .txt transcript in the output directory",
	)

	parser.add_argument("-T", "--include-thoughts", action="store_true", help="Include thoughts in the response")

	args = parser.parse_args(argv)

	input_dir = Path(args.input_dir)
	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	tags = list(input_dir.glob('*.tags'))

	prompt = args.prompt_file.read_text()

	queue = []
	if args.overwrite_existing:
	queue = tags
	else:
	queue = [t for t in tags if not (output_dir / t.with_suffix(".txt")).exists()]

	logger.info(f'Remaining {len(queue)} images to transcribe from {len(tags)} tags')
	if len(queue) == len(tags):
	raise Exception(f"Either no images are transcribed or filtering is broken")

	logger.info(f"Found {len(queue)} image(s) in {input_dir}")

	if args.workers == 1:
	# Sequential processing for single worker
	logger.info("Using sequential processing (1 worker)")
	for tags_file in tqdm(queue, desc="Transcribing images", unit="img"):
	tags_file, txt_path, out_text = process_single_image(
	args.api,
	tags_file,
	output_dir,
	args.api_key,
	args.model,
	prompt,
	args.max_retries,
	args.include_thoughts,
	)

	# Write per-image transcript and metadata, and append to all.txt
	txt_path.write_text(out_text, encoding="utf-8")
	# Also append this transcript to a running `all.txt` in the output directory
	all_path = output_dir / "all.txt"
	with all_path.open("a", encoding="utf-8") as allf:
	allf.write(out_text)
	allf.write("\n\n")

	logger.info(f"Wrote transcript to {txt_path}")
	time.sleep(args.delay)
	else:
	# Parallel processing for multiple workers
	logger.info(f"Using {args.workers} parallel workers")
	try:
	with ProcessPoolExecutor(max_workers=args.workers) as executor:
	# Submit all tasks
	future_to_tags_file = {
	executor.submit(
	process_single_image,
	args.api,
	tags_file,
	output_dir,
	args.api_key,
	args.model,
	prompt,
	args.max_retries,
	args.include_thoughts,
	): tags_file
	for tags_file in queue
	}

	# Process completed tasks with progress bar
	with tqdm(total=len(queue), desc="Transcribing images", unit="img") as pbar:
	try:
	for future in as_completed(future_to_tags_file):
	tags_file, txt_path, out_text = future.result()

	# Write per-image transcript and metadata, and append to all.txt
	txt_path.write_text(out_text, encoding="utf-8")
	# Also append this transcript to a running `all.txt` in the output directory
	all_path = output_dir / "all.txt"
	with all_path.open("a", encoding="utf-8") as allf:
	allf.write(out_text)
	allf.write("\n\n")

	logger.info(f"Wrote transcript to {txt_path}")
	pbar.update(1)
	except KeyboardInterrupt:
	logger.info("Interrupt received. Cancelling remaining tasks...")
	# Cancel all pending futures
	for future in future_to_tags_file:
	future.cancel()
	# Wait for already running tasks to complete
	for future in as_completed(future_to_tags_file):
	try:
	future.result()
	except:
	pass
	raise
	except KeyboardInterrupt:
	logger.info("Processing interrupted by user. Shutting down gracefully...")
	return 1

	logger.info(f"All done. Outputs in {output_dir}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())
	[project]
	name = "crop"
	version = "0.1.0"
	description = "Add your description here"
	readme = "README.md"
	requires-python = ">=3.13"
	dependencies = [
	"google-genai>=1.41.0",
	"httpx>=0.28.1",
	"ipdb>=0.13.13",
	"loguru>=0.7.3",
	"tqdm>=4.67.1",
	]