greg76 · April 6, 2026 20:48
diff --git a/!sleepstory.md b/!sleepstory.md
diff --git a/PROMPT.md b/PROMPT.md
diff --git a/sleepstory.py b/sleepstory.py
 import contextlib
 import io
 import os
 import re
 import subprocess
 import time
 from pathlib import Path
 from typing import Any

 import espeakng_loader
 import numpy as np
 from mlx_audio.tts.utils import load_model as load_tts
 from mlx_lm import generate, load
 from mlx_lm.sample_utils import make_logits_processors, make_sampler
 from phonemizer.backend.espeak.wrapper import EspeakWrapper

 BACKGROUND_SOUND = "rain.m4a"
 BACKGROUND_VOLUME = 0.1
 VOICE = "bm_george"  # bm_lewis
 OUTPUT_FOLDER = Path("stories")

 # Monkey-patch phonemizer to fix AttributeError in misaki/espeak.py
 if not hasattr(EspeakWrapper, "set_data_path"):
    setattr(EspeakWrapper, "set_data_path", lambda _: None)

 # Set ESPEAK_DATA_PATH so espeak-ng can find its data
 os.environ["ESPEAK_DATA_PATH"] = str(Path(espeakng_loader.get_data_path()).parent)

 PROMPT = "PROMPT.md"

 # Models
 LLM_PATH = "mlx-community/Llama-3.2-3B-Instruct-4bit"
 TTS_PATH = "mlx-community/Kokoro-82M-bf16"

 print("--- Loading Engines ---")
 # This block swallows all the "Fetching 12 files..." and "Loading weights..." noise
 with (
    contextlib.redirect_stdout(io.StringIO()),
    contextlib.redirect_stderr(io.StringIO()),
 ):
    model, tokenizer, *_ = load(LLM_PATH)
    tokenizer: Any
    tts_model: Any = load_tts(TTS_PATH)  # type: ignore[arg-type]

 print("✅ Engines Ready.")


 def parse_prompt(file_path: str | Path) -> list[dict[str, str]]:
    with open(file_path, "r") as f:
        content = f.read()

    # Split by headers starting with #
    sections = re.split(r"^#\s+", content, flags=re.MULTILINE)

    system_content = ""
    user_content = ""

    for section in sections:
        if not section.strip():
            continue

        lines = section.split("\n")
        header = lines[0].lower()
        body = "\n".join(lines[1:]).strip()

        if "system prompt" in header:
            system_content = body
        else:
            # Re-add the header for user sections to maintain context
            user_content += f"# {section.strip()}\n\n"

    messages = []
    if system_content:
        messages.append({"role": "system", "content": system_content})
    if user_content:
        messages.append({"role": "user", "content": user_content.strip()})
    return messages


 def generate_story() -> None:
    # 1. Generate the content
    messages = parse_prompt(PROMPT)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Configure sampler and logits processors
    sampler = make_sampler(temp=0.7, top_p=0.9)
    logits_processors = make_logits_processors(
        repetition_penalty=1.03,
        repetition_context_size=1024,  # Larger context to avoid long-term loops
    )

    print("--- Generating Story (this may take a minute) ---")
    start_text = time.perf_counter()
    story_raw = generate(
        model,
        tokenizer,
        prompt=prompt,
        max_tokens=2000,
        sampler=sampler,
        logits_processors=logits_processors,
    )
    end_text = time.perf_counter()

    # 2. Extract Title
    lines = [line.strip() for line in story_raw.splitlines() if line.strip()]

    for i, line in enumerate(lines):
        if m := re.match(r"^(#+\s+|\*{2,})(.+)", line):
            extracted_title = m.group(2).strip()
            story_body = "\n".join(lines[i + 1 :]).strip()
            break
    else:
        extracted_title = "Untitled Cyberpunk Story"
        story_body = "\n".join(lines).strip()

    # 3. Save Files
    OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

    safe_base = re.sub(r"[^\w\s-]", "", extracted_title).replace(" ", "_").lower()
    text_filename = safe_base + ".md"
    audio_filename = safe_base + ".m4b"

    with open(OUTPUT_FOLDER / text_filename, "w") as f:
        f.write(story_raw)

    # 4. Audio Pipeline

    # Build the command as a list. Mix looped background sound if set.
    if BACKGROUND_SOUND:
        filter_str = (
            f"[0:a]volume=1.0[v]; "
            f"[1:a]volume={BACKGROUND_VOLUME}[r]; "
            f"[v][r]amix=inputs=2:duration=first:dropout_transition=2"
        )
        # fmt: off
        command = [
            "ffmpeg", "-y",
            "-f", "s16le", "-ar", "24000", "-ac", "1",
            "-i", "pipe:0",
            "-stream_loop", "-1", "-i", BACKGROUND_SOUND,
            "-filter_complex", filter_str,
            "-ac", "1", "-c:a", "aac", "-b:a", "64k",
            "-metadata", f"title={extracted_title}",
            "-metadata", "genre=Audiobook",
            "-metadata:s:a:0", "media_type=10",
            OUTPUT_FOLDER / audio_filename,
        ]
        # fmt: on
    else:
        # fmt: off
        command = [
            "ffmpeg", "-y",
            "-f", "s16le", "-ar", "24000", "-ac", "1",
            "-i", "pipe:0",
            "-c:a", "aac", "-b:a", "64k",
            "-metadata", f"title={extracted_title}",
            "-metadata", "genre=Audiobook",
            "-metadata:s:a:0", "media_type=10",
            OUTPUT_FOLDER / audio_filename,
        ]
        # fmt: on

    process = subprocess.Popen(
        command, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL
    )

    assert process.stdin is not None

    start_audio = time.perf_counter()
    total_samples = 0.0

    print(f"--- Generating Audio: {extracted_title} ---")
    for result in tts_model.generate(story_body, voice=VOICE, lang_code="b", speed=0.8):
        audio_np = np.array(result.audio)
        samples = (audio_np * 32767).astype(np.int16).tobytes()
        total_samples += len(samples) / 2
        process.stdin.write(samples)

    process.stdin.close()
    process.wait()
    end_audio = time.perf_counter()

    # 5. Telemetry
    text_time = end_text - start_text
    tokens = len(tokenizer.encode(story_raw))

    print("\n" + "=" * 40)
    print(f"📖 STORY: {extracted_title}")
    print("=" * 40)
    print(f"LLM:   {tokens / text_time:.2f} tokens/sec")
    print(f"Audio: {(total_samples / 24000) / (end_audio - start_audio):.2f}x realtime")
    print(f"Text:  {text_filename}")
    print(f"Audio: {audio_filename}")
    print("=" * 40 + "\n")


 if __name__ == "__main__":
    if BACKGROUND_SOUND and not os.path.isfile(BACKGROUND_SOUND):
        print(f"{BACKGROUND_SOUND=} but doesn't exists.")
        exit(1)
    generate_story()
	import contextlib
	import io
	import os
	import re
	import subprocess
	import time
	from pathlib import Path
	from typing import Any

	import espeakng_loader
	import numpy as np
	from mlx_audio.tts.utils import load_model as load_tts
	from mlx_lm import generate, load
	from mlx_lm.sample_utils import make_logits_processors, make_sampler
	from phonemizer.backend.espeak.wrapper import EspeakWrapper

	BACKGROUND_SOUND = "rain.m4a"
	BACKGROUND_VOLUME = 0.1
	VOICE = "bm_george" # bm_lewis
	OUTPUT_FOLDER = Path("stories")

	# Monkey-patch phonemizer to fix AttributeError in misaki/espeak.py
	if not hasattr(EspeakWrapper, "set_data_path"):
	setattr(EspeakWrapper, "set_data_path", lambda _: None)

	# Set ESPEAK_DATA_PATH so espeak-ng can find its data
	os.environ["ESPEAK_DATA_PATH"] = str(Path(espeakng_loader.get_data_path()).parent)

	PROMPT = "PROMPT.md"

	# Models
	LLM_PATH = "mlx-community/Llama-3.2-3B-Instruct-4bit"
	TTS_PATH = "mlx-community/Kokoro-82M-bf16"

	print("--- Loading Engines ---")
	# This block swallows all the "Fetching 12 files..." and "Loading weights..." noise
	with (
	contextlib.redirect_stdout(io.StringIO()),
	contextlib.redirect_stderr(io.StringIO()),
	):
	model, tokenizer, *_ = load(LLM_PATH)
	tokenizer: Any
	tts_model: Any = load_tts(TTS_PATH) # type: ignore[arg-type]

	print("✅ Engines Ready.")


	def parse_prompt(file_path: str \| Path) -> list[dict[str, str]]:
	with open(file_path, "r") as f:
	content = f.read()

	# Split by headers starting with #
	sections = re.split(r"^#\s+", content, flags=re.MULTILINE)

	system_content = ""
	user_content = ""

	for section in sections:
	if not section.strip():
	continue

	lines = section.split("\n")
	header = lines[0].lower()
	body = "\n".join(lines[1:]).strip()

	if "system prompt" in header:
	system_content = body
	else:
	# Re-add the header for user sections to maintain context
	user_content += f"# {section.strip()}\n\n"

	messages = []
	if system_content:
	messages.append({"role": "system", "content": system_content})
	if user_content:
	messages.append({"role": "user", "content": user_content.strip()})
	return messages


	def generate_story() -> None:
	# 1. Generate the content
	messages = parse_prompt(PROMPT)
	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	# Configure sampler and logits processors
	sampler = make_sampler(temp=0.7, top_p=0.9)
	logits_processors = make_logits_processors(
	repetition_penalty=1.03,
	repetition_context_size=1024, # Larger context to avoid long-term loops
	)

	print("--- Generating Story (this may take a minute) ---")
	start_text = time.perf_counter()
	story_raw = generate(
	model,
	tokenizer,
	prompt=prompt,
	max_tokens=2000,
	sampler=sampler,
	logits_processors=logits_processors,
	)
	end_text = time.perf_counter()

	# 2. Extract Title
	lines = [line.strip() for line in story_raw.splitlines() if line.strip()]

	for i, line in enumerate(lines):
	if m := re.match(r"^(#+\s+\|\*{2,})(.+)", line):
	extracted_title = m.group(2).strip()
	story_body = "\n".join(lines[i + 1 :]).strip()
	break
	else:
	extracted_title = "Untitled Cyberpunk Story"
	story_body = "\n".join(lines).strip()

	# 3. Save Files
	OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

	safe_base = re.sub(r"[^\w\s-]", "", extracted_title).replace(" ", "_").lower()
	text_filename = safe_base + ".md"
	audio_filename = safe_base + ".m4b"

	with open(OUTPUT_FOLDER / text_filename, "w") as f:
	f.write(story_raw)

	# 4. Audio Pipeline

	# Build the command as a list. Mix looped background sound if set.
	if BACKGROUND_SOUND:
	filter_str = (
	f"[0:a]volume=1.0[v]; "
	f"[1:a]volume={BACKGROUND_VOLUME}[r]; "
	f"[v][r]amix=inputs=2:duration=first:dropout_transition=2"
	)
	# fmt: off
	command = [
	"ffmpeg", "-y",
	"-f", "s16le", "-ar", "24000", "-ac", "1",
	"-i", "pipe:0",
	"-stream_loop", "-1", "-i", BACKGROUND_SOUND,
	"-filter_complex", filter_str,
	"-ac", "1", "-c:a", "aac", "-b:a", "64k",
	"-metadata", f"title={extracted_title}",
	"-metadata", "genre=Audiobook",
	"-metadata:s:a:0", "media_type=10",
	OUTPUT_FOLDER / audio_filename,
	]
	# fmt: on
	else:
	# fmt: off
	command = [
	"ffmpeg", "-y",
	"-f", "s16le", "-ar", "24000", "-ac", "1",
	"-i", "pipe:0",
	"-c:a", "aac", "-b:a", "64k",
	"-metadata", f"title={extracted_title}",
	"-metadata", "genre=Audiobook",
	"-metadata:s:a:0", "media_type=10",
	OUTPUT_FOLDER / audio_filename,
	]
	# fmt: on

	process = subprocess.Popen(
	command, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL
	)

	assert process.stdin is not None

	start_audio = time.perf_counter()
	total_samples = 0.0

	print(f"--- Generating Audio: {extracted_title} ---")
	for result in tts_model.generate(story_body, voice=VOICE, lang_code="b", speed=0.8):
	audio_np = np.array(result.audio)
	samples = (audio_np * 32767).astype(np.int16).tobytes()
	total_samples += len(samples) / 2
	process.stdin.write(samples)

	process.stdin.close()
	process.wait()
	end_audio = time.perf_counter()

	# 5. Telemetry
	text_time = end_text - start_text
	tokens = len(tokenizer.encode(story_raw))

	print("\n" + "=" * 40)
	print(f"📖 STORY: {extracted_title}")
	print("=" * 40)
	print(f"LLM: {tokens / text_time:.2f} tokens/sec")
	print(f"Audio: {(total_samples / 24000) / (end_audio - start_audio):.2f}x realtime")
	print(f"Text: {text_filename}")
	print(f"Audio: {audio_filename}")
	print("=" * 40 + "\n")


	if __name__ == "__main__":
	if BACKGROUND_SOUND and not os.path.isfile(BACKGROUND_SOUND):
	print(f"{BACKGROUND_SOUND=} but doesn't exists.")
	exit(1)
	generate_story()
No results found