Skip to content

Instantly share code, notes, and snippets.

@greg76
Last active April 6, 2026 20:48
Show Gist options
  • Select an option

  • Save greg76/1bed6c15d31dbd7a9ae9484c9ebd594a to your computer and use it in GitHub Desktop.

Select an option

Save greg76/1bed6c15d31dbd7a9ae9484c9ebd594a to your computer and use it in GitHub Desktop.
Quick experiment to use MLX framework for generating cyberpunk audio sleep stories.

Audio sleep stories

  • Llama-3.2-3B-Instruct-4bit is used for generating the story based on the instructions in PROMPT.md
    • Llama 3.2 seems to work best with smaller stories: keeping the stories around 1000 words results ~4 min audio, with a slightly slower speed cadence.
  • Kokoro-82M-bf16 is used as the text to speech model (using even the "premium" built-in macOS voices produced weaker results with the say command)
  • Add your own sound file (eg. BACKGROUND_SOUND = "rain.m4a") for background soundscape.
  • There were some weird type innotation bugs in some of the underlying packages, hence the # type: ignore comments for the linter.

System prompt

  • You are a sleep podcasts writer! Your style is "Cyberpunk ASMR" — heavily atmospheric & sensory.
  • Use markdown formatting. Start with the title right ahead.
  • Write a long, immersive cyberpunk sleep story of approximately 1000 words.
  • Break the story into many atmospheric paragraphs. Maintain a steady, rhythmic pace.
  • Crucial: Ensure the story has a clear, atmospheric conclusion and ends with 'The end.'
  • Do not wander; keep the pacing tight so you finish within the word count.

Task

Write a "sleep story" set in a rain-slicked, neon-lit cyberpunk city.

The Vibe

Inspired by Neuromancer, The Snowcrash and Ready Player One. Focus on the "low-life, high-tech" aesthetic but through a lens of calm, late-night solitude.

Story constraints

  1. No Conflict: The "story arc" should be a flat plateau of calm activity.
  2. Sensory Details: Focus on the hum of cooling fans, the rhythmic drip & smell of rain, the soft glow of emerald terminal text.
  3. Pacing: Use long, flowing sentences, use hypnotic descriptions. Use commas and ellipses to create a slow, rhythmic pace for the narrator.
  4. The "Decking" Segment: Describe a slow, peaceful transition into a "private server" virtual reality that looks like a calm, digital Zen garden or a low-poly ocean.

Structure

This is just for the overall arch for the actual theme, take some creative liberty.

  • 0-300 words: Setting the scene in a small, cozy apartment or hideout at night. The sound of the city outside.
  • 300-700 words: The process of "booting up" / "jacking in". The tactile feel of the cyberdeck, the soft click of switches & cables, the slow crawl of data on the screen.
  • 700-1000 words: A drift into a virtual void. Ending with pulsing neon lines that slowly fade to black.
import contextlib
import io
import os
import re
import subprocess
import time
from pathlib import Path
from typing import Any
import espeakng_loader
import numpy as np
from mlx_audio.tts.utils import load_model as load_tts
from mlx_lm import generate, load
from mlx_lm.sample_utils import make_logits_processors, make_sampler
from phonemizer.backend.espeak.wrapper import EspeakWrapper
BACKGROUND_SOUND = "rain.m4a"
BACKGROUND_VOLUME = 0.1
VOICE = "bm_george" # bm_lewis
OUTPUT_FOLDER = Path("stories")
# Monkey-patch phonemizer to fix AttributeError in misaki/espeak.py
if not hasattr(EspeakWrapper, "set_data_path"):
setattr(EspeakWrapper, "set_data_path", lambda _: None)
# Set ESPEAK_DATA_PATH so espeak-ng can find its data
os.environ["ESPEAK_DATA_PATH"] = str(Path(espeakng_loader.get_data_path()).parent)
PROMPT = "PROMPT.md"
# Models
LLM_PATH = "mlx-community/Llama-3.2-3B-Instruct-4bit"
TTS_PATH = "mlx-community/Kokoro-82M-bf16"
print("--- Loading Engines ---")
# This block swallows all the "Fetching 12 files..." and "Loading weights..." noise
with (
contextlib.redirect_stdout(io.StringIO()),
contextlib.redirect_stderr(io.StringIO()),
):
model, tokenizer, *_ = load(LLM_PATH)
tokenizer: Any
tts_model: Any = load_tts(TTS_PATH) # type: ignore[arg-type]
print("✅ Engines Ready.")
def parse_prompt(file_path: str | Path) -> list[dict[str, str]]:
with open(file_path, "r") as f:
content = f.read()
# Split by headers starting with #
sections = re.split(r"^#\s+", content, flags=re.MULTILINE)
system_content = ""
user_content = ""
for section in sections:
if not section.strip():
continue
lines = section.split("\n")
header = lines[0].lower()
body = "\n".join(lines[1:]).strip()
if "system prompt" in header:
system_content = body
else:
# Re-add the header for user sections to maintain context
user_content += f"# {section.strip()}\n\n"
messages = []
if system_content:
messages.append({"role": "system", "content": system_content})
if user_content:
messages.append({"role": "user", "content": user_content.strip()})
return messages
def generate_story() -> None:
# 1. Generate the content
messages = parse_prompt(PROMPT)
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Configure sampler and logits processors
sampler = make_sampler(temp=0.7, top_p=0.9)
logits_processors = make_logits_processors(
repetition_penalty=1.03,
repetition_context_size=1024, # Larger context to avoid long-term loops
)
print("--- Generating Story (this may take a minute) ---")
start_text = time.perf_counter()
story_raw = generate(
model,
tokenizer,
prompt=prompt,
max_tokens=2000,
sampler=sampler,
logits_processors=logits_processors,
)
end_text = time.perf_counter()
# 2. Extract Title
lines = [line.strip() for line in story_raw.splitlines() if line.strip()]
for i, line in enumerate(lines):
if m := re.match(r"^(#+\s+|\*{2,})(.+)", line):
extracted_title = m.group(2).strip()
story_body = "\n".join(lines[i + 1 :]).strip()
break
else:
extracted_title = "Untitled Cyberpunk Story"
story_body = "\n".join(lines).strip()
# 3. Save Files
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
safe_base = re.sub(r"[^\w\s-]", "", extracted_title).replace(" ", "_").lower()
text_filename = safe_base + ".md"
audio_filename = safe_base + ".m4b"
with open(OUTPUT_FOLDER / text_filename, "w") as f:
f.write(story_raw)
# 4. Audio Pipeline
# Build the command as a list. Mix looped background sound if set.
if BACKGROUND_SOUND:
filter_str = (
f"[0:a]volume=1.0[v]; "
f"[1:a]volume={BACKGROUND_VOLUME}[r]; "
f"[v][r]amix=inputs=2:duration=first:dropout_transition=2"
)
# fmt: off
command = [
"ffmpeg", "-y",
"-f", "s16le", "-ar", "24000", "-ac", "1",
"-i", "pipe:0",
"-stream_loop", "-1", "-i", BACKGROUND_SOUND,
"-filter_complex", filter_str,
"-ac", "1", "-c:a", "aac", "-b:a", "64k",
"-metadata", f"title={extracted_title}",
"-metadata", "genre=Audiobook",
"-metadata:s:a:0", "media_type=10",
OUTPUT_FOLDER / audio_filename,
]
# fmt: on
else:
# fmt: off
command = [
"ffmpeg", "-y",
"-f", "s16le", "-ar", "24000", "-ac", "1",
"-i", "pipe:0",
"-c:a", "aac", "-b:a", "64k",
"-metadata", f"title={extracted_title}",
"-metadata", "genre=Audiobook",
"-metadata:s:a:0", "media_type=10",
OUTPUT_FOLDER / audio_filename,
]
# fmt: on
process = subprocess.Popen(
command, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL
)
assert process.stdin is not None
start_audio = time.perf_counter()
total_samples = 0.0
print(f"--- Generating Audio: {extracted_title} ---")
for result in tts_model.generate(story_body, voice=VOICE, lang_code="b", speed=0.8):
audio_np = np.array(result.audio)
samples = (audio_np * 32767).astype(np.int16).tobytes()
total_samples += len(samples) / 2
process.stdin.write(samples)
process.stdin.close()
process.wait()
end_audio = time.perf_counter()
# 5. Telemetry
text_time = end_text - start_text
tokens = len(tokenizer.encode(story_raw))
print("\n" + "=" * 40)
print(f"📖 STORY: {extracted_title}")
print("=" * 40)
print(f"LLM: {tokens / text_time:.2f} tokens/sec")
print(f"Audio: {(total_samples / 24000) / (end_audio - start_audio):.2f}x realtime")
print(f"Text: {text_filename}")
print(f"Audio: {audio_filename}")
print("=" * 40 + "\n")
if __name__ == "__main__":
if BACKGROUND_SOUND and not os.path.isfile(BACKGROUND_SOUND):
print(f"{BACKGROUND_SOUND=} but doesn't exists.")
exit(1)
generate_story()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment