- Llama-3.2-3B-Instruct-4bit is used for generating the story based on the instructions in PROMPT.md
- Llama 3.2 seems to work best with smaller stories: keeping the stories around 1000 words results ~4 min audio, with a slightly slower speed cadence.
- Kokoro-82M-bf16 is used as the text to speech model (using even the "premium" built-in macOS voices produced weaker results with the
saycommand) - Add your own sound file (eg. BACKGROUND_SOUND = "rain.m4a") for background soundscape.
- There were some weird type innotation bugs in some of the underlying packages, hence the
# type: ignorecomments for the linter.
Last active
April 6, 2026 20:48
-
-
Save greg76/1bed6c15d31dbd7a9ae9484c9ebd594a to your computer and use it in GitHub Desktop.
Quick experiment to use MLX framework for generating cyberpunk audio sleep stories.
- You are a sleep podcasts writer! Your style is "Cyberpunk ASMR" — heavily atmospheric & sensory.
- Use markdown formatting. Start with the title right ahead.
- Write a long, immersive cyberpunk sleep story of approximately 1000 words.
- Break the story into many atmospheric paragraphs. Maintain a steady, rhythmic pace.
- Crucial: Ensure the story has a clear, atmospheric conclusion and ends with 'The end.'
- Do not wander; keep the pacing tight so you finish within the word count.
Write a "sleep story" set in a rain-slicked, neon-lit cyberpunk city.
Inspired by Neuromancer, The Snowcrash and Ready Player One. Focus on the "low-life, high-tech" aesthetic but through a lens of calm, late-night solitude.
- No Conflict: The "story arc" should be a flat plateau of calm activity.
- Sensory Details: Focus on the hum of cooling fans, the rhythmic drip & smell of rain, the soft glow of emerald terminal text.
- Pacing: Use long, flowing sentences, use hypnotic descriptions. Use commas and ellipses to create a slow, rhythmic pace for the narrator.
- The "Decking" Segment: Describe a slow, peaceful transition into a "private server" virtual reality that looks like a calm, digital Zen garden or a low-poly ocean.
This is just for the overall arch for the actual theme, take some creative liberty.
- 0-300 words: Setting the scene in a small, cozy apartment or hideout at night. The sound of the city outside.
- 300-700 words: The process of "booting up" / "jacking in". The tactile feel of the cyberdeck, the soft click of switches & cables, the slow crawl of data on the screen.
- 700-1000 words: A drift into a virtual void. Ending with pulsing neon lines that slowly fade to black.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import contextlib | |
| import io | |
| import os | |
| import re | |
| import subprocess | |
| import time | |
| from pathlib import Path | |
| from typing import Any | |
| import espeakng_loader | |
| import numpy as np | |
| from mlx_audio.tts.utils import load_model as load_tts | |
| from mlx_lm import generate, load | |
| from mlx_lm.sample_utils import make_logits_processors, make_sampler | |
| from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
| BACKGROUND_SOUND = "rain.m4a" | |
| BACKGROUND_VOLUME = 0.1 | |
| VOICE = "bm_george" # bm_lewis | |
| OUTPUT_FOLDER = Path("stories") | |
| # Monkey-patch phonemizer to fix AttributeError in misaki/espeak.py | |
| if not hasattr(EspeakWrapper, "set_data_path"): | |
| setattr(EspeakWrapper, "set_data_path", lambda _: None) | |
| # Set ESPEAK_DATA_PATH so espeak-ng can find its data | |
| os.environ["ESPEAK_DATA_PATH"] = str(Path(espeakng_loader.get_data_path()).parent) | |
| PROMPT = "PROMPT.md" | |
| # Models | |
| LLM_PATH = "mlx-community/Llama-3.2-3B-Instruct-4bit" | |
| TTS_PATH = "mlx-community/Kokoro-82M-bf16" | |
| print("--- Loading Engines ---") | |
| # This block swallows all the "Fetching 12 files..." and "Loading weights..." noise | |
| with ( | |
| contextlib.redirect_stdout(io.StringIO()), | |
| contextlib.redirect_stderr(io.StringIO()), | |
| ): | |
| model, tokenizer, *_ = load(LLM_PATH) | |
| tokenizer: Any | |
| tts_model: Any = load_tts(TTS_PATH) # type: ignore[arg-type] | |
| print("✅ Engines Ready.") | |
| def parse_prompt(file_path: str | Path) -> list[dict[str, str]]: | |
| with open(file_path, "r") as f: | |
| content = f.read() | |
| # Split by headers starting with # | |
| sections = re.split(r"^#\s+", content, flags=re.MULTILINE) | |
| system_content = "" | |
| user_content = "" | |
| for section in sections: | |
| if not section.strip(): | |
| continue | |
| lines = section.split("\n") | |
| header = lines[0].lower() | |
| body = "\n".join(lines[1:]).strip() | |
| if "system prompt" in header: | |
| system_content = body | |
| else: | |
| # Re-add the header for user sections to maintain context | |
| user_content += f"# {section.strip()}\n\n" | |
| messages = [] | |
| if system_content: | |
| messages.append({"role": "system", "content": system_content}) | |
| if user_content: | |
| messages.append({"role": "user", "content": user_content.strip()}) | |
| return messages | |
| def generate_story() -> None: | |
| # 1. Generate the content | |
| messages = parse_prompt(PROMPT) | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| # Configure sampler and logits processors | |
| sampler = make_sampler(temp=0.7, top_p=0.9) | |
| logits_processors = make_logits_processors( | |
| repetition_penalty=1.03, | |
| repetition_context_size=1024, # Larger context to avoid long-term loops | |
| ) | |
| print("--- Generating Story (this may take a minute) ---") | |
| start_text = time.perf_counter() | |
| story_raw = generate( | |
| model, | |
| tokenizer, | |
| prompt=prompt, | |
| max_tokens=2000, | |
| sampler=sampler, | |
| logits_processors=logits_processors, | |
| ) | |
| end_text = time.perf_counter() | |
| # 2. Extract Title | |
| lines = [line.strip() for line in story_raw.splitlines() if line.strip()] | |
| for i, line in enumerate(lines): | |
| if m := re.match(r"^(#+\s+|\*{2,})(.+)", line): | |
| extracted_title = m.group(2).strip() | |
| story_body = "\n".join(lines[i + 1 :]).strip() | |
| break | |
| else: | |
| extracted_title = "Untitled Cyberpunk Story" | |
| story_body = "\n".join(lines).strip() | |
| # 3. Save Files | |
| OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True) | |
| safe_base = re.sub(r"[^\w\s-]", "", extracted_title).replace(" ", "_").lower() | |
| text_filename = safe_base + ".md" | |
| audio_filename = safe_base + ".m4b" | |
| with open(OUTPUT_FOLDER / text_filename, "w") as f: | |
| f.write(story_raw) | |
| # 4. Audio Pipeline | |
| # Build the command as a list. Mix looped background sound if set. | |
| if BACKGROUND_SOUND: | |
| filter_str = ( | |
| f"[0:a]volume=1.0[v]; " | |
| f"[1:a]volume={BACKGROUND_VOLUME}[r]; " | |
| f"[v][r]amix=inputs=2:duration=first:dropout_transition=2" | |
| ) | |
| # fmt: off | |
| command = [ | |
| "ffmpeg", "-y", | |
| "-f", "s16le", "-ar", "24000", "-ac", "1", | |
| "-i", "pipe:0", | |
| "-stream_loop", "-1", "-i", BACKGROUND_SOUND, | |
| "-filter_complex", filter_str, | |
| "-ac", "1", "-c:a", "aac", "-b:a", "64k", | |
| "-metadata", f"title={extracted_title}", | |
| "-metadata", "genre=Audiobook", | |
| "-metadata:s:a:0", "media_type=10", | |
| OUTPUT_FOLDER / audio_filename, | |
| ] | |
| # fmt: on | |
| else: | |
| # fmt: off | |
| command = [ | |
| "ffmpeg", "-y", | |
| "-f", "s16le", "-ar", "24000", "-ac", "1", | |
| "-i", "pipe:0", | |
| "-c:a", "aac", "-b:a", "64k", | |
| "-metadata", f"title={extracted_title}", | |
| "-metadata", "genre=Audiobook", | |
| "-metadata:s:a:0", "media_type=10", | |
| OUTPUT_FOLDER / audio_filename, | |
| ] | |
| # fmt: on | |
| process = subprocess.Popen( | |
| command, stdin=subprocess.PIPE, stderr=subprocess.DEVNULL | |
| ) | |
| assert process.stdin is not None | |
| start_audio = time.perf_counter() | |
| total_samples = 0.0 | |
| print(f"--- Generating Audio: {extracted_title} ---") | |
| for result in tts_model.generate(story_body, voice=VOICE, lang_code="b", speed=0.8): | |
| audio_np = np.array(result.audio) | |
| samples = (audio_np * 32767).astype(np.int16).tobytes() | |
| total_samples += len(samples) / 2 | |
| process.stdin.write(samples) | |
| process.stdin.close() | |
| process.wait() | |
| end_audio = time.perf_counter() | |
| # 5. Telemetry | |
| text_time = end_text - start_text | |
| tokens = len(tokenizer.encode(story_raw)) | |
| print("\n" + "=" * 40) | |
| print(f"📖 STORY: {extracted_title}") | |
| print("=" * 40) | |
| print(f"LLM: {tokens / text_time:.2f} tokens/sec") | |
| print(f"Audio: {(total_samples / 24000) / (end_audio - start_audio):.2f}x realtime") | |
| print(f"Text: {text_filename}") | |
| print(f"Audio: {audio_filename}") | |
| print("=" * 40 + "\n") | |
| if __name__ == "__main__": | |
| if BACKGROUND_SOUND and not os.path.isfile(BACKGROUND_SOUND): | |
| print(f"{BACKGROUND_SOUND=} but doesn't exists.") | |
| exit(1) | |
| generate_story() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment