Skip to content

Instantly share code, notes, and snippets.

@thomasjpfan
Last active May 30, 2025 14:51
Show Gist options
  • Save thomasjpfan/b00420943a2dc02a0ff0ffd516107e52 to your computer and use it in GitHub Desktop.
Save thomasjpfan/b00420943a2dc02a0ff0ffd516107e52 to your computer and use it in GitHub Desktop.
parse_it.py
from llama_index.core import (
VectorStoreIndex,
Settings,
StorageContext,
SimpleDirectoryReader,
)
from llama_index.readers.web import SimpleWebPageReader
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding = HuggingFaceEmbedding(model_name="nomic-ai/modernbert-embed-base")
llm = HuggingFaceLLM(
model_name="Qwen/Qwen2.5-Coder-7B-Instruct",
tokenizer_name="Qwen/Qwen2.5-Coder-7B-Instruct",
)
# llm = OpenAILike(
# # api_base=f"{vllm_chat_app.endpoint}/v1",
# api_base="http://127.0.0.1:1234/v1",
# api_version="v1",
# model="gemma-3-4b-it",
# api_key="XYZ",
# # is_chat_model=True,
# # is_function_calling_model=True,
# max_tokens=2056,
# )
Settings.llm = llm
Settings.embed_model = embedding
# source_url = "https://en.wikipedia.org/wiki/Star_Wars"
# documents = SimpleWebPageReader(html_to_text=True).load_data([source_url])
from llama_index.core import Document
with open("modal-docs/modal-text-docs.md") as f:
document = Document(text=f.read())
# documents = SimpleDirectoryReader("modal-docs").load_data()
vector_store = LanceDBVectorStore(uri="lancedb-serving")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents([document], storage_context=storage_context)
# vector_store._table
# breakpoint()
# from llama_index.core import load_index_from_storage
# storage_context = StorageContext.from_defaults(persist_dir="storage")
# cur_index = load_index_from_storage(storage_context)
# storage_context.persist(persist_dir="storage")
# query_engine = index.as_chat_engine()
# models["query_engine"] = query_engine
query_engine = index.as_query_engine()
# # Query the engine
response = query_engine.query("How to add pandas to a modal image?")
print(response)
from llama_index.core import (
VectorStoreIndex,
Settings,
StorageContext,
load_index_from_storage,
)
from llama_index.readers.web import SimpleWebPageReader
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.llms.huggingface import HuggingFaceLLM
# from llama_index import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding = HuggingFaceEmbedding(model_name="nomic-ai/modernbert-embed-base")
llm = HuggingFaceLLM(
model_name="Qwen/Qwen2.5-Coder-7B-Instruct",
tokenizer_name="Qwen/Qwen2.5-Coder-7B-Instruct",
)
Settings.llm = llm
Settings.embed_model = embedding
import lancedb
db = lancedb.connect("lancedb-serving")
vector_store = LanceDBVectorStore.from_table(table=db["vectors"])
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()
# # Query the engine
# response = query_engine.query("How to add pandas to a modal image?")
response = query_engine.query(
"What environment variables are available in a modal function?"
)
print(response)
This file has been truncated, but you can view the full file.
Introduction
============
Modal is a cloud function platform that lets you:
* Run any code remotely within seconds.
* Define [container environments](guide/images.html) in code (or use one of our pre-built backends).
* [Scale out horizontally](guide/scale.html) to thousands of containers.
* Attach [GPUs](guide/gpu.html) with a single line of code.
* Serve your functions as [web endpoints](guide/webhooks.html).
* Deploy and monitor [persistent scheduled jobs](guide/cron.html).
* Use powerful primitives like [distributed dictionaries and queues](guide/dicts-and-queues.html).
You get [full serverless execution and pricing](../pricing.html), because we host everything and charge per second of usage. Notably, there’s zero configuration in Modal - everything is code. Take a breath of fresh air and feel how good it tastes with no YAML in it.
Getting started
---------------
The nicest thing about all of this is that **you don’t have to set up any
infrastructure.** Just:
1. Create an account at [modal.com](https://modal.com)
2. Run `pip install modal` to install the `modal` Python package
3. Run `modal setup` to authenticate (if this doesn’t work, try `python -m modal setup`)
…and you can start running jobs right away. Check out some of our simple getting started examples:
* [Hello, world!](examples/hello_world.html)
* [A simple web scraper](examples/web-scraper.html)
You can also learn Modal interactively without installing anything through our [code playground](../playground/get_started.html).
How does it work?
-----------------
Modal takes your code, puts it in a container, and executes it in the cloud.
Where does it run? Modal runs it in its own cloud environment. The benefit is
that we solve all the hard infrastructure problems for you, so you don’t have to
do anything. You don’t need to mess with Kubernetes, Docker or even an AWS
account.
Modal is currently Python-only, but we may support other languages in the
future.
[Introduction](#introduction)[Getting started](#getting-started)[How does it work?](#how-does-it-work)
See it in action
[Hello, world!](examples/hello_world.html)
[A simple web scraper](examples/web-scraper.html)
![Modal logo](../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../company.html) [Status](https://status.modal.com/) [Changelog](reference/changelog.html) [Documentation](guide.html) [Slack Community](../slack.html) [Pricing](../pricing.html) [Examples](examples.html)
API Reference
=============
This is the API reference for the [`modal`](https://pypi.org/project/modal/) Python package, which allows you to run distributed applications on Modal.
The reference is intended to be limited to low-level descriptions of various
programmatic functionality. If you’re just getting started with Modal, we would
instead recommend looking at the [guide](guide.html) first
or to get started quickly with an [example](examples.html).
Application construction
------------------------
| | |
| --- | --- |
| [`App`](reference/modal.App.html) | The main unit of deployment for code on Modal |
| [`App.function`](reference/modal.App.html#function) | Decorator for registering a function with an App |
| [`App.cls`](reference/modal.App.html#cls) | Decorator for registering a class with an App |
Serverless execution
--------------------
| | |
| --- | --- |
| [`Function`](reference/modal.Function.html) | A serverless function backed by an autoscaling container pool |
| [`Cls`](reference/modal.Cls.html) | A serverless class supporting parametrization and lifecycle hooks |
Extended Function configuration
-------------------------------
### Class parametrization
| | |
| --- | --- |
| [`parameter`](reference/modal.parameter.html) | Used to define class parameters, akin to a Dataclass field |
### Lifecycle hooks
| | |
| --- | --- |
| [`enter`](reference/modal.enter.html) | Decorator for a method that will be executed during container startup |
| [`exit`](reference/modal.exit.html) | Decorator for a method that will be executed during container shutdown |
| [`method`](reference/modal.method.html) | Decorator for exposing a method as an invokable function |
### Web integrations
| | |
| --- | --- |
| [`fastapi_endpoint`](reference/modal.fastapi_endpoint.html) | Decorator for exposing a simple FastAPI-based endpoint |
| [`asgi_app`](reference/modal.asgi_app.html) | Decorator for functions that construct an ASGI web application |
| [`wsgi_app`](reference/modal.wsgi_app.html) | Decorator for functions that construct a WSGI web application |
| [`web_server`](reference/modal.web_server.html) | Decorator for functions that construct an HTTP web server |
### Function semantics
| | |
| --- | --- |
| [`batched`](reference/modal.batched.html) | Decorator that enables [dynamic input batching](guide/dynamic-batching.html) |
| [`concurrent`](reference/modal.concurrent.html) | Decorator that enables [input concurrency](guide/concurrent-inputs.html) |
### Scheduling
| | |
| --- | --- |
| [`Cron`](reference/modal.Cron.html) | A schedule that runs based on cron syntax |
| [`Period`](reference/modal.Period.html) | A schedule that runs at a fixed interval |
### Exception handling
| | |
| --- | --- |
| [`Retries`](reference/modal.Retries.html) | Function retry policy for input failures |
Sandboxed execution
-------------------
| | |
| --- | --- |
| [`Sandbox`](reference/modal.Sandbox.html) | An interface for restricted code execution |
| [`ContainerProcess`](reference/modal.container_process.html#modalcontainer_processcontainerprocess) | An object representing a sandboxed process |
| [`FileIO`](reference/modal.file_io.html#modalfile_iofileio) | A handle for a file in the Sandbox filesystem |
Container configuration
-----------------------
| | |
| --- | --- |
| [`Image`](reference/modal.Image.html) | An API for specifying container images |
| [`Secret`](reference/modal.Secret.html) | A pointer to secrets that will be exposed as environment variables |
Data primitives
---------------
### Persistent storage
| | |
| --- | --- |
| [`Volume`](reference/modal.Volume.html) | Distributed storage supporting highly performant parallel reads |
| [`CloudBucketMount`](reference/modal.CloudBucketMount.html) | Storage backed by a third-party cloud bucket (S3, etc.) |
| [`NetworkFileSystem`](reference/modal.NetworkFileSystem.html) | Shared, writeable cloud storage (superseded by `modal.Volume`) |
### In-memory storage
| | |
| --- | --- |
| [`Dict`](reference/modal.Dict.html) | A distributed key-value store |
| [`Queue`](reference/modal.Queue.html) | A distributed FIFO queue |
Networking
----------
| | |
| --- | --- |
| [`Proxy`](reference/modal.Proxy.html) | An object that provides a static outbound IP address for containers |
| [`forward`](reference/modal.forward.html) | A context manager for publicly exposing a port from a container |
[API Reference](#api-reference)[Application construction](#application-construction)[Serverless execution](#serverless-execution)[Extended Function configuration](#extended-function-configuration)[Class parametrization](#class-parametrization)[Lifecycle hooks](#lifecycle-hooks)[Web integrations](#web-integrations)[Function semantics](#function-semantics)[Scheduling](#scheduling)[Exception handling](#exception-handling)[Sandboxed execution](#sandboxed-execution)[Container configuration](#container-configuration)[Data primitives](#data-primitives)[Persistent storage](#persistent-storage)[In-memory storage](#in-memory-storage)[Networking](#networking)
![Modal logo](../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../company.html) [Status](https://status.modal.com/) [Changelog](reference/changelog.html) [Documentation](guide.html) [Slack Community](../slack.html) [Pricing](../pricing.html) [Examples](examples.html)
### Featured Examples
Featured
Images, video & 3D
Fine-tuning
Language modeling
Batch processing
Audio
Sandboxed code execution
Computational biology
[### Deploy an OpenAI-compatible LLM service
Run large language models with a drop-in replacement for the OpenAI API.](examples/vllm_inference.html)[### Custom pet art from Flux with Hugging Face and Gradio
Fine-tune an image generation model on pictures of your pet.](examples/dreambooth_app.html)[### Run llama.cpp
Run DeepSeek-R1 and Phi-4 on llama.cpp](examples/llama_cpp.html)[### Voice chat with LLMs
Build an interactive voice chat app.](examples/llm-voice-chat.html)[### Serve diffusion models
Serve Flux on Modal with a number of optimizations for blazingly fast inference.](examples/flux.html)[### Fold proteins with Chai-1
Predict molecular structures from sequences with SotA open source models.](examples/chai1.html)[### Serverless TensorRT-LLM (LLaMA 3 8B)
Run interactive language model applications.](examples/trtllm_latency.html)[### Star in custom music videos
Fine-tune a Wan2.1 video model on your face and run it in parallel](examples/music-video-gen.html)[### Create music
Turn prompts into music with MusicGen](examples/musicgen.html)[### Sandbox a LangGraph agent's code
Run an LLM coding agent that runs its own language models.](examples/agent.html)[### RAG Chat with PDFs
Use ColBERT-style, multimodal embeddings with a Vision-Language Model to answer questions about documents.](examples/chat_with_pdf_vision.html)[### Bring images to life
Prompt a generative video model to animate an image.](examples/image_to_video.html)[### Fast podcast transcriptions
Build an end-to-end podcast transcription app that leverages dozens of containers for super-fast processing.](examples/whisper-transcriber.html)[### Build a protein folding dashboard
Serve a web UI for a protein model with ESM3, Molstar, and Gradio](examples/esm3.html)[### Deploy a Hacker News Slackbot
Periodically post new Hacker News posts to Slack.](examples/hackernews_alerts.html)[### Retrieval-Augmented Generation (RAG) for Q&A
Build a question-answering web endpoint that can cite its sources.](examples/potus_speech_qanda.html)[### Document OCR job queue
Use Modal as an infinitely scalable job queue that can service async tasks from a web app.](examples/doc_ocr_jobs.html)[### Parallel processing of Parquet files on S3
Analyze data from the Taxi and Limousine Commission of NYC in parallel.](examples/s3_bucket_mount.html)
![Modal logo](../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../company.html) [Status](https://status.modal.com/) [Changelog](reference/changelog.html) [Documentation](guide.html) [Slack Community](../slack.html) [Pricing](../pricing.html) [Examples](examples.html)
Connecting Modal to your Datadog account
========================================
You can use the [Modal + Datadog Integration](https://docs.datadoghq.com/integrations/modal/) to export Modal function logs to Datadog. You’ll find the Modal Datadog
Integration available for install in the Datadog marketplace.
What this integration does
--------------------------
This integration allows you to:
1. Export Modal audit logs in Datadog
2. Export Modal function logs to Datadog
3. Export container metrics to Datadog
Installing the integration
--------------------------
1. Open the [Modal Tile](https://app.datadoghq.com/integrations?integrationId=modal) (or the EU tile [here](https://app.datadoghq.eu/integrations?integrationId=modal))
in the Datadog integrations page
2. Click “Install Integration”
3. Click Connect Accounts to begin authorization of this integration.
You will be redirected to log into Modal, and once logged in, you’ll
be redirected to the Datadog authorization page.
4. Click “Authorize” to complete the integration setup
Metrics
-------
The Modal Datadog Integration will forward the following metrics to Datadog:
* `modal.cpu.utilization`
* `modal.memory.utilization`
* `modal.gpu.memory.utilization`
* `modal.gpu.compute.utilization`
These metrics come free of charge and are tagged with `container_id`, `environment_name`, and `workspace_name`.
Structured logging
------------------
Logs from Modal are sent to Datadog in plaintext without any structured
parsing. This means that if you have custom log formats, you’ll need to
set up a [log processing pipeline](https://docs.datadoghq.com/logs/log_configuration/pipelines/?tab=source) in Datadog to parse them.
Modal passes log messages in the `.message` field of the log record. To
parse logs, you should operate over this field. Note that the Modal Integration
does set up some basic pipelines. In order for your pipelines to work, ensure
that your pipelines come before Modal’s pipelines in your log settings.
Cost Savings
------------
The Modal Datadog Integration will forward all logs to Datadog which could be
costly for verbose apps. We recommend using either [Log Pipelines](https://docs.datadoghq.com/logs/log_configuration/pipelines/?tab=source) or [Index Exclusion Filters](https://docs.datadoghq.com/logs/indexes/?tab=ui#exclusion-filters) to filter logs before they are sent to Datadog.
The Modal Integration tags all logs with the `environment` attribute. The
simplest way to filter logs is to create a pipeline that filters on this
attribute and to isolate verbose apps in a separate environment.
Uninstalling the integration
----------------------------
Once the integration is uninstalled, all logs will stop being sent to
Datadog, and authorization will be revoked.
1. Navigate to the [Modal metrics settings page](http://modal.com/settings/metrics) and select “Delete Datadog Integration”.
2. On the Configure tab in the Modal integration tile in Datadog,
click Uninstall Integration.
3. Confirm that you want to uninstall the integration.
4. Ensure that all API keys associated with this integration have been
disabled by searching for the integration name on the [API Keys](https://app.datadoghq.com/organization-settings/api-keys?filter=Modal) page.
[Connecting Modal to your Datadog account](#connecting-modal-to-your-datadog-account)[What this integration does](#what-this-integration-does)[Installing the integration](#installing-the-integration)[Metrics](#metrics)[Structured logging](#structured-logging)[Cost Savings](#cost-savings)[Uninstalling the integration](#uninstalling-the-integration)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Asynchronous API usage
======================
All of the functions in Modal are available in both standard (blocking) and
asynchronous variants. The async interface can be accessed by appending `.aio` to any function in the Modal API.
For example, instead of `my_modal_function.remote("hello")` in a blocking
context, you can use `await my_modal_function.remote.aio("hello")` to get an
asynchronous coroutine response, for use with Python’s `asyncio` library.
```
import asyncio
import modal
app = modal.App()
@app.function()
async def myfunc():
...
@app.local_entrypoint()
async def main():
# execute 100 remote calls to myfunc in parallel
await asyncio.gather(*[myfunc.remote.aio() for i in range(100)])
```
This is an advanced feature. If you are comfortable with asynchronous
programming, you can use this to create arbitrary parallel execution patterns,
with the added benefit that any Modal functions will be executed remotely.
Async functions
---------------
Regardless if you use an async runtime (like `asyncio`) in your usage of *Modal
itself*, you are free to define your `app.function`-decorated function bodies
as either async or blocking. Both kinds of definitions will work for remote
Modal function calls from both any context.
An async function can call a blocking function, and vice versa.
```
@app.function()
def blocking_function():
return 42
@app.function()
async def async_function():
x = await blocking_function.remote.aio()
return x * 10
@app.local_entrypoint()
def blocking_main():
print(async_function.remote()) # => 420
```
If a function is configured to support multiple concurrent inputs per container,
the behavior varies slightly between blocking and async contexts:
* In a blocking context, concurrent inputs will run on separate Python threads.
These are subject to the GIL, but they can still lead to race conditions if
used with non-threadsafe objects.
* In an async context, concurrent inputs are simply scheduled as coroutines on
the executor thread. Everything remains single-threaded.
[Asynchronous API usage](#asynchronous-api-usage)[Async functions](#async-functions)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Filesystem Access
=================
If you want to pass data in and out of the Sandbox during execution, you can use our
filesystem API to easily read and write files. The API supports reading files up to 100 MiB
and writes up to 1 GiB in size.
```
import modal
app = modal.App.lookup("sandbox-fs-demo", create_if_missing=True)
sb = modal.Sandbox.create(app=app)
with sb.open("test.txt", "w") as f:
f.write("Hello World\n")
f = sb.open("test.txt", "rb")
print(f.read())
f.close()
```
The filesystem API is similar to Python’s built-in [io.FileIO](https://docs.python.org/3/library/io.html#io.FileIO) and supports many of the same methods, including `read`, `readline`, `readlines`, `write`, `flush`, `seek`, and `close`.
We also provide the special methods `replace_bytes` and `delete_bytes`, which may be useful for LLM-generated code.
```
from modal.file_io import delete_bytes, replace_bytes
with sb.open("example.txt", "w") as f:
f.write("The quick brown fox jumps over the lazy dog")
with sb.open("example.txt", "r+") as f:
# The quick brown fox jumps over the lazy dog
print(f.read())
# The slow brown fox jumps over the lazy dog
replace_bytes(f, b"slow", start=4, end=9)
# The slow red fox jumps over the lazy dog
replace_bytes(f, b"red", start=9, end=14)
# The slow red fox jumps over the dog
delete_bytes(f, start=32, end=37)
f.seek(0)
print(f.read())
sb.terminate()
```
We additionally provide commands [`mkdir`](../reference/modal.Sandbox.html#mkdir), [`rm`](../reference/modal.Sandbox.html#rm), and [`ls`](../reference/modal.Sandbox.html#ls) to make interacting with the filesystem more ergonomic.
Syncing files outside the Sandbox
---------------------------------
Modal [Volume](../reference/modal.Volume.html)s or [CloudBucketMount](cloud-bucket-mounts.html)s can also be attached to
Sandboxes for file syncing outside the Sandbox. If you want to give the caller
access to files written by the Sandbox, you could create an ephemeral `Volume` that will be garbage collected when the App finishes:
```
with modal.Volume.ephemeral() as vol:
sb = modal.Sandbox.create(
volumes={"/cache": vol},
app=my_app,
)
p = sb.exec("bash", "-c", "echo foo > /cache/a.txt")
p.wait()
sb.terminate()
for data in vol.read_file("a.txt"):
print(data)
```
Alternatively, if you want to persist files between Sandbox invocations (useful
if you’re building a stateful code interpreter, for example), you can use create
a persisted `Volume` with a dynamically assigned label:
```
session_id = "example-session-id-123abc"
vol = modal.Volume.from_name(f"vol-{session_id}", create_if_missing=True)
sb = modal.Sandbox.create(
volumes={"/cache": vol},
app=my_app,
)
p = sb.exec("bash", "-c", "echo foo > /cache/a.txt")
p.wait()
sb.terminate()
for data in vol.read_file("a.txt"):
print(data)
```
File syncing behavior differs between Volumes and CloudBucketMounts. For Volumes, files are only synced back to the Volume when the Sandbox terminates. For CloudBucketMounts, files are synced automatically.
[Filesystem Access](#filesystem-access)[Syncing files outside the Sandbox](#syncing-files-outside-the-sandbox)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Cold start performance
======================
Modal Functions are run in [containers](images.html).
If a container is already ready to run your Function, it will be reused.
If not, Modal spins up a new container.
This is known as a *cold start*,
and it is often associated with higher latency.
There are two sources of increased latency during cold starts:
1. inputs may **spend more time waiting** in a queue for a container
to become ready or “warm”.
2. when an input is handled by the container that just started,
there may be **extra work that only needs to be done on the first invocation** (“initialization”).
This guide presents techniques and Modal features for reducing the impact of both queueing
and initialization on observed latencies.
If you are invoking Functions with no warm containers
or if you otherwise see inputs spending too much time in the “pending” state,
you should [target queueing time for optimization](#reduce-time-spent-queueing-for-warm-containers).
If you see some Function invocations taking much longer than others,
and those invocations are the first handled by a new container,
you should [target initialization for optimization](#reduce-latency-from-initialization).
Reduce time spent queueing for warm containers
----------------------------------------------
New containers are booted when there are not enough other warm containers to
to handle the current number of inputs.
For example, the first time you send an input to a Function,
there are zero warm containers and there is one input,
so a single container must be booted up.
The total latency for the input will include
the time it takes to boot a container.
If you send another input right after the first one finishes,
there will be one warm container and one pending input,
and no new container will be booted.
Generalizing, there are two factors that affect the time inputs spend queueing:
the time it takes for a container to boot and become warm (which we solve by booting faster)
and the time until a warm container is available to handle an input (which we solve by having more warm containers).
### Warm up containers faster
The time taken for a container to become warm
and ready for inputs can range from seconds to minutes.
Modal’s custom container stack has been heavily optimized to reduce this time.
Containers boot in about one second.
But before a container is considered warm and ready to handle inputs,
we need to execute any logic in your code’s global scope (such as imports)
or in any [`modal.enter` methods](lifecycle-functions.html).
So if your boots are slow, these are the first places to work on optimization.
For example, you might be downloading a large model from a model server
during the boot process.
You can instead [download the model ahead of time](model-weights.html),
so that it only needs to be downloaded once.
For models in the tens of gigabytes,
this can reduce boot times from minutes to seconds.
### Run more warm containers
It is not always possible to speed up boots sufficiently.
For example, seconds of added latency to load a model may not
be acceptable in an interactive setting.
In this case, the only option is to have more warm containers running.
This increases the chance that an input will be handled by a warm container,
for example one that finishes an input while another container is booting.
Modal currently exposes three parameters to control how many containers will be warm: `scaledown_window`, `min_containers`, and `buffer_containers`.
All of these strategies can increase the resources consumed by your Function
and so introduce a trade-off between cold start latencies and cost.
#### Keep containers warm for longer with `scaledown_window`
Modal containers will remain idle for a short period before shutting down. By
default, the maximum idle time is 60 seconds. You can configure this by setting
the `scaledown_window` on the [`@function`](../reference/modal.App.html#function) decorator. The value is measured in seconds, and it can be set anywhere between
two seconds and twenty minutes.
```
import modal
app = modal.App()
@app.function(scaledown_window=300)
def my_idle_greeting():
return {"hello": "world"}
```
Increasing the `scaledown_window` reduces the chance that subsequent requests
will require a cold start, although you will be billed for any resources used
while the container is idle (e.g., GPU reservation or residual memory
occupancy). Note that containers will not necessarily remain alive for the
entire window, as the autoscaler will scale down more agressively when the
Function is substantially over-provisioned.
#### Overprovision resources with `min_containers` and `buffer_containers`
Keeping already warm containers around longer doesn’t help if there are no warm
containers to begin with, as when Functions scale from zero.
To keep some containers warm and running at all times, set the `min_containers` value on the [`@function`](../reference/modal.App.html#function) decorator. This
puts a floor on the the number of containers so that the Function doesn’t scale
to zero. Modal will still scale up and spin down more containers as the
demand for your Function fluctuates above the `min_containers` value, as usual.
While `min_containers` overprovisions containers while the Function is idle, `buffer_containers` provisions extra containers while the Function is active.
This “buffer” of extra containers will be idle and ready to handle inputs if
the rate of requests increases. This parameter is particularly useful for
bursty request patterns, where the arrival of one input predicts the arrival of more inputs,
like when a new user or client starts hitting the Function.
```
import modal
app = modal.App(image=modal.Image.debian_slim().pip_install("fastapi"))
@app.function(min_containers=3, buffer_containers=3)
def my_warm_greeting():
return "Hello, world!"
```
#### Adjust warm pools dynamically
You can also set the warm pool size for a deployed function dynamically with [`Function.update_autoscaler`](../reference/modal.Function.html#update_autoscaler).
This can be used with a Modal [scheduled function](cron.html) to update the number of warm containers based on the time of day, for example:
```
import modal
app = modal.App()
@app.function()
def square(x):
return x**2
@app.function(schedule=modal.Cron("0 * * * *")) # run at the start of the hour
def update_autoscaler_settings():
from datetime import datetime, timezone
peak_hours_start, peak_hours_end = 6, 18
if peak_hours_start <= datetime.now(timezone.utc).hour < peak_hours_end:
square.update_autoscaler(min_containers=3)
else:
square.update_autoscaler(min_containers=0)
```
Reduce latency from initialization
----------------------------------
Some work is done the first time that a function is invoked
but can be used on every subsequent invocation.
This is [*amortized work*](https://www.cs.cornell.edu/courses/cs312/2006sp/lectures/lec18.html) done at initialization.
For example, you may be using a large pre-trained model
whose weights need to be loaded from disk to memory the first time it is used.
This results in longer latencies for the first invocation of a warm container,
which shows up in the application as occasional slow calls: high tail latency or elevated p9Xs.
### Move initialization work out of the first invocation
Some work done on the first invocation can be moved up and completed ahead of time.
Any work that can be saved to disk, like [downloading model weights](model-weights.html),
should be done as early as possible. The results can be included in the [container’s Image](images.html) or saved to a [Modal Volume](volumes.html).
Some work is tricky to serialize, like spinning up a network connection or an inference server.
If you can move this initialization logic out of the function body and into the global scope or a [container `enter` method](lifecycle-functions.html#enter),
you can move this work into the warm up period.
Containers will not be considered warm until all `enter` methods have completed,
so no inputs will be routed to containers that have yet to complete this initialization.
For more on how to use `enter` with machine learning model weights, see [this guide](model-weights.html).
Note that `enter` doesn’t get rid of the latency —
it just moves the latency to the warm up period,
where it can be handled by [running more warm containers](#run-more-warm-containers).
### Share initialization work across cold starts with memory snapshots
Cold starts can also be made faster by using memory snapshots.
Invocations of a Function after the first
are faster in part because the memory is already populated
with values that otherwise need to be computed or read from disk,
like the contents of imported libraries.
Memory snapshotting captures the state of a container’s memory
at user-controlled points after it has been warmed up
and reuses that state in future boots, which can substantially
reduce cold start latency penalties and warm up period duration.
Refer to the [memory snapshot](memory-snapshot.html) guide for details.
### Optimize initialization code
Sometimes, there is nothing to be done but to speed this work up.
Here, we share specific patterns that show up in optimizing initialization
in Modal Functions.
#### Load multiple large files concurrently
Often Modal applications need to read large files into memory (eg. model
weights) before they can process inputs. Where feasible these large file
reads should happen concurrently and not sequentially. Concurrent IO takes
full advantage of our platform’s high disk and network bandwidth
to reduce latency.
One common example of slow sequential IO is loading multiple independent
Huggingface `transformers` models in series.
```
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
model_a = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor_a = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model_b = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
processor_b = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
```
The above snippet does four `.from_pretrained` loads sequentially.
None of the components depend on another being already loaded in memory, so they
can be loaded concurrently instead.
They could instead be loaded concurrently using a function like this:
```
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
def load_models_concurrently(load_functions_map: dict) -> dict:
model_id_to_model = {}
with ThreadPoolExecutor(max_workers=len(load_functions_map)) as executor:
future_to_model_id = {
executor.submit(load_fn): model_id
for model_id, load_fn in load_functions_map.items()
}
for future in as_completed(future_to_model_id.keys()):
model_id_to_model[future_to_model_id[future]] = future.result()
return model_id_to_model
components = load_models_concurrently({
"clip_model": lambda: CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
"clip_processor": lambda: CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
"blip_model": lambda: BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large"),
"blip_processor": lambda: BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
})
```
If performing concurrent IO on large file reads does *not* speed up your cold
starts, it’s possible that some part of your function’s code is holding the
Python [GIL](https://wiki.python.org/moin/GlobalInterpreterLock) and reducing
the efficacy of the multi-threaded executor.
[Cold start performance](#cold-start-performance)[Reduce time spent queueing for warm containers](#reduce-time-spent-queueing-for-warm-containers)[Warm up containers faster](#warm-up-containers-faster)[Run more warm containers](#run-more-warm-containers)[Keep containers warm for longer with scaledown\_window](#keep-containers-warm-for-longer-with-scaledown_window)[Overprovision resources with min\_containers and buffer\_containers](#overprovision-resources-with-min_containers-and-buffer_containers)[Adjust warm pools dynamically](#adjust-warm-pools-dynamically)[Reduce latency from initialization](#reduce-latency-from-initialization)[Move initialization work out of the first invocation](#move-initialization-work-out-of-the-first-invocation)[Share initialization work across cold starts with memory snapshots](#share-initialization-work-across-cold-starts-with-memory-snapshots)[Optimize initialization code](#optimize-initialization-code)[Load multiple large files concurrently](#load-multiple-large-files-concurrently)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Private registries
==================
Modal provides the [`Image.from_registry`](images.html#use-an-existing-container-image-with-from_registry) function, which can pull public images available from registries such as Docker
Hub and GitHub Container Registry, as well as private images from registries
such as [AWS Elastic Container Registry (ECR)](https://aws.amazon.com/ecr/), [GCP Artifact Registry](https://cloud.google.com/artifact-registry), and Docker
Hub.
Docker Hub (Private)
--------------------
To pull container images from private Docker Hub repositories, [create an access token](https://docs.docker.com/security/for-developers/access-tokens/) with “Read-Only” permissions and use this token value and your Docker Hub
username to create a Modal [Secret](secrets.html).
```
REGISTRY_USERNAME=my-dockerhub-username
REGISTRY_PASSWORD=dckr_pat_TS012345aaa67890bbbb1234ccc
```
Use this Secret with the [`modal.Image.from_registry`](../reference/modal.Image.html#from_registry) method.
Elastic Container Registry (ECR)
--------------------------------
You can pull images from your AWS ECR account by specifying the full image URI
as follows:
```
import modal
aws_secret = modal.Secret.from_name("my-aws-secret")
image = (
modal.Image.from_aws_ecr(
"000000000000.dkr.ecr.us-east-1.amazonaws.com/my-private-registry:latest",
secret=aws_secret,
)
.pip_install("torch", "huggingface")
)
app = modal.App(image=image)
```
As shown above, you also need to use a [Modal Secret](secrets.html) containing the environment variables `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_REGION`. The AWS IAM user account associated
with those keys must have access to the private registry you want to access.
The user needs to have the following read-only policies:
```
{
"Version": "2012-10-17",
"Statement": [
{
"Action": ["ecr:GetAuthorizationToken"],
"Effect": "Allow",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"ecr:BatchCheckLayerAvailability",
"ecr:GetDownloadUrlForLayer",
"ecr:GetRepositoryPolicy",
"ecr:DescribeRepositories",
"ecr:ListImages",
"ecr:DescribeImages",
"ecr:BatchGetImage",
"ecr:GetLifecyclePolicy",
"ecr:GetLifecyclePolicyPreview",
"ecr:ListTagsForResource",
"ecr:DescribeImageScanFindings"
],
"Resource": "<MY-REGISTRY-ARN>"
}
]
}
```
You can use the IAM configuration above as a template for creating an IAM user.
You can then [generate an access key](https://aws.amazon.com/premiumsupport/knowledge-center/create-access-key/) and create a Modal Secret using the AWS integration option. Modal will use your
access keys to generate an ephemeral ECR token. That token is only used to pull
image layers at the time a new image is built. We don’t store this token but
will cache the image once it has been pulled.
Images on ECR must be private and follow [image configuration requirements](../reference/modal.Image.html#from_aws_ecr).
Google Artifact Registry and Google Container Registry
------------------------------------------------------
For further detail on how to pull images from Google’s image registries, see [`modal.Image.from_gcp_artifact_registry`](../reference/modal.Image.html#from_gcp_artifact_registry).
[Private registries](#private-registries)[Docker Hub (Private)](#docker-hub-private)[Elastic Container Registry (ECR)](#elastic-container-registry-ecr)[Google Artifact Registry and Google Container Registry](#google-artifact-registry-and-google-container-registry)
See it in action
[Registry image for Algolia indexing](../examples/algolia_indexer.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Cloud bucket mounts
===================
The [`modal.CloudBucketMount`](../reference/modal.CloudBucketMount.html) is a
mutable volume that allows for both reading and writing files from a cloud
bucket. It supports AWS S3, Cloudflare R2, and Google Cloud Storage buckets.
Cloud bucket mounts are built on top of AWS’ [`mountpoint`](https://github.com/awslabs/mountpoint-s3) technology and inherits
its limitations. Notably, mode changes are disabled, so commands like `chmod` and [`shutil.copymode()`](https://docs.python.org/3/library/shutil.html#shutil.copymode) will fail.
Mounting Cloudflare R2 buckets
------------------------------
`CloudBucketMount` enables Cloudflare R2 buckets to be mounted as file system
volumes. Because Cloudflare R2 is [S3-Compatible](https://developers.cloudflare.com/r2/api/s3/api/) the setup is
very similar between R2 and S3. See [modal.CloudBucketMount](../reference/modal.CloudBucketMount.html#modalcloudbucketmount) for usage instructions.
When creating the R2 API token for use with the mount, you need to have the
ability to read, write, and list objects in the specific buckets you will mount.
You do *not* need admin permissions, and you should *not* use “Client IP Address
Filtering”.
Mounting Google Cloud Storage buckets
-------------------------------------
`CloudBucketMount` enables Google Cloud Storage (GCS) buckets to be mounted as file system
volumes. See [modal.CloudBucketMount](../reference/modal.CloudBucketMount.html#modalcloudbucketmount) for GCS setup instructions.
Mounting S3 buckets
-------------------
`CloudBucketMount` enables S3 buckets to be mounted as file system volumes. To
interact with a bucket, you must have the appropriate IAM permissions configured
(refer to the section on [IAM Permissions](#iam-permissions)).
```
import modal
import subprocess
app = modal.App()
s3_bucket_name = "s3-bucket-name" # Bucket name not ARN.
s3_access_credentials = modal.Secret.from_dict({
"AWS_ACCESS_KEY_ID": "...",
"AWS_SECRET_ACCESS_KEY": "...",
"AWS_REGION": "..."
})
@app.function(
volumes={
"/my-mount": modal.CloudBucketMount(s3_bucket_name, secret=s3_access_credentials)
}
)
def f():
subprocess.run(["ls", "/my-mount"])
```
### Specifying S3 bucket region
Amazon S3 buckets are associated with a single AWS Region. [`Mountpoint`](https://github.com/awslabs/mountpoint-s3) attempts to automatically detect the region for your S3 bucket at startup time and directs all S3 requests to that region. However, in certain scenarios, like if your container is running on an AWS worker in a certain region, while your bucket is in a different region, this automatic detection may fail.
To avoid this issue, you can specify the region of your S3 bucket by adding an `AWS_REGION` key to your Modal secrets, as in the code example above.
### Using AWS temporary security credentials
`CloudBucketMount`s also support AWS temporary security credentials by passing
the additional environment variable `AWS_SESSION_TOKEN`. Temporary credentials
will expire and will not get renewed automatically. You will need to update
the corresponding Modal Secret in order to prevent failures.
You can get temporary credentials with the [AWS CLI](https://aws.amazon.com/cli/) with:
```
$ aws configure export-credentials --format env
export AWS_ACCESS_KEY_ID=XXX
export AWS_SECRET_ACCESS_KEY=XXX
export AWS_SESSION_TOKEN=XXX...
```
All these values are required.
### Using OIDC identity tokens
Modal provides [OIDC integration](oidc-integration.html) and will automatically generate identity tokens to authenticate to AWS.
OIDC eliminates the need for manual token passing through Modal secrets and is based on short-lived tokens, which limits the window of exposure if a token is compromised.
To use this feature, you must [configure AWS to trust Modal’s OIDC provider](oidc-integration.html#step-1-configure-aws-to-trust-modals-oidc-provider) and [create an IAM role that can be assumed by Modal Functions](oidc-integration.html#step-2-create-an-iam-role-that-can-be-assumed-by-modal-functions).
Then, you specify the IAM role that your Modal Function should assume to access the S3 bucket.
```
import modal
app = modal.App()
s3_bucket_name = "s3-bucket-name"
role_arn = "arn:aws:iam::123456789abcd:role/s3mount-role"
@app.function(
volumes={
"/my-mount": modal.CloudBucketMount(
bucket_name=s3_bucket_name,
oidc_auth_role_arn=role_arn
)
}
)
def f():
subprocess.run(["ls", "/my-mount"])
```
### Mounting a path within a bucket
To mount only the files under a specific subdirectory, you can specify a path prefix using `key_prefix`.
Since this prefix specifies a directory, it must end in a `/`.
The entire bucket is mounted when no prefix is supplied.
```
import modal
import subprocess
app = modal.App()
s3_bucket_name = "s3-bucket-name"
prefix = 'path/to/dir/'
s3_access_credentials = modal.Secret.from_dict({
"AWS_ACCESS_KEY_ID": "...",
"AWS_SECRET_ACCESS_KEY": "...",
})
@app.function(
volumes={
"/my-mount": modal.CloudBucketMount(
bucket_name=s3_bucket_name,
key_prefix=prefix,
secret=s3_access_credentials
)
}
)
def f():
subprocess.run(["ls", "/my-mount"])
```
This will only mount the files in the bucket `s3-bucket-name` that are prefixed by `path/to/dir/`.
### Read-only mode
To mount a bucket in read-only mode, set `read_only=True` as an argument.
```
import modal
import subprocess
app = modal.App()
s3_bucket_name = "s3-bucket-name" # Bucket name not ARN.
s3_access_credentials = modal.Secret.from_dict({
"AWS_ACCESS_KEY_ID": "...",
"AWS_SECRET_ACCESS_KEY": "...",
})
@app.function(
volumes={
"/my-mount": modal.CloudBucketMount(s3_bucket_name, secret=s3_access_credentials, read_only=True)
}
)
def f():
subprocess.run(["ls", "/my-mount"])
```
While S3 mounts supports both write and read operations, they are optimized for
reading large files sequentially. Certain file operations, such as renaming
files, are not supported. For a comprehensive list of supported operations,
consult the [Mountpoint documentation](https://github.com/awslabs/mountpoint-s3/blob/main/doc/SEMANTICS.md).
### IAM permissions
To utilize `CloudBucketMount` for reading and writing files from S3 buckets,
your IAM policy must include permissions for `s3:PutObject`, `s3:AbortMultipartUpload`, and `s3:DeleteObject`. These permissions are not
required for mounts configured with `read_only=True`.
```
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "ModalListBucketAccess",
"Effect": "Allow",
"Action": ["s3:ListBucket"],
"Resource": ["arn:aws:s3:::<MY-S3-BUCKET>"]
},
{
"Sid": "ModalBucketAccess",
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:AbortMultipartUpload",
"s3:DeleteObject"
],
"Resource": ["arn:aws:s3:::<MY-S3-BUCKET>/*"]
}
]
}
```
[Cloud bucket mounts](#cloud-bucket-mounts)[Mounting Cloudflare R2 buckets](#mounting-cloudflare-r2-buckets)[Mounting Google Cloud Storage buckets](#mounting-google-cloud-storage-buckets)[Mounting S3 buckets](#mounting-s3-buckets)[Specifying S3 bucket region](#specifying-s3-bucket-region)[Using AWS temporary security credentials](#using-aws-temporary-security-credentials)[Using OIDC identity tokens](#using-oidc-identity-tokens)[Mounting a path within a bucket](#mounting-a-path-within-a-bucket)[Read-only mode](#read-only-mode)[IAM permissions](#iam-permissions)
See it in action
[Mount S3 buckets in Modal apps](../examples/s3_bucket_mount.html)
[Create a LoRA Playground with Modal, Gradio, and S3](../examples/cloud_bucket_mount_loras.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Managing deployments
====================
Once you’ve finished using `modal run` or `modal serve` to iterate on your Modal
code, it’s time to deploy. A Modal deployment creates and then persists an
application and its objects, providing the following benefits:
* Repeated application function executions will be grouped under the deployment,
aiding observability and usage tracking. Programmatically triggering lots of
ephemeral App runs can clutter your web and CLI interfaces.
* Function calls are much faster because deployed functions are persistent and
reused, not created on-demand by calls. Learn how to trigger deployed
functions in [Invoking deployed functions](trigger-deployed-functions.html).
* [Scheduled functions](cron.html) will continue scheduling separate from
any local iteration you do, and will notify you on failure.
* [Web endpoints](webhooks.html) keep running when you close your laptop,
and their URL address matches the deployment name.
Creating deployments
--------------------
Deployments are created using the [`modal deploy` command](../reference/cli/app.html#modal-app-list).
```
% modal deploy -m whisper_pod_transcriber.main
✓ Initialized. View app page at https://modal.com/apps/ap-PYc2Tb7JrkskFUI8U5w0KG.
✓ Created objects.
├── 🔨 Created populate_podcast_metadata.
├── 🔨 Mounted /home/ubuntu/whisper_pod_transcriber at /root/whisper_pod_transcriber
├── 🔨 Created fastapi_app => https://modal-labs-whisper-pod-transcriber-fastapi-app.modal.run
├── 🔨 Mounted /home/ubuntu/whisper_pod_transcriber/whisper_frontend/dist at /assets
├── 🔨 Created search_podcast.
├── 🔨 Created refresh_index.
├── 🔨 Created transcribe_segment.
├── 🔨 Created transcribe_episode..
└── 🔨 Created fetch_episodes.
✓ App deployed! 🎉
View Deployment: https://modal.com/apps/modal-labs/whisper-pod-transcriber
```
Running this command on an existing deployment will redeploy the App,
incrementing its version. For detail on how live deployed apps transition
between versions, see the [Updating deployments](#updating-deployments) section.
Deployments can also be created programmatically using Modal’s [Python API](../reference/modal.App.html#deploy).
Viewing deployments
-------------------
Deployments can be viewed either on the [apps](../../login%EF%B9%96next=%EA%A4%B7apps.html) web page or by using the [`modal app list` command](../reference/cli/app.html#modal-app-list).
Updating deployments
--------------------
A deployment can deploy a new App or redeploy a new version of an existing
deployed App. It’s useful to understand how Modal handles the transition between
versions when an App is redeployed. In general, Modal aims to support
zero-downtime deployments by gradually transitioning traffic to the new version.
If the deployment involves building new versions of the Images used by the App,
the build process will need to complete succcessfully. The existing version of
the App will continue to handle requests during this time. Errors during the
build will abort the deployment with no change to the status of the App.
After the build completes, Modal will start to bring up new containers running
the latest version of the App. The existing containers will continue handling
requests (using the previous version of the App) until the new containers have
completed their cold start.
Once the new containers are ready, old containers will stop accepting new
requests. However, the old containers will continue running any requests they
had previously accepted. The old containers will not terminate until they have
finished processing all ongoing requests.
Any warm pool containers will also be cycled during a deployment, as the
previous version’s warm pool are now outdated.
Deployment rollbacks
--------------------
To quickly reset an App back to a previous version, you can perform a deployment *rollback*. Rollbacks can be triggered from either the App dashboard or the CLI.
Rollback deployments look like new deployments: they increment the version number
and are attributed to the user who triggered the rollback. But the App’s functions
and metadata will be reset to their previous state independently of your current
App codebase.
Note that deployment rollbacks are supported only on the Team and Enterprise plans.
Stopping deployments
--------------------
Deployed apps can be stopped in the web UI by clicking the red “Stop app” button on
the App’s “Overview” page, or alternatively from the command line using the [`modal app stop` command](../reference/cli/app.html#modal-app-stop).
Stopping an App is a destructive action. Apps cannot be restarted from this state;
a new App will need to be deployed from the same source files. Objects associated
with stopped deployments will eventually be garbage collected.
[Managing deployments](#managing-deployments)[Creating deployments](#creating-deployments)[Viewing deployments](#viewing-deployments)[Updating deployments](#updating-deployments)[Deployment rollbacks](#deployment-rollbacks)[Stopping deployments](#stopping-deployments)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Memory Snapshot (beta)
======================
Memory snapshots can dramatically improve cold start performance for compatible Modal Functions.
During startup, your Python function typically reads many files from the file system, which
is expensive. For example, the `torch` package is [hundreds of MiB](https://pypi.org/project/torch/#files) and requires over 20,000 file operations to load! With memory snapshots, Modal
will produce restorable saves of your Function’s container right after startup initialization, and use these when available to lower startup latency. Functions with memory snapshots enabled **typically start 1.5-3x faster**.
Modal produces snapshots for deployed Functions on demand, creating and maintaining several snapshots to ensure coverage across our diverse worker fleet. Modal will also automatically expire snapshots and create new ones as we make runtime and security updates.
You don’t need to modify CPU Functions to take advantage of snapshotting in most
cases. GPU-enabled Functions typically require refactoring to move GPU
initialization into post-restore lifecycle functions (see below).
This is a *beta* feature. Let us know in [Modal Slack](../../slack.html) if you find any issues. To use memory snapshots, we recommend using Modal client version [`0.64.99`](../reference/changelog.html#06499-2024-09-11) or later.
Enabling snapshots
------------------
You can enable memory snapshots for your Function with the `enable_memory_snapshot=True` parameter:
```
import modal
app = modal.App("example-memory-snapshot")
@app.function(enable_memory_snapshot=True)
def my_func():
print("hello")
```
Then deploy the App with `modal deploy`. Memory snapshots are created only when an App is in a deployed state and aren’t enabled for ephemeral Apps.
Keep the following in mind when using memory snapshots:
* Every time a snapshot is created, Modal logs `Creating memory snapshot for Function.`.
* Modal creates several snapshots for a given version of your Function (see [Snapshot compatibility](#snapshot-compatibility) section).
* Redeploying your Function may cause Modal to create new snapshots, as existing snapshots
might not be compatible with your updated Function.
* Creating memory snapshots adds latency to a Function’s startup time, so expect
your Function to be slower to start during the first invocations.
Updating snapshots
------------------
Redeploying your Function with new configuration (e.g. a new GPU type) or new code will cause previous snapshots to become obsolete. Subsequent invocations to the new Function version will automatically create new snapshots from the new configuration and code.
Modal also automatically recreates your snapshots to keep up with platform’s latest runtime and security changes.
Snapshot compatibility
----------------------
Modal will create memory snapshots for every new version of your Function.
Changing your Function or updating its dependencies will trigger a new
snapshotting operation when you run your Function anew.
Additionally, you may observe in application logs your Function being memory
snapshots multiple times during its first few invocations. This happens because
memory snapshots are compatible with the underlying worker type that created them,
and Modal Functions run across a handful of worker types.
CPU-only Functions need around 6 snapshots for coverage, and Functions targeting a specific
GPU (e.g. A100) need 2-3. The cold boot benefits should greatly outweigh the penalty of creating multiple
snapshots.
Using snapshots with lifecycle functions
----------------------------------------
It’s currently not possible to snapshot GPU memory. We avoid exposing GPU
devices to your Function during the snapshotting stage (e.g. when `@enter(snap=True)`). NVIDIA drivers are available but no GPU devices are.
To work around this limitation, we suggest refactoring your initialization code
to run across two separate `@modal.enter` functions: one that runs before
creating the snapshot (`snap=True`), and one that runs after restoring from the
snapshot (`snap=False`). Load model weights onto CPU memory in the `snap=True` method, and then move the weights onto GPU memory in the `snap=False` method.
Here’s an example using the `sentence-transformers` package:
```
import modal
image = modal.Image.debian_slim().pip_install("sentence-transformers")
app = modal.App("sentence-transformers", image=image)
with image.imports():
from sentence_transformers import SentenceTransformer
model_vol = modal.Volume.from_name("sentence-transformers-models", create_if_missing=True)
@app.cls(gpu="a10g", volumes={"/models": model_vol}, enable_memory_snapshot=True)
class Embedder:
model_id = "BAAI/bge-small-en-v1.5"
@modal.enter(snap=True)
def load(self):
# Create a memory snapshot with the model loaded in CPU memory.
self.model = SentenceTransformer(f"/models/{self.model_id}", device="cpu")
@modal.enter(snap=False)
def setup(self):
self.model.to("cuda") # Move the model to a GPU!
@modal.method()
def run(self, sentences:list[str]):
embeddings = self.model.encode(sentences, normalize_embeddings=True)
print(embeddings)
@app.local_entrypoint()
def main():
Embedder().run.remote(sentences=["what is the meaning of life?"])
if __name__ == "__main__":
cls = modal.Cls.from_name("sentence-transformers", "Embedder")
cls().run.remote(sentences=["what is the meaning of life?"])
```
Snapshotting reduces the time it takes for this App’s Function to startup by about 3x, from ~6 seconds down to just ~2 seconds.
Known limitations
-----------------
Memory Snapshot is still in *beta*. Please report any issues on our [community Slack server](../../slack.html).
Client versions prior to [`0.64.99`](../reference/changelog.html#06499-2024-09-11) contain bugs that may cause snapshot restoration to fail.
### Caching GPU information
If your program calls functions that check if GPUs are available during snapshotting,
they will get a misleading report.
In the following example, GPUs are not available when `no_gpus_available_during_snapshots()` is called, but they are when the app
is restored and `gpus_available_following_restore()` is called:
```
import modal
app = modal.App(image=modal.Image.debian_slim().pip_install("torch"))
@app.cls(enable_memory_snapshot=True, gpu="any")
class GPUAvailability:
@modal.enter(snap=True)
def no_gpus_available_during_snapshots(self):
import torch
print(f"GPUs available: {torch.cuda.is_available()}") # False
@modal.enter(snap=False)
def gpus_available_following_restore(self):
import torch
print(f"GPUs available: {torch.cuda.is_available()}") # True
@modal.method()
def demo(self):
print("Done!")
```
The `torch.cuda` module has multiple functions which, if called during
snapshotting, will initialize CUDA as having zero GPU devices. Such functions
include `torch.cuda.is_available` and `torch.cuda.get_device_capability`.
If you’re using a framework that calls these methods during its import phase,
it may not be compatible with memory snapshots. The problem can manifest as
confusing “cuda not available” or “no CUDA-capable device is detected” errors.
In particular, `xformers` is known to call `torch.cuda.get_device_capability` on
import, so if it is imported during snapshotting it can unhelpfully initialize
CUDA with zero GPUs. The [workaround](https://github.com/facebookresearch/xformers/issues/1030) for this
is to be on version `>=0.0.28` and set the `XFORMERS_ENABLE_TRITON` environment
variable to `1` in your `modal.Image`.
```
image = modal.Image.debian_slim().env({"XFORMERS_ENABLE_TRITON": "1"})
```
Setting this variable early-returns from the `xformers` function which unhelpfully
initializes CUDA.
### Randomness and uniqueness
If your application depends on uniqueness of state, you must evaluate your
Function code and verify that it is resilient to snapshotting operations. For
example, if a variable is randomly initialized and snapshotted, that variable
will be identical after every restore, possibly breaking uniqueness expectations
of the proceeding Function code.
[Memory Snapshot (beta)](#memory-snapshot-beta)[Enabling snapshots](#enabling-snapshots)[Updating snapshots](#updating-snapshots)[Snapshot compatibility](#snapshot-compatibility)[Using snapshots with lifecycle functions](#using-snapshots-with-lifecycle-functions)[Known limitations](#known-limitations)[Caching GPU information](#caching-gpu-information)[Randomness and uniqueness](#randomness-and-uniqueness)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Sandbox memory snapshots
========================
> 🌱 Sandbox memory snapshots are in **early preview, pre-beta.**
Sandbox memory snapshots are copies of a Sandbox’s entire state, both in memory and on the filesystem. These Snapshots can be restored later to create a new Sandbox, which is an exact clone of the original Sandbox.
To snapshot a Sandbox, create it with `_experimental_enable_snapshot` set to `True`, and use the `_experimental_snapshot` method, which returns a `SandboxSnapshot` object:
```
image = modal.Image.debian_slim().apt_install("curl", "procps")
app = modal.App.lookup("sandbox-snapshot", create_if_missing=True)
with modal.enable_output():
sb = modal.Sandbox.create(
"python3", "-m", "http.server", "8000",
app=app, image=image, _experimental_enable_snapshot=True
)
print(f"Performing snapshot of {sb.object_id} ...")
snapshot = sb._experimental_snapshot()
```
Create a new Sandbox from the returned SandboxSnapshot with `Sandbox._experimental_from_snapshot`:
```
print(f"Restoring from snapshot {sb.object_id} ...")
sb2 = modal.Sandbox._experimental_from_snapshot(snapshot)
print("Let's see that the http.server is still running...")
p = sb2.exec("ps", "aux")
print(p.stdout.read())
# Talk to snapshotted sandbox http.server
p = sb2.exec("curl", "http://localhost:8000/")
reply = p.stdout.read()
print(reply) # <!DOCTYPE HTML><html lang...
```
The new Sandbox will be a duplicate of your original Sandbox. All running processes will still be running, in the same state as when they were snapshotted, and any changes made to the filesystem will be visible.
You can retrieve the ID of any Sandbox Snapshot with `snapshot.object_id` . To restore from a snapshot by ID, first rehydrate the Snapshot with `SandboxSnapshot.from_id` and then restore from it:
```
snapshot_id = snapshot.object_id
# ... save the Sandbox ID (sb-123abc) for later
# sometime in the future...
snapshot = modal.SandboxSnapshot.from_id(snapshot_id)
sandbox = modal.Sandbox._experimental_from_snapshot(snapshot)
```
Note that these methods are *experimental*, and we may change them in the future.
### Re-snapshotting
Modal supports creating a new snapshot from a restored Sandbox snapshot. To maintain the snapshot’s expiration window, the new snapshot inherits the expiration of its parent.
Continuing from the example code above, we demonstrate re-snapshotting:
```
# Add a file to the snapshotted sandbox
p = sb2.exec("touch", "/foo")
p.wait()
snapshot2 = sb2._experimental_snapshot()
print(f"Restoring from new snapshot {sb.object_id} ...")
sb3 = modal.Sandbox._experimental_from_snapshot(snapshot2)
# Talk to re-snapshotted sandbox http.server
p = sb3.exec("curl", "http://localhost:8000/")
reply = p.stdout.read()
print(reply) # Shows the new 'foo' directory in the HTML listing.
```
### Limitations
Currently, Sandbox Snapshots will expire 7 days after creation.
Open TCP connections will be closed automatically when a Snapshot is taken, and will need to be reopened when the Snapshot is restored.
Snapshotting a sandbox will currently cause it to terminate. We intend to remove this limitation soon.
Sandboxes created with `_experimental_enable_snapshot=True` or restored from Snapshots cannot run with GPUs.
It is not possible to snapshot a sandbox while a `Sandbox.exec` command is still running. Furthermore, any background processes launched by a call to `Sandbox.exec` will not be properly restored after a snapshot.
[Sandbox memory snapshots](#sandbox-memory-snapshots)[Re-snapshotting](#re-snapshotting)[Limitations](#limitations)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Region selection
================
Modal allows you to specify which cloud region you would like to run a Function in. This may be useful if:
* you are required (for regulatory reasons or by your customers) to process data within certain regions.
* you want to reduce egress fees that result from reading data from a dependency like S3.
* you have a latency-sensitive app where app endpoints need to run near an external DB.
Note that regardless of what region your Function runs in, all Function inputs and outputs go through Modal’s control plane in us-east-1.
Pricing
-------
A multiplier on top of our [base usage pricing](../../pricing.html) will be applied to any function that has a cloud region defined.
| **Region** | **Multiplier** |
| --- | --- |
| Any region in US/EU/AP | 1.25x |
| All other regions | 2.5x |
Here’s an example: let’s say you have a function that uses 1 T4, 1 CPU core, and 1GB memory. You’ve specified that the function should run in `us-east-2`. The cost to run this function for 1 hour would be `((T4 hourly cost) + (CPU hourly cost for one core) + (Memory hourly cost for one GB)) * 1.25`.
If you specify multiple regions and they span the two categories above, we will apply the smaller of the two multipliers.
Specifying a region
-------------------
To run your Modal Function in a specific region, pass a `region=` argument to the `function` decorator.
```
import os
import modal
app = modal.App("...")
@app.function(region="us-east") # also supports a list of options, for example region=["us-central", "us-east"]
def f():
print(f"running in {os.environ['MODAL_REGION']}") # us-east-1, us-east-2, us-ashburn-1, etc.
```
You can specify a region in addition to the underlying cloud, `@app.function(cloud="aws", region="us-east")` would run your Function only in `"us-east-1"` or `"us-east-2"` for instance.
Region options
--------------
Modal offers varying levels of granularity for regions. Use broader regions when possible, as this increases the pool of available resources your Function can be assigned to, which improves cold-start time and availability.
### United States (“us”)
Use `region="us"` to select any region in the United States.
```
Broad Specific Description
==============================================================
"us-east" "us-east-1" AWS Virginia
"us-east-2" AWS Ohio
"us-east1" GCP South Carolina
"us-east4" GCP Virginia
"us-east5" GCP Ohio
"us-ashburn-1" OCI Virginia
--------------------------------------------------------------
"us-central" "us-central1" GCP Iowa
"us-chicago-1" OCI Chicago
"us-phoenix-1" OCI Phoenix
--------------------------------------------------------------
"us-west" "us-west-1" AWS California
"us-west-2" AWS Oregon
"us-west1" GCP Oregon
"us-west3" GCP Utah
"us-west4" GCP Nevada
"us-sanjose-1" OCI San Jose
```
### Europe (“eu”)
Use `region="eu"` to select any region in Europe.
```
Broad Specific Description
==============================================================
"eu-west" "eu-central-1" AWS Frankfurt
"eu-west-1" AWS Ireland
"eu-west-3" AWS Paris
"europe-west1" GCP Belgium
"europe-west3" GCP Frankfurt
"europe-west4" GCP Netherlands
"eu-frankfurt-1" OCI Frankfurt
"eu-paris-1" OCI Paris
--------------------------------------------------------------
"eu-north" "eu-north-1" AWS Stockholm
```
### Asia–Pacific (“ap”)
Use `region="ap"` to select any region in Asia–Pacific.
```
Broad Specific Description
==============================================================
"ap-northeast" "asia-northeast3" GCP Seoul
"asia-northeast1" GCP Tokyo
"ap-northeast-1" AWS Tokyo
"ap-northeast-3" AWS Osaka
--------------------------------------------------------------
"ap-southeast" "asia-southeast1" GCP Singapore
"ap-southeast-3" AWS Jakarta
--------------------------------------------------------------
"ap-south" "ap-south-1" AWS Mumbai
```
### Other regions
```
Broad Specific Description
==============================================================
"ca" "ca-central-1" AWS Montreal
"ca-toronto-1" OCI Toronto
--------------------------------------------------------------
"uk" "uk-london-1" OCI London
"europe-west2" GCP London
"eu-west-2" AWS London
--------------------------------------------------------------
"jp" "ap-northeast-1" AWS Tokyo
"ap-northeast-3" AWS Osaka
"asia-northeast1" GCP Tokyo
--------------------------------------------------------------
"me" "me-west1" GCP Tel Aviv
--------------------------------------------------------------
"sa" "sa-east-1" AWS São Paulo
```
Region selection and GPU availability
-------------------------------------
Region selection limits the pool of instances we can run your Functions on. As a result, you may observe higher wait times between when your Function is called and when it gets executed. Generally, we have higher availability in US/EU versus other regions. Whenever possible, select the broadest possible regions so you get the best resource availability.
[Region selection](#region-selection)[Pricing](#pricing)[Specifying a region](#specifying-a-region)[Region options](#region-options)[United States (“us”)](#united-states-us)[Europe (“eu”)](#europe-eu)[Asia–Pacific (“ap”)](#asiapacific-ap)[Other regions](#other-regions)[Region selection and GPU availability](#region-selection-and-gpu-availability)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Streaming endpoints
===================
Modal web endpoints support streaming responses using FastAPI’s [`StreamingResponse`](https://fastapi.tiangolo.com/advanced/custom-response/#streamingresponse) class. This class accepts asynchronous generators, synchronous generators, or
any Python object that implements the [*iterator protocol*](https://docs.python.org/3/library/stdtypes.html#typeiter),
and can be used with Modal Functions!
Simple example
--------------
This simple example combines Modal’s `@modal.fastapi_endpoint` decorator with a `StreamingResponse` object to produce a real-time SSE response.
```
import time
def fake_event_streamer():
for i in range(10):
yield f"data: some data {i}\n\n".encode()
time.sleep(0.5)
@app.function(image=modal.Image.debian_slim().pip_install("fastapi[standard]"))
@modal.fastapi_endpoint()
def stream_me():
from fastapi.responses import StreamingResponse
return StreamingResponse(
fake_event_streamer(), media_type="text/event-stream"
)
```
If you serve this web endpoint and hit it with `curl`, you will see the ten SSE
events progressively appear in your terminal over a ~5 second period.
```
curl --no-buffer https://modal-labs--example-streaming-stream-me.modal.run
```
The MIME type of `text/event-stream` is important in this example, as it tells
the downstream web server to return responses immediately, rather than buffering
them in byte chunks (which is more efficient for compression).
You can still return other content types like large files in streams, but they
are not guaranteed to arrive as real-time events.
Streaming responses with `.remote`
----------------------------------
A Modal Function wrapping a generator function body can have its response passed
directly into a `StreamingResponse`. This is particularly useful if you want to
do some GPU processing in one Modal Function that is called by a CPU-based web
endpoint Modal Function.
```
@app.function(gpu="any")
def fake_video_render():
for i in range(10):
yield f"data: finished processing some data from GPU {i}\n\n".encode()
time.sleep(1)
@app.function(image=modal.Image.debian_slim().pip_install("fastapi[standard]"))
@modal.fastapi_endpoint()
def hook():
from fastapi.responses import StreamingResponse
return StreamingResponse(
fake_video_render.remote_gen(), media_type="text/event-stream"
)
```
Streaming responses with `.map` and `.starmap`
----------------------------------------------
You can also combine Modal Function parallelization with streaming responses,
enabling applications to service a request by farming out to dozens of
containers and iteratively returning result chunks to the client.
```
@app.function()
def map_me(i):
return f"segment {i}\n"
@app.function(image=modal.Image.debian_slim().pip_install("fastapi[standard]"))
@modal.fastapi_endpoint()
def mapped():
from fastapi.responses import StreamingResponse
return StreamingResponse(map_me.map(range(10)), media_type="text/plain")
```
This snippet will spread the ten `map_me(i)` executions across containers, and
return each string response part as it completes. By default the results will be
ordered, but if this isn’t necessary pass `order_outputs=False` as keyword
argument to the `.map` call.
### Asynchronous streaming
The example above uses a synchronous generator, which automatically runs on its
own thread, but in asynchronous applications, a loop over a `.map` or `.starmap` call can block the event loop. This will stop the `StreamingResponse` from
returning response parts iteratively to the client.
To avoid this, you can use the `.aio()` method to convert a synchronous `.map` into its async version. Also, other blocking calls should be offloaded to a
separate thread with `asyncio.to_thread()`. For example:
```
@app.function(gpu="any", image=modal.Image.debian_slim().pip_install("fastapi[standard]"))
@modal.fastapi_endpoint()
async def transcribe_video(request):
from fastapi.responses import StreamingResponse
segments = await asyncio.to_thread(split_video, request)
return StreamingResponse(wrapper(segments), media_type="text/event-stream")
# Notice that this is an async generator.
async def wrapper(segments):
async for partial_result in transcribe_video.map.aio(segments):
yield "data: " + partial_result + "\n\n"
```
Further examples
----------------
* Complete code the for the simple examples given above is available [in our modal-examples Github repository](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/streaming.py).
* [An end-to-end example of streaming Youtube video transcriptions with OpenAI’s whisper model.](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/openai_whisper/streaming/main.py)
[Streaming endpoints](#streaming-endpoints)[Simple example](#simple-example)[Streaming responses with .remote](#streaming-responses-with-remote)[Streaming responses with .map and .starmap](#streaming-responses-with-map-and-starmap)[Asynchronous streaming](#asynchronous-streaming)[Further examples](#further-examples)
See it in action
[LLM Voice Chat](../examples/llm-voice-chat.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Apps, Functions, and entrypoints
================================
An `App` is the object that represents an application running on Modal.
All functions and classes are associated with an [`App`](../reference/modal.App.html#modalapp).
When you [`run`](../reference/cli/run.html) or [`deploy`](../reference/cli/deploy.html) an `App`, it creates an ephemeral or a
deployed `App`, respectively.
You can view a list of all currently running Apps on the [`apps`](../../login%EF%B9%96next=%EA%A4%B7apps.html) page.
Ephemeral Apps
--------------
An ephemeral App is created when you use the [`modal run`](../reference/cli/run.html) CLI command, or the [`app.run`](../reference/modal.App.html#run) method. This creates a temporary
App that only exists for the duration of your script.
Ephemeral Apps are stopped automatically when the calling program exits, or when
the server detects that the client is no longer connected.
You can use [`--detach`](../reference/cli/run.html) in order to keep an ephemeral App running even
after the client exits.
By using `app.run` you can run your Modal apps from within your Python scripts:
```
def main():
...
with app.run():
some_modal_function.remote()
```
By default, running your app in this way won’t propagate Modal logs and progress bar messages. To enable output, use the [`modal.enable_output`](../reference/modal.enable_output.html) context manager:
```
def main():
...
with modal.enable_output():
with app.run():
some_modal_function.remote()
```
Deployed Apps
-------------
A deployed App is created using the [`modal deploy`](../reference/cli/deploy.html) CLI command. The App is persisted indefinitely until you delete it via the [web UI](../../login%EF%B9%96next=%EA%A4%B7apps.html). Functions in a deployed App that have an attached [schedule](cron.html) will be run on a schedule. Otherwise, you can
invoke them manually using [web endpoints or Python](trigger-deployed-functions.html).
Deployed Apps are named via the [`App`](../reference/modal.App.html#modalapp) constructor. Re-deploying an existing `App` (based on the name) will update it
in place.
Entrypoints for ephemeral Apps
------------------------------
The code that runs first when you `modal run` an App is called the “entrypoint”.
You can register a local entrypoint using the [`@app.local_entrypoint()`](../reference/modal.App.html#local_entrypoint) decorator. You can also use a regular Modal function as an entrypoint, in which
case only the code in global scope is executed locally.
### Argument parsing
If your entrypoint function takes arguments with primitive types, `modal run` automatically parses them as CLI options. For example, the following function
can be called with `modal run script.py --foo 1 --bar "hello"`:
```
# script.py
@app.local_entrypoint()
def main(foo: int, bar: str):
some_modal_function.remote(foo, bar)
```
If you wish to use your own argument parsing library, such as `argparse`, you can instead accept a variable-length argument list for your entrypoint or your function. In this case, Modal skips CLI parsing and forwards CLI arguments as a tuple of strings. For example, the following function can be invoked with `modal run my_file.py --foo=42 --bar="baz"`:
```
import argparse
@app.function()
def train(*arglist):
parser = argparse.ArgumentParser()
parser.add_argument("--foo", type=int)
parser.add_argument("--bar", type=str)
args = parser.parse_args(args = arglist)
```
### Manually specifying an entrypoint
If there is only one `local_entrypoint` registered, [`modal run script.py`](../reference/cli/run.html) will automatically use it. If
you have no entrypoint specified, and just one decorated Modal function, that
will be used as a remote entrypoint instead. Otherwise, you can direct `modal run` to use a specific entrypoint.
For example, if you have a function decorated with [`@app.function()`](../reference/modal.App.html#function) in your file:
```
# script.py
@app.function()
def f():
print("Hello world!")
@app.function()
def g():
print("Goodbye world!")
@app.local_entrypoint()
def main():
f.remote()
```
Running [`modal run script.py`](../reference/cli/run.html) will execute the `main` function locally, which would call the `f` function remotely. However you can
instead run `modal run script.py::app.f` or `modal run script.py::app.g` to
execute `f` or `g` directly.
Apps were once Stubs
--------------------
The `modal.App` class in the client was previously called `modal.Stub`. The
old name was kept as an alias for some time, but from Modal 1.0.0 onwards,
using `modal.Stub` will result in an error.
[Apps, Functions, and entrypoints](#apps-functions-and-entrypoints)[Ephemeral Apps](#ephemeral-apps)[Deployed Apps](#deployed-apps)[Entrypoints for ephemeral Apps](#entrypoints-for-ephemeral-apps)[Argument parsing](#argument-parsing)[Manually specifying an entrypoint](#manually-specifying-an-entrypoint)[Apps were once Stubs](#apps-were-once-stubs)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Request timeouts
================
Web endpoint (a.k.a. webhook) requests should complete quickly, ideally within a
few seconds. All web endpoint function types
([`web_endpoint`, `asgi_app`, `wsgi_app`](../reference/modal.web_endpoint.html))
have a maximum HTTP request timeout of 150 seconds enforced. However, the
underlying Modal function can have a longer [timeout](timeouts.html).
In case the function takes more than 150 seconds to complete, a HTTP status 303
redirect response is returned pointing at the original URL with a special query
parameter linking it that request. This is the *result URL* for your function.
Most web browsers allow for up to 20 such redirects, effectively allowing up to
50 minutes (20 \* 150 s) for web endpoints before the request times out.
(**Note:** This does not work with requests that require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS), since the
response will not have been returned from your code in time for the server to
populate CORS headers.)
Some libraries and tools might require you to add a flag or option in order to
follow redirects automatically, e.g. `curl -L ...` or `http --follow ...`.
The *result URL* can be reloaded without triggering a new request. It will block
until the request completes.
(**Note:** As of March 2025, the Python standard library’s `urllib` module has the
maximum number of redirects to any single URL set to 4 by default ([source](https://github.com/python/cpython/blob/main/Lib/urllib/request.py)), which would limit the total timeout to 12.5 minutes (5 \* 150 s = 750 s) unless this setting is overridden.)
Polling solutions
-----------------
Sometimes it can be useful to be able to poll for results rather than wait for a
long running HTTP request. The easiest way to do this is to have your web
endpoint spawn a `modal.Function` call and return the function call id that
another endpoint can use to poll the submitted function’s status. Here is an
example:
```
import fastapi
import modal
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
app = modal.App(image=image)
web_app = fastapi.FastAPI()
@app.function()
@modal.asgi_app()
def fastapi_app():
return web_app
@app.function()
def slow_operation():
...
@web_app.post("/accept")
async def accept_job(request: fastapi.Request):
call = slow_operation.spawn()
return {"call_id": call.object_id}
@web_app.get("/result/{call_id}")
async def poll_results(call_id: str):
function_call = modal.FunctionCall.from_id(call_id)
try:
return function_call.get(timeout=0)
except TimeoutError:
http_accepted_code = 202
return fastapi.responses.JSONResponse({}, status_code=http_accepted_code)
```
[*Document OCR Web App*](../examples/doc_ocr_webapp.html) is an example that uses
this pattern.
[Request timeouts](#request-timeouts)[Polling solutions](#polling-solutions)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Tunnels (beta)
==============
Modal allows you to expose live TCP ports on a Modal container. This is done by
creating a *tunnel* that forwards the port to the public Internet.
```
import modal
app = modal.App()
@app.function()
def start_app():
# Inside this `with` block, port 8000 on the container can be accessed by
# the address at `tunnel.url`, which is randomly assigned.
with modal.forward(8000) as tunnel:
print(f"tunnel.url = {tunnel.url}")
print(f"tunnel.tls_socket = {tunnel.tls_socket}")
# ... start some web server at port 8000, using any framework
```
Tunnels are direct connections and terminate TLS automatically. Within a few
milliseconds of container startup, this function prints a message such as:
```
tunnel.url = https://wtqcahqwhd4tu0.r5.modal.host
tunnel.tls_socket = ('wtqcahqwhd4tu0.r5.modal.host', 443)
```
You can also create tunnels on a [Sandbox](sandbox-networking.html#forwarding-ports) to directly expose the container’s ports.
Build with tunnels
------------------
Tunnels are the fastest way to get a low-latency, direct connection to a running
container. You can use them to run live browser applications with **interactive
terminals**, **Jupyter notebooks**, **VS Code servers**, and more.
As a quick example, here is how you would expose a Jupyter notebook:
```
import os
import secrets
import subprocess
import modal
app = modal.App()
app.image = modal.Image.debian_slim().pip_install("jupyterlab")
@app.function()
def run_jupyter():
token = secrets.token_urlsafe(13)
with modal.forward(8888) as tunnel:
url = tunnel.url + "/?token=" + token
print(f"Starting Jupyter at {url}")
subprocess.run(
[
"jupyter",
"lab",
"--no-browser",
"--allow-root",
"--ip=0.0.0.0",
"--port=8888",
"--LabApp.allow_origin='*'",
"--LabApp.allow_remote_access=1",
],
env={**os.environ, "JUPYTER_TOKEN": token, "SHELL": "/bin/bash"},
stderr=subprocess.DEVNULL,
)
```
When you run the function, it starts Jupyter and gives you the public URL. It’s
as simple as that.
All Modal features are supported. If you [need GPUs](gpu.html), pass `gpu=` to the `@app.function()` decorator. If you [need more CPUs, RAM](resources.html), or to attach [volumes](volumes.html), those
also just work.
### Programmable startup
The tunnel API is completely on-demand, so you can start them as the result of a
web request.
For example, you could make something like Jupyter Hub without leaving Modal,
giving your users their own Jupyter notebooks when they visit a URL:
```
import modal
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
app = modal.App(image=image)
@app.function(timeout=900) # 15 minutes
def run_jupyter(q):
... # as before, but return the URL on app.q
@app.function()
@modal.fastapi_endpoint(method="POST")
def jupyter_hub():
from fastapi import HTTPException
from fastapi.responses import RedirectResponse
... # do some validation on the secret or bearer token
if is_valid:
with modal.Queue.ephemeral() as q:
run_jupyter.spawn(q)
url = q.get()
return RedirectResponse(url, status_code=303)
else:
raise HTTPException(401, "Not authenticated")
```
This gives every user who sends a POST request to the web endpoint their own
Jupyter notebook server, on a fully isolated Modal container.
You could do the same with VS Code and get some basic version of an instant,
serverless IDE!
### Advanced: Unencrypted TCP tunnels
By default, tunnels are only exposed to the Internet at a secure random URL, and
connections have automatic TLS (the “S” in HTTPS). However, sometimes you might
need to expose a protocol like an SSH server that goes directly over TCP. In
this case, we have support for *unencrypted* tunnels:
```
with modal.forward(8000, unencrypted=True) as tunnel:
print(f"tunnel.tcp_socket = {tunnel.tcp_socket}")
```
Might produce an output like:
```
tunnel.tcp_socket = ('r3.modal.host', 23447)
```
You can then connect over TCP, for example with `nc r3.modal.host 23447`. Unlike
encrypted TLS sockets, these cannot be given a non-guessable, cryptographically
random URL due to how the TCP protocol works, so they are assigned a random port
number instead.
Pricing
-------
Modal only charges for containers based on [the resources you use](../../pricing.html). There is no additional
charge for having an active tunnel.
For example, if you start a Jupyter notebook on port 8888 and access it via
tunnel, you can use it for an hour for development (with 0.01 CPUs) and then
actually run an intensive job with 16 CPUs for one minute. The amount you would
be billed for in that hour is 0.01 + 16 \* (1/60) = **0.28 CPUs**, even though
you had access to 16 CPUs without needing to restart your notebook.
Security
--------
Tunnels are run on Modal’s private global network of Internet relays. On
startup, your container will connect to the nearest tunnel so you get the
minimum latency, very similar in performance to a direct connection with the
machine.
This makes them ideal for live debugging sessions, using web-based terminals
like [ttyd](https://github.com/tsl0922/ttyd).
The generated URLs are cryptographically random, but they are also public on the
Internet, so anyone can access your application if they are given the URL.
We do not currently do any detection of requests above L4, so if you are running
a web server, we will not add special proxy HTTP headers or translate HTTP/2.
You’re just getting the TLS-encrypted TCP stream directly!
[Tunnels (beta)](#tunnels-beta)[Build with tunnels](#build-with-tunnels)[Programmable startup](#programmable-startup)[Advanced: Unencrypted TCP tunnels](#advanced-unencrypted-tcp-tunnels)[Pricing](#pricing)[Security](#security)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Global variables
================
There are cases where you might want objects or data available in **global** scope. For example:
* You need to use the data in a scheduled function (scheduled functions don’t
accept arguments)
* You need to construct objects (e.g. Secrets) in global scope to use as
function annotations
* You don’t want to clutter many function signatures with some common arguments
they all use, and pass the same arguments through many layers of function
calls.
For these cases, you can use the `modal.is_local` function, which returns `True` if the app is running locally (initializing) or `False` if the app is executing
in the cloud.
For instance, to create a [`modal.Secret`](secrets.html) that you can pass
to your function decorators to create environment variables, you can run:
```
import os
if modal.is_local():
pg_password = modal.Secret.from_dict({"PGPASS": os.environ["MY_LOCAL_PASSWORD"]})
else:
pg_password = modal.Secret.from_dict({})
@app.function(secrets=[pg_password])
def get_secret_data():
connection = psycopg2.connect(password=os.environ["PGPASS"])
...
```
Warning about regular module globals
------------------------------------
If you try to construct a global in module scope using some local data *without* using something like `modal.is_local`, it might have unexpected effects since
your Python modules will be not only be loaded on your local machine, but also
on the remote worker.
E.g., this will typically not work:
```
# blob.json doesn't exist on the remote worker, so this will cause an error there
data_blob = open("blob.json", "r").read()
@app.function()
def foo():
print(data_blob)
```
[Global variables](#global-variables)[Warning about regular module globals](#warning-about-regular-module-globals)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Slack notifications (beta)
==========================
You can integrate your Modal Workspace with Slack to receive timely essential notifications.
Prerequisites
-------------
* You are a [Workspace Manager](workspaces.html#administrating-workspace-members) in the Modal Workspace you’re installing the Slack integration in.
* You have permissions to install apps in your Slack workspace.
Supported notifications
-----------------------
* Alerts for failed scheduled function runs.
* Alerts when any of your apps have client versions that are out of date.
* Alerts when you hit your GPU resource limits.
Configuration
-------------
### Step 1: Install the Slack integration
Visit the *Slack Integration* section on your [settings](../../login%EF%B9%96next=%EA%A4%B7settings.html) page in your Modal Workspace and click the **Add to Slack** button.
### Step 2: Add the Modal app to your Slack channel
Navigate to the Slack channel you want to add the Modal to and click on the channel header. On the integrations tab you can add the Modal app.
![Add Modal app to Slack channel](../../_app/immutable/assets/slack-add-modal-app.Cy4hnVNV.jpg)
### Step 3: Use `/modal link` to link the Slack channel to your Modal Workspace
You’ll be prompted to select the Workspace you want to link to the Slack channel. You can always unlink the Slack channel by visiting the *Slack Integration* section on your [settings](../../login%EF%B9%96next=%EA%A4%B7settings.html) page in your Modal Workspace.
[Slack notifications (beta)](#slack-notifications-beta)[Prerequisites](#prerequisites)[Supported notifications](#supported-notifications)[Configuration](#configuration)[Step 1: Install the Slack integration](#step-1-install-the-slack-integration)[Step 2: Add the Modal app to your Slack channel](#step-2-add-the-modal-app-to-your-slack-channel)[Step 3: Use /modal link to link the Slack channel to your Modal Workspace](#step-3-use-modal-link-to-link-the-slack-channel-to-your-modal-workspace)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Preemption
==========
All Modal Functions are subject to preemption. If a preemption event interrupts
a running Function, Modal will gracefully terminate the Function and restart it
on the same input.
Preemptions are rare, but it is always possible that your Function is
interrupted. Long-running Functions such as model training Functions should take
particular care to tolerate interruptions, as likelihood of interruption increases
with Function run duration.
Preparing for interruptions
---------------------------
Design your applications to be fault and preemption tolerant. Modal will send an
interrupt signal to your container when preemption occurs. This will cause the
Function’s [exit handler](lifecycle-functions.html#exit) to run, which
can perform any cleanup within its grace period.
Other best practices for handling preemptions include:
* Divide long-running operations into small tasks or use checkpoints so that you
can save your work frequently.
* Ensure preemptible operations are safely retryable (ie. idempotent).
Running uninterruptible Functions
---------------------------------
We currently don’t have a way for Functions to avoid the possibility of
interruption, but it’s a planned feature. If you require Functions guaranteed to
run without interruption, please reach out!
[Preemption](#preemption)[Preparing for interruptions](#preparing-for-interruptions)[Running uninterruptible Functions](#running-uninterruptible-functions)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Batch Processing
================
Modal is optimized for large-scale batch processing, allowing functions to scale to thousands of parallel containers with zero additional configuration. Function calls can be submitted asynchronously for background execution, eliminating the need to wait for jobs to finish or tune resource allocation.
This guide covers Modal’s batch processing capabilities, from basic invocation to integration with existing pipelines.
Background Execution with `.spawn_map`
--------------------------------------
The fastest way to submit multiple jobs for asynchronous processing is by invoking a function with `.spawn_map`. When combined with the [`--detach`](../reference/cli/run.html) flag, your App continues running until all jobs are completed.
Here’s an example of submitting 100,000 videos for parallel embedding. You can disconnect after submission, and the processing will continue to completion in the background:
```
# Kick off asynchronous jobs with `modal run --detach batch_processing.py`
import modal
app = modal.App("batch-processing-example")
volume = modal.Volume.from_name("video-embeddings", create_if_missing=True)
@app.function(volumes={"/data": volume})
def embed_video(video_id: int):
# Business logic:
# - Load the video from the volume
# - Embed the video
# - Save the embedding to the volume
...
@app.local_entrypoint()
def main():
embed_video.spawn_map(range(100_000))
```
This pattern works best for jobs that store results externally—for example, in a [Modal Volume](volumes.html), [Cloud Bucket Mount](cloud-bucket-mounts.html), or your own database\*.
*\* For database connections, consider using [Modal Proxy](proxy-ips.html) to maintain a static IP across thousands of containers.*
Parallel Processing with `.map`
-------------------------------
Using `.map` allows you to offload expensive computations to powerful machines while gathering results. This is particularly useful for pipeline steps with bursty resource demands. Modal handles all infrastructure provisioning and de-provisioning automatically.
Here’s how to implement parallel video similarity queries as a single Modal function call:
```
# Run jobs and collect results with `modal run gather.py`
import modal
app = modal.App("gather-results-example")
@app.function(gpu="L40S")
def compute_video_similarity(query: str, video_id: int) -> tuple[int, int]:
# Embed video with GPU acceleration & compute similarity with query
return video_id, score
@app.local_entrypoint()
def main():
import itertools
queries = itertools.repeat("Modal for batch processing")
video_ids = range(100_000)
for video_id, score in compute_video_similarity.map(queries, video_ids):
# Process results (e.g., extract top 5 most similar videos)
pass
```
This example runs `compute_video_similarity` on an autoscaling pool of L40S GPUs, returning scores to a local process for further processing.
Integration with Existing Systems
---------------------------------
The recommended way to use Modal Functions within your existing data pipeline is through [deployed function invocation](trigger-deployed-functions.html). After deployment, you can call Modal functions from external systems:
```
def external_function(inputs):
compute_similarity = modal.Function.from_name(
"gather-results-example",
"compute_video_similarity"
)
for result in compute_similarity.map(inputs):
# Process results
pass
```
You can invoke Modal Functions from any Python context, gaining access to built-in observability, resource management, and GPU acceleration.
[Batch Processing](#batch-processing)[Background Execution with .spawn\_map](#background-execution-with-spawn_map)[Parallel Processing with .map](#parallel-processing-with-map)[Integration with Existing Systems](#integration-with-existing-systems)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
S3 Gateway endpoints
====================
When running workloads in AWS, our system automatically uses a corresponding [S3 Gateway endpoint](https://docs.aws.amazon.com/vpc/latest/privatelink/vpc-endpoints-s3.html) to ensure low costs, optimal performance, and network reliability between Modal and S3.
Workloads running on Modal should not incur egress or ingress fees associated
with S3 operations. No configuration is needed in order for your app to use S3 Gateway endpoints.
S3 Gateway endpoints are automatically used when your app runs on AWS.
Endpoint configuration
----------------------
Only use the region-specific endpoint (`s3.<region>.amazonaws.com`) or the
global AWS endpoint (`s3.amazonaws.com`). Using an S3 endpoint from one region
in another **will not use the S3 Gateway Endpoint incurring networking costs**.
Avoid specifying regional endpoints manually, as this can lead to unexpected cost
or performance degradation.
Inter-region costs
------------------
S3 Gateway endpoints guarantee no costs for network traffic within the same AWS region.
However, if your Modal Function runs in one region but your bucket resides in a
different region you will be billed for inter-region traffic.
You can prevent this by scheduling your Modal App in the same region of your
S3 bucket with [Region selection](region-selection.html#region-selection).
[S3 Gateway endpoints](#s3-gateway-endpoints)[Endpoint configuration](#endpoint-configuration)[Inter-region costs](#inter-region-costs)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Volumes
=======
Modal Volumes provide a high-performance distributed file system for your Modal applications.
They are designed for write-once, read-many I/O workloads, like creating machine learning model
weights and distributing them for inference.
Creating a Volume
-----------------
The easiest way to create a Volume and use it as a part of your app is to use
the `modal volume create` CLI command. This will create the Volume and output
some sample code:
```
% modal volume create my-volume
Created volume 'my-volume' in environment 'main'.
```
Using a Volume on Modal
-----------------------
To attach an existing Volume to a Modal Function, use `Volume.from_name`:
```
vol = modal.Volume.from_name("my-volume")
@app.function(volumes={"/data": vol})
def run():
with open("/data/xyz.txt", "w") as f:
f.write("hello")
vol.commit() # Needed to make sure all changes are persisted before exit
```
You can also browse and manipulate Volumes from an ad hoc Modal Shell:
```
% modal shell --volume my-volume --volume another-volume
```
Volumes will be mounted under `/mnt`.
### Creating Volumes lazily from code
You can also create Volumes lazily from code using:
```
vol = modal.Volume.from_name("my-volume", create_if_missing=True)
```
This will create the Volume if it doesn’t exist.
Using a Volume from outside of Modal
------------------------------------
Volumes can also be used outside Modal via the Python SDK or our CLI.
### Using a Volume from local code
You can interact with Volumes from anywhere you like using the `modal` Python client library.
```
vol = modal.Volume.from_name("my-volume")
with vol.batch_upload() as batch:
batch.put_file("local-path.txt", "/remote-path.txt")
batch.put_directory("/local/directory/", "/remote/directory")
batch.put_file(io.BytesIO(b"some data"), "/foobar")
```
For more details, see the [reference documentation](../reference/modal.Volume.html).
### Using a Volume via the command line
You can also interact with Volumes using the command line interface. You can run `modal volume` to get a full list of its subcommands:
```
% modal volume
Usage: modal volume [OPTIONS] COMMAND [ARGS]...
Read and edit modal.Volume volumes.
Note: users of modal.NetworkFileSystem should use the modal nfs command instead.
╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --help Show this message and exit. │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ File operations ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ cp Copy within a modal.Volume. Copy source file to destination file or multiple source files to destination directory. │
│ get Download files from a modal.Volume object. │
│ ls List files and directories in a modal.Volume volume. │
│ put Upload a file or directory to a modal.Volume. │
│ rm Delete a file or directory from a modal.Volume. │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Management ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ create Create a named, persistent modal.Volume. │
│ delete Delete a named, persistent modal.Volume. │
│ list List the details of all modal.Volume volumes in an Environment. │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```
For more details, see the [reference documentation](../reference/cli/volume.html).
Volume commits and reloads
--------------------------
Unlike a normal filesystem, you need to explicitly reload the Volume to see
changes made since it was first mounted. This reload is handled by invoking the [`.reload()`](../reference/modal.Volume.html#reload) method on a Volume object.
Similarly, any Volume changes made within a container need to be committed for
those the changes to become visible outside the current container. This is handled
periodically by [background commits](#background-commits) and directly by invoking
the [`.commit()`](../reference/modal.Volume.html#commit) method on a `modal.Volume` object.
At container creation time the latest state of an attached Volume is mounted. If
the Volume is then subsequently modified by a commit operation in another
running container, that Volume modification won’t become available until the
original container does a [`.reload()`](../reference/modal.Volume.html#reload).
Consider this example which demonstrates the effect of a reload:
```
import pathlib
import modal
app = modal.App()
volume = modal.Volume.from_name("my-volume")
p = pathlib.Path("/root/foo/bar.txt")
@app.function(volumes={"/root/foo": volume})
def f():
p.write_text("hello")
print(f"Created {p=}")
volume.commit() # Persist changes
print(f"Committed {p=}")
@app.function(volumes={"/root/foo": volume})
def g(reload: bool = False):
if reload:
volume.reload() # Fetch latest changes
if p.exists():
print(f"{p=} contains '{p.read_text()}'")
else:
print(f"{p=} does not exist!")
@app.local_entrypoint()
def main():
g.remote() # 1. container for `g` starts
f.remote() # 2. container for `f` starts, commits file
g.remote(reload=False) # 3. reuses container for `g`, no reload
g.remote(reload=True) # 4. reuses container, but reloads to see file.
```
The output for this example is this:
```
p=PosixPath('/root/foo/bar.txt') does not exist!
Created p=PosixPath('/root/foo/bar.txt')
Committed p=PosixPath('/root/foo/bar.txt')
p=PosixPath('/root/foo/bar.txt') does not exist!
p=PosixPath('/root/foo/bar.txt') contains hello
```
This code runs two containers, one for `f` and one for `g`. Only the last
function invocation reads the file created and committed by `f` because it was
configured to reload.
### Background commits
Modal Volumes run background commits:
every few seconds while your Function executes,
the contents of attached Volumes will be committed
without your application code calling `.commit`.
A final snapshot and commit is also automatically performed on container shutdown.
Being able to persist changes to Volumes without changing your application code
is especially useful when [training or fine-tuning models using frameworks](#model-checkpointing).
Model serving
-------------
A single ML model can be served by simply baking it into a `modal.Image` at
build time using [`run_function`](../reference/modal.Image.html#run_function). But
if you have dozens of models to serve, or otherwise need to decouple image
builds from model storage and serving, use a `modal.Volume`.
Volumes can be used to save a large number of ML models and later serve any one
of them at runtime with much better performance than can be achieved with a [`modal.NetworkFileSystem`](../reference/modal.NetworkFileSystem.html).
This snippet below shows the basic structure of the solution.
```
import modal
app = modal.App()
volume = modal.Volume.from_name("model-store")
model_store_path = "/vol/models"
@app.function(volumes={model_store_path: volume}, gpu="any")
def run_training():
model = train(...)
save(model_store_path, model)
volume.commit() # Persist changes
@app.function(volumes={model_store_path: volume})
def inference(model_id: str, request):
try:
model = load_model(model_store_path, model_id)
except NotFound:
volume.reload() # Fetch latest changes
model = load_model(model_store_path, model_id)
return model.run(request)
```
For more details, see our [guide to storing model weights on Modal](model-weights.html).
Model checkpointing
-------------------
Checkpoints are snapshots of an ML model and can be configured by the callback
functions of ML frameworks. You can use saved checkpoints to restart a training
job from the last saved checkpoint. This is particularly helpful in managing [preemption](preemption.html).
For more, see our [example code for long-running training](../examples/long-training.html).
### Hugging Face `transformers`
To periodically checkpoint into a `modal.Volume`, just set the `Trainer`’s [`output_dir`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.output_dir) to a directory in the Volume.
```
import pathlib
volume = modal.Volume.from_name("my-volume")
VOL_MOUNT_PATH = pathlib.Path("/vol")
@app.function(
gpu="A10G",
timeout=2 * 60 * 60, # run for at most two hours
volumes={VOL_MOUNT_PATH: volume},
)
def finetune():
from transformers import Seq2SeqTrainer
...
training_args = Seq2SeqTrainingArguments(
output_dir=str(VOL_MOUNT_PATH / "model"),
# ... more args here
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_xsum_train,
eval_dataset=tokenized_xsum_test,
)
```
Volumes versus Network File Systems
-----------------------------------
Like the [`modal.NetworkFileSystem`](../reference/modal.NetworkFileSystem.html),
Volumes can be simultaneously attached to multiple Modal Functions, supporting
concurrent reading and writing. But unlike the `modal.NetworkFileSystem`, the `modal.Volume` has been designed for fast reads and does not automatically
synchronize writes between mounted Volumes.
Volume performance
------------------
Volumes work best when they contain less than 50,000 files and directories. The
latency to attach or modify a Volume scales linearly with the number of files in
the Volume, and past a few tens of thousands of files the linear component
starts to dominate the fixed overhead.
There is currently a hard limit of 500,000 inodes (files, directories and
symbolic links) per Volume. If you reach this limit, any further attempts to
create new files or directories will error with [`ENOSPC` (No space left on device)](https://pubs.opengroup.org/onlinepubs/9799919799/).
Filesystem consistency
----------------------
### Concurrent modification
Concurrent modification from multiple containers is supported, but concurrent
modifications of the same files should be avoided. Last write wins in case of
concurrent modification of the same file — any data the last writer didn’t have
when committing changes will be lost!
The number of commits you can run concurrently is limited. If you run too many
concurrent commits each commit will take longer due to contention. If you are
committing small changes, avoid doing more than 5 concurrent commits (the number
of concurrent commits you can make is proportional to the size of the changes
being committed).
As a result, Volumes are typically not a good fit for use cases where you need
to make concurrent modifications to the same file (nor is distributed file
locking supported).
While a reload is in progress the Volume will appear empty to the container that
initiated the reload. That means you cannot read from or write to a Volume in a
container where a reload is ongoing (note that this only applies to the
container where the reload was issued, other containers remain unaffected).
### Busy Volume errors
You can only reload a Volume when there no open files on the Volume. If you have
open files on the Volume the [`.reload()`](../reference/modal.Volume.html#reload) operation will fail with “volume busy”. The following is a simple example of how
a “volume busy” error can occur:
```
volume = modal.Volume.from_name("my-volume")
@app.function(volumes={"/vol": volume})
def reload_with_open_files():
f = open("/vol/data.txt", "r")
volume.reload() # Cannot reload when files in the Volume are open.
```
### Can’t find file on Volume errors
When accessing files in your Volume, don’t forget to pre-pend where your Volume
is mounted in the container.
In the example below, where the Volume has been mounted at `/data`, “hello” is
being written to `/data/xyz.txt`.
```
import modal
app = modal.App()
vol = modal.Volume.from_name("my-volume")
@app.function(volumes={"/data": vol})
def run():
with open("/data/xyz.txt", "w") as f:
f.write("hello")
vol.commit()
```
If you instead write to `/xyz.txt`, the file will be saved to the local disk of the Modal Function.
When you dump the contents of the Volume, you will not see the `xyz.txt` file.
Further examples
----------------
* [Character LoRA fine-tuning](../examples/diffusers_lora_finetune.html) with model storage on a Volume
* [Protein folding](../examples/chai1.html) with model weights and output files stored on Volumes
* [Dataset visualization with Datasette](https://modal.com/docs/example/covid_datasette) using a SQLite database on a Volume
[Volumes](#volumes)[Creating a Volume](#creating-a-volume)[Using a Volume on Modal](#using-a-volume-on-modal)[Creating Volumes lazily from code](#creating-volumes-lazily-from-code)[Using a Volume from outside of Modal](#using-a-volume-from-outside-of-modal)[Using a Volume from local code](#using-a-volume-from-local-code)[Using a Volume via the command line](#using-a-volume-via-the-command-line)[Volume commits and reloads](#volume-commits-and-reloads)[Background commits](#background-commits)[Model serving](#model-serving)[Model checkpointing](#model-checkpointing)[Hugging Face transformers](#hugging-face-transformers)[Volumes versus Network File Systems](#volumes-versus-network-file-systems)[Volume performance](#volume-performance)[Filesystem consistency](#filesystem-consistency)[Concurrent modification](#concurrent-modification)[Busy Volume errors](#busy-volume-errors)[Can’t find file on Volume errors](#cant-find-file-on-volume-errors)[Further examples](#further-examples)
See it in action
[Fine-tuning and serving custom image generation models](../examples/diffusers_lora_finetune.html)
[Folding proteins](../examples/chai1.html)
[Serving interactive visualizations for a SQLite database](../examples/covid_datasette.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Container lifecycle hooks
=========================
Since Modal will reuse the same container for multiple inputs, sometimes you
might want to run some code exactly once when the container starts or exits.
To accomplish this, you need to use Modal’s class syntax and the [`@app.cls`](../reference/modal.App.html#cls) decorator. Specifically, you’ll
need to:
1. Convert your function to a method by making it a member of a class.
2. Decorate the class with `@app.cls(...)` with same arguments you previously
had for `@app.function(...)`.
3. Instead of the `@app.function` decorator on the original method, use `@method` or the appropriate decorator for a [web endpoint](#lifecycle-hooks-for-web-endpoints).
4. Add the correct method “hooks” to your class based on your need:
* `@enter` for one-time initialization (remote)
* `@exit` for one-time cleanup (remote)
`@enter`
--------
The container entry handler is called when a new container is started. This is
useful for doing one-time initialization, such as loading model weights or
importing packages that are only present in that image.
To use, make your function a member of a class, and apply the `@enter()` decorator to one or more class methods:
```
import modal
app = modal.App()
@app.cls(cpu=8)
class Model:
@modal.enter()
def run_this_on_container_startup(self):
import pickle
self.model = pickle.load(open("model.pickle"))
@modal.method()
def predict(self, x):
return self.model.predict(x)
@app.local_entrypoint()
def main():
Model().predict.remote(x=123)
```
When working with an [asynchronous Modal](async.html) app, you may use an
async method instead:
```
import modal
app = modal.App()
@app.cls(memory=1024)
class Processor:
@modal.enter()
async def my_enter_method(self):
self.cache = await load_cache()
@modal.method()
async def run(self, x):
return await do_some_async_stuff(x, self.cache)
@app.local_entrypoint()
async def main():
await Processor().run.remote(x=123)
```
Note: The `@enter()` decorator replaces the earlier `__enter__` syntax, which
has been deprecated.
`@exit`
-------
The container exit handler is called when a container is about to exit. It is
useful for doing one-time cleanup, such as closing a database connection or
saving intermediate results. To use, make your function a member of a class, and
apply the `@exit()` decorator:
```
import modal
app = modal.App()
@app.cls()
class ETLPipeline:
@modal.enter()
def open_connection(self):
import psycopg2
self.connection = psycopg2.connect(os.environ["DATABASE_URI"])
@modal.method()
def run(self):
# Run some queries
pass
@modal.exit()
def close_connection(self):
self.connection.close()
@app.local_entrypoint()
def main():
ETLPipeline().run.remote()
```
Exit handlers are also called when a container is [preempted](preemption.html).
The exit handler is given a grace period of 30 seconds to finish, and it will be
killed if it takes longer than that to complete.
Lifecycle hooks for web endpoints
---------------------------------
Modal `@function`s that are [web endpoints](webhooks.html) can be
converted to the class syntax as well. Instead of `@modal.method`, simply use
whichever of the web endpoint decorators (`@modal.fastapi_endpoint`, `@modal.asgi_app` or `@modal.wsgi_app`) you were using before.
```
from fastapi import Request
import modal
image = modal.Image.debian_slim().pip_install("fastapi")
app = modal.App("web-endpoint-cls", image=image)
@app.cls()
class Model:
@modal.enter()
def run_this_on_container_startup(self):
self.model = pickle.load(open("model.pickle"))
@modal.fastapi_endpoint()
def predict(self, request: Request):
...
```
[Container lifecycle hooks](#container-lifecycle-hooks)[@enter](#enter)[@exit](#exit)[Lifecycle hooks for web endpoints](#lifecycle-hooks-for-web-endpoints)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
GPU acceleration
================
Modal makes it easy to run any code on GPUs.
Quickstart
----------
Here’s a simple example of a function running on an A100 in Modal:
```
import modal
app = modal.App()
image = modal.Image.debian_slim().pip_install("torch")
@app.function(gpu="A100", image=image)
def run():
import torch
print(torch.cuda.is_available())
```
This installs PyTorch on top of a base image, and is able to use GPUs with
PyTorch.
Specifying GPU type
-------------------
You can pick a specific GPU type for your function via the `gpu` argument.
Modal supports the following values for this parameter:
* `T4`
* `L4`
* `A10G`
* `A100-40GB`
* `A100-80GB`
* `L40S`
* `H100`
For instance, to use an H100, you can use `@app.function(gpu="H100")`.
Refer to our [pricing page](../../pricing.html) for the latest pricing on each GPU type.
Specifying GPU count
--------------------
You can specify more than 1 GPUs per container by appending `:n` to the GPU
argument. For instance, to run a function with 8\*H100:
```
@app.function(gpu="H100:8")
def run_llama_405b_fp8():
...
```
Currently H100, A100, L4, T4 and L40S instances support up to 8 GPUs (up to 640 GB GPU RAM),
and A10G instances support up to 4 GPUs (up to 96 GB GPU RAM). Note that requesting
more than 2 GPUs per container will usually result in larger wait times. These
GPUs are always attached to the same physical machine.
Picking a GPU
-------------
For running, rather than training, neural networks, we recommend starting off
with the [L40S](https://resources.nvidia.com/en-us-l40s/l40s-datasheet-28413),
which offers an excellent trade-off of cost and performance and 48 GB of GPU
RAM for storing model weights.
For more on how to pick a GPU for use with neural networks like LLaMA or Stable
Diffusion, and for tips on how to make that GPU go brrr, check out [Tim Dettemers’ blog post](https://timdettmers.com/2023/01/30/which-gpu-for-deep-learning/) or the [Full Stack Deep Learning page on Cloud GPUs](https://fullstackdeeplearning.com/cloud-gpus/).
GPU fallbacks
-------------
Modal allows specifying a list of possible GPU types, suitable for functions that are
compatible with multiple options. Modal respects the ordering of this list and
will try to allocate the most preferred GPU type before falling back to less
preferred ones.
```
@app.function(gpu=["H100", "A100-40GB:2"])
def run_on_80gb():
...
```
See [this example](../examples/gpu_fallbacks.html) for more detail.
H100 GPUs
---------
Modal’s fastest GPUs are the [H100s](https://www.nvidia.com/en-us/data-center/h100/), NVIDIA’s
flagship data center chip for the Hopper/Lovelace [architecture](../../gpu-glossary/device-hardware/streaming-multiprocessor-architecture.html).
To request an H100, set the `gpu` argument to `"H100"`
```
@app.function(gpu="H100")
def run_text_to_video():
...
```
Check out [this example](../examples/flux.html) to see how you can generate images
from the Flux.schnell model in under a second using an H100.
Before you jump for the most powerful (and so most expensive) GPU, make sure you
understand where the bottlenecks are in your computations. For example, running
language models with small batch sizes (e.g. one prompt at a time) results in a [bottleneck on memory, not arithmetic](https://kipp.ly/transformer-inference-arithmetic/).
Since arithmetic throughput has risen faster than memory throughput in recent
hardware generations, speedups for memory-bound GPU jobs are not as extreme and
may not be worth the extra cost.
**H200 GPUs**
Modal may automatically upgrade an H100 request to an [H200](https://www.nvidia.com/en-us/data-center/h200/), NVIDIA’s evolution of the H100 chip
for the Hopper/Lovelace [architecture](../../gpu-glossary/device-hardware/streaming-multiprocessor-architecture.html).
This automatic upgrade *does not* change the cost of the GPU.
H200s are software compatible with H100s, so your code always works for both, but an upgrade
to an H200 brings higher memory bandwidth! NVIDIA H200’s HBM3e memory bandwidth of 4.8TB/s is 1.4x faster than NVIDIA H100 with HBM3.
A100 GPUs
---------
[A100s](https://www.nvidia.com/en-us/data-center/a100/) are the previous
generation of top-of-the-line data center chip from NVIDIA, based on the Ampere [architecture](../../gpu-glossary/device-hardware/streaming-multiprocessor-architecture.html).
Modal offers two versions of the A100: one with 40 GB of RAM and another with 80 GB of RAM.
To request an A100 with 40 GB of [GPU memory](../../gpu-glossary/device-hardware/gpu-ram.html), use `gpu="A100"`:
```
@app.function(gpu="A100")
def llama_7b():
...
```
To request an 80 GB A100, use the string `A100-80GB`:
```
@app.function(gpu="A100-80GB")
def llama_70b_fp8():
...
```
Multi GPU training
------------------
Modal currently supports multi-GPU training on a single machine, with multi-node training in closed beta ([contact us](../../slack.html) for access). Depending on which framework you are using, you may need to use different techniques to train on multiple GPUs.
If the framework re-executes the entrypoint of the Python process (like [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/index.html)) you need to either set the strategy to `ddp_spawn` or `ddp_notebook` if you wish to invoke the training directly. Another option is to run the training script as a subprocess instead.
```
@app.function(gpu="A100:2")
def run():
import subprocess
import sys
subprocess.run(
["python", "train.py"],
stdout=sys.stdout, stderr=sys.stderr,
check=True,
)
```
Examples and more resources.
----------------------------
For more information about GPUs in general, check out our [GPU Glossary](../../gpu-glossary/readme.html).
Or take a look some examples of Modal apps using GPUs:
* [Fine-tune a character LoRA for your pet](../examples/dreambooth_app.html)
* [Fast LLM inference with vLLM](../examples/vllm_inference.html)
* [Stable Diffusion with a CLI, API, and web UI](../examples/stable_diffusion_cli.html)
* [Rendering Blender videos](../examples/blender_video.html)
[GPU acceleration](#gpu-acceleration)[Quickstart](#quickstart)[Specifying GPU type](#specifying-gpu-type)[Specifying GPU count](#specifying-gpu-count)[Picking a GPU](#picking-a-gpu)[GPU fallbacks](#gpu-fallbacks)[H100 GPUs](#h100-gpus)[A100 GPUs](#a100-gpus)[Multi GPU training](#multi-gpu-training)[Examples and more resources.](#examples-and-more-resources)
See it in action
[High-speed inference with vLLM](../examples/vllm_inference.html)
[Stable Diffusion 3.5 Large](../examples/stable_diffusion_cli.html)
[Blender video renderer](../examples/blender_video.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Connecting Modal to your OpenTelemetry Provider
===============================================
You can export Modal logs to your [OpenTelemetry](https://opentelemetry.io/docs/what-is-opentelemetry/) provider using the Modal OpenTelemetry integration. This integration is compatible with
any observability provider that supports the OpenTelemetry HTTP APIs.
What this integration does
--------------------------
This integration allows you to:
1. Export Modal audit logs to your provider
2. Export Modal function logs to your provider
3. Export container metrics to your provider
Metrics
-------
The Modal OpenTelemetry Integration will forward the following metrics to your provider:
* `modal.cpu.utilization`
* `modal.memory.utilization`
* `modal.gpu.memory.utilization`
* `modal.gpu.compute.utilization`
These metrics are tagged with `container_id`, `environment_name`, and `workspace_name`.
Installing the integration
--------------------------
1. Find out the endpoint URL for your OpenTelemetry provider. This is the URL that
the Modal integration will send logs to. Note that this should be the base URL
of the OpenTelemetry provider, and not a specific endpoint. For example, for the [US New Relic instance](https://docs.newrelic.com/docs/opentelemetry/best-practices/opentelemetry-otlp/#configure-endpoint-port-protocol),
the endpoint URL is `https://otlp.nr-data.net`, not `https://otlp.nr-data.net/v1/logs`.
2. Find out the API key or other authentication method required to send logs to your
OpenTelemetry provider. This is the key that the Modal integration will use to authenticate
with your provider. Modal can provide any key/value HTTP header pairs. For example, for [New Relic](https://docs.newrelic.com/docs/opentelemetry/best-practices/opentelemetry-otlp/#api-key),
the header is `api-key`.
3. Create a new OpenTelemetry Secret in Modal with one key per header. These keys should be
prefixed with `OTEL_HEADER_`, followed by the name of the header. The value of this
key should be the value of the header. For example, for New Relic, an example Secret
might look like `OTEL_HEADER_api-key: YOUR_API_KEY`. If you use the OpenTelemetry Secret
template, this will be pre-filled for you.
4. Navigate to the [Modal metrics settings page](http://modal.com/settings/metrics) and configure
the OpenTelemetry push URL from step 1 and the Secret from step 3.
5. Save your changes and use the test button to confirm that logs are being sent to your provider.
If it’s all working, you should see a `Hello from Modal! 🚀` log from the `modal.test_logs` service.
Uninstalling the integration
----------------------------
Once the integration is uninstalled, all logs will stop being sent to
your provider.
1. Navigate to the [Modal metrics settings page](http://modal.com/settings/metrics) and disable the OpenTelemetry integration.
[Connecting Modal to your OpenTelemetry Provider](#connecting-modal-to-your-opentelemetry-provider)[What this integration does](#what-this-integration-does)[Metrics](#metrics)[Installing the integration](#installing-the-integration)[Uninstalling the integration](#uninstalling-the-integration)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Proxy auth tokens (beta)
========================
To prevent users outside of your workspace from discovering and triggering web endpoints that you create, Modal will check for two headers: `Modal-Key` and `Modal-Secret` on HTTP requests to the endpoint. You can populate these headers with tokens created under [Settings > Proxy Auth Tokens](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7proxy-auth-tokens.html).
By default, [web endpoints](webhooks.html) created by the [fastapi\_endpoint](../reference/modal.fastapi_endpoint.html), [asgi\_app](../reference/modal.asgi_app.html), [wsgi\_app](../reference/modal.wsgi_app.html), or [web\_server](../reference/modal.web_server.html) decorators
are publicly available. The optional field `requires_proxy_auth` protects your web endpoint by verifying a key and a token are passed in the `Modal-Key` and `Modal-Secret` headers. Requests without those headers will receive the HTTP error [401 Unauthorized](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401) unless valid credentials are supplied.
```
import modal
@app.function()
@modal.fastapi_endpoint(requires_proxy_auth=True)
def app():
return "hello world"
```
To trigger the endpoint, create a [Proxy Auth Token](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7proxy-auth-tokens.html), which will generate a token ID and token secret that you use to prove the authorization of your request. In requests to the web endpoint, add the `Modal-Key` and `Modal-Secret` HTTP headers and supply your token in the header value.
```
export TOKEN_ID=wk-1234abcd
export TOKEN_SECRET=ws-1234abcd
curl -H "Modal-Key: $TOKEN_ID" \
-H "Modal-Secret: $TOKEN_SECRET" \
https://my-secure-endpoint.modal.run
```
Everyone within the workspace of the web endpoint can manage the tokens that will be accepted as valid authentication.
Proxy-Authorization header
--------------------------
Previously, Modal Proxy Auth tokens used the verified the [`Proxy-Authorization`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Proxy-Authorization) header, returning a [407 Proxy Unauthorized](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/407) HTTP error in case the token isn’t valid. We have since made an update to use `Modal-Key` and `Modal-Secret` instead. `Proxy-Authorization` is deprecated, and users are advised to stop using it.
The `Proxy-Authorization` header uses the `Basic` authentication scheme and expect base64 encoding of `[TOKEN_ID]:[TOKEN_SECRET]` for the credentials. For example:
```
export TOKEN_ID=wk-1234abcd
export TOKEN_SECRET=ws-1234abcd
curl https://my-secure-endpoint.modal.run -H "Proxy-Authorization: Basic $(echo -n $TOKEN_ID:$TOKEN_SECRET | base64)"
```
[Proxy auth tokens (beta)](#proxy-auth-tokens-beta)[Proxy-Authorization header](#proxy-authorization-header)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Geographic Latency
==================
Modal’s worker cluster is multi-cloud and multi-region. The vast majority of workers are located
in the continental USA, but we do run workers in Europe and Asia.
Modal’s control plane is hosted in Virginia, USA (`us-east-1`).
Any time data needs to travel between the Modal client, our control plane servers, and our workers
latency will be incurred. [Cloudping.com](https://www.cloudping.co/grid) provides good estimates on the
significance of the latency between regions. For example, the roundtrip latency between AWS `us-east-1` (Virginia, USA) and `us-west-1` (California, USA) is around 60ms.
You can observe the location identifier of a container [via an environment variable](environment_variables.html).
Logging this environment variable alongside latency information can reveal when geography is impacting your application
performance.
Region selection
----------------
In cases where low-latency communication is required between your container and a network dependency (e.g a database),
it is useful to ensure that Modal schedules your container in only regions geographically proximate to that dependency.
For example, if you have an AWS RDS database in Virginia, USA (`us-east-1`), ensuring your Modal containers are also scheduled in Virginia
means that network latency between the container and the database will be less than 5 milliseconds.
For more information, please see [Region selection](region-selection.html).
[Geographic Latency](#geographic-latency)[Region selection](#region-selection)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Running commands in Sandboxes
=============================
Once you have created a Sandbox, you can run commands inside it using the [`Sandbox.exec`](../reference/modal.Sandbox.html#exec) method.
```
sb = modal.Sandbox.create(app=my_app)
process = sb.exec("echo", "hello")
print(process.stdout.read())
process = sb.exec("python", "-c", "print(1 + 1)")
print(process.stdout.read())
process = sb.exec(
"bash",
"-c",
"for i in $(seq 1 10); do echo foo $i; sleep 0.1; done",
)
for line in process.stdout:
print(line, end="")
sb.terminate()
```
`Sandbox.exec` returns a [`ContainerProcess`](../reference/modal.container_process.html#modalcontainer_processcontainerprocess) object, which allows access to the process’s `stdout`, `stderr`, and `stdin`.
Input
-----
The Sandbox and ContainerProcess `stdin` handles are [`StreamWriter`](../reference/modal.io_streams.html#modalio_streamsstreamwriter) objects. This object supports flushing writes with both synchronous and asynchronous APIs:
```
import asyncio
sb = modal.Sandbox.create(app=my_app)
p = sb.exec("bash", "-c", "while read line; do echo $line; done")
p.stdin.write(b"foo bar\n")
p.stdin.write_eof()
p.stdin.drain()
p.wait()
sb.terminate()
async def run_async():
sb = await modal.Sandbox.create.aio(app=my_app)
p = await sb.exec.aio("bash", "-c", "while read line; do echo $line; done")
p.stdin.write(b"foo bar\n")
p.stdin.write_eof()
await p.stdin.drain.aio()
await p.wait.aio()
await sb.terminate.aio()
asyncio.run(run_async())
```
Output
------
The Sandbox and ContainerProcess `stdout` and `stderr` handles are [`StreamReader`](../reference/modal.io_streams.html#modalio_streamsstreamreader) objects. These objects support reading from the stream in both synchronous and asynchronous manners.
To read from a stream after the underlying process has finished, you can use the `read` method, which blocks until the process finishes and returns the entire output stream.
```
sb = modal.Sandbox.create(app=my_app)
p = sb.exec("echo", "hello")
print(p.stdout.read())
sb.terminate()
```
To stream output, take advantage of the fact that `stdout` and `stderr` are
iterable:
```
import asyncio
sb = modal.Sandbox.create(app=my_app)
p = sb.exec("bash", "-c", "for i in $(seq 1 10); do echo foo $i; sleep 0.1; done")
for line in p.stdout:
# Lines preserve the trailing newline character, so use end="" to avoid double newlines.
print(line, end="")
p.wait()
sb.terminate()
async def run_async():
sb = await modal.Sandbox.create.aio(app=my_app)
p = await sb.exec.aio("bash", "-c", "for i in $(seq 1 10); do echo foo $i; sleep 0.1; done")
async for line in p.stdout:
# Avoid double newlines by using end="".
print(line, end="")
await p.wait.aio()
await sb.terminate.aio()
asyncio.run(run_async())
```
### Stream types
By default, all streams are buffered in memory, waiting to be consumed by the
client. You can control this behavior with the `stdout` and `stderr` parameters.
These parameters are conceptually similar to the `stdout` and `stderr` parameters of the [`subprocess`](https://docs.python.org/3/library/subprocess.html#subprocess.DEVNULL) module.
```
from modal.stream_type import StreamType
sb = modal.Sandbox.create(app=my_app)
# Default behavior: buffered in memory.
p = sb.exec(
"bash",
"-c",
"echo foo; echo bar >&2",
stdout=StreamType.PIPE,
stderr=StreamType.PIPE,
)
print(p.stdout.read())
print(p.stderr.read())
# Print the stream to STDOUT as it comes in.
p = sb.exec(
"bash",
"-c",
"echo foo; echo bar >&2",
stdout=StreamType.STDOUT,
stderr=StreamType.STDOUT,
)
p.wait()
# Discard all output.
p = sb.exec(
"bash",
"-c",
"echo foo; echo bar >&2",
stdout=StreamType.DEVNULL,
stderr=StreamType.DEVNULL,
)
p.wait()
sb.terminate()
```
[Running commands in Sandboxes](#running-commands-in-sandboxes)[Input](#input)[Output](#output)[Stream types](#stream-types)
See it in action
[Building a coding agent with Sandboxes](../examples/agent.html)
[Building a code interpreter](../examples/simple_code_interpreter.html)
[Safe code execution](../examples/safe_code_execution.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Dicts and Queues
================
Modal provides a variety of distributed objects to enable seamless interactivity
and data transfer across different components of a distributed system. Two key
objects are dicts and queues, both of which serve specific roles in facilitating
communication and data management in your applications.
Modal Dicts
-----------
A [Dict](../reference/modal.Dict.html) in Modal provides distributed key-value
storage. Much like a standard Python dictionary, it lets you store and retrieve
values using keys. However, unlike a regular dictionary, a Dict in Modal is
shared across all containers of an application and can be accessed and
manipulated concurrently from any of them.
```
import modal
app = modal.App()
# Create a persisted dict - the data gets retained between app runs
my_dict = modal.Dict.from_name("my-persisted-dict", create_if_missing=True)
@app.local_entrypoint()
def main():
my_dict["key"] = "value" # setting a value
value = my_dict["key"] # getting a value
```
Dicts in Modal are persisted, which means that the data in the dictionary is
stored and can be retrieved later, even after the application is redeployed.
They can also be accessed from other Modal functions.
You can store Python values of any type within Dicts, since they’re serialized
using [`cloudpickle`](https://github.com/cloudpipe/cloudpickle). Note that you
will need to have the library defining the type installed in the environment
where you retrieve the object from the Dict, otherwise a `DeserializationError` will be raised.
Unlike with normal Python dictionaries, updates to mutable value types will not
be reflected in other containers unless the updated object is explicitly put
back into the Dict. As a consequence, patterns like chained updates
(`my_dict["outer_key"]["inner_key"] = value`) cannot be used the same way as
they would with a local dictionary.
Currently, the per-object size limit is 100 MiB and the maximum number of entries
per update is 10,000. It’s recommended to use Dicts for smaller objects (under 5 MiB).
Each object in the Dict will expire after 7 days of inactivity (no reads or writes).
Modal Queues
------------
A [Queue](../reference/modal.Queue.html) in Modal is a distributed queue-like
object. It allows you to add and retrieve items in a first-in-first-out (FIFO)
manner. Queues are particularly useful when you want to handle tasks or process
data asynchronously, or when you need to pass messages between different
components of your distributed system.
```
import modal
app = modal.App()
my_queue = modal.Queue.from_name("my-persisted-queue", create_if_missing=True)
@app.local_entrypoint()
def main():
my_queue.put("some object") # adding a value
value = my_queue.get() # retrieving a value
```
Similar to Dicts, Queues are also persisted and support values of any type.
### Queue partitions
Queues are split into separate FIFO partitions via a string key. By default, one
partition (corresponding to an empty key) is used.
A single `Queue` can contain up to 100,000 partitions, each with up to 5,000
items. Each item can be up to 1 MiB. These limits also apply to the default
partition.
```
import modal
app = modal.App()
my_queue = modal.Queue.from_name("my-persisted-queue", create_if_missing=True)
@app.local_entrypoint()
def main():
my_queue.put("some value")
my_queue.put(123)
assert my_queue.get() == "some value"
assert my_queue.get() == 123
my_queue.put(0)
my_queue.put(1, partition="foo")
my_queue.put(2, partition="bar")
# Default and "foo" partition are ignored by the get operation.
assert my_queue.get(partition="bar") == 2
# Set custom 10s expiration time on "foo" partition.
my_queue.put(3, partition="foo", partition_ttl=10)
# (beta feature) Iterate through items in place (read immutably)
my_queue.put(1)
assert [v for v in my_queue.iterate()] == [0, 1]
```
By default, each partition is cleared 24 hours after the last `put` operation. A
lower TTL can be specified by the `partition_ttl` argument in the `put` or `put_many` methods. Each partition’s expiry is handled independently.
As such, `Queue`s are best used for communication between active functions and
not relied on for persistent storage.
Asynchronous calls
------------------
Both Dicts and Queues are synchronous by default, but they support asynchronous
interaction with the `.aio` function suffix.
```
@app.local_entrypoint()
async def main():
await my_queue.put.aio(100)
assert await my_queue.get.aio() == 100
await my_dict.put.aio("hello", 400)
assert await my_dict.get.aio("hello") == 400
```
Note that `.put` and `.get` are aliases for the overloaded indexing operators on
Dicts, but you need to invoke them by name for asynchronous calls.
Please see the docs on [asynchronous functions](async.html) for more
information.
Example: Dict and Queue Interaction
-----------------------------------
To illustrate how dicts and queues can interact together in a simple distributed
system, consider the following example program that crawls the web, starting
from [wikipedia.org](https://www.wikipedia.org) and traversing links to many
sites in breadth-first order. The Queue stores pages to crawl, while the Dict is
used as a kill switch to stop execution of tasks immediately upon completion.
```
import queue
import sys
from datetime import datetime
import modal
app = modal.App(image=modal.Image.debian_slim().pip_install("requests", "beautifulsoup4"))
def extract_links(url: str) -> list[str]:
"""Extract links from a given URL."""
import requests
import urllib.parse
from bs4 import BeautifulSoup
resp = requests.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
links = []
for link in soup.find_all("a"):
links.append(urllib.parse.urljoin(url, link.get("href")))
return links
@app.function()
def crawl_pages(q: modal.Queue, d: modal.Dict, urls: set[str]) -> None:
for url in urls:
if "stop" in d:
return
try:
s = datetime.now()
links = extract_links(url)
print(f"Crawled: {url} in {datetime.now() - s}, with {len(links)} links")
q.put_many(links)
except Exception as exc:
print(f"Failed to crawl: {url} with error {exc}, skipping...", file=sys.stderr)
@app.function()
def scrape(url: str):
start_time = datetime.now()
# Create ephemeral dicts and queues
with modal.Dict.ephemeral() as d, modal.Queue.ephemeral() as q:
# The dict is used to signal the scraping to stop
# The queue contains the URLs that have been crawled
# Initialize queue with a starting URL
q.put(url)
# Crawl until the queue is empty, or reaching some number of URLs
visited = set()
max_urls = 50000
while True:
try:
next_urls = q.get_many(2000, timeout=5)
except queue.Empty:
break
new_urls = set(next_urls) - visited
visited |= new_urls
if len(visited) < max_urls:
crawl_pages.spawn(q, d, new_urls)
else:
d["stop"] = True
elapsed = (datetime.now() - start_time).total_seconds()
print(f"Crawled {len(visited)} URLs in {elapsed:.2f} seconds")
@app.local_entrypoint()
def main():
scrape.remote("https://www.wikipedia.org/")
```
Starting from Wikipedia, this spawns several dozen containers (auto-scaled on
demand) to crawl over 200,000 URLs in 40 seconds.
Data durability
---------------
Dicts are backed by durable storage. Queues are backed by a replicated in-memory
database, so data could potentially be lost, but it is unlikely.
Queues and Dicts are also subject to expiration, as described by the [modal.Dict](../reference/modal.Dict.html) and [modal.Queue](../reference/modal.Queue.html) reference pages.
[Please get in touch](mailto:[email protected]) if you need durability for Queue objects.
[Dicts and Queues](#dicts-and-queues)[Modal Dicts](#modal-dicts)[Modal Queues](#modal-queues)[Queue partitions](#queue-partitions)[Asynchronous calls](#asynchronous-calls)[Example: Dict and Queue Interaction](#example-dict-and-queue-interaction)[Data durability](#data-durability)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Using CUDA on Modal
===================
Modal makes it easy to accelerate your workloads with datacenter-grade NVIDIA GPUs.
To take advantage of the hardware, you need to use matching software: the CUDA stack.
This guide explains the components of that stack and how to install them on Modal.
For more on which GPUs are available on Modal and how to choose a GPU for your use case,
see [this guide](gpu.html). For a deep dive on both the [GPU hardware](../../gpu-glossary/device-hardware.html) and [software](../../gpu-glossary/device-software.html) and for even more detail on [the CUDA stack](../../gpu-glossary/host-software.html),
see our [GPU Glossary](../../gpu-glossary/readme.html).
Here’s the tl;dr:
* The [NVIDIA Accelerated Graphics Driver for Linux-x86\_64](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#driver-installation), version 570.86.15,
and [CUDA Driver API](https://docs.nvidia.com/cuda/archive/12.8.0/cuda-driver-api/index.html), version 12.8, are already installed.
You can call `nvidia-smi` or run compiled CUDA programs from any Modal Function with access to a GPU.
* That means you can install many popular libraries like `torch` that bundle their other CUDA dependencies [with a simple `pip_install`](#install-gpu-accelerated-torch-and-transformers-with-pip_install).
* For bleeding-edge libraries like `flash-attn`, you may need to install CUDA dependencies manually.
To make your life easier, [use an existing image](#for-more-complex-setups-use-an-officially-supported-cuda-image).
What is CUDA?
-------------
When someone refers to “installing CUDA” or “using CUDA”,
they are referring not to a library, but to a [stack](../../gpu-glossary/host-software/cuda-software-platform.html) with multiple layers.
Your application code (and its dependencies) can interact
with the stack at different levels.
![The CUDA stack](../../_app/immutable/assets/cuda-stack-diagram.BdEpPviG.png)
This leads to a lot of confusion. To help clear that up, the following sections explain each component in detail.
### Level 0: Kernel-mode driver components
At the lowest level are the [*kernel-mode driver components*](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#nvidia-open-gpu-kernel-modules).
The Linux kernel is essentially a single program operating the entire machine and all of its hardware.
To add hardware to the machine, this program is extended by loading new modules into it.
These components communicate directly with hardware — in this case the GPU.
Because they are kernel modules, these driver components are tightly integrated with the host operating system
that runs your containerized Modal Functions and are not something you can inspect or change yourself.
### Level 1: User-mode driver API
All action in Linux that doesn’t occur in the kernel occurs in [user space](https://en.wikipedia.org/wiki/User_space).
To talk to the kernel drivers from our user space programs, we need *user-mode driver components*.
Most prominently, that includes:
* the [CUDA Driver API](../../gpu-glossary/host-software/cuda-driver-api.html),
a [shared object](https://en.wikipedia.org/wiki/Shared_library) called `libcuda.so`.
This object exposes functions like [`cuMemAlloc`](https://docs.nvidia.com/cuda/archive/12.8.0/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467),
for allocating GPU memory.
* the [NVIDIA management library](https://developer.nvidia.com/management-library-nvml), `libnvidia-ml.so`, and its command line interface [`nvidia-smi`](https://developer.nvidia.com/system-management-interface).
You can use these tools to check the status of the system’s GPU(s).
These components are installed on all Modal machines with access to GPUs.
Because they are user-level components, you can use them directly:
```
import modal
app = modal.App()
@app.function(gpu="any")
def check_nvidia_smi():
import subprocess
output = subprocess.check_output(["nvidia-smi"], text=True)
assert "Driver Version:" in output
assert "CUDA Version:" in output
print(output)
return output
```
### Level 2: CUDA Toolkit
Wrapping the CUDA Driver API is the [CUDA Runtime API](../../gpu-glossary/host-software/cuda-runtime-api.html), the `libcudart.so` shared library.
This API includes functions like [`cudaLaunchKernel`](https://docs.nvidia.com/cuda/archive/12.8.0/cuda-runtime-api/group__CUDART__HIGHLEVEL.html#group__CUDART__HIGHLEVEL_1g7656391f2e52f569214adbfc19689eb3) and is more commonly used in CUDA programs (see [this HackerNews comment](https://news.ycombinator.com/item?id=20616385) for color commentary on why).
This shared library is *not* installed by default on Modal.
The CUDA Runtime API is generally installed as part of the larger [NVIDIA CUDA Toolkit](https://docs.nvidia.com/cuda/index.html),
which includes the [NVIDIA CUDA compiler driver](../../gpu-glossary/host-software/nvcc.html) (`nvcc`) and its toolchain
and a number of [useful goodies](../../gpu-glossary/host-software/cuda-binary-utilities.html) for writing and debugging CUDA programs (`cuobjdump`, `cudnn`, profilers, etc.).
Contemporary GPU-accelerated machine learning workloads like LLM inference frequently make use of many components of the CUDA Toolkit,
such as the run-time compilation library [`nvrtc`](https://docs.nvidia.com/cuda/archive/12.8.0/nvrtc/index.html).
So why aren’t these components installed along with the drivers?
A compiled CUDA program can run without the CUDA Runtime API installed on the system,
by [statically linking](https://en.wikipedia.org/wiki/Static_library) the CUDA Runtime API into the program binary,
though this is fairly uncommon for CUDA-accelerated Python programs.
Additionally, older versions of these components are needed for some applications
and some application deployments even use several versions at once.
Both patterns are compatible with the host machine driver provided on Modal.
Install GPU-accelerated `torch` and `transformers` with `pip_install`
---------------------------------------------------------------------
The components of the CUDA Toolkit can be installed via `pip`,
via PyPI packages like [`nvidia-cuda-runtime-cu12`](https://pypi.org/project/nvidia-cuda-runtime-cu12/) and [`nvidia-cuda-nvrtc-cu12`](https://pypi.org/project/nvidia-cuda-nvrtc-cu12/).
These components are listed as dependencies of some popular GPU-accelerated Python libraries, like `torch`.
Because Modal already includes the lower parts of the CUDA stack, you can install these libraries
with [the `pip_install` method of `modal.Image`](images.html#add-python-packages-with-pip_install), just like any other Python library:
```
image = modal.Image.debian_slim().pip_install("torch")
@app.function(gpu="any", image=image)
def run_torch():
import torch
has_cuda = torch.cuda.is_available()
print(f"It is {has_cuda} that torch can access CUDA")
return has_cuda
```
Many libraries for running open-weights models, like `transformers` and `vllm`,
use `torch` under the hood and so can be installed in the same way:
```
image = modal.Image.debian_slim().pip_install("transformers[torch]")
image = image.apt_install("ffmpeg") # for audio processing
@app.function(gpu="any", image=image)
def run_transformers():
from transformers import pipeline
transcriber = pipeline(model="openai/whisper-tiny.en", device="cuda")
result = transcriber("https://modal-cdn.com/mlk.flac")
print(result["text"]) # I have a dream that one day this nation will rise up live out the true meaning of its creed
```
For more complex setups, use an officially-supported CUDA image
---------------------------------------------------------------
The disadvantage of installing the CUDA stack via `pip` is that
many other libraries that depend on its components being installed as normal system packages cannot find them.
For these cases, we recommend you use an image that already has the full CUDA stack installed as system packages
and all environment variables set correctly, like the [`nvidia/cuda:*-devel-*` images on Docker Hub](https://hub.docker.com/r/nvidia/cuda).
One popular library that requires the whole toolkit is [`flash-attn`](https://github.com/Dao-AILab/flash-attention),
which was, for a time, by far the fastest implementation of Transformer multi-head attention:
```
cuda_version = "12.8.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
.apt_install("git")
.pip_install( # required to build flash-attn
"ninja",
"packaging",
"wheel",
"torch",
)
.pip_install( # add flash-attn
"flash-attn==2.7.4.post1", extra_options="--no-build-isolation"
)
)
@app.function(gpu="a10g", image=image)
def run_flash_attn():
import torch
from flash_attn import flash_attn_func
batch_size, seqlen, nheads, headdim, nheads_k = 2, 4, 3, 16, 3
q = torch.randn(batch_size, seqlen, nheads, headdim, dtype=torch.float16).to("cuda")
k = torch.randn(batch_size, seqlen, nheads_k, headdim, dtype=torch.float16).to("cuda")
v = torch.randn(batch_size, seqlen, nheads_k, headdim, dtype=torch.float16).to("cuda")
out = flash_attn_func(q, k, v)
assert out.shape == (batch_size, seqlen, nheads, headdim)
```
Make sure to choose a version of CUDA that is no greater than the version provided by the host machine.
Older minor (`12.*`) versions are guaranteed to be compatible with the host machine’s driver,
but older major (`11.*`, `10.*`, etc.) versions may not be.
What next?
----------
For more on accessing and choosing GPUs on Modal, check out [this guide](gpu.html).
To dive deep on GPU internals, check out our [GPU Glossary](../../gpu-glossary/readme.html).
To see these installation patterns in action, check out these examples:
* [Fast LLM inference with vLLM](../examples/vllm_inference.html)
* [Finetune a character LoRA for your pet](../examples/diffusers_lora_finetune.html)
* [Optimized Flux inference](../examples/flux.html)
[Using CUDA on Modal](#using-cuda-on-modal)[What is CUDA?](#what-is-cuda)[Level 0: Kernel-mode driver components](#level-0-kernel-mode-driver-components)[Level 1: User-mode driver API](#level-1-user-mode-driver-api)[Level 2: CUDA Toolkit](#level-2-cuda-toolkit)[Install GPU-accelerated torch and transformers with pip\_install](#install-gpu-accelerated-torch-and-transformers-with-pip_install)[For more complex setups, use an officially-supported CUDA image](#for-more-complex-setups-use-an-officially-supported-cuda-image)[What next?](#what-next)
See it in action
[High-speed inference with vLLM](../examples/vllm_inference.html)
[Run Stable Diffusion with a CLI, API, and web UI](../examples/stable_diffusion_cli.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Modal user account setup
========================
To run and deploy applications on Modal you’ll need to sign up and create a user
account.
You can visit the [signup](../../signup.html) page to begin the process or execute [`modal setup`](../reference/cli/setup.html#modal-setup) on the command line.
Users can also be provisioned through [Okta SSO](okta-sso.html), which is
an enterprise feature that you can request. For the typical user you’ll sign-up
using an existing GitHub account. If you’re interested in authenticating with
other identity providers let us know at [[email protected]](mailto:[email protected]).
What GitHub permissions does signing up require?
------------------------------------------------
* `user:email` — gives us the emails associated with the GitHub account.
* `read:org` (invites only) — needed for Modal workspace invites. Note: this
only allows us to see what organization memberships you have
([GitHub docs](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/scopes-for-oauth-apps)).
We won’t be able to access any code repositories or other details.
How can I change my email?
--------------------------
You can change your email on the [settings](../../login%EF%B9%96next=%EA%A4%B7settings.html) page.
[Modal user account setup](#modal-user-account-setup)[What GitHub permissions does signing up require?](#what-github-permissions-does-signing-up-require)[How can I change my email?](#how-can-i-change-my-email)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Invoking deployed functions
===========================
Modal lets you take a function created by a [deployment](managing-deployments.html) and call it from other contexts.
There are two ways of invoking deployed functions. If the invoking client is
running Python, then the same [Modal client library](https://pypi.org/project/modal/) used to write Modal code
can be used. HTTPS is used if the invoking client is not running Python and
therefore cannot import the Modal client library.
Invoking with Python
--------------------
Some use cases for Python invocation include:
* An existing Python web server (eg. Django, Flask) wants to invoke Modal
functions.
* You have split your product or system into multiple Modal applications that
deploy independently and call each other.
### Function lookup and invocation basics
Let’s say you have a script `my_shared_app.py` and this script defines a Modal
app with a function that computes the square of a number:
```
import modal
app = modal.App("my-shared-app")
@app.function()
def square(x: int):
return x ** 2
```
You can deploy this app to create a persistent deployment:
```
% modal deploy shared_app.py
✓ Initialized.
✓ Created objects.
├── 🔨 Created square.
├── 🔨 Mounted /Users/erikbern/modal/shared_app.py.
✓ App deployed! 🎉
View Deployment: https://modal.com/apps/erikbern/my-shared-app
```
Let’s try to run this function from a different context. For instance, let’s
fire up the Python interactive interpreter:
```
% python
Python 3.9.5 (default, May 4 2021, 03:29:30)
[Clang 12.0.0 (clang-1200.0.32.27)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import modal
>>> f = modal.Function.from_name("my-shared-app", "square")
>>> f.remote(42)
1764
>>>
```
This works exactly the same as a regular modal `Function` object. For example,
you can `.map()` over functions invoked this way too:
```
>>> f = modal.Function.from_name("my-shared-app", "square")
>>> f.map([1, 2, 3, 4, 5])
[1, 4, 9, 16, 25]
```
#### Authentication
The Modal Python SDK will read the token from `~/.modal.toml` which typically is
created using `modal token new`.
Another method of providing the credentials is to set the environment variables `MODAL_TOKEN_ID` and `MODAL_TOKEN_SECRET`. If you want to call a Modal function
from a context such as a web server, you can expose these environment variables
to the process.
#### Lookup of lifecycle functions
[Lifecycle functions](lifecycle-functions.html) are defined on classes,
which you can look up in a different way. Consider this code:
```
import modal
app = modal.App("my-shared-app")
@app.cls()
class MyLifecycleClass:
@modal.enter()
def enter(self):
self.var = "hello world"
@modal.method()
def foo(self):
return self.var
```
Let’s say you deploy this app. You can then call the function by doing this:
```
>>> cls = modal.Cls.from_name("my-shared-app", "MyLifecycleClass")
>>> obj = cls() # You can pass any constructor arguments here
>>> obj.foo.remote()
'hello world'
```
### Asynchronous invocation
In certain contexts, a Modal client will need to trigger Modal functions without
waiting on the result. This is done by spawning functions and receiving a [`FunctionCall`](../reference/modal.FunctionCall.html) as a
handle to the triggered execution.
The following is an example of a Flask web server (running outside Modal) which
accepts model training jobs to be executed within Modal. Instead of the HTTP
POST request waiting on a training job to complete, which would be infeasible,
the relevant Modal function is spawned and the [`FunctionCall`](../reference/modal.FunctionCall.html) object is stored for later polling of execution status.
```
from uuid import uuid4
from flask import Flask, jsonify, request
app = Flask(__name__)
pending_jobs = {}
...
@app.route("/jobs", methods = ["POST"])
def create_job():
predict_fn = modal.Function.from_name("example", "train_model")
job_id = str(uuid4())
function_call = predict_fn.spawn(
job_id=job_id,
params=request.json,
)
pending_jobs[job_id] = function_call
return {
"job_id": job_id,
"status": "pending",
}
```
### Importing a Modal function between Modal apps
You can also import one function defined in an app from another app:
```
import modal
app = modal.App("another-app")
square = modal.Function.from_name("my-shared-app", "square")
@app.function()
def cube(x):
return x * square.remote(x)
@app.local_entrypoint()
def main():
assert cube.remote(42) == 74088
```
### Comparison with HTTPS
Compared with HTTPS invocation, Python invocation has the following benefits:
* Avoids the need to create web endpoint functions.
* Avoids handling serialization of request and response data between Modal and
your client.
* Uses the Modal client library’s built-in authentication.
+ Web endpoints are public to the entire internet, whereas function `lookup` only exposes your code to you (and your org).
* You can work with shared Modal functions as if they are normal Python
functions, which might be more convenient.
Invoking with HTTPS
-------------------
Any non-Python application client can interact with deployed Modal applications
via [web endpoint functions](webhooks.html).
Anything able to make HTTPS requests can trigger a Modal web endpoint function.
Note that all deployed web endpoint functions have [a stable HTTPS URL](webhook-urls.html).
Some use cases for HTTPS invocation include:
* Calling Modal functions from a web browser client running Javascript
* Calling Modal functions from non-Python backend services (Java, Go, Ruby,
NodeJS, etc)
* Calling Modal functions using UNIX tools (`curl`, `wget`)
However, if the client of your Modal deployment is running Python, it’s better
to use the [Modal client library](https://pypi.org/project/modal/) to invoke
your Modal code.
For more detail on setting up functions for invocation over HTTP see the [web endpoints guide](webhooks.html).
[Invoking deployed functions](#invoking-deployed-functions)[Invoking with Python](#invoking-with-python)[Function lookup and invocation basics](#function-lookup-and-invocation-basics)[Authentication](#authentication)[Lookup of lifecycle functions](#lookup-of-lifecycle-functions)[Asynchronous invocation](#asynchronous-invocation)[Importing a Modal function between Modal apps](#importing-a-modal-function-between-modal-apps)[Comparison with HTTPS](#comparison-with-https)[Invoking with HTTPS](#invoking-with-https)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Jupyter notebooks
=================
You can use the Modal client library in notebook environments like Jupyter! Just `import modal` and use as normal. You will likely need to use [`app.run`](apps.html#ephemeral-apps) to create an ephemeral app to run your functions:
```
# Cell 1
import modal
app = modal.App()
@app.function()
def my_function(x):
...
# Cell 2
with modal.enable_output():
with app.run():
my_function.remote(42)
```
Known issues
------------
* **Interactive shell and interactive functions are not supported.**
These can only be run within a live terminal session, so they are not
supported in notebooks.
* **Local and remote Python versions must match.**
When defining Modal Functions in a Jupyter notebook, the Function automatically
has `serialized=True` set. This implies that the versions of Python and any third-
party libraries used in your Modal container must match the version you have locally,
so that the function can be deserialized remotely without errors.
If you encounter issues not documented above, try restarting the notebook kernel, as it may be
in a broken state, which is common in notebook development.
If the issue persists, contact us [in our Slack](../../slack.html).
We are working on removing these known issues so that writing Modal applications
in a notebook feels just like developing in regular Python modules and scripts.
Jupyter inside Modal
--------------------
You can run Jupyter in Modal using the `modal launch` command. For example:
```
$ modal launch jupyter --gpu a10g
```
That will start a Jupyter instance with an A10G GPU attached. You’ll be able to
access the app with via a [Modal Tunnel URL](tunnels.html#tunnels-beta). Jupyter
will stop running whenever you stop Modal call in your terminal.
See `--help` for additional options.
Further examples
----------------
* [Basic demonstration of running Modal in a notebook](https://github.com/modal-labs/modal-examples/blob/main/11_notebooks/basic.ipynb)
* [Running Jupyter server within a Modal function](https://github.com/modal-labs/modal-examples/blob/main/11_notebooks/jupyter_inside_modal.py)
[Jupyter notebooks](#jupyter-notebooks)[Known issues](#known-issues)[Jupyter inside Modal](#jupyter-inside-modal)[Further examples](#further-examples)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Failures and retries
====================
When you call a function over a sequence of inputs with [Function.map()](scale.html#parallel-execution-of-inputs), sometimes
errors can happen during function execution. Exceptions from within the remote
function are propagated to the caller, so you can handle them with a `try-except` statement (refer to [section on custom types](troubleshooting.html#custom-types-defined-in-__main__) for more on how to catch user-defined exceptions):
```
@app.function()
def f(i):
raise ValueError()
@app.local_entrypoint()
def main():
try:
for _ in f.map([1, 2, 3]):
pass
except ValueError:
print("Exception handled")
```
Function retries
----------------
You can configure Modal to automatically retry function failures if you set the `retries` option when declaring your function:
```
@app.function(retries=3)
def my_flaky_function():
pass
```
When used with `Function.map()`, each input is retried up to the max number of
retries specified.
The basic configuration shown provides a fixed 1s delay between retry attempts.
For fine-grained control over retry delays, including exponential backoff
configuration, use [`modal.Retries`](../reference/modal.Retries.html).
To treat exceptions as successful results and aggregate them in the results list instead, pass in [`return_exceptions=True`](scale.html#exceptions).
Container crashes
-----------------
If a `modal.Function` container crashes (either on start-up, e.g. while handling imports in global scope, or during execution, e.g. an out-of-memory error), Modal will reschedule the container and any work it was currently assigned.
For [ephemeral apps](apps.html#ephemeral-apps), container crashes will be retried until a failure rate is exceeded, after which all pending inputs will be failed and the exception will be propagated to the caller.
For [deployed apps](apps.html#deployed-apps), container crashes will be retried indefinitely, so as to not disrupt service. Modal will instead apply a crash-loop backoff and the rate of new container creation for the function will be slowed down. Crash-looping containers are displayed in the app dashboard.
[Failures and retries](#failures-and-retries)[Function retries](#function-retries)[Container crashes](#container-crashes)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Large dataset ingestion
=======================
This guide provides best practices for downloading, transforming, and storing large datasets within
Modal. A dataset is considered large if it contains hundreds of thousands of files and/or is over
100 GiB in size.
These guidelines ensure that large datasets can be ingested fully and reliably.
Configure your Function for heavy disk usage
--------------------------------------------
Large datasets should be downloaded and transformed using a `modal.Function` and stored
into a `modal.CloudBucketMount`. We recommend backing the latter with a Cloudflare R2 bucket,
because Cloudflare does not charge network egress fees and has lower GiB/month storage costs than AWS S3.
This `modal.Function` should specify a large `timeout` because large dataset processing can take hours,
and it should request a larger ephemeral disk in cases where the dataset being downloaded and processed
is hundreds of GiBs.
```
@app.function(
volumes={
"/mnt": modal.CloudBucketMount(
"datasets",
bucket_endpoint_url="https://abc123example.r2.cloudflarestorage.com",
secret=modal.Secret.from_name("cloudflare-r2-datasets"),
)
},
ephemeral_disk=1000 * 1000, # 1 TiB
timeout=60 * 60 * 12, # 12 hours
)
def download_and_transform() -> None:
...
```
### Use compressed archives on Modal Volumes
`modal.Volume`s are designed for storing tens of thousands of individual files,
but not for hundreds of thousands or millions of files.
However they can be still be used for storing large datasets if files are first combined and compressed
in a dataset transformation step before saving them into the Volume.
See the [transforming](#transforming) section below for more details.
Experimentation
---------------
Downloading and transforming large datasets can be fiddly. While iterating on a reliable ingestion program
it is recommended to start a long-running `modal.Function` serving a JupyterHub server so that you can maintain
disk state in the face of application errors.
See the [running Jupyter server within a Modal function](https://github.com/modal-labs/modal-examples/blob/main/11_notebooks/jupyter_inside_modal.py) example as base code.
Downloading
-----------
The raw dataset data should be first downloaded into the container at `/tmp/` and not placed
directly into the mounted volume. This serves a couple purposes.
1. Certain download libraries and tools (e.g. `wget`) perform filesystem operations not supported properly by `CloudBucketMount`.
2. The raw dataset data may need to be transformed before use, in which case it is wasteful to store it permanently.
This snippet shows the basic download-and-copy procedure:
```
import pathlib
import shutil
import subprocess
tmp_path = pathlib.Path("/tmp/imagenet/")
vol_path = pathlib.Path("/mnt/imagenet/")
filename = "imagenet-object-localization-challenge.zip"
# 1. Download into /tmp/
subprocess.run(
f"kaggle competitions download -c imagenet-object-localization-challenge --path {tmp_path}",
shell=True,
check=True
)
vol_path.mkdir(exist_ok=True)
# 2. Copy (without transform) into mounted volume.
shutil.copyfile(tmp_path / filename, vol_path / filename)
```
Transforming
------------
When ingesting a large dataset it is sometimes necessary to transform it before storage, so that it is in
an optimal format for loading at runtime. A common kind of necessary transform is gzip decompression. Very large
datasets are often gzipped for storage and network transmission efficiency, but gzip decompression (80 MiB/s)
is hundreds of times slower than reading from a solid state drive (SSD)
and should be done once before storage to avoid decompressing on every read against the dataset.
Transformations should be performed after storing the raw dataset in `/tmp/`. Performing transformations almost always increases container disk usage and this is where the [`ephemeral_disk` parameter](../reference/modal.App.html#function) parameter becomes important. For example, a
100 GiB raw, compressed dataset may decompress to into 500 GiB, occupying 600 GiB of container disk space.
Transformations should also typically be performed against `/tmp/`. This is because
1. transforms can be IO intensive and IO latency is lower against local SSD.
2. transforms can create temporary data which is wasteful to store permanently.
Examples
--------
The best practices offered in this guide are demonstrated in the [`modal-examples` repository](https://github.com/modal-labs/modal-examples/tree/main/12_datasets).
The examples include these popular large datasets:
* [ImageNet](https://www.image-net.org/), the image labeling dataset that kicked off the deep learning revolution
* [COCO](https://cocodataset.org/#download), the Common Objects in COntext dataset of densely-labeled images
* [LAION-400M](https://laion.ai/blog/laion-400-open-dataset/), the Stable Diffusion training dataset
* Data derived from the [Big “Fantastic” Database](https://bfd.mmseqs.com/), [Protein Data Bank](https://www.wwpdb.org/), and [UniProt Database](https://www.uniprot.org/) used in training the [RoseTTAFold](https://github.com/RosettaCommons/RoseTTAFold) protein structure model
[Large dataset ingestion](#large-dataset-ingestion)[Configure your Function for heavy disk usage](#configure-your-function-for-heavy-disk-usage)[Use compressed archives on Modal Volumes](#use-compressed-archives-on-modal-volumes)[Experimentation](#experimentation)[Downloading](#downloading)[Transforming](#transforming)[Examples](#examples)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Proxies (beta)
==============
You can securely connect with resources in your private network
using a Modal Proxy. Proxies are a secure tunnel between
Apps and exit nodes with static IPs. You can allow-list those static IPs
in your network firewall, making sure that only traffic originating from these
IP addresses is allowed into your network.
Proxies are unique and not shared between workspaces. All traffic
between your Apps and the Proxy server is encrypted using [Wireguard](https://www.wireguard.com/).
Modal Proxies are built on top of [vprox](https://github.com/modal-labs/vprox),
a Modal open-source project used to create highly available proxy servers
using Wireguard.
*Modal Proxies are in beta. Please let us know if you run into issues.*
Creating a Proxy
----------------
Proxies are available for [Team Plan](../../pricing.html) or [Enterprise](../../pricing.html) users.
You can create Proxies in your workspace [Settings](../../login%EF%B9%96next=%EA%A4%B7settings.html) page.
Team Plan users can create one Proxy and Enterprise users three Proxies. Each Proxy
can have a maximum of five static IP addresses.
Please reach out to [[email protected]](mailto:[email protected]) if you need greater limits.
Using a Proxy
-------------
After a Proxy is online, add it to a Modal Function with the argument `proxy=Proxy.from_name("<your-proxy>")`. For example:
```
import modal
import subprocess
app = modal.App(image=modal.Image.debian_slim().apt_install("curl"))
@app.function(proxy=modal.Proxy.from_name("<your-proxy>"))
def my_ip():
subprocess.run(["curl", "-s", "ifconfig.me"])
@app.local_entrypoint()
def main():
my_ip.remote()
```
All network traffic from your Function will now use the Proxy as a tunnel.
The program above will always print the same IP address independent
of where it runs in Modal’s infrastructure. If that same program
were to run without a Proxy, it would print a different IP
address depending on where it runs.
Proxy performance
-----------------
All traffic that goes through a Proxy is encrypted by Wireguard. This adds
latency to your Function’s networking. If are experiencing networking issues
with Proxies related to performance, first add more IP addresses to your
Proxy (see [Adding more IP addresses a Proxy](#adding-more-ip-addresses-to-a-proxy)).
Adding more IP addresses to a Proxy
-----------------------------------
Proxies support up to five static IP addresses. Adding IP addresses improves
throughput linearly.
You can add an IP address to your workspace in [Settings](../../login%EF%B9%96next=%EA%A4%B7settings.html) > Proxies.
Select the desired Proxy and add a new IP.
If a Proxy has multiple IPs, Modal will randomly pick one when running your Function.
Proxies and Sandboxes
---------------------
Proxies can also be used with [Sandboxes](sandbox.html). For example:
```
import modal
app = modal.App.lookup("sandbox-proxy", create_if_missing=True)
sb = modal.Sandbox.create(
app=app,
image=modal.Image.debian_slim().apt_install("curl"),
proxy=modal.Proxy.from_name("<your-proxy>"))
process = sb.exec("curl", "-s", "https://ifconfig.me")
stdout = process.stdout.read()
print(stdout)
sb.terminate()
```
Similarly to our Function implementation, this Sandbox program will
always print the same IP address.
[Proxies (beta)](#proxies-beta)[Creating a Proxy](#creating-a-proxy)[Using a Proxy](#using-a-proxy)[Proxy performance](#proxy-performance)[Adding more IP addresses to a Proxy](#adding-more-ip-addresses-to-a-proxy)[Proxies and Sandboxes](#proxies-and-sandboxes)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Reserving CPU and memory
========================
Each Modal container has a default reservation of 0.125 CPU cores and 128 MiB of memory.
Containers can exceed this minimum if the worker has available CPU or memory.
You can also guarantee access to more resources by requesting a higher reservation.
CPU cores
---------
If you have code that must run on a larger number of cores, you can
request that using the `cpu` argument. This allows you to specify a
floating-point number of CPU cores:
```
import modal
app = modal.App()
@app.function(cpu=8.0)
def my_function():
# code here will have access to at least 8.0 cores
...
```
Memory
------
If you have code that needs more guaranteed memory, you can request it using the `memory` argument. This expects an integer number of megabytes:
```
import modal
app = modal.App()
@app.function(memory=32768)
def my_function():
# code here will have access to at least 32 GiB of RAM
...
```
How much can I request?
-----------------------
For both CPU and memory, a maximum is enforced at function creation time to
ensure your application can be scheduled for execution. Requests exceeding the
maximum will be rejected with an [`InvalidError`](../reference/modal.exception.html#modalexceptioninvaliderror).
As the platform grows, we plan to support larger CPU and memory reservations.
Billing
-------
For CPU and memory, you’ll be charged based on whichever is higher: your reservation or actual usage.
Disk requests are billed by increasing the memory request at a 20:1 ratio. For example, requesting 500 GiB of disk will increase the memory request to 25 GiB, if it is not already set higher.
Resource limits
---------------
### CPU limits
Modal containers have a default soft CPU limit that is set at 16 physical cores above the CPU request.
Given that the default CPU request is 0.125 cores the default soft CPU limit is 16.125 cores.
Above this limit the host will begin to throttle the CPU usage of the container.
You can alternatively set the CPU limit explicitly.
```
cpu_request = 1.0
cpu_limit = 4.0
@app.function(cpu=(cpu_request, cpu_limit))
def f():
...
```
### Memory limits
Modal containers can have a hard memory limit which will ‘Out of Memory’ (OOM) kill
containers which attempt to exceed the limit. This functionality is useful when a container
has a serious memory leak. You can set the limit and have the container killed to avoid paying
for the leaked GBs of memory.
```
mem_request = 1024
mem_limit = 2048
@app.function(
memory=(mem_request, mem_limit),
)
def f():
...
```
Specify this limit using the [`memory` parameter](../reference/modal.App.html#function) on Modal Functions.
### Disk limits
Running Modal containers have access to many GBs of SSD disk, but the amount
of writes is limited by:
1. The size of the underlying worker’s SSD disk capacity
2. A per-container disk quota that is set in the 100s of GBs.
Hitting either limit will cause the container’s disk writes to be rejected, which
typically manifests as an `OSError`.
Increased disk sizes can be requested with the [`ephemeral_disk` parameter](../reference/modal.App.html#function). The maximum
disk size is 3.0 TiB (3,145,728 MiB). Larger disks are intended to be used for [dataset processing](dataset-ingestion.html).
[Reserving CPU and memory](#reserving-cpu-and-memory)[CPU cores](#cpu-cores)[Memory](#memory)[How much can I request?](#how-much-can-i-request)[Billing](#billing)[Resource limits](#resource-limits)[CPU limits](#cpu-limits)[Memory limits](#memory-limits)[Disk limits](#disk-limits)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Workspaces
==========
A **workspace** is an area where a user can deploy Modal apps and other
resources. There are two types of workspaces: personal and shared. After a new
user has signed up to Modal, a personal workspace is automatically created for
them. The name of the personal workspace is based on your GitHub username, but
it might be randomly generated if already taken or invalid.
To collaborate with others, a new shared workspace needs to be created.
Create a Workspace
------------------
All additional workspaces are shared workspaces, meaning you can invite others
by email to collaborate with you. There are two ways to create a Modal workspace
on the [settings](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7workspaces.html) page.
![view of workspaces creation interface](https://modal-cdn.com/cdnbot/create-new-workspace-viewk0ka46_7_800f2053.webp)
1. Create from [GitHub organization](https://docs.github.com/en/organizations). Allows members in GitHub organization to auto-join the workspace.
2. Create from scratch. You can invite anyone to your workspace.
If you’re interested in having a workspace associated with your Okta
organization, then check out our [Okta SSO docs](okta-sso.html).
If you’re interested in using SSO through Google or other providers, then please reach out to us at [[email protected]](mailto:[email protected]).
Auto-joining a Workspace associated with a GitHub organization
--------------------------------------------------------------
Note: This is only relevant for Workspaces created from a GitHub organization.
Users can automatically join a Workspace on their [Workspace settings page](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7workspaces.html) if they are a member of the GitHub organization associated with the Workspace.
To turn off this functionality a Workspace Manager can disable it on the **Workspace Management** tab of their Workspace’s settings page.
Inviting new Workspace members
------------------------------
To invite a new Workspace member, you can visit the [settings](../../login%EF%B9%96next=%EA%A4%B7settings.html) page
and navigate to the members tab for the appropriate workspace.
You can either send an email invite or share an invite link. Both existing Modal
users and non-existing users can use the links to join your workspace. If they
are a new user a Modal account will be created for them.
![invite member section](../../_app/immutable/assets/invite-member.CHnml0eT.png)
Create a token for a Workspace
------------------------------
To interact with a Workspace’s resources programmatically, you need to add an
API token for that Workspace. Your existing API tokens are displayed on [the settings page](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7tokens.html) and new API tokens can be added for a
particular Workspace.
After adding a token for a Workspace to your Modal config file you can activate
that Workspace’s profile using the CLI (see below).
As an manager or workspace owner you can manage active tokens for a workspace on [the member tokens page](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7member-tokens.html). For more information on API
token management see the [documentation about configuration](../reference/modal.config.html).
Switching active Workspace
--------------------------
When on the dashboard or using the CLI, the active profile determines which
personal or organizational Workspace is associated with your actions.
### Dashboard
You can switch between organization Workspaces and your Personal Workspace by
using the workspace selector at the top of [the dashboard](../../login%EF%B9%96next=%EA%A4%B7apps.html).
### CLI
To switch the Workspace associated with CLI commands, use `modal profile activate`.
Administrating workspace members
--------------------------------
Workspaces have three different levels of access privileges:
* Owner
* Manager
* User
The user that creates a workspace is automatically set as the **Owner** for that
workspace. The owner can assign any other roles within the workspace, as well as
remove other members of the workspace.
A **Manager** within a workspace can assign all roles except **Owner** and can
also remove other members of the workspace.
A **User** of a workspace can not assign any access privileges within the
workspace but can otherwise perform any action like running and deploying apps
and modify Secrets.
As an Owner or Manager you can administrate the access privileges of other
members on the members tab in [settings](../../login%EF%B9%96next=%EA%A4%B7settings.html).
Leaving a Workspace
-------------------
To leave a workspace, navigate to [the settings page](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7workspaces.html) and
click “Leave” on a listed Workspace. There must be at least one owner assigned
to a workspace.
[Workspaces](#workspaces)[Create a Workspace](#create-a-workspace)[Auto-joining a Workspace associated with a GitHub organization](#auto-joining-a-workspace-associated-with-a-github-organization)[Inviting new Workspace members](#inviting-new-workspace-members)[Create a token for a Workspace](#create-a-token-for-a-workspace)[Switching active Workspace](#switching-active-workspace)[Dashboard](#dashboard)[CLI](#cli)[Administrating workspace members](#administrating-workspace-members)[Leaving a Workspace](#leaving-a-workspace)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Snapshots
=========
Sandboxes support snapshotting, allowing you to save your Sandbox’s state
and restore it later. This is useful for:
* Creating custom environments for your Sandboxes to run in
* Backing up your Sandbox’s state for debugging
* Running large-scale experiments with the same initial state
* Branching your Sandbox’s state to test different code changes independently
Filesystem Snapshots
--------------------
Filesystem Snapshots are copies of the Sandbox’s filesystem at a given point in time.
These Snapshots are [Images](../reference/modal.Image.html) and can be used to create
new Sandboxes.
To create a Filesystem Snapshot, you can use the [`Sandbox.snapshot_filesystem()`](../reference/modal.Sandbox.html#snapshot_filesystem) method:
```
import modal
app = modal.App.lookup("sandbox-fs-snapshot-test", create_if_missing=True)
sb = modal.Sandbox.create(app=app)
p = sb.exec("bash", "-c", "echo 'test' > /test")
p.wait()
assert p.returncode == 0, "failed to write to file"
image = sb.snapshot_filesystem()
sb.terminate()
sb2 = modal.Sandbox.create(image=image, app=app)
p2 = sb2.exec("bash", "-c", "cat /test")
assert p2.stdout.read().strip() == "test"
sb2.terminate()
```
Filesystem Snapshots are optimized for performance: they are calculated as the difference
from your base image, so only modified files are stored. Restoring a Filesystem Snapshot
utilizes the same infrastructure we use to get fast cold starts for your Sandboxes.
Memory Snapshots
----------------
[Sandboxes memory snapshots](sandbox-memory-snapshots.html) are in early preview.
Contact us if this is something you’re interested in!
[Snapshots](#snapshots)[Filesystem Snapshots](#filesystem-snapshots)[Memory Snapshots](#memory-snapshots)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Images
======
This guide walks you through how to define the environment your Modal Functions run in.
These environments are called *containers*. Containers are like light-weight
virtual machines — container engines use [operating system tricks](https://earthly.dev/blog/chroot/) to isolate programs
from each other (“containing” them), making them work as though they were
running on their own hardware with their own filesystem. This makes execution
environments more reproducible, for example by preventing accidental
cross-contamination of environments on the same machine. For added security,
Modal runs containers using the sandboxed [gVisor container runtime](https://cloud.google.com/blog/products/identity-security/open-sourcing-gvisor-a-sandboxed-container-runtime).
Containers are started up from a stored “snapshot” of their filesystem state
called an *image*. Producing the image for a container is called *building* the
image.
By default, Modal Functions are executed in a [Debian Linux](https://en.wikipedia.org/wiki/Debian) container with a basic
Python installation of the same minor version `v3.x` as your local Python
interpreter.
To make your Apps and Functions useful, you will probably need some third party system packages
or Python libraries. Modal provides a number of options to customize your container images at
different levels of abstraction and granularity, from high-level convenience
methods like `pip_install` through wrappers of core container image build
features like `RUN` and `ENV` to full on “bring-your-own-Dockerfile”. We’ll
cover each of these in this guide, along with tips and tricks for building
Images effectively when using each tool.
The typical flow for defining an image in Modal is [method chaining](https://jugad2.blogspot.com/2016/02/examples-of-method-chaining-in-python.html) starting from a base image, like this:
```
import modal
image = (
modal.Image.debian_slim(python_version="3.10")
.apt_install("git")
.pip_install("torch==2.6.0")
.env({"HALT_AND_CATCH_FIRE": "0"})
.run_commands("git clone https://github.com/modal-labs/agi && echo 'ready to go!'")
)
```
In addition to being Pythonic and clean, this also matches the onion-like [layerwise build process](https://docs.docker.com/build/guide/layers/) of
container images.
Add Python packages with `pip_install`
--------------------------------------
The simplest and most common container modification is to add some third party
Python package, like [`pandas`](https://pandas.pydata.org/).
You can add Python packages to the environment by passing all the packages you
need to the [`pip_install`](../reference/modal.Image.html#pip_install) method of
an image.
You can include [typical Python dependency version specifiers](https://peps.python.org/pep-0508/),
like `"torch <= 2.0"`, in the arguments. But we recommend pinning dependencies
tightly, like `"torch == 1.9.1"`, to improve the reproducibility and robustness
of your builds.
```
import modal
datascience_image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install("pandas==2.2.0", "numpy")
)
@app.function(image=datascience_image)
def my_function():
import pandas as pd
import numpy as np
df = pd.DataFrame()
...
```
Note that because you can define a different environment for each and every
Modal Function if you so choose, you don’t need to worry about virtual
environment management. Containers make for much better separation of concerns!
If you want to run a specific version of Python remotely rather than just
matching the one you’re running locally, provide the `python_version` as a
string when constructing the base image, like we did above.
Add local files with `add_local_dir` and `add_local_file`
---------------------------------------------------------
If you want to forward files from your local system, you can do that using the `image.add_local_dir` and `image.add_local_file` image builder methods.
```
image = modal.Image.debian_slim().add_local_dir("/user/erikbern/.aws", remote_path="/root/.aws")
```
By default, these files are added to your container as it starts up rather than introducing
a new image layer. This means that the redeployment after making changes is really quick, but
also means you can’t run additional build steps after. You can specify a `copy=True` argument
to the `add_local_` methods to instead force the files to be included in a built image.
### Adding local Python modules
There is a convenience method for the special case of adding local Python modules to
the container: [`Image.add_local_python_source`](../reference/modal.Image.html#add_local_python_source)
The difference from `add_local_dir` is that `add_local_python_source` takes module names as arguments
instead of a file system path and looks up the local package’s or module’s location via Python’s importing
mechanism. The files are then added to directories that make them importable in containers in the
same way as they are locally.
This is mostly intended for pure Python auxiliary modules that are part of your project and that your code imports,
whereas third party packages should be installed via [`Image.pip_install()`](../reference/modal.Image.html#pip_install) or similar.
```
import modal
app = modal.App()
image_with_module = modal.Image.debian_slim().add_local_python_source("my_local_module")
@app.function(image=image_with_module)
def f():
import my_local_module # this will now work in containers
my_local_module.do_stuff()
```
### What if I have different Python packages locally and remotely?
You might want to use packages inside your Modal code that you don’t have on
your local computer. In the example above, we build a container that uses `pandas`. But if we don’t have `pandas` locally, on the computer launching the
Modal job, we can’t put `import pandas` at the top of the script, since it would
cause an `ImportError`.
The easiest solution to this is to put `import pandas` in the function body
instead, as you can see above. This means that `pandas` is only imported when
running inside the remote Modal container, which has `pandas` installed.
Be careful about what you return from Modal Functions that have different
packages installed than the ones you have locally! Modal Functions return Python
objects, like `pandas.DataFrame`s, and if your local machine doesn’t have `pandas` installed, it won’t be able to handle a `pandas` object (the error
message you see will mention [serialization](https://hazelcast.com/glossary/serialization/)/[deserialization](https://hazelcast.com/glossary/deserialization/)).
If you have a lot of functions and a lot of Python packages, you might want to
keep the imports in the global scope so that every function can use the same
imports. In that case, you can use the [`imports()`](../reference/modal.Image.html#imports) context manager:
```
import modal
pandas_image = modal.Image.debian_slim().pip_install("pandas", "numpy")
with pandas_image.imports():
import pandas as pd
import numpy as np
@app.function(image=pandas_image)
def my_function():
df = pd.DataFrame()
```
Run shell commands with `.run_commands`
---------------------------------------
You can also supply shell commands that should be executed when building the
container image.
You might use this to preload custom assets, like model parameters, so that they
don’t need to be retrieved when Functions start up:
```
import modal
image_with_model = (
modal.Image.debian_slim().apt_install("curl").run_commands(
"curl -O https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalcatface.xml",
)
)
@app.function(image=image_with_model)
def find_cats():
content = open("/haarcascade_frontalcatface.xml").read()
...
```
You can also use this command to install Python packages. For example,
you can use it to install packages with [`uv`](https://github.com/astral-sh/uv),
which can be substantially faster than `pip`:
```
import modal
image = (
modal.Image.debian_slim()
.pip_install("uv")
.run_commands("uv pip install --system --compile-bytecode torch")
)
```
Note that it is important to pass `--compile-bytecode` when using `uv` on Modal.
Unlike `pip`, `uv` does not produce [Python bytecode](https://realpython.com/ref/glossary/bytecode/) (the contents of the `.pyc` files in those `__pycache__` folders you may have noticed in your Python projects)
by default when packages are installed. On a serverless platform like Modal, skipping that work at installation time
means it instead has to be done every time a container starts.
Run a Python function during your build with `.run_function`
------------------------------------------------------------
Instead of using shell commands, you can also run a Python function as an image
build step using the [`Image.run_function`](../reference/modal.Image.html#run_function) method. For
example, you can use this to download model parameters from Hugging Face into
your Image:
```
import os
import modal
def download_models() -> None:
import diffusers
model_name = "segmind/small-sd"
pipe = diffusers.StableDiffusionPipeline.from_pretrained(
model_name, use_auth_token=os.environ["HF_TOKEN"]
)
pipe.save_pretrained("/model")
image = (
modal.Image.debian_slim()
.pip_install("diffusers[torch]", "transformers", "ftfy", "accelerate")
.run_function(download_models, secrets=[modal.Secret.from_name("huggingface-secret")])
)
```
Any kwargs accepted by [`@app.function`](../reference/modal.App.html#function) ([`Volume`s](volumes.html), and specifications of
resources like [GPUs](gpu.html)) can be supplied here.
Essentially, this is equivalent to running a Modal Function and snapshotting the
resulting filesystem as an image.
Whenever you change other features of your image, like the base image or the
version of a Python package, the image will automatically be rebuilt the next
time it is used. This is a bit more complicated when changing the contents of
functions. See the [reference documentation](../reference/modal.Image.html#run_function) for details.
Attach GPUs during setup
------------------------
If a step in the setup of your container image should be run on an instance with
a GPU (e.g., so that a package can query the GPU to set compilation flags), pass a
desired GPU type when defining that step:
```
import modal
image = (
modal.Image.debian_slim()
.pip_install("bitsandbytes", gpu="H100")
)
```
Use `mamba` instead of `pip` with `micromamba_install`
------------------------------------------------------
`pip` installs Python packages, but some Python workloads require the
coordinated installation of system packages as well. The `mamba` package manager
can install both. Modal provides a pre-built [Micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) base image that makes it easy to work with `micromamba`:
```
import modal
app = modal.App("bayes-pgm")
numpyro_pymc_image = (
modal.Image.micromamba()
.micromamba_install("pymc==5.10.4", "numpyro==0.13.2", channels=["conda-forge"])
)
@app.function(image=numpyro_pymc_image)
def sample():
import pymc as pm
import numpyro as np
print(f"Running on PyMC v{pm.__version__} with JAX/numpyro v{np.__version__} backend")
...
```
Use an existing container image with `.from_registry`
-----------------------------------------------------
You don’t always need to start from scratch! Public registries like [Docker Hub](https://hub.docker.com/) have many pre-built container images for
common software packages.
You can use any public image in your function using [`Image.from_registry`](../reference/modal.Image.html#from_registry), so long as:
* Python 3.9 or later is installed on the `$PATH` as `python`
* `pip` is installed correctly
* The image is built for the [`linux/amd64` platform](https://unix.stackexchange.com/questions/53415/why-are-64-bit-distros-often-called-amd64)
* The image has a [valid `ENTRYPOINT`](#entrypoint)
```
import modal
sklearn_image = modal.Image.from_registry("huanjason/scikit-learn")
@app.function(image=sklearn_image)
def fit_knn():
from sklearn.neighbors import KNeighborsClassifier
...
```
If an existing image does not have either `python` or `pip` set up properly, you
can still use it. Just provide a version number as the `add_python` argument to
install a reproducible [standalone build](https://github.com/indygreg/python-build-standalone) of Python:
```
import modal
image1 = modal.Image.from_registry("ubuntu:22.04", add_python="3.11")
image2 = modal.Image.from_registry("gisops/valhalla:latest", add_python="3.11")
```
The `from_registry` method can load images from all public registries, such as [Nvidia’s `nvcr.io`](https://catalog.ngc.nvidia.com/containers), [AWS ECR](https://aws.amazon.com/ecr/), and [GitHub’s `ghcr.io`](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry).
We also support access to [private AWS ECR and GCP Artifact Registry images](private-registries.html).
Bring your own image definition with `.from_dockerfile`
-------------------------------------------------------
Sometimes, you might be already have a container image defined in a Dockerfile.
You can define an Image with a Dockerfile using [`Image.from_dockerfile`](../reference/modal.Image.html#from_dockerfile).
It takes a path to an existing Dockerfile.
For instance, we might write a Dockerfile that adds scikit-learn to the official Python image:
```
FROM python:3.9
RUN pip install sklearn
```
and then define a Modal Image with it:
```
import modal
dockerfile_image = modal.Image.from_dockerfile("Dockerfile")
@app.function(image=dockerfile_image)
def fit():
import sklearn
...
```
Note that you can still do method chaining to extend this image!
### Dockerfile command compatibility
Since Modal doesn’t use Docker to build containers, we have our own
implementation of the [Dockerfile specification](https://docs.docker.com/engine/reference/builder/).
Most Dockerfiles should work out of the box, but there are some differences to
be aware of.
First, a few minor Dockerfile commands and flags have not been implemented yet.
Please reach out to us if your use case requires any of these.
Next, there are some command-specific things that may be useful when porting a
Dockerfile to Modal.
#### `ENTRYPOINT`
While the [`ENTRYPOINT`](https://docs.docker.com/engine/reference/builder/#entrypoint) command is supported, there is an additional constraint to the entrypoint script
provided: it must also `exec` the arguments passed to it at some point. This is
so that Modal’s own Python entrypoint can run after your own. Most entrypoint
scripts in Docker containers are wrappers over other scripts, so this is likely
already the case.
If you wish to write your own entrypoint script, you can use the following as a
template:
```
#!/usr/bin/env bash
# Your custom startup commands here.
exec "$@" # Runs the command passed to the entrypoint script.
```
If the above file is saved as `/usr/bin/my_entrypoint.sh` in your container,
then you can register it as an entrypoint with `ENTRYPOINT ["/usr/bin/my_entrypoint.sh"]` in your Dockerfile, or with [`entrypoint`](../reference/modal.Image.html#entrypoint) as an
Image build step.
```
import modal
image = (
modal.Image.debian_slim()
.pip_install("foo")
.entrypoint(["/usr/bin/my_entrypoint.sh"])
)
```
#### `ENV`
We currently don’t support default values in [interpolations](https://docs.docker.com/compose/compose-file/12-interpolation/),
such as `${VAR:-default}`
Image caching and rebuilds
--------------------------
Modal uses the definition of an Image to determine whether it needs to be
rebuilt. If the definition hasn’t changed since the last time you ran or
deployed your App, the previous version will be pulled from the cache.
Images are cached per layer (i.e., per `Image` method call), and breaking
the cache on a single layer will cause cascading rebuilds for all subsequent
layers. You can shorten iteration cycles by defining frequently-changing
layers last so that the cached version of all other layers can be used.
In some cases, you may want to force an Image to rebuild, even if the
definition hasn’t changed. You can do this by adding the `force_build=True` argument to any of the Image building methods.
```
import modal
image = (
modal.Image.debian_slim()
.apt_install("git")
.pip_install("slack-sdk", force_build=True)
.run_commands("echo hi")
)
```
As in other cases where a layer’s definition changes, both the `pip_install` and `run_commands` layers will rebuild, but the `apt_install` will not. Remember to
remove `force_build=True` after you’ve rebuilt the Image, or it will
rebuild every time you run your code.
Alternatively, you can set the `MODAL_FORCE_BUILD` environment variable (e.g. `MODAL_FORCE_BUILD=1 modal run ...`) to rebuild all images attached to your App.
But note that when you rebuild a base layer, the cache will be invalidated for *all* Images that depend on it, and they will rebuild the next time you run or deploy
any App that uses that base.
Image builder updates
---------------------
Because changes to base images will cause cascading rebuilds, Modal is
conservative about updating the base definitions that we provide. But many
things are baked into these definitions, like the specific versions of the Image
OS, the included Python, and the Modal client dependencies.
We provide a separate mechanism for keeping base images up-to-date without
causing unpredictable rebuilds: the “Image Builder Version”. This is a workspace
level-configuration that will be used for every Image built in your workspace.
We release a new Image Builder Version every few months but allow you to update
your workspace’s configuration when convenient. After updating, your next
deployment will take longer, because your Images will rebuild. You may also
encounter problems, especially if your Image definition does not pin the version
of the third-party libraries that it installs (as your new Image will get the
latest version of these libraries, which may contain breaking changes).
You can set the Image Builder Version for your workspace by going to your [workspace settings](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7image-config.html). This page also documents the
important updates in each version.
[Images](#images)[Add Python packages with pip\_install](#add-python-packages-with-pip_install)[Add local files with add\_local\_dir and add\_local\_file](#add-local-files-with-add_local_dir-and-add_local_file)[Adding local Python modules](#adding-local-python-modules)[What if I have different Python packages locally and remotely?](#what-if-i-have-different-python-packages-locally-and-remotely)[Run shell commands with .run\_commands](#run-shell-commands-with-run_commands)[Run a Python function during your build with .run\_function](#run-a-python-function-during-your-build-with-run_function)[Attach GPUs during setup](#attach-gpus-during-setup)[Use mamba instead of pip with micromamba\_install](#use-mamba-instead-of-pip-with-micromamba_install)[Use an existing container image with .from\_registry](#use-an-existing-container-image-with-from_registry)[Bring your own image definition with .from\_dockerfile](#bring-your-own-image-definition-with-from_dockerfile)[Dockerfile command compatibility](#dockerfile-command-compatibility)[ENTRYPOINT](#entrypoint)[ENV](#env)[Image caching and rebuilds](#image-caching-and-rebuilds)[Image builder updates](#image-builder-updates)
See it in action
[Registry image for Algolia indexing](../examples/algolia_indexer.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Fast pull from registry
=======================
The performance of pulling public and private images from registries into Modal
can be significantly improved by adopting the [eStargz](https://github.com/containerd/stargz-snapshotter/blob/main/docs/estargz.md) compression format.
By applying eStargz compression during your image build and push, Modal will be much
more efficient at pulling down your image from the registry.
How to use estargz
------------------
If you have [Buildkit](https://docs.docker.com/build/buildkit/) version greater than `0.10.0`, adopting `estargz` is as simple as
adding some flags to your `docker buildx build` command:
* `type=registry` flag will instruct BuildKit to push the image after building.
+ If you do not push the image from immediately after build and instead attempt to push it later with docker push, the image will be converted to a standard gzip image.
* `compression=estargz` specifies that we are using the [eStargz](https://github.com/containerd/stargz-snapshotter/blob/main/docs/estargz.md) compression format.
* `oci-mediatypes=true` specifies that we are using the OCI media types, which is required for eStargz.
* `force-compression=true` will recompress the entire image and convert the base image to eStargz if it is not already.
```
docker buildx build --tag "<registry>/<namespace>/<repo>:<version>" \
--output type=registry,compression=estargz,force-compression=true,oci-mediatypes=true \
.
```
Then reference the container image as normal in your Modal code.
```
app = modal.App(
"example-estargz-pull",
image=modal.Image.from_registry(
"public.ecr.aws/modal/estargz-example-images:text-generation-v1-esgz"
)
)
```
At build time you should see the eStargz-enabled puller activate:
```
Building image im-TinABCTIf12345ydEwTXYZ
=> Step 0: FROM public.ecr.aws/modal/estargz-example-images:text-generation-v1-esgz
Using estargz to speed up image pull (index loaded in 1.86s)...
Progress: 10% complete... (1.11s elapsed)
Progress: 20% complete... (3.10s elapsed)
Progress: 30% complete... (4.18s elapsed)
Progress: 40% complete... (4.76s elapsed)
Progress: 50% complete... (5.51s elapsed)
Progress: 62% complete... (6.17s elapsed)
Progress: 74% complete... (6.99s elapsed)
Progress: 81% complete... (7.23s elapsed)
Progress: 99% complete... (8.90s elapsed)
Progress: 100% complete... (8.90s elapsed)
Copying image...
Copied image in 5.81s
```
Supported registries
--------------------
Currently, Modal supports fast estargz pulling images with the following registries:
* AWS Elastic Container Registry (ECR)
* Docker Hub (docker.io)
* Google Artifact Registry (gcr.io, pkg.dev)
We are working on adding support for GitHub Container Registry (ghcr.io).
[Fast pull from registry](#fast-pull-from-registry)[How to use estargz](#how-to-use-estargz)[Supported registries](#supported-registries)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Secrets
=======
Securely provide credentials and other sensitive information to your Modal Functions with Secrets.
You can create and edit Secrets via
the [dashboard](../../login%EF%B9%96next=%EA%A4%B7secrets.html),
the command line interface ([`modal secret`](../reference/cli/secret.html)), and
programmatically from Python code ([`modal.Secret`](../reference/modal.Secret.html)).
To inject Secrets into the container running your Function, add the `secrets=[...]` argument to your `app.function` or `app.cls` decoration.
Deploy Secrets from the Modal Dashboard
---------------------------------------
The most common way to create a Modal Secret is to use the [Secrets panel of the Modal dashboard](../../login%EF%B9%96next=%EA%A4%B7secrets.html),
which also shows any existing Secrets.
When you create a new Secret, you’ll be prompted with a number of templates to help you get started.
These templates demonstrate standard formats for credentials for everything from Postgres and MongoDB
to Weights & Biases and Hugging Face.
Use Secrets in your Modal Apps
------------------------------
You can then use your Secret by constructing it `from_name` when defining a Modal App
and then accessing its contents as environment variables.
For example, if you have a Secret called `secret-keys` containing the key `MY_PASSWORD`:
```
@app.function(secrets=[modal.Secret.from_name("secret-keys")])
def some_function():
import os
secret_key = os.environ["MY_PASSWORD"]
...
```
Each Secret can contain multiple keys and values but you can also inject
multiple Secrets, allowing you to separate Secrets into smaller reusable units:
```
@app.function(secrets=[
modal.Secret.from_name("my-secret-name"),
modal.Secret.from_name("other-secret"),
])
def other_function():
...
```
The Secrets are applied in order, so key-values from later `modal.Secret` objects in the list will overwrite earlier key-values in the case of a clash.
For example, if both `modal.Secret` objects above contained the key `FOO`, then
the value from `"other-secret"` would always be present in `os.environ["FOO"]`.
Create Secrets programmatically
-------------------------------
In addition to defining Secrets on the web dashboard, you can
programmatically create a Secret directly in your script and send it along to
your Function using `Secret.from_dict(...)`. This can be useful if you want to
send Secrets from your local development machine to the remote Modal App.
```
import os
if modal.is_local():
local_secret = modal.Secret.from_dict({"FOO": os.environ["LOCAL_FOO"]})
else:
local_secret = modal.Secret.from_dict({})
@app.function(secrets=[local_secret])
def some_function():
import os
print(os.environ["FOO"])
```
If you have [`python-dotenv`](https://pypi.org/project/python-dotenv/) installed,
you can also use `Secret.from_dotenv()` to create a Secret from the variables in a `.env` file
```
@app.function(secrets=[modal.Secret.from_dotenv()])
def some_other_function():
print(os.environ["USERNAME"])
```
Interact with Secrets from the command line
-------------------------------------------
You can create, list, and delete your Modal Secrets with the `modal secret` command line interface.
View your Secrets and their timestamps with
```
modal secret list
```
Create a new Secret by passing `{KEY}={VALUE}` pairs to `modal secret create`:
```
modal secret create database-secret PGHOST=uri PGPORT=5432 PGUSER=admin PGPASSWORD=hunter2
```
Remove Secrets by passing their name to `modal secret delete`:
```
modal secret delete database-secret
```
[Secrets](#secrets)[Deploy Secrets from the Modal Dashboard](#deploy-secrets-from-the-modal-dashboard)[Use Secrets in your Modal Apps](#use-secrets-in-your-modal-apps)[Create Secrets programmatically](#create-secrets-programmatically)[Interact with Secrets from the command line](#interact-with-secrets-from-the-command-line)
See it in action
[OpenAI Secret for LangChain RAG](../examples/potus_speech_qanda.html)
[Write to Google Sheets](../examples/db_to_sheet.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Security and privacy at Modal
=============================
The document outlines Modal’s security and privacy commitments.
Application security (AppSec)
-----------------------------
AppSec is the practice of building software that is secure by design, secured
during development, secured with testing and review, and deployed securely.
* We build our software using memory-safe programming languages, including Rust
(for our worker runtime and storage infrastructure) and Python (for our API
servers and Modal client).
* Software dependencies are audited by Github’s Dependabot.
* We make decisions that minimize our attack surface. Most interactions with
Modal are well-described in a gRPC API, and occur through [`modal`](https://pypi.org/project/modal), our open-source command-line tool
and Python client library.
* We have automated synthetic monitoring test applications that continuously
check for network and application isolation within our runtime.
* We use HTTPS for secure connections. Modal forces HTTPS for all services using
TLS (SSL), including our public website and the Dashboard to ensure secure
connections. Modal’s [client library](https://pypi.org/project/modal) connects
to Modal’s servers over TLS and verify TLS certificates on each connection.
* All user data is encrypted in transit and at rest.
* All public Modal APIs use [TLS 1.3](https://datatracker.ietf.org/doc/html/rfc8446), the latest and
safest version of the TLS protocol.
* Internal code reviews are performed using a modern, PR-based development
workflow (Github), and engage external penetration testing firms to assess our
software security.
Corporate security (CorpSec)
----------------------------
CorpSec is the practice of making sure Modal employees have secure access to
Modal company infrastructure, and also that exposed channels to Modal are
secured. CorpSec controls are the primary concern of standards such as SOC2.
* Access to our services and applications is gated on a SSO Identity Provider
(IdP).
* We mandate phishing-resistant multi-factor authentication (MFA) in all
enrolled IdP accounts.
* We regularly audit access to internal systems.
* Employee laptops are protected by full disk encryption using FileVault2, and
managed by Secureframe MDM.
Network and infrastructure security (InfraSec)
----------------------------------------------
InfraSec is the practice of ensuring a hardened, minimal attack surface for
components we deploy on our network.
* Modal uses logging and metrics observability providers, including Datadog and
Sentry.io.
* Compute jobs at Modal are containerized and virtualized using [gVisor](https://github.com/google/gvisor), the sandboxing technology
developed at Google and used in their *Google Cloud Run* and *Google
Kubernetes Engine* cloud services.
* We conduct annual business continuity and security incident exercises.
Vulnerability remediation
-------------------------
Security vulnerabilities directly affecting Modal’s systems and services will be
patched or otherwise remediated within a timeframe appropriate for the severity
of the vulnerability, subject to the public availability of a patch or other
remediation mechanisms.
If there is a CVSS severity rating accompanying a vulnerability disclosure, we
rely on that as a starting point, but may upgrade or downgrade the severity
using our best judgement.
### Severity timeframes
* **Critical:** 24 hours
* **High:** 1 week
* **Medium:** 1 month
* **Low:** 3 months
* **Informational:** 3 months or longer
Shared responsibility model
---------------------------
Modal prioritizes the integrity, security, and availability of customer data. Under our shared responsibility model, customers also have certain responsibilities regarding data backup, recovery, and availability.
1. **Data backup**: Customers are responsible for maintaining backups of their data. Performing daily backups is recommended. Customers must routinely verify the integrity of their backups.
2. **Data recovery**: Customers should maintain a comprehensive data recovery plan that includes detailed procedures for data restoration in the event of data loss, corruption, or system failure. Customers must routinely test their recovery process.
3. **Availability**: While Modal is committed to high service availability, customers must implement contingency measures to maintain business continuity during service interruptions. Customers are also responsible for the reliability of their own IT infrastructure.
4. **Security measures**: Customers must implement appropriate security measures, such as encryption and access controls, to protect their data throughout the backup, storage, and recovery processes. These processes must comply with all relevant laws and regulations.
SOC 2
-----
We have successfully completed a [System and Organization Controls (SOC) 2 Type 2
audit](../../blog/soc2type2.html). Contact us at [[email protected]](mailto:[email protected]) for more details or access to the
report.
HIPAA
-----
HIPAA, which stands for the Health Insurance Portability and Accountability Act, establishes a set of standards that protect health information, including individuals’ medical records and other individually identifiable health information. HIPAA guidelines apply to both covered entities and business associates—of which Modal is the latter if you are processing PHI on Modal.
Modal’s services can be used in a HIPAA compliant manner. It is important to note that unlike other security standards, there is no officially recognized certification process for HIPAA compliance. Instead, we demonstrate our compliance with regulations such as HIPAA via the practices outlined in this doc, our technical and operational security measures, and through official audits for standards compliance such as SOC 2 certification.
To use Modal services for HIPAA-compliant workloads, a Business Associate Agreement (BAA) should be established with us prior to submission of any PHI. This is available on our Enterprise plan. Contact us at [[email protected]](mailto:[email protected]) to get started. At the moment, [Volumes](volumes.html), [Images](images.html) (persistent storage), and user code are out of scope of the commitments within our BAA, so PHI should not be used in those areas of the product.
PCI
---
*Payment Card Industry Data Security Standard* (PCI) is a standard that defines
the security and privacy requirements for payment card processing.
Modal uses [Stripe](https://stripe.com) to securely process transactions and
trusts their commitment to best-in-class security. We do not store personal
credit card information for any of our customers. Stripe is certified as “PCI
Service Provider Level 1”, which is the highest level of certification in the
payments industry.
Bug bounty program
------------------
Keeping user data secure is a top priority at Modal. We welcome contributions
from the security community to identify vulnerabilities in our product and
disclose them to us in a responsible manner. We offer rewards ranging from $100
to $1000+ depending on the severity of the issue discovered. To participate,
please send a report of the vulnerability to [[email protected]](mailto:[email protected]).
Data privacy
------------
Modal will never access or use:
* your source code.
* the inputs (function arguments) or outputs (function return values) to your Modal Functions.
* any data you store in Modal, such as in Images or Volumes.
Inputs (function arguments) and outputs (function return values) are deleted from our system after a max TTL of 7 days.
App logs and metadata are stored on Modal. Modal will not access this data
unless permission is granted by the user to help with troubleshooting.
Questions?
----------
[Email us!](mailto:[email protected])
[Security and privacy at Modal](#security-and-privacy-at-modal)[Application security (AppSec)](#application-security-appsec)[Corporate security (CorpSec)](#corporate-security-corpsec)[Network and infrastructure security (InfraSec)](#network-and-infrastructure-security-infrasec)[Vulnerability remediation](#vulnerability-remediation)[Severity timeframes](#severity-timeframes)[Shared responsibility model](#shared-responsibility-model)[SOC 2](#soc-2)[HIPAA](#hipaa)[PCI](#pci)[Bug bounty program](#bug-bounty-program)[Data privacy](#data-privacy)[Questions?](#questions)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Developing and debugging
========================
Modal makes it easy to run apps in the cloud, try code changes in the cloud, and
debug remotely executing code as if it were right there on your laptop. To speed
boost your inner dev loop, this guide provides a rundown of tools and techniques
for developing and debugging software in Modal.
Interactivity
-------------
You can launch a Modal App interactively and have it drop you right into the
middle of the action, at an interesting callsite or the site of a runtime
detonation.
### Interactive functions
It is possible to start the interactive Python debugger or start an `IPython` REPL right in the middle of your Modal App.
To do so, you first need to run your App in “interactive” mode by using the `--interactive` / `-i` flag. In interactive mode, you can establish a connection
to the calling terminal by calling `interact()` from within your function.
For a simple example, you can accept user input with the built-in Python `input` function:
```
@app.function()
def my_fn(x):
modal.interact()
print("Enter a number:", end=" ")
x = input()
print(f"Your number is {x}")
```
Now when you run your app with the `--interactive` flag, you’re able to send
inputs to your app, even though it’s running in a remote container!
```
modal run -i guess_number.py
Enter a number: 5
Your number is 5
```
For a more interesting example, you can [`pip_install("ipython")`](../reference/modal.Image.html#pip_install) and start an `IPython` REPL dynamically anywhere in your code:
```
@app.function()
def f():
model = expensive_function()
# play around with model
modal.interact()
import IPython
IPython.embed()
```
The built-in Python debugger can be initiated with the language’s `breakpoint()` function. For convenience, breakpoints call `interact` automatically.
```
@app.function()
def f():
x = "10point3"
breakpoint()
answer = float(x)
```
### Debugging Running Containers
#### Debug Shells
Modal also lets you run interactive commands on your running Containers from the
terminal — much like `ssh`-ing into a traditional machine or cloud VM.
To run a command inside a running Container, you first need to get the Container
ID. You can view all running Containers and their Container IDs with [`modal container list`](../reference/cli/container.html).
After you obtain the Container ID, you can connect to the Container with `modal shell [container-id]`. This launches a “Debug Shell” that comes with some preinstalled tools:
* `vim`
* `nano`
* `ps`
* `strace`
* `curl`
* `py-spy`
* and more!
You can use a debug shell to examine or terminate running processes, modify the Container filesystem, run commands, and more. You can also install additional packages using your Container’s package manager (ex. `apt`).
Note that debug shells will terminate immediately once your Container has finished running.
#### `modal container exec`
You can also execute a specific command in a running Container with `modal container exec [container-id] [command...]`. For example, to see what files are in `/root`, you can run `modal container exec [container-id] ls /root`.
```
❯ modal container list
Active Containers in environment: nathan-dev
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
┃ Container ID ┃ App ID ┃ App Name ┃ Start Time ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
│ ta-01JK47GVDMWMGPH8MQ0EW30Y25 │ ap-FSuhQ4LpvNAt5b6mKi1CDw │ my-app │ 2025-02-02 16:02 EST │
└───────────────────────────────┴───────────────────────────┴──────────┴──────────────────────┘
❯ modal container exec ta-01JK47GVDMWMGPH8MQ0EW30Y25 ls /root
__pycache__ test00.py
```
Note that your executed command will terminate immediately once your Container
has finished running.
By default, commands will be run within a [pseudoterminal (PTY)](https://en.wikipedia.org/wiki/Pseudoterminal), but this
can be disabled with the `--no-pty` flag.
#### Live container profiling
When a container or input is seemingly stuck or not making progress,
you can use the Modal web dashboard to find out what code that’s executing in the
container in real time. To do so, look for **Live Profiling** in the **Containers** tab in your
function dashboard.
![Live container profiling](https://modal-public-assets.s3.us-east-1.amazonaws.com/live-profiling-bigger.gif)
### Debugging Container Images
You can also launch an interactive shell in a new Container with the same
environment as your Function. This is handy for debugging issues with your
Image, interactively refining build commands, and exploring the contents of [`Volume`](../reference/modal.Volume.html)s and [`NetworkFileSystem`](../reference/modal.NetworkFileSystem.html)s.
The primary interface for accessing this feature is the [`modal shell`](../reference/cli/shell.html) CLI command, which accepts a Function
name in your App (or prompts you to select one, if none is provided), and runs
an interactive command on the same image as the Function, with the same [`Secret`](../reference/modal.Secret.html)s and [`NetworkFileSystem`](../reference/modal.NetworkFileSystem.html)s attached as the selected Function.
The default command is `/bin/bash`, but you can override this with any other
command of your choice using the `--cmd` flag.
Note that `modal shell [filename].py` does not attach a shell to a running Container of the
Function, but instead creates a fresh instance of the underlying Image. To attach a shell to a running Container, use `modal shell [container-id]` instead.
Live updating
-------------
### Hot reloading with `modal serve`
Modal has the command `modal serve <filename.py>`, which creates a loop that
live updates an App when any of the supporting files change.
Live updating works with web endpoints, syncing your changes as you make them,
and it also works well with cron schedules and job queues.
```
import modal
app = modal.App(image=modal.Image.debian_slim().pip_install("fastapi"))
@app.function()
@modal.fastapi_endpoint()
def f():
return "I update on file edit!"
@app.function(schedule=modal.Period(seconds=5))
def run_me():
print("I also update on file edit!")
```
If you edit this file, the `modal serve` command will detect the change and
update the code, without having to restart the command.
Observability
-------------
Each running Modal App, including all ephemeral Apps, streams logs and resource
metrics back to you for viewing.
On start, an App will log a dashboard link that will take you its App page.
```
$ python3 main.py
✓ Initialized. View app page at https://modal.com/apps/ap-XYZ1234.
...
```
From this page you can access the following:
* logs, both from your application and system-level logs from Modal
* compute resource metrics (CPU, RAM, GPU)
* function call history, including historical success/failure counts
### Debug logs
You can enable Modal’s client debug logs by setting the `MODAL_LOGLEVEL` environment variable to `DEBUG`.
Running the following will show debug logging from the Modal client running locally.
```
MODAL_LOGLEVEL=DEBUG modal run hello.py
```
To enable debug logs in the Modal client running in the remote container, you can set `MODAL_LOGLEVEL` using
a Modal [`Secret`](../reference/modal.Secret.html).
```
@app.function(secrets=[modal.Secret.from_dict({"MODAL_LOGLEVEL": "DEBUG"})])
def f():
print("Hello, world!")
```
### Client tracebacks
To see a traceback (a.k.a [stack trace](https://en.wikipedia.org/wiki/Stack_trace)) for a client-side exception, you can set the `MODAL_TRACEBACK` environment variable to `1`.
```
MODAL_TRACEBACK=1 modal run my_app.py
```
We encourage you to report cases where you need to enable this functionality, as it’s indication of an issue in Modal.
[Developing and debugging](#developing-and-debugging)[Interactivity](#interactivity)[Interactive functions](#interactive-functions)[Debugging Running Containers](#debugging-running-containers)[Debug Shells](#debug-shells)[modal container exec](#modal-container-exec)[Live container profiling](#live-container-profiling)[Debugging Container Images](#debugging-container-images)[Live updating](#live-updating)[Hot reloading with modal serve](#hot-reloading-with-modal-serve)[Observability](#observability)[Debug logs](#debug-logs)[Client tracebacks](#client-tracebacks)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Modal 1.0 migration guide
=========================
We are planning to release version 1.0 of the `modal` Python SDK in Q2 of 2025.
This release will signify an increased commitment to API stability and will
imply some changes to our development workflow.
Preceding the 1.0 release, we are making a number of breaking changes based on
feedback that we have received from early users. These changes are intended to
address pain points and reduce confusion about some aspects of the Modal API.
While they will require some changes to existing user code, we believe that
they’ll make it easier to use Modal going forward.
Our plan is to gradually roll out changes — with deprecation warnings — across
the final sequence of 0.X releases. Once we release 1.0, code that does not
issue deprecation warnings can be considered stable API. There will be a buffer
of at least six months before we expire the deprecations introduced prior to 1.0
and remove support for the old APIs.
This page outlines the major changes that we’re making as part of the v1.0
releases.
Deprecating `Image.copy_*` methods
----------------------------------
*Introduced in: v0.72.11*
We recently introduced new `Image` methods — `Image.add_local_dir` and `Image.add_local_file` — to replace the existing `Image.copy_local_dir` and `Image.copy_local_file`.
The new methods subsume the functionality of the old ones, but their default
behavior is different and more performant. By default, files will be mounted to
the container at runtime rather than copied into a new `Image` layer. This can
speed up development substantially when iterating on the contents of the files.
Building a new `Image` layer should be necessary only when subsequent build
steps will use the added files. In that case, you can pass `copy=True` in `Image.add_local_file` or `Image.add_local_dir`.
The `Image.add_local_dir` method also has an `ignore=` parameter, which you can
use to pass file-matching patterns (using dockerignore rules) or predicate
functions to exclude files.
Deprecating `Mount` as part of the public API
---------------------------------------------
*Introduced in: v0.72.4*
Currently, local files can be mounted to the container filesystem either by
including them in the `Image` definition or by passing a `modal.Mount` object
directly to the `App.function` or `App.cls` decorators. As part of the 1.0
release, we are simplifying the container filesystem configuration to be defined
only by the `Image` used for each Function. This implies deprecation of the
following:
* The `mount=` parameter of `App.function` and `App.cls`
* The `context_mount=` parameter of several `modal.Image` methods
* The `Image.copy_mount` method
* The `Mount` object
Code that uses the `mount=` parameter of `App.function` and `App.cls` should be
migrated to pass those files / directories to the `Image` used by that Function
or Cls, i.e. using the `Image.add_local_file`, `Image.add_local_dir`, or `Image.add_local_python_source` methods:
```
# Mounting local files
# Old way (deprecated)
mount = modal.Mount.from_local_dir("data").add_local_file("config.yaml")
@app.function(image=image, mount=mount)
def f():
...
# New way
image = image.add_local_dir("data", "/root/data").add_local_file("config.yaml", "/root/config.yaml")
@app.function(image=image)
def f():
...
## Mounting local Python source code
# Old way (deprecated)
mount = modal.Mount.from_local_python_packages("my-lib"))
@app.function(image=image, mount=mount)
def f()
...
# New way
image = image.add_local_python_source("my-lib")
@app.function(image=image)
def f(...):
...
## Using Image.copy_mount
# Old way (deprecated)
mount = modal.Mount.from_local_dir("data").add_local_file("config.yaml")
image.copy_mount(mount)
# New way
image.add_local_dir("data", "root/data").add_local_file("config.yaml", "/root/config.yaml")
```
Code that uses the `context_mount=` parameter of `Image.from_dockerfile` and `Image.dockerfile_commands` methods can delete that parameter; we now
automatically infer the files that need to be included in the context.
Deprecating the `@modal.build` decorator
----------------------------------------
*Introduced in: v0.72.17*
As part of consolidating the filesystem configuration API, we are also
deprecating the `modal.build` decorator.
For use cases where `modal.build` would previously have been the suggested
approach (e.g., downloading model weights or other large assets to the
container filesystem), we now recommend using a `modal.Volume` instead. The
main advantage of storing weights in a `Volume` instead of an `Image` is that
the weights do not need to be re-downloaded every time you change something else
about the `Image` definition.
Many frameworks, such as Hugging Face, automatically cache downloaded model
weights. When using these frameworks, you just need to ensure that you mount a `modal.Volume` to the expected location of the framework’s cache:
```
cache_vol = modal.Volume.from_name("hf-hub-cache")
@app.cls(
image=image.env({"HF_HUB_CACHE": "/cache"}),
volumes={"/cache": cache_vol},
...
)
class Model:
@modal.enter()
def load_model(self):
self.model = ModelClass.from_pretrained(...)
```
For frameworks that don’t support automatic caching, you could write a separate
function to download the weights and write them directly to the Volume, then `modal run` against this function before you deploy.
In some cases (e.g., if the step runs very quickly), you may wish for the logic
currently decorated with `@modal.build` to continue modifying the Image
filesystem. In that case, you can extract the method as a standalone function
and pass it to `Image.run_function`:
```
def download_weights():
...
image = image.run_function(download_weights)
```
Requiring explicit inclusion of local Python dependencies
---------------------------------------------------------
*Introduced in: 0.73.11*
Prior to 1.0, Modal will inspect the modules that are imported when running
your App code and automatically include any “local” modules in the remote
container environment. This behavior is referred to as “automounting”.
While convenient, this approach has a number of edge cases and surprising
behaviors, such as ignoring modules with imports that are deferred using `Image.imports`. Additionally, it is difficult to configure the automounting
behavior to, e.g., ignore large data files that are stored within your local
Python project directories.
Going forward, it will be necessary to explicitly include the local dependencies
of your Modal App. The easiest way to do this is with [`Image.add_local_python_source`]
(/docs/reference/modal.Image#add\_local\_python\_source):
```
import modal
import helpers
image = modal.Image.debian_slim().add_local_python_source("helpers")
```
In the period leading up to the change in default behavior, the Modal client
will issue deprecation warnings when automounted modules are not included
in the Image. Updating the Image definition will remove these warnings.
Note that Modal will continue to automatically include the source module or
package defining the App itself. We’re introducing a new App or Function-level
parameter, `include_source`, which can be set to `False` in cases where this is
not desired (i.e., because your Image definition already includes the App
source).
Renaming autoscaler parameters
------------------------------
*Introduced in: v0.73.76*
We’re renaming several parameters that configure autoscaling behavior:
* `keep_warm` is now `min_containers`
* `concurrency_limit` is now `max_containers`
* `container_idle_timeout` is now `scaledown_window`
The renaming is intended to address some persistent confusion about
the meaning of these parameters. The migration path is a simple
find-and-replace operation.
Additionally, we’re promoting a fourth parameter, `buffer_containers`,
from experimental status (previously `_experimental_buffer_containers`).
Like `min_containers`, `buffer_containers` can help mitigate cold-start
penalties by overprovisioning containers while the Function is active.
Renaming `modal.web_endpoint` to `modal.fastapi_endpoint`
---------------------------------------------------------
*Introduced in: v0.73.89*
We’re renaming the `modal.web_endpoint` decorator to `modal.fastapi_endpoint` so that the implicit dependency on FastAPI is more clear. This can be a
simple name substitution in your code as the semantics are otherwise identical.
We may reintroduce a lightweight `modal.web_endpoint` without external
dependencies in the future.
Replacing `allow_concurrent_inputs` with `@modal.concurrent`
------------------------------------------------------------
*Introduced in: v0.73.148*
The `allow_concurrent_inputs` parameter is being replaced with a new decorator, `@modal.concurrent`. The decorator can be applied either to a Function or a Cls.
We’re moving the input concurrency feature out of “Beta” status as part of this
change.
The new decorator exposes two distinct parameters: `max_inputs` (the limit
on the number of inputs the Function will concurrently accept) and `target_inputs` (the level of concurrency targeted by the Modal autoscaler).
The simplest migration path is to replace `allow_concurrent_inputs=N` with `@modal.concurrent(max_inputs=N)`:
```
# Old way, with a function (deprecated)
@app.function(allow_concurrent_inputs=1000)
def f(...):
...
# New way, with a function
@app.function()
@modal.concurrent(max_inputs=1000)
def f(...):
...
# Old way, with a class (deprecated)
@app.cls(allow_concurrent_inputs=1000)
class MyCls:
...
# New way, with a class
@app.cls()
@modal.concurrent(max_inputs=1000)
class MyCls:
...
```
Setting `target_inputs` along with `max_inputs` may benefit performance by
reducing latency during periods where the container pool is scaling up. See the [input concurrency guide](concurrent-inputs.html) for more information.
Deprecating the `.lookup` method on Modal objects
-------------------------------------------------
*Introduced in: v0.72.56*
Most Modal objects can be instantiated through two distinct methods: `.from_name` and `.lookup`. The redundancy between these methods is a persistent
source of confusion.
The `.from_name` method is lazy: it operates entirely locally and instantiates
only a shell for the object. The local object won’t be associated with its
identity on the Modal server until you interact with it. In contrast, the `.lookup` method is eager: it triggers a remote call to the Modal server, and it
returns a fully-hydrated object.
Because Modal objects can now be hydrated on-demand, when they are first
used, there is rarely any need to eagerly hydrate. Therefore, we’re deprecating `.lookup` so that there’s only one obvious way to instantiate objects.
In most cases, the migration is a simple find-and-replace of `.lookup` → `.from_name`.
One exception is when your code needs to access object metadata, such as its ID,
or a web endpoint’s URL. In that case, you can explicitly force hydration of the
object by calling its `.hydrate()` method. There may be other subtle consequences,
such as errors being rasied at a different location if no object exists with the
given name.
Removing support for custom Cls constructors
--------------------------------------------
*Introduced in: v0.74.0*
Classes decorated with `App.cls` are no longer allowed to have a custom constructor
(`__init__` method). Instead, class parameterization should be exposed using
dataclass-style [`modal.parameter`](../reference/modal.parameter.html) annotations:
```
# Old way (deprecated)
@app.cls()
class MyCls:
def __init__(self, name: str = "Bert"):
self.name = name
# New way
@app.cls()
class MyCls:
name: str = modal.parameter(default="Bert")
```
Modal will provide a synthetic constructor for classes that use `modal.parameter`.
Arguments to the synthetic constructor must be passed using keywords, so you may
need to update your calling code as well:
```
obj = MyCls(name="Bert") # name= is now required
```
We’re making this change to address some persistent confusion about when
constructors execute for remote calls and what operations are allowed to run in
them. If your custom constructor performs any setup logic beyond storing the
parameter values, you should move it to a method decorated with `@modal.enter()`.
Additionally, we’re reducing the types that we support as class parameters to
a small number of primitives (`str`, `int`, `bool`, and `bytes`).
Limiting class parameterization to primitive types will also allow us to provide
better observability over parameterized class instances in the web dashboard,
CLI, and other contexts where it is not possible to represent arbitrary Python
objects.
If you need to parameterize classes across more complex types, you can implement
your own serialization logic, e.g. using strings as the wire format:
```
@app.cls()
class MyCls:
param_str: str = modal.parameter()
@modal.enter()
def deserialize_parameters(self):
self.param_obj = SomeComplexType.from_str(self.param_str)
```
We recommend adopting interpretable constructor arguments (i.e., prefer
meaningful strings over pickled bytes) so that you will be able to get the most
benefit from future improvements to parameterized class observability.
Simplifying Cls lookup patterns
-------------------------------
*Introduced in: v0.73.26*
Modal previously supported several different patterns for looking up a `modal.Cls` and remotely invoking one of its methods:
```
# Documented pattern
MyCls = modal.Cls.from_name("my-app", "MyCls")
obj = MyCls()
obj.some_method.remote(...)
# Alternate pattern: skipping the object instantiation
MyCls = modal.Cls.from_name("my-app", "MyCls")
MyCls.some_method.remote(...)
# Alternate pattern: looking up the method as a Function
f = modal.Function.lookup("my-app", "MyCls.some_method")
f.remote(...)
```
While each pattern could successfully trigger a remote function call, there were
a number of subtle differences in behavior between them.
Going forward, we will only support the first pattern. Making remote calls to a
method on a deployed Cls will require you to (a) look up the object using `modal.Cls` and (b) instantiate the object before calling its methods.
Deprecating `modal.gpu` objects
-------------------------------
*Introduced in: v0.73.31*
The `modal.gpu` objects are being deprecated; going forward, all GPU resource
configuration should be accomplished using strings.
This should be an easy code substitution, e.g. `gpu=modal.gpu.H100()` can be
replaced with `gpu="H100"`. When using the `count=` parameter of the GPU class,
simply append it to the name with a colon (e.g. `gpu="H100:8"`). In the case of
the `modal.gpu.A100(size="80GB")` variant, the name of the corresponding gpu is `"A100-80GB"`.
Note that string arguments are case-insensitive, so `"H100"` and `"h100"` are
both accepted.
The main rationale for this change is that it will allow us to introduce new
GPU models in the future without requring users to upgrade their SDK.
Requiring explicit invocation for module mode
---------------------------------------------
*Introduced in: 0.73.58*
The Modal CLI allows you to reference the source code for your App as either
a file path (e.g. `src/my_app.py`) or as a module name (e.g. `src.my_app`).
As in Python, the choice has some implications for how relative imports are
resolved. To make this more salient, Modal will mirror Python going forwared
and require that you explicitly invoke module mode by passing `-m` on your
command line (e.g., `modal deploy -m src.my_app`).
[Modal 1.0 migration guide](#modal-10-migration-guide)[Deprecating Image.copy\_\* methods](#deprecating-imagecopy_-methods)[Deprecating Mount as part of the public API](#deprecating-mount-as-part-of-the-public-api)[Deprecating the @modal.build decorator](#deprecating-the-modalbuild-decorator)[Requiring explicit inclusion of local Python dependencies](#requiring-explicit-inclusion-of-local-python-dependencies)[Renaming autoscaler parameters](#renaming-autoscaler-parameters)[Renaming modal.web\_endpoint to modal.fastapi\_endpoint](#renaming-modalweb_endpoint-to-modalfastapi_endpoint)[Replacing allow\_concurrent\_inputs with @modal.concurrent](#replacing-allow_concurrent_inputs-with-modalconcurrent)[Deprecating the .lookup method on Modal objects](#deprecating-the-lookup-method-on-modal-objects)[Removing support for custom Cls constructors](#removing-support-for-custom-cls-constructors)[Simplifying Cls lookup patterns](#simplifying-cls-lookup-patterns)[Deprecating modal.gpu objects](#deprecating-modalgpu-objects)[Requiring explicit invocation for module mode](#requiring-explicit-invocation-for-module-mode)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Sandboxes
=========
In addition to the Function interface, Modal has a direct
interface for defining containers *at runtime* and securely running arbitrary code
inside them.
This can be useful if, for example, you want to:
* Execute code generated by a language model.
* Create isolated environments for running untrusted code.
* Check out a git repository and run a command against it, like a test suite, or `npm lint`.
* Run containers with arbitrary dependencies and setup scripts.
Each individual job is called a **Sandbox** and can be created using the [`Sandbox.create`](../reference/modal.Sandbox.html#create) constructor:
```
import modal
app = modal.App.lookup("my-app", create_if_missing=True)
sb = modal.Sandbox.create(app=app)
p = sb.exec("python", "-c", "print('hello')")
print(p.stdout.read())
p = sb.exec("bash", "-c", "for i in {1..10}; do date +%T; sleep 0.5; done")
for line in p.stdout:
# Avoid double newlines by using end="".
print(line, end="")
sb.terminate()
```
**Note:** you can run the above example as a script directly with `python my_script.py`. `modal run` is not needed here since there is no [entrypoint](apps.html#entrypoints-for-ephemeral-apps).
Sandboxes require an [`App`](apps.html) to be passed when spawned from outside
of a Modal container. You may pass in a regular `App` object or look one up by name with [`App.lookup`](../reference/modal.App.html#lookup). The `create_if_missing` flag on `App.lookup` will create an `App` with the given name if it doesn’t exist.
Running a Sandbox with an entrypoint
------------------------------------
In most cases, Sandboxes are treated as a generic container that can run arbitrary
commands. However, in some cases, you may want to run a single command or script
as the entrypoint of the Sandbox. You can do this by passing string arguments to the
Sandbox constructor:
```
sb = modal.Sandbox.create("python", "-m", "http.server", "8080", app=my_app, timeout=10)
for line in sb.stdout:
print(line, end="")
```
This functionality is most useful for running long-lived services that you want
to keep running in the background. See our [Jupyter notebook example](../examples/jupyter_sandbox.html) for a more concrete example of this.
Referencing Sandboxes from other code
-------------------------------------
If you have a running Sandbox, you can retrieve it using the [`Sandbox.from_id`](../reference/modal.Sandbox.html#from_id) method.
```
sb = modal.Sandbox.create(app=my_app)
sb_id = sb.object_id
# ... later in the program ...
sb2 = modal.Sandbox.from_id(sb_id)
p = sb2.exec("echo", "hello")
print(p.stdout.read())
sb2.terminate()
```
A common use case for this is keeping a pool of Sandboxes available for executing tasks
as they come in. You can keep a list of `object_id`s of Sandboxes that are “open” and
reuse them, closing over the `object_id` in whatever function is using them.
Parameters
----------
Sandboxes support nearly all configuration options found in regular `modal.Function`s.
Refer to [`Sandbox.create`](../reference/modal.Sandbox.html#create) for further documentation
on Sandbox parametrization.
For example, Images and Mounts can be used just as with functions:
```
sb = modal.Sandbox.create(
image=modal.Image.debian_slim().pip_install("pandas"),
mounts=[modal.Mount.from_local_dir("./my_repo", remote_path="/repo")],
workdir="/repo",
app=my_app,
)
```
### Using custom images
Sandboxes support custom images just as Functions do. However, while you’ll typically
invoke a Modal Function with the `modal run` cli, you typically spawn a Sandbox
with a simple `python` call. As such, you need to manually enable output streaming
to see your image build logs:
```
image = modal.Image.debian_slim().pip_install("pandas", "numpy")
with modal.enable_output():
sb = modal.Sandbox.create(image=image, app=my_app)
```
### Dynamically defined environments
Note that any valid `Image` or `Mount` can be used with a Sandbox, even if those
images or mounts have not previously been defined. This also means that Images and
Mounts can be built from requirements at **runtime**. For example, you could
use a language model to write some code and define your image, and then spawn a
Sandbox with it. Check out [devlooper](https://github.com/modal-labs/devlooper) for a concrete example of this.
### Environment variables
You can set environment variables using inline secrets:
```
secret = modal.Secret.from_dict({"MY_SECRET": "hello"})
sb = modal.Sandbox.create(
secrets=[secret],
app=my_app,
)
p = sb.exec("bash", "-c", "echo $MY_SECRET")
print(p.stdout.read())
```
Tagging
-------
Sandboxes can be tagged with arbitrary key-value pairs. These tags can be used
to filter results in [`Sandbox.list`](../reference/modal.Sandbox.html#list).
```
sandbox_v1_1 = modal.Sandbox.create("sleep", "10", app=my_app)
sandbox_v1_2 = modal.Sandbox.create("sleep", "20", app=my_app)
sandbox_v1_1.set_tags({"major_version": "1", "minor_version": "1"})
sandbox_v1_2.set_tags({"major_version": "1", "minor_version": "2"})
for sandbox in modal.Sandbox.list(app_id=my_app.app_id): # All sandboxes.
print(sandbox.object_id)
for sandbox in modal.Sandbox.list(
app_id=my_app.app_id,
tags={"major_version": "1"},
): # Also all sandboxes.
print(sandbox.object_id)
for sandbox in modal.Sandbox.list(
app_id=app.app_id,
tags={"major_version": "1", "minor_version": "2"},
): # Just the latest sandbox.
print(sandbox.object_id)
```
[Sandboxes](#sandboxes)[Running a Sandbox with an entrypoint](#running-a-sandbox-with-an-entrypoint)[Referencing Sandboxes from other code](#referencing-sandboxes-from-other-code)[Parameters](#parameters)[Using custom images](#using-custom-images)[Dynamically defined environments](#dynamically-defined-environments)[Environment variables](#environment-variables)[Tagging](#tagging)
See it in action
[Building a coding agent with Sandboxes](../examples/agent.html)
[Building a code interpreter](../examples/simple_code_interpreter.html)
[Running a Jupyter notebook](../examples/jupyter_sandbox.html)
[Safe code execution](../examples/safe_code_execution.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Troubleshooting
===============
“Command not found” errors
--------------------------
If you installed Modal but you’re seeing an error like `modal: command not found` when trying to run the CLI, this means that the
installation location of Python package executables (“binaries”) are not present
on your system path. This is a common problem; you need to reconfigure your
system’s environment variables to fix it.
One workaround is to use `python -m modal.cli` instead of `modal`. However, this
is just a patch. There’s no single solution for the problem because Python
installs dependencies on different locations depending on your environment. See
this [popular StackOverflow question](https://stackoverflow.com/q/35898734) for
pointers on how to resolve your system path issue.
Custom types defined in `__main__`
----------------------------------
Modal currently uses [cloudpickle](https://github.com/cloudpipe/cloudpickle) to
transfer objects returned or exceptions raised by functions that are executed in
Modal. This gives a lot of flexibility and support for custom data types.
However, any types that are declared in your Python entrypoint file (The one you
call on the command line) will currently be *redeclared* if they are returned
from Modal functions, and will therefore have the same structure and type name
but not maintain class object identity with your local types. This means that
you *can’t* catch specific custom exception classes:
```
import modal
app = modal.App()
class MyException(Exception):
pass
@app.function()
def raise_custom():
raise MyException()
@app.local_entrypoint()
def main():
try:
raise_custom.remote()
except MyException: # this will not catch the remote exception
pass
except Exception: # this will catch it instead, as it's still a subclass of Exception
pass
```
Nor can you do object equality checks on `dataclasses`, or `isinstance` checks:
```
import modal
import dataclasses
@dataclasses.dataclass
class MyType:
foo: int
app = modal.App()
@app.function()
def return_custom():
return MyType(foo=10)
@app.local_entrypoint()
def main():
data = return_custom.remote()
assert data == MyType(foo=10) # false!
assert data.foo == 10 # true!, the type still has the same fields etc.
assert isinstance(data, MyType) # false!
```
If this is a problem for you, you can easily solve it by moving your custom type
definitions to a separate Python file from the one you trigger to run your Modal
code, and import that file instead.
```
# File: my_types.py
import dataclasses
@dataclasses.dataclass
class MyType:
foo: int
```
```
# File: modal_script.py
import modal
from my_types import MyType
app = modal.App()
@app.function()
def return_custom():
return MyType(foo=10)
@app.local_entrypoint()
def main():
data = return_custom.remote()
assert data == MyType(foo=10) # true!
assert isinstance(data, MyType) # true!
```
Function side effects
---------------------
The same container *can* be reused for multiple invocations of the same function
within an app. This means that if your function has side effects like modifying
files on disk, they may or may not be present for subsequent calls to that
function. You should not rely on the side effects to be present, but you might
have to be careful so they don’t cause problems.
For example, if you create a disk-backed database using sqlite3:
```
import modal
import sqlite3
app = modal.App()
@app.function()
def db_op():
db = sqlite3("db_file.sqlite3")
db.execute("CREATE TABLE example (col_1 TEXT)")
...
```
This function *can* (but will not necessarily) fail on the second invocation
with an
`OperationalError: table foo already exists`
To get around this, take care to either clean up your side effects (e.g.
deleting the db file at the end your function call above) or make your functions
take them into consideration (e.g. adding an `if os.path.exists("db_file.sqlite")` condition or randomize the filename
above).
Heartbeat timeout
-----------------
The Modal client in `modal.Function` containers runs a heartbeat loop that the host uses to healthcheck the container’s main process.
If the container stops heartbeating for a long period (minutes) the container will be terminated due to a `heartbeat timeout`, which is displayed in logs.
Container heartbeat timeouts are rare, and typically caused by one of two application-level sources:
* [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock) is held for a long time, stopping the heartbeat thread from making progress. [py-spy](https://github.com/benfred/py-spy?tab=readme-ov-file#how-does-gil-detection-work) can detect GIL holding. We include `py-spy` [automatically in `modal shell`](developing-debugging.html#debug-shells) for convenience. A quick fix for GIL holding is to run the code which holds the GIL [in a subprocess](https://docs.python.org/3/library/multiprocessing.html#the-process-class).
* Container process initiates shutdown, intentionally stopping the heartbeats, but it does not complete shutdown.
In both cases [turning on debug logging](developing-debugging.html#debug-logs) will help diagnose the issue.
`413 Content Too Large` errors
------------------------------
If you receive a `413 Content Too Large` error, this might be because you are
hitting our gRPC payload size limits.
The size limit is currently 100MB.
`403` errors when connecting to GCP services.
---------------------------------------------
GCP will sometimes return 403 errors to Modal when connecting directly to GCP
cloud services like Google Cloud Storage. This is a known issue.
The workaround is to pin the `cloud` parameter in the [`@app.function`](../reference/modal.App.html#function) or [`@app.cls`](../reference/modal.App.html#cls).
For example:
```
@app.function(cloud="gcp")
def f():
...
```
```
@app.cls(cloud="gcp")
class MyClass:
...
```
Outdated kernel version (4.4.0)
-------------------------------
Our secure runtime [reports a misleadingly old](https://github.com/google/gvisor/issues/11117) kernel version, 4.4.0.
Certain software libraries will detect this and report a warning. These warnings can be ignored because the runtime
actually implements Linux kernel features from versions 5.15+.
If the outdated kernel version reporting creates errors in your application please contact us [in our Slack](../../slack.html).
[Troubleshooting](#troubleshooting)[“Command not found” errors](#command-not-found-errors)[Custom types defined in \_\_main\_\_](#custom-types-defined-in-__main__)[Function side effects](#function-side-effects)[Heartbeat timeout](#heartbeat-timeout)[413 Content Too Large errors](#413-content-too-large-errors)[403 errors when connecting to GCP services.](#403-errors-when-connecting-to-gcp-services)[Outdated kernel version (4.4.0)](#outdated-kernel-version-440)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Okta SSO
========
Prerequisites
-------------
* A Workspace that’s on an [Enterprise](../../pricing.html) plan
* Admin access to the Workspace you want to configure with Okta Single-Sign-On (SSO)
* Admin privileges for your Okta Organization
Supported features
------------------
* IdP-initiated SSO
* SP-initiated SSO
* Just-In-Time account provisioning
For more information on the listed features, visit the [Okta Glossary](https://help.okta.com/okta_help.htm?type=oie&id=ext_glossary).
Configuration
-------------
### Read this before you enable “Require SSO”
Enabling “Require SSO” will force all users to sign in via Okta. Ensure that you
have admin access to your Modal Workspace through an Okta account before
enabling.
### Configuration steps
#### Step 1: Add Modal app to Okta Applications
1. Sign in to your Okta admin dashboard
2. Navigate to the Applications tab and click “Browse App Catalog”. ![Okta browse application](../../_app/immutable/assets/okta-browse-applications.BiqGsdcd.png)
3. Select “Modal” and click “Done”.
4. Select the “Sign On” tab and click “Edit”. ![Okta sign on edit](../../_app/immutable/assets/okta-sign-on-edit.DHny2cIB.png)
5. Fill out Workspace field to configure for your specific Modal workspace. See [Step 2](#step-2-link-your-workspace-to-okta-modal-application) if you’re unsure what this is. ![Okta add workspace](../../_app/immutable/assets/okta-add-workspace-username.DoM8qewy.png)
#### Step 2: Link your Workspace to Okta Modal application
1. Navigate to your application on the Okta Admin page.
2. Copy the Metadata URL from the Okta Admin Console (It’s under the “Sign On”
tab). ![Okta metadata url](../../_app/immutable/assets/okta-metadata-url.BLDzMpWn.png)
3. Sign in to <https://modal.com> and visit your Workspace Management page
(e.g. `https://modal.com/settings/[workspace name]/workspace-management`)
4. Paste the Metadata URL in the input and click “Save Changes”
#### Step 3: Assign users / groups and test the integration
1. Navigate back to your Okta application on the Okta Admin dashboard.
2. Click on the “Assignments” tab and add the appropriate people or groups.
![Okta Assign Users](../../_app/immutable/assets/okta-assign-people.BhAmcJ0m.png)
3. To test the integration, sign in as one of the users you assigned in the previous step.
4. Click on the Modal application on the Okta Dashboard to initiate Single Sign-On.
#### Notes
The following SAML attributes are used by the integration:
| Name | Value |
| --- | --- |
| email | user.email |
| firstName | user.firstName |
| lastName | user.lastName |
SP-initiated SSO
----------------
The sign-in process is initiated from <https://modal.com/login/sso>
1. Enter your workspace name in the input
2. Click “continue with SSO” to authenticate with Okta
[Okta SSO](#okta-sso)[Prerequisites](#prerequisites)[Supported features](#supported-features)[Configuration](#configuration)[Read this before you enable “Require SSO”](#read-this-before-you-enable-require-sso)[Configuration steps](#configuration-steps)[Step 1: Add Modal app to Okta Applications](#step-1-add-modal-app-to-okta-applications)[Step 2: Link your Workspace to Okta Modal application](#step-2-link-your-workspace-to-okta-modal-application)[Step 3: Assign users / groups and test the integration](#step-3-assign-users--groups-and-test-the-integration)[Notes](#notes)[SP-initiated SSO](#sp-initiated-sso)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Web endpoint URLs
=================
This guide documents the behavior of URLs for [web endpoints](webhooks.html) on Modal: automatic generation, configuration, programmatic retrieval, and more.
Determine the URL of a web endpoint from code
---------------------------------------------
Modal Functions with the [`fastapi_endpoint`](../reference/modal.fastapi_endpoint.html), [`asgi_app`](../reference/modal.asgi_app.html), [`wsgi_app`](../reference/modal.wsgi_app.html),
or [`web_server`](../reference/modal.web_server.html) decorator
are made available over the Internet when they are [`serve`d](../reference/cli/serve.html) or [`deploy`ed](../reference/cli/deploy.html) and so they have a URL.
This URL is displayed in the `modal` CLI output
and is available in the Modal [dashboard](../../login%EF%B9%96next=%EA%A4%B7apps.html) for the Function.
To determine a Function’s URL programmatically,
check its [`get_web_url()`](../reference/modal.Function.html#get_web_url) property:
```
@app.function(image=modal.Image.debian_slim().pip_install("fastapi[standard]"))
@modal.fastapi_endpoint(docs=True)
def show_url() -> str:
return show_url.get_web_url()
```
For deployed Functions, this also works from other Python code!
You just need to do a [`from_name`](../reference/modal.Function.html#from_name) based on the name of the Function and its [App](apps.html):
```
import requests
remote_function = modal.Function.from_name("app", "show_url")
remote_function.get_web_url() == requests.get(handle.get_web_url()).json()
```
Auto-generated URLs
-------------------
By default, Modal Functions
will be served from the `modal.run` domain.
The full URL will be constructed from a number of pieces of information
to uniquely identify the endpoint.
At a high-level, web endpoint URLs for deployed applications have the
following structure: `https://<source>--<label>.modal.run`.
The `source` component represents the workspace and environment where the App is
deployed. If your workspace has only a single environment, the `source` will
just be the workspace name. Multiple environments are disambiguated by an [“environment suffix”](environments.html#environment-web-suffixes), so
the full source would be `<workspace>-<suffix>`. However, one environment per
workspace is allowed to have a null suffix, in which case the source would just
be `<workspace>`.
The `label` component represents the specific App and Function that the endpoint
routes to. By default, these are concatenated with a hyphen, so the label would
be `<app>-<function>`.
These components are normalized to contain only lowercase letters, numerals, and dashes.
To put this all together, consider the following example. If a member of the `ECorp` workspace uses the `main` environment (which has `prod` as its web
suffix) to deploy the `text_to_speech` app with a webhook for the `flask-app` function, the URL will have the following components:
* *Source*:
+ *Workspace name slug*: `ECorp` → `ecorp`
+ *Environment web suffix slug*: `main` → `prod`
* *Label*:
+ *App name slug*: `text_to_speech` → `text-to-speech`
+ *Function name slug*: `flask_app` → `flask-app`
The full URL will be `https://ecorp-prod--text-to-speech-flask-app.modal.run`.
User-specified labels
---------------------
It’s also possible to customize the `label` used for each Function
by passing a parameter to the relevant endpoint decorator:
```
import modal
image = modal.Image.debian_slim().pip_install("fastapi")
app = modal.App(name="text_to_speech", image=image)
@app.function()
@modal.fastapi_endpoint(label="speechify")
def web_endpoint_handler():
...
```
Building on the example above, this code would produce the following URL: `https://ecorp-prod--speechify.modal.run`.
User-specified labels are not automatically normalized, but labels with
invalid characters will be rejected.
Ephemeral apps
--------------
To support development workflows, webhooks for ephemeral apps (i.e., apps
created with `modal serve`) will have a `-dev` suffix appended to their URL
label (regardless of whether the label is auto-generated or user-specified).
This prevents development work from interfering with deployed versions of the
same app.
If an ephemeral app is serving a webhook while another ephemeral webhook is
created seeking the same web endpoint label, the new function will *steal* the
running webhook’s label.
This ensures that the latest iteration of the ephemeral function is
serving requests and that older ones stop receiving web traffic.
Truncation
----------
If a generated subdomain label is longer than 63 characters, it will be
truncated.
For example, the following subdomain label is too long, 67 characters: `ecorp--text-to-speech-really-really-realllly-long-function-name-dev`.
The truncation happens by calculating a SHA-256 hash of the overlong label, then
taking the first 6 characters of this hash. The overlong subdomain label is
truncated to 56 characters, and then joined by a dash to the hash prefix. In
the above example, the resulting URL would be `ecorp--text-to-speech-really-really-rea-1b964b-dev.modal.run`.
The combination of the label hashing and truncation provides a unique list of 63
characters, complying with both DNS system limits and uniqueness requirements.
Custom domains
--------------
**Custom domains are available on our [Team and Enterprise plans](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7plans.html).**
For more customization, you can use your own domain names with Modal web
endpoints. If your [plan](../../pricing.html) supports custom domains, visit the [Domains
tab](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7domains.html) in your workspace settings to add a domain name to your
workspace.
You can use three kinds of domains with Modal:
* **Apex:** root domain names like `example.com`
* **Subdomain:** single subdomain entries such as `my-app.example.com`, `api.example.com`, etc.
* **Wildcard domain:** either in a subdomain like `*.example.com`, or in a
deeper level like `*.modal.example.com`
You’ll be asked to update your domain DNS records with your domain name
registrar and then validate the configuration in Modal. Once the records have
been properly updated and propagated, your custom domain will be ready to use.
You can assign any Modal web endpoint to any registered domain in your workspace
with the `custom_domains` argument.
```
import modal
app = modal.App("custom-domains-example")
@app.function()
@modal.fastapi_endpoint(custom_domains=["api.example.com"])
def hello(message: str):
return {"message": f"hello {message}"}
```
You can then run `modal deploy` to put your web endpoint online, live.
```
$ curl -s https://api.example.com?message=world
{"message": "hello world"}
```
Note that Modal automatically generates and renews TLS certificates for your
custom domains. Since we do this when your domain is first accessed, there may
be an additional 1-2s latency on the first request. Additional requests use a
cached certificate.
You can also register multiple domain names and associate them with the same web
endpoint.
```
import modal
app = modal.App("custom-domains-example-2")
@app.function()
@modal.fastapi_endpoint(custom_domains=["api.example.com", "api.example.net"])
def hello(message: str):
return {"message": f"hello {message}"}
```
For **Wildcard** domains, Modal will automatically resolve arbitrary custom
endpoints (and issue TLS certificates). For example, if you add the wildcard
domain `*.example.com`, then you can create any custom domains under `example.com`:
```
import random
import modal
app = modal.App("custom-domains-example-2")
random_domain_name = random.choice(range(10))
@app.function()
@modal.fastapi_endpoint(custom_domains=[f"{random_domain_name}.example.com"])
def hello(message: str):
return {"message": f"hello {message}"}
```
Custom domains can also be used with [ASGI](../reference/modal.asgi_app.html#modalasgi_app) or [WSGI](../reference/modal.wsgi_app.html) apps using the same `custom_domains` argument.
[Web endpoint URLs](#web-endpoint-urls)[Determine the URL of a web endpoint from code](#determine-the-url-of-a-web-endpoint-from-code)[Auto-generated URLs](#auto-generated-urls)[User-specified labels](#user-specified-labels)[Ephemeral apps](#ephemeral-apps)[Truncation](#truncation)[Custom domains](#custom-domains)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Environments
============
Environments are sub-divisions of workspaces, allowing you to deploy the same app
(or set of apps) in multiple instances for different purposes without changing
your code. Typical use cases for environments include having one `dev` environment and one `prod` environment, preventing overwriting production apps
when developing new features, while still being able to deploy changes to a
“live” and potentially complex structure of apps.
Each environment has its own set of [Secrets](secrets.html) and any
object lookups performed from an app in an environment will by default look for
objects in the same environment.
By default, every workspace has a single Environment called “main”. New
Environments can be created on the CLI:
```
modal environment create dev
```
(You can run `modal environment --help` for more info)
Once created, Environments show up as a dropdown menu in the navbar of the [Modal dashboard](../../login%EF%B9%96next=%EA%A4%B7apps.html), letting you set browse all Modal Apps and Secrets
filtered by which Environment they were deployed to.
Most CLI commands also support an `--env` flag letting you specify which
Environment you intend to interact with, e.g.:
```
modal run --env=dev app.py
modal volume create --env=dev storage
```
To set a default Environment for your current CLI profile you can use `modal config set-environment`, e.g.:
```
modal config set-environment dev
```
Alternatively, you can set the `MODAL_ENVIRONMENT` environment variable.
Environment web suffixes
------------------------
Environments have a ‘web suffix’ which is used to make [web endpoint URLs](webhook-urls.html) unique across your workspace. One
Environment is allowed to have no suffix (`""`).
Cross environment lookups
-------------------------
It’s possible to explicitly look up objects in Environments other than the Environment
your app runs within:
```
production_secret = modal.Secret.from_name(
"my-secret",
environment_name="main"
)
```
```
modal.Function.from_name(
"my_app",
"some_function",
environment_name="dev"
)
```
However, the `environment_name` argument is optional and omitting it will use
the Environment from the object’s associated App or calling context.
[Environments](#environments)[Environment web suffixes](#environment-web-suffixes)[Cross environment lookups](#cross-environment-lookups)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Scheduling remote cron jobs
===========================
A common requirement is to perform some task at a given time every day or week
automatically. Modal facilitates this through function schedules.
Basic scheduling
----------------
Let’s say we have a Python module `heavy.py` with a function, `perform_heavy_computation()`.
```
# heavy.py
def perform_heavy_computation():
...
if __name__ == "__main__":
perform_heavy_computation()
```
To schedule this function to run once per day, we create a Modal App and attach
our function to it with the `@app.function` decorator and a schedule parameter:
```
# heavy.py
import modal
app = modal.App()
@app.function(schedule=modal.Period(days=1))
def perform_heavy_computation():
...
```
To activate the schedule, deploy your app, either through the CLI:
```
modal deploy --name daily_heavy heavy.py
```
Or programmatically:
```
if __name__ == "__main__":
app.deploy()
```
Now the function will run every day, at the time of the initial deployment,
without any further interaction on your part.
When you make changes to your function, just rerun the deploy command to
overwrite the old deployment.
Note that when you redeploy your function, `modal.Period` resets, and the
schedule will run X hours after this most recent deployment.
If you want to run your function at a regular schedule not disturbed by deploys, `modal.Cron` (see below) is a better option.
Monitoring your scheduled runs
------------------------------
To see past execution logs for the scheduled function, go to the [Apps](https://modal.com/apps) section on the Modal web site.
Schedules currently cannot be paused. Instead the schedule should be removed and
the app redeployed. Schedules can be started manually on the app’s dashboard
page, using the “run now” button.
Schedule types
--------------
There are two kinds of base schedule values - [`modal.Period`](../reference/modal.Period.html) and [`modal.Cron`](../reference/modal.Cron.html).
[`modal.Period`](../reference/modal.Period.html) lets you specify an interval
between function calls, e.g. `Period(days=1)` or `Period(hours=5)`:
```
# runs once every 5 hours
@app.function(schedule=modal.Period(hours=5))
def perform_heavy_computation():
...
```
[`modal.Cron`](../reference/modal.Cron.html) gives you finer control using [cron](https://en.wikipedia.org/wiki/Cron) syntax:
```
# runs at 8 am (UTC) every Monday
@app.function(schedule=modal.Cron("0 8 * * 1"))
def perform_heavy_computation():
...
```
For more details, see the API reference for [Period](../reference/modal.Period.html), [Cron](../reference/modal.Cron.html) and [Function](../reference/modal.Function.html)
[Scheduling remote cron jobs](#scheduling-remote-cron-jobs)[Basic scheduling](#basic-scheduling)[Monitoring your scheduled runs](#monitoring-your-scheduled-runs)[Schedule types](#schedule-types)
See it in action
[Hacker News Slackbot](../examples/hackernews_alerts.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Passing local data
==================
If you have a function that needs access to some data not present in your Python
files themselves you have a few options for bundling that data with your Modal
app.
Passing function arguments
--------------------------
The simplest and most straight-forward way is to read the data from your local
script and pass the data to the outermost Modal function call:
```
import json
@app.function()
def foo(a):
print(sum(a["numbers"]))
@app.local_entrypoint()
def main():
data_structure = json.load(open("blob.json"))
foo.remote(data_structure)
```
Any data of reasonable size that is serializable through [cloudpickle](https://github.com/cloudpipe/cloudpickle) is passable as an
argument to Modal functions.
Refer to the section on [global variables](global-variables.html) for how
to work with objects in global scope that can only be initialized locally.
Including local files
---------------------
For including local files for your Modal Functions to access, see [Defining Images](images.html).
[Passing local data](#passing-local-data)[Passing function arguments](#passing-function-arguments)[Including local files](#including-local-files)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Using OIDC to authenticate with external services
=================================================
Your Functions in Modal may need to access external resources like S3 buckets.
Traditionally, you would need to store long-lived credentials in Modal Secrets
and reference those Secrets in your function code. With the Modal OIDC
integration, you can instead use automatically-generated identity
tokens to authenticate to external services.
How it works
------------
[OIDC](https://auth0.com/docs/authenticate/protocols/openid-connect-protocol) is
a standard protocol for authenticating users between systems. In Modal, we use
OIDC to generate short-lived tokens that external services can use to verify
that your function is authenticated.
The OIDC integration has two components: the discovery document and the generated
tokens.
The [OIDC discovery document](https://swagger.io/docs/specification/v3_0/authentication/openid-connect-discovery/) describes how our OIDC server is configured. It primarily includes the supported [claims](https://developer.okta.com/blog/2017/07/25/oidc-primer-part-1) and the [keys](https://auth0.com/docs/secure/tokens/json-web-tokens/json-web-key-sets) we use to sign tokens. Discovery documents are always hosted at `/.well-known/openid-configuration`, and
you can view ours at <https://oidc.modal.com/.well-known/openid-configuration>.
The generated tokens are [JWTs](https://jwt.io/) signed by Modal using the keys described in the
discovery document. These tokens contain the full identity of the Function
in the `sub` claim, and they use custom claims to make this information more
easily accessible. See our [discovery document](https://oidc.modal.com/.well-known/openid-configuration) for a full list of claims.
Generated tokens are injected into your Function’s containers via the `MODAL_IDENTITY_TOKEN` environment variable. Below is an example of what claims might be included in a token:
```
{
"sub": "modal:workspace_id:ac-12345abcd:environment_name:modal-examples:app_name:oidc-token-test:function_name:jwt_return_func:container_id:ta-12345abcd",
"aud": "oidc.modal.com",
"exp": 1732137751,
"iat": 1731964951,
"iss": "https://oidc.modal.com",
"jti": "31f92dca-e847-4bc9-8d15-9f234567a123",
"workspace_id": "ac-12345abcd",
"environment_id": "en-12345abcd",
"environment_name": "modal-examples",
"app_id": "ap-12345abcd",
"app_name": "oidc-token-test",
"function_id": "fu-12345abcd",
"function_name": "jwt_return_func",
"container_id": "ta-12345abcd"
}
```
### Key thumbprints
RSA keys have [thumbprints](https://connect2id.com/products/nimbus-jose-jwt/examples/jwk-thumbprints). You
can use these thumbprints to verify that the keys in our discovery document are
genuine. This protects against potential Man in the Middle (MitM) attacks, although
our required use of HTTPS mitigates this risk.
If you’d like to have the extra security of verifying the thumbprints, you can
use the following command to print the thumbprints for the keys in our
discovery document:
```
$ openssl s_client -connect oidc.modal.com:443 < /dev/null 2>/dev/null | openssl x509 -fingerprint -noout | awk -F= '{print $2}' | tr -d ':'
F062F2151EDE30D1620B48B7AC91D66047D769D3
```
Note that these thumbprints may change over time as we rotate keys. We recommend
periodically checking for and updating your scripts with the new thumbprints.
### App name format
By default, Modal Apps can be created with arbitrary names. However, when using
OIDC, the App name has a stricter character set. Specifically, it must be 64
characters or less and can only include alphanumeric characters, dashes, periods,
and underscores. If these constraints are violated, the OIDC token will not be
injected into the container.
Note that these are the same constraints that are applied to [Deployed Apps](managing-deployments.html).
This means that if an App is deployable, it will also be compatible with OIDC.
Demo usage with AWS
-------------------
To see how OIDC tokens can be used, we’ll demo a simple Function that lists
objects in an S3 bucket.
### Step 0: Understand your OIDC claims
Before we can configure OIDC policies, we need to know what claims we can match
against. We can run a Function and inspect its claims to find out.
```
app = modal.App("oidc-token-test")
jwt_image = modal.Image.debian_slim().pip_install("pyjwt")
@app.function(image=jwt_image)
def jwt_return_func():
import jwt
token = os.environ["MODAL_IDENTITY_TOKEN"]
claims = jwt.decode(token, options={"verify_signature": False})
print(json.dumps(claims, indent=2))
@app.local_entrypoint()
def main():
jwt_return_func.remote()
```
Run the function locally to see its claims:
```
$ modal run oidc-token-test.py
{
"sub": "modal:workspace_id:ac-12345abcd:environment_name:modal-examples:app_name:oidc-token-test:function_name:jwt_return_func:container_id:ta-12345abcd",
"aud": "oidc.modal.com",
"exp": 1732137751,
"iat": 1731964951,
"iss": "https://oidc.modal.com",
"jti": "31f92dca-e847-4bc9-8d15-9f234567a123",
"workspace_id": "ac-12345abcd",
"environment_id": "en-12345abcd",
"environment_name": "modal-examples",
"app_id": "ap-12345abcd",
"app_name": "oidc-token-test",
"function_id": "fu-12345abcd",
"function_name": "jwt_return_func",
"container_id": "ta-12345abcd"
}
```
Now we can match off these claims to configure our OIDC policies.
### Step 1: Configure AWS to trust Modal’s OIDC provider
We need to make AWS accept Modal identity tokens. To do this, we need to add
Modal’s OIDC provider as a trusted entity in our AWS account.
```
aws iam create-open-id-connect-provider \
--url https://oidc.modal.com \
--client-id-list oidc.modal.com \
# Optionally replace with the thumbprint from the discovery document.
# Note that this may change over time as we rotate keys, and this argument
# can be omitted if you'd prefer to rely on the HTTPS verification instead.
--thumbprint-list "<thumbprint>"
```
This will trigger AWS to pull down our [JSON Web Key Set (JWKS)](https://auth0.com/docs/secure/tokens/json-web-tokens/json-web-key-sets) and use it to verify the signatures of any tokens signed by Modal.
### Step 2: Create an IAM role that can be assumed by Modal Functions
Let’s create a simple IAM policy that allows listing objects in an S3 bucket.
Take the policy below and replace the bucket name with your own.
```
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": ["s3:PutObject", "s3:GetObject", "s3:ListBucket"],
"Resource": ["arn:aws:s3:::fun-bucket", "arn:aws:s3:::fun-bucket/*"]
}
]
}
```
Now, we can create an IAM role that uses this policy. Visit the IAM console
to create this role. Be sure to replace the account ID and workspace ID placeholders
with your own.
```
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Federated": "arn:aws:iam::123456789abcd:oidc-provider/oidc.modal.com"
},
"Action": "sts:AssumeRoleWithWebIdentity",
"Condition": {
"StringEquals": {
"oidc.modal.com:aud": "oidc.modal.com"
},
"StringLike": {
"oidc.modal.com:sub": "modal:workspace_id:ac-12345abcd:*"
}
}
}
]
}
```
Note how we use `workspace_id` to limit the scope of the role. This means that
the IAM role can only be assumed by Functions in your Workspace. You can further
limit this by specifying an Environment, App, or Function name.
Ideally, we would use the custom claims for role limiting. Unfortunately, AWS
does not support [matching on custom claims](https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_iam-condition-keys.html#condition-keys-wif),
so we use the `sub` claim instead.
### Step 3: Use the OIDC token in your Function
The AWS SDKs have built-in support for OIDC tokens, so you can use them as
follows:
```
import boto3
app = modal.App("oidc-token-test")
boto3_image = modal.Image.debian_slim().pip_install("boto3")
# Trade a Modal OIDC token for AWS credentials
def get_s3_client(role_arn):
sts_client = boto3.client("sts")
# Assume role with Web Identity
credential_response = sts_client.assume_role_with_web_identity(
RoleArn=role_arn, RoleSessionName="OIDCSession", WebIdentityToken=os.environ["MODAL_IDENTITY_TOKEN"]
)
# Extract credentials
credentials = credential_response["Credentials"]
return boto3.client(
"s3",
aws_access_key_id=credentials["AccessKeyId"],
aws_secret_access_key=credentials["SecretAccessKey"],
aws_session_token=credentials["SessionToken"],
)
# List the contents of an S3 bucket
@app.function(image=boto3_image)
def list_bucket_contents(bucket_name, role_arn):
s3_client = get_s3_client(role_arn)
response = s3_client.list_objects_v2(Bucket=bucket_name)
for obj in response["Contents"]:
print(f"- {obj['Key']} (Size: {obj['Size']} bytes)")
@app.local_entrypoint()
def main():
# Replace with the role ARN and bucket name from step 2
list_bucket_contents.remote("fun-bucket", "arn:aws:iam::123456789abcd:role/oidc_test_role")
```
Run the function locally to see the contents of the bucket:
```
$ modal run oidc-token-test.py
- test-file.txt (Size: 10 bytes)
```
Next steps
----------
The OIDC integration can be used for much more than just AWS. With this same pattern,
you can configure automatic access to [Vault](https://developer.hashicorp.com/vault/docs/auth/jwt), [GCP](https://cloud.google.com/identity-platform/docs/web/oidc), [Azure](https://learn.microsoft.com/en-us/entra/identity-platform/v2-protocols-oidc), and more.
[Using OIDC to authenticate with external services](#using-oidc-to-authenticate-with-external-services)[How it works](#how-it-works)[Key thumbprints](#key-thumbprints)[App name format](#app-name-format)[Demo usage with AWS](#demo-usage-with-aws)[Step 0: Understand your OIDC claims](#step-0-understand-your-oidc-claims)[Step 1: Configure AWS to trust Modal’s OIDC provider](#step-1-configure-aws-to-trust-modals-oidc-provider)[Step 2: Create an IAM role that can be assumed by Modal Functions](#step-2-create-an-iam-role-that-can-be-assumed-by-modal-functions)[Step 3: Use the OIDC token in your Function](#step-3-use-the-oidc-token-in-your-function)[Next steps](#next-steps)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Storing model weights on Modal
==============================
Efficiently managing the weights of large models is crucial for optimizing the
build times and startup latency of many ML and AI applications.
Our recommended method for working with model weights is to store them in a Modal [Volume](volumes.html),
which acts as a distributed file system, a “shared disk” all of your Modal Functions can access.
Storing weights in a Modal Volume
---------------------------------
To store your model weights in a Volume, you need to either
make the Volume available to a Modal Function that saves the model weights
or upload the model weights into the Volume from a client.
### Saving model weights into a Modal Volume from a Modal Function
If you’re already generating the weights on Modal, you just need to
attach the Volume to your Modal Function, making it available for reading and writing:
```
from pathlib import Path
volume = modal.Volume.from_name("model-weights-vol", create_if_missing=True)
MODEL_DIR = Path("/models")
@app.function(gpu="any", volumes={MODEL_DIR: volume}) # attach the Volume
def train_model(data, config):
import run_training
model = run_training(config, data)
model.save(config, MODEL_DIR)
```
Volumes are attached by including them in a dictionary that maps
a path on the remote machine to a `modal.Volume` object.
They look just like a normal file system, so model weights can be saved to them
without adding any special code.
If the model weights are generated outside of Modal and made available
over the Internet, for example by an open-weights model provider
or your own training job on a dedicated cluster,
you can also download them into a Volume from a Modal Function:
```
@app.function(volumes={MODEL_DIR: volume})
def download_model(model_id):
import model_hub
model_hub.download(model_id, local_dir=MODEL_DIR / model_id)
```
Add [Modal Secrets](secrets.html) to access weights that require authentication.
See [below](#storing-weights-from-the-hugging-face-hub-on-modal) for
more on downloading from the popular Hugging Face Hub.
### Uploading model weights into a Modal Volume
Instead of pulling weights into a Modal Volume from inside a Modal Function,
you might wish to push weights into Modal from a client,
like your laptop or a dedicated training cluster.
For that, you can use the `batch_upload` method of [`modal.Volume`](../reference/modal.Volume.html)s
via the Modal Python client library:
```
volume = modal.Volume.from_name("model-weights-vol", create_if_missing=True)
@app.local_entrypoint()
def main(local_path: str, remote_path: str):
with volume.batch_upload() as upload:
upload.put_directory(local_path, remote_path)
```
Alternatively, you can upload model weights using the [`modal volume`](../reference/cli/volume.html) CLI command:
```
modal volume put model-weights-vol path/to/model path/on/volume
```
### Mounting cloud buckets as Modal Volumes
If your model weights are already in cloud storage,
for example in an S3 bucket, you can connect them
to Modal Functions with a `CloudBucketMount`.
See [the guide](cloud-bucket-mounts.html) for details.
Reading model weights from a Modal Volume
-----------------------------------------
You can read weights from a Volume as you would normally read them
from disk, so long as you attach the Volume to your Function.
```
@app.function(gpu="any", volumes={MODEL_DIR: volume})
def inference(prompt, model_id):
import load_model
model = load_model(MODEL_DIR / model_id)
model.run(prompt)
```
Storing weights in the Modal Image
----------------------------------
It is also possible to store weights in your Function’s Modal [Image](images.html),
the private file system state that a Function sees when it starts up.
The weights might be downloaded via shell commands with [`Image.run_commands`](images.html) or downloaded using a Python function with [`Image.run_function`](images.html).
We recommend storing model weights in a Modal [Volume](volumes.html),
as described [above](#storing-weights-in-a-modal-volume). Performance is similar
for the two methods. Volumes are more flexible.
Images are rebuilt when their definition changes, starting from the changed layer,
which increases reproducibility for some builds but leads to unnecessary extra downloads
in most cases.
Optimizing model weight reads with `@enter`
-------------------------------------------
In the above code samples, weights are loaded from disk into memory each time
the `inference` function is run. This isn’t so bad if inference is much
slower than model loading (e.g. it is run on very large datasets)
or if the model loading logic is smart enough to skip reloading.
To guarantee a particular model’s weights are only loaded once, you can use the `@enter` [container lifecycle hook](lifecycle-functions.html) to load the weights only when a new container starts.
```
MODEL_ID = "some-model-id"
@app.cls(gpu="any", volumes={MODEL_DIR: volume})
class Model:
@modal.enter()
def setup(self, model_id=MODEL_ID):
import load_model
self.model = load_model(MODEL_DIR, model_id)
@modal.method()
def inference(self, prompt):
return self.model.run(prompt)
```
Note that methods decorated with `@enter` can’t be passed dynamic arguments.
If you need to load a single but possibly different model on each container start, you can [parametrize](parametrized-functions.html) your Modal Cls.
Below, we use the `modal.parameter` syntax.
```
@app.cls(gpu="any", volumes={MODEL_DIR: volume})
class ParametrizedModel:
model_id: str = modal.parameter()
@modal.enter()
def setup(self):
import load_model
self.model = load_model(MODEL_DIR, self.model_id)
@modal.method()
def inference(self, prompt):
return self.model.run(prompt)
```
Storing weights from the Hugging Face Hub on Modal
--------------------------------------------------
The [Hugging Face Hub](https://huggingface.co/models) has over 1,000,000 models
with weights available for download.
The snippet below shows some additional tricks for downloading models
from the Hugging Face Hub on Modal.
```
from pathlib import Path
import modal
# create a Volume, or retrieve it if it exists
volume = modal.Volume.from_name("model-weights-vol", create_if_missing=True)
MODEL_DIR = Path("/models")
# define dependencies for downloading model
download_image = (
modal.Image.debian_slim()
.pip_install("huggingface_hub[hf_transfer]") # install fast Rust download client
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # and enable it
)
# define dependencies for running model
inference_image = modal.Image.debian_slim().pip_install("transformers")
@app.function(
volumes={MODEL_DIR: volume}, # "mount" the Volume, sharing it with your function
image=download_image, # only download dependencies needed here
)
def download_model(
repo_id: str="hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
revision: str=None, # include a revision to prevent surprises!
):
from huggingface_hub import snapshot_download
snapshot_download(repo_id=repo_id, local_dir=MODEL_DIR / repo_id)
print(f"Model downloaded to {MODEL_DIR / repo_id}")
```
[Storing model weights on Modal](#storing-model-weights-on-modal)[Storing weights in a Modal Volume](#storing-weights-in-a-modal-volume)[Saving model weights into a Modal Volume from a Modal Function](#saving-model-weights-into-a-modal-volume-from-a-modal-function)[Uploading model weights into a Modal Volume](#uploading-model-weights-into-a-modal-volume)[Mounting cloud buckets as Modal Volumes](#mounting-cloud-buckets-as-modal-volumes)[Reading model weights from a Modal Volume](#reading-model-weights-from-a-modal-volume)[Storing weights in the Modal Image](#storing-weights-in-the-modal-image)[Optimizing model weight reads with @enter](#optimizing-model-weight-reads-with-enter)[Storing weights from the Hugging Face Hub on Modal](#storing-weights-from-the-hugging-face-hub-on-modal)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Continuous deployment
=====================
It’s a common pattern to auto-deploy your Modal App as part of a CI/CD pipeline.
To get you started, below is a guide to doing continuous deployment of a Modal
App in GitHub.
GitHub Actions
--------------
Here’s a sample GitHub Actions workflow that deploys your App on every push to
the `main` branch.
This requires you to create a [Modal token](../../login%EF%B9%96next=%EA%A4%B7settings%EA%A4%B7tokens.html) and add it as a [secret for your Github Actions workflow](https://github.com/Azure/actions-workflow-samples/blob/master/assets/create-secrets-for-GitHub-workflows.md).
After setting up secrets, create a new workflow file in your repository at `.github/workflows/ci-cd.yml` with the following contents:
```
name: CI/CD
on:
push:
branches:
- main
jobs:
deploy:
name: Deploy
runs-on: ubuntu-latest
env:
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install Modal
run: |
python -m pip install --upgrade pip
pip install modal
- name: Deploy job
run: |
modal deploy -m my_package.my_file
```
Be sure to replace `my_package.my_file` with your actual entrypoint.
If you use multiple Modal [Environments](environments.html), you can
additionally specify the target environment in the YAML using `MODAL_ENVIRONMENT=xyz`.
[Continuous deployment](#continuous-deployment)[GitHub Actions](#github-actions)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Running untrusted code in Functions
===================================
Modal provides two primitives for running untrusted code: Restricted Functions and [Sandboxes](sandbox.html). While both can be used for running untrusted code, they serve different purposes: Sandboxes provide a container-like interface while Restricted Functions provide an interface similar to a traditional Function.
Restricted Functions are useful for executing:
* Code generated by language models (LLMs)
* User-submitted code in interactive environments
* Third-party plugins or extensions
Using `restrict_modal_access`
-----------------------------
To restrict a Function’s access to Modal resources, set `restrict_modal_access=True` on the Function definition:
```
import modal
app = modal.App()
@app.function(restrict_modal_access=True)
def run_untrusted_code(code_input: str):
# This nunction cannot access Modal resources
return eval(code_input)
```
When `restrict_modal_access` is enabled:
* The Function cannot access Modal resources (Queues, Dicts, etc.)
* The Function cannot call other Functions
* The Function cannot access Modal’s internal APIs
Comparison with Sandboxes
-------------------------
While both `restrict_modal_access` and [Sandboxes](sandbox.html) can be used for running untrusted code, they serve different purposes:
| Feature | Restricted Function | Sandbox |
| --- | --- | --- |
| State | Stateless | Stateful |
| Interface | Function-like | Container-like |
| Setup | Simple decorator | Requires explicit creation/termination |
| Use case | Quick, isolated code execution | Interactive development, long-running sessions |
Best Practices
--------------
When running untrusted code, consider these additional security measures:
1. Use `max_inputs=1` to ensure each container only handles one request. Containers that get reused could cause information leakage between users.
```
@app.function(restrict_modal_access=True, max_inputs=1)
def isolated_function(input_data):
# Each input gets a fresh container
return process(input_data)
```
2. Set appropriate timeouts to prevent long-running operations:
```
@app.function(
restrict_modal_access=True,
timeout=30, # 30 second timeout
max_inputs=1
)
def time_limited_function(input_data):
return process(input_data)
```
3. Consider using `block_network=True` to prevent the container from making outbound network requests:
```
@app.function(
restrict_modal_access=True,
block_network=True,
max_inputs=1
)
def network_isolated_function(input_data):
return process(input_data)
```
Example: Running LLM-generated Code
-----------------------------------
Below is a complete example of running code generated by a language model:
```
import modal
app = modal.App("restricted-access-example")
@app.function(restrict_modal_access=True, max_inputs=1, timeout=30, block_network=True)
def run_llm_code(generated_code: str):
try:
# Create a restricted environment
execution_scope = {}
# Execute the generated code
exec(generated_code, execution_scope)
# Return the result if it exists
return execution_scope.get("result", None)
except Exception as e:
return f"Error executing code: {str(e)}"
@app.local_entrypoint()
def main():
# Example LLM-generated code
code = """
def calculate_fibonacci(n):
if n <= 1:
return n
return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)
result = calculate_fibonacci(10)
"""
result = run_llm_code.remote(code)
print(f"Result: {result}")
```
This example locks down the container to ensure that the code is safe to execute by:
* Restricting Modal access
* Using a fresh container for each execution
* Setting a timeout
* Blocking network access
* Catching and handling potential errors
Error Handling
--------------
When a restricted Function attempts to access Modal resources, it will raise an `AuthError`:
```
@app.function(restrict_modal_access=True)
def restricted_function(q: modal.Queue):
try:
# This will fail because the Function is restricted
return q.get()
except modal.exception.AuthError as e:
return f"Access denied: {e}"
```
The error message will indicate that the operation is not permitted due to restricted Modal access.
[Running untrusted code in Functions](#running-untrusted-code-in-functions)[Using restrict\_modal\_access](#using-restrict_modal_access)[Comparison with Sandboxes](#comparison-with-sandboxes)[Best Practices](#best-practices)[Example: Running LLM-generated Code](#example-running-llm-generated-code)[Error Handling](#error-handling)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Networking and security
=======================
Sandboxes are built to be secure-by-default, meaning that a default Sandbox has
no ability to accept incoming network connections or access your Modal resources.
Networking
----------
Since Sandboxes may run untrusted code, they have options to restrict their network access.
To block all network access, set `block_network=True` on [`Sandbox.create`](../reference/modal.Sandbox.html#create).
For more fine-grained networking control, a Sandbox’s outbound network access
can be restricted using the `cidr_allowlist` parameter. This parameter takes a
list of CIDR ranges that the Sandbox is allowed to access, blocking all other
outbound traffic.
### Forwarding ports
Sandboxes can also expose TCP ports to the internet. This is useful if,
for example, you want to connect to a web server running inside a Sandbox.
Use the `encrypted_ports` and `unencrypted_ports` parameters of `Sandbox.create` to specify which ports to forward. You can then access the public URL of a tunnel
using the [`Sandbox.tunnels`](../reference/modal.Sandbox.html#tunnels) method:
```
import requests
import time
sb = modal.Sandbox.create(
"python",
"-m",
"http.server",
"12345",
encrypted_ports=[12345],
app=my_app,
)
tunnel = sb.tunnels()[12345]
time.sleep(1) # Wait for server to start.
print(f"Connecting to {tunnel.url}...")
print(requests.get(tunnel.url, timeout=5).text)
```
For more details on how tunnels work, see the [tunnels guide](tunnels.html).
Security model
--------------
In a typical Modal Function, the Function code can call other Modal APIs allowing
it to spawn containers, create and destroy Volumes, read from Dicts and Queues, etc.
Sandboxes, by contrast, are isolated from the main Modal workspace. They have no API
access, meaning the blast radius of any malicious code is limited to the Sandbox
environment.
Sandboxes are built on top of [gVisor](https://gvisor.dev/), a container runtime
by Google that provides strong isolation properties. gVisor has custom logic to
prevent Sandboxes from making malicious system calls, giving you stronger isolation
than standard [runc](https://github.com/opencontainers/runc) containers.
[Networking and security](#networking-and-security)[Networking](#networking)[Forwarding ports](#forwarding-ports)[Security model](#security-model)
See it in action
[Running a Jupyter notebook](../examples/jupyter_sandbox.html)
[Safe code execution](../examples/safe_code_execution.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Environment variables
=====================
Runtime environment variables
-----------------------------
The Modal runtime sets several environment variables during initialization. The
keys for these environment variables are reserved and cannot be overridden by
your Function configuration.
The following variables provide information about the function’s runtime
environment:
* **`MODAL_CLOUD_PROVIDER`** — Modal executes functions across a number of cloud
providers ([AWS](https://aws.amazon.com/), [GCP](https://cloud.google.com/), [OCI](https://www.oracle.com/cloud/)). This variable specifies which cloud
provider the Modal Function is running within.
* **`MODAL_ENVIRONMENT`** — The name of the [Modal Environment](environments.html) the function is running within.
* **`MODAL_IMAGE_ID`** — The ID of the [`modal.Image`](../reference/modal.Image.html) used by the Modal Function.
* **`MODAL_IS_REMOTE`** - Set to ‘1’ to indicate that the function code is running in
a remote container.
* **`MODAL_REGION`** — This will correspond to a geographic area identifier from
the cloud provider associated with the Function (see above). For AWS, the
identifier is a “region”. For GCP it is a “zone”, and for OCI it is an
“availability domain”. Example values are `us-east-1` (AWS), `us-central1` (GCP), `us-ashburn-1` (OCI).
* **`MODAL_TASK_ID`** — The ID of the container running the Modal Function.
* **`MODAL_IDENTITY_TOKEN`** — An [OIDC token](oidc-integration.html) encoding the identity of the Modal Function.
Container image environment variables
-------------------------------------
The container image layers used by a Modal Function’s `modal.Image` may set
environment variables. These variables will be present within your Function’s runtime
environment. For example, the [`debian_slim`](../reference/modal.Image.html#debian_slim) image sets the `GPG_KEY` variable.
To override image variables or set new ones, use the [`.env`](../reference/modal.Image.html#env) method provided by `modal.Image`.
[Environment variables](#environment-variables)[Runtime environment variables](#runtime-environment-variables)[Container image environment variables](#container-image-environment-variables)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Parametrized functions
======================
A single Modal Function can be parametrized by a set of arguments, so that each unique combination of arguments will behave like an individual
Modal Function with its own auto-scaling and lifecycle logic.
For example, you might want to have a separate pool of containers for each unique user that invokes your Function. In this scenario, you would
parametrize your Function by a user ID.
To parametrize a Modal Function, you need to use Modal’s [class syntax](lifecycle-functions.html) and the [`@app.cls`](../reference/modal.App.html#cls) decorator. Specifically, you’ll need to:
1. Convert your function to a method by making it a member of a class.
2. Decorate the class with `@app.cls(...)` with the same arguments you previously
had for `@app.function(...)` or your [web endpoint decorator](webhooks.html).
3. If you previously used the `@app.function()` decorator on your function, replace it with `@modal.method()`.
4. Define dataclass-style, type-annotated instance attributes with `modal.parameter()` and optionally set default values:
```
import modal
app = modal.App()
@app.cls()
class MyClass:
foo: str = modal.parameter()
bar: int = modal.parameter(default=10)
@modal.method()
def baz(self, qux: str = "default") -> str:
return f"This code is running in container pool ({self.foo}, {self.bar}), with input qux={qux}"
```
The parameters create a keyword-only constructor for your class, and the methods can be called as follows:
```
@app.local_entrypoint()
def main():
m1 = MyClass(foo="hedgehog", bar=7)
m1.baz.remote()
m2 = MyClass(foo="fox")
m2.baz.remote(qux="override")
```
Function calls for each unique combination of values for `foo` and `bar` will run in their own separate container pools.
If you re-constructed a `MyClass` with the same arguments in a different context, the calls to `baz` would be routed to the same set of containers as before.
Some things to note:
* The total size of the arguments is limited to 16 KiB.
* Modal classes can still annotate types of regular class attributes, which are independent of parametrization, by either omitting `= modal.parameter()` or using `= modal.parameter(init=False)` to satisfy type checkers.
* The support types are these primitives: `str`, `int`, `bool`, and `bytes`.
* The legacy `__init__` constructor method is being removed, see [the 1.0 migration for details.](modal-1-0-migration.html#removing-support-for-custom-cls-constructors)
Looking up a parametrized function
----------------------------------
If you want to call your parametrized function from a Python script running
anywhere, you can use `Cls.lookup`:
```
import modal
MyClass = modal.Cls.from_name("parametrized-function-app", "MyClass") # returns a class-like object
m = MyClass(foo="snake", bar=12)
m.baz.remote()
```
Parametrized web endpoints
--------------------------
Modal [web endpoints](webhooks.html) can also be parametrized:
```
app = modal.App("parametrized-endpoint")
@app.cls()
class MyClass():
foo: str = modal.parameter()
bar: int = modal.parameter(default=10)
@modal.fastapi_endpoint()
def baz(self, qux: str = "default") -> str:
...
```
Parameters are specified in the URL as query parameter values.
```
curl "https://parametrized-endpoint.modal.run?foo=hedgehog&bar=7&qux=override"
curl "https://parametrized-endpoint.modal.run?foo=hedgehog&qux=override"
curl "https://parametrized-endpoint.modal.run?foo=hedgehog&bar=7"
curl "https://parametrized-endpoint.modal.run?foo=hedgehog"
```
Using parametrized functions with lifecycle functions
-----------------------------------------------------
Parametrized functions can be used with [lifecycle functions](lifecycle-functions.html).
For example, here is how you might parametrize the [`@enter`](lifecycle-functions.html#enter) lifecycle function to load a specific model:
```
@app.cls()
class Model:
name: str = modal.parameter()
size: int = modal.parameter(default=100)
@modal.enter()
def load_model(self):
print(f"Loading model {self.name} with size {self.size}")
self.model = load_model_util(self.name, self.size)
@modal.method()
def generate(self, prompt: str) -> str:
return self.model.generate(prompt)
```
[Parametrized functions](#parametrized-functions)[Looking up a parametrized function](#looking-up-a-parametrized-function)[Parametrized web endpoints](#parametrized-web-endpoints)[Using parametrized functions with lifecycle functions](#using-parametrized-functions-with-lifecycle-functions)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Web endpoints
=============
This guide explains how to set up web endpoints with Modal.
All deployed Modal Functions can be [invoked from any other Python application](trigger-deployed-functions.html) using the Modal client library. We additionally provide multiple ways to expose
your Functions over the web for non-Python clients.
You can [turn any Python function into a web endpoint](#simple-endpoints) with a single line
of code, you can [serve a full app](#serving-asgi-and-wsgi-apps) using
frameworks like FastAPI, Django, or Flask, or you can [serve anything that speaks HTTP and listens on a port](#non-asgi-web-servers).
Below we walk through each method, assuming you’re familiar with web applications outside of Modal.
For a detailed walkthrough of basic web endpoints on Modal aimed at developers new to web applications,
see [this tutorial](../examples/basic_web.html).
Simple endpoints
----------------
The easiest way to create a web endpoint from an existing Python function is to use the [`@modal.fastapi_endpoint` decorator](../reference/modal.fastapi_endpoint.html).
```
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
@app.function(image=image)
@modal.fastapi_endpoint()
def f():
return "Hello world!"
```
This decorator wraps the Modal Function in a [FastAPI application](#how-do-web-endpoints-run-in-the-cloud).
*Note: Prior to v0.73.82, this function was named `@modal.web_endpoint`*.
### Developing with `modal serve`
You can run this code as an ephemeral app, by running the command
```
modal serve server_script.py
```
Where `server_script.py` is the file name of your code. This will create an
ephemeral app for the duration of your script (until you hit Ctrl-C to stop it).
It creates a temporary URL that you can use like any other REST endpoint. This
URL is on the public internet.
The `modal serve` command will live-update an app when any of its supporting
files change.
Live updating is particularly useful when working with apps containing web
endpoints, as any changes made to web endpoint handlers will show up almost
immediately, without requiring a manual restart of the app.
### Deploying with `modal deploy`
You can also deploy your app and create a persistent web endpoint in the cloud
by running `modal deploy`:
### Passing arguments to an endpoint
When using `@modal.fastapi_endpoint`, you can add [query parameters](https://fastapi.tiangolo.com/tutorial/query-params/) which
will be passed to your Function as arguments. For instance
```
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
@app.function(image=image)
@modal.fastapi_endpoint()
def square(x: int):
return {"square": x**2}
```
If you hit this with a URL-encoded query string with the `x` parameter present,
the Function will receive the value as an argument:
```
$ curl https://modal-labs--web-endpoint-square-dev.modal.run?x=42
{"square":1764}
```
If you want to use a `POST` request, you can use the `method` argument to `@modal.fastapi_endpoint` to set the HTTP verb. To accept any valid JSON object, [use `dict` as your type annotation](https://fastapi.tiangolo.com/tutorial/body-nested-models/?h=dict#bodies-of-arbitrary-dicts) and FastAPI will handle the rest.
```
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
@app.function(image=image)
@modal.fastapi_endpoint(method="POST")
def square(item: dict):
return {"square": item['x']**2}
```
This now creates an endpoint that takes a JSON body:
```
$ curl -X POST -H 'Content-Type: application/json' --data-binary '{"x": 42}' https://modal-labs--web-endpoint-square-dev.modal.run
{"square":1764}
```
This is often the easiest way to get started, but note that FastAPI recommends
that you use [typed Pydantic models](https://fastapi.tiangolo.com/tutorial/body/) in order to
get automatic validation and documentation. FastAPI also lets you pass data to
web endpoints in other ways, for instance as [form data](https://fastapi.tiangolo.com/tutorial/request-forms/) and [file uploads](https://fastapi.tiangolo.com/tutorial/request-files/).
How do web endpoints run in the cloud?
--------------------------------------
Note that web endpoints, like everything else on Modal, only run when they need
to. When you hit the web endpoint the first time, it will boot up the container,
which might take a few seconds. Modal keeps the container alive for a short
period in case there are subsequent requests. If there are a lot of requests,
Modal might create more containers running in parallel.
For the shortcut `@modal.fastapi_endpoint` decorator, Modal wraps your function in a [FastAPI](https://fastapi.tiangolo.com/) application. This means that the [Image](images.html) your Function uses must have FastAPI installed, and the Functions that you write
need to follow its request and response [semantics](https://fastapi.tiangolo.com/tutorial). Web endpoint Functions can use
all of FastAPI’s powerful features, such as Pydantic models for automatic validation,
typed query and path parameters, and response types.
Here’s everything together, combining Modal’s abilities to run functions in
user-defined containers with the expressivity of FastAPI:
```
import modal
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
image = modal.Image.debian_slim().pip_install("fastapi[standard]", "boto3")
app = modal.App(image=image)
class Item(BaseModel):
name: str
qty: int = 42
@app.function()
@modal.fastapi_endpoint(method="POST")
def f(item: Item):
import boto3
# do things with boto3...
return HTMLResponse(f"<html>Hello, {item.name}!</html>")
```
This endpoint definition would be called like so:
```
curl -d '{"name": "Erik", "qty": 10}' \
-H "Content-Type: application/json" \
-X POST https://ecorp--web-demo-f-dev.modal.run
```
Or in Python with the [`requests`](https://pypi.org/project/requests/) library:
```
import requests
data = {"name": "Erik", "qty": 10}
requests.post("https://ecorp--web-demo-f-dev.modal.run", json=data, timeout=10.0)
```
Serving ASGI and WSGI apps
--------------------------
You can also serve any app written in an [ASGI](https://asgi.readthedocs.io/en/latest/) or [WSGI](https://en.wikipedia.org/wiki/Web_Server_Gateway_Interface)-compatible
web framework on Modal.
ASGI provides support for async web frameworks. WSGI provides support for
synchronous web frameworks.
### ASGI apps - FastAPI, FastHTML, Starlette
For ASGI apps, you can create a function decorated with [`@modal.asgi_app`](../reference/modal.asgi_app.html) that returns a reference to
your web app:
```
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
@app.function(image=image)
@modal.concurrent(max_inputs=100)
@modal.asgi_app()
def fastapi_app():
from fastapi import FastAPI, Request
web_app = FastAPI()
@web_app.post("/echo")
async def echo(request: Request):
body = await request.json()
return body
return web_app
```
Now, as before, when you deploy this script as a Modal App, you get a URL for
your app that you can hit:
The `@modal.concurrent` decorator enables a single container
to process multiple inputs at once, taking advantage of the asynchronous
event loops in ASGI applications. See [this guide](concurrent-inputs.html) for details.
#### ASGI Lifespan
While we recommend using [`@modal.enter`](lifecycle-functions.html#enter) for defining container lifecycle hooks, we also support the [ASGI lifespan protocol](https://asgi.readthedocs.io/en/latest/specs/lifespan.html). Lifespans begin when containers start, typically at the time of the first request. Here’s an example using [FastAPI](https://fastapi.tiangolo.com/advanced/events/#lifespan):
```
import modal
app = modal.App("fastapi-lifespan-app")
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
@app.function(image=image)
@modal.asgi_app()
def fastapi_app_with_lifespan():
from fastapi import FastAPI, Request
def lifespan(wapp: FastAPI):
print("Starting")
yield
print("Shutting down")
web_app = FastAPI(lifespan=lifespan)
@web_app.get("/")
async def hello(request: Request):
return "hello"
return web_app
```
### WSGI apps - Django, Flask
You can serve WSGI apps using the [`@modal.wsgi_app`](../reference/modal.wsgi_app.html) decorator:
```
image = modal.Image.debian_slim().pip_install("flask")
@app.function(image=image)
@modal.concurrent(max_inputs=100)
@modal.wsgi_app()
def flask_app():
from flask import Flask, request
web_app = Flask(__name__)
@web_app.post("/echo")
def echo():
return request.json
return web_app
```
See [Flask’s docs](https://flask.palletsprojects.com/en/2.1.x/deploying/asgi/) for more information on using Flask as a WSGI app.
Because WSGI apps are synchronous, concurrent inputs will be run on separate
threads. See [this guide](concurrent-inputs.html) for details.
Non-ASGI web servers
--------------------
Not all web frameworks offer an ASGI or WSGI interface. For example, [`aiohttp`](https://docs.aiohttp.org/) and [`tornado`](https://www.tornadoweb.org/) use their own asynchronous network binding, while others like [`text-generation-inference`](https://github.com/huggingface/text-generation-inference) actually expose a Rust-based HTTP server running as a subprocess.
For these cases, you can use the [`@modal.web_server`](../reference/modal.web_server.html) decorator to “expose” a
port on the container:
```
@app.function()
@modal.concurrent(max_inputs=100)
@modal.web_server(8000)
def my_file_server():
import subprocess
subprocess.Popen("python -m http.server -d / 8000", shell=True)
```
Just like all web endpoints on Modal, this is only run on-demand. The function
is executed on container startup, creating a file server at the root directory.
When you hit the web endpoint URL, your request will be routed to the file
server listening on port `8000`.
For `@web_server` endpoints, you need to make sure that the application binds to
the external network interface, not just localhost. This usually means binding
to `0.0.0.0` instead of `127.0.0.1`.
See our examples of how to serve [Streamlit](../examples/serve_streamlit.html) and [ComfyUI](../examples/comfyapp.html) on Modal.
Serve many configurations with parametrized functions
-----------------------------------------------------
Python functions that launch ASGI/WSGI apps or web servers on Modal
cannot take arguments.
One simple pattern for allowing client-side configuration of these web endpoints
is to use [parametrized functions](parametrized-functions.html).
Each different choice for the values of the parameters will create a distinct
auto-scaling container pool.
```
@app.cls()
@modal.concurrent(max_inputs=100)
class Server:
root: str = modal.parameter(default=".")
@modal.web_server(8000)
def files(self):
import subprocess
subprocess.Popen(f"python -m http.server -d {self.root} 8000", shell=True)
```
The values are provided in URLs as query parameters:
```
curl https://ecorp--server-files.modal.run # use the default value
curl https://ecorp--server-files.modal.run?root=.cache # use a different value
curl https://ecorp--server-files.modal.run?root=%2F # don't forget to URL encode!
```
For details, see [this guide to parametrized functions](parametrized-functions.html).
WebSockets
----------
Functions annotated with `@web_server`, `@asgi_app`, or `@wsgi_app` also support
the WebSocket protocol. Consult your web framework for appropriate documentation
on how to use WebSockets with that library.
WebSockets on Modal maintain a single function call per connection, which can be
useful for keeping state around. Most of the time, you will want to set your
handler function to [allow concurrent inputs](concurrent-inputs.html),
which allows multiple simultaneous WebSocket connections to be handled by the
same container.
We support the full WebSocket protocol as per [RFC 6455](https://www.rfc-editor.org/rfc/rfc6455), but we do not yet have
support for [RFC 8441](https://www.rfc-editor.org/rfc/rfc8441) (WebSockets over
HTTP/2) or [RFC 7692](https://datatracker.ietf.org/doc/html/rfc7692) (`permessage-deflate` extension). WebSocket messages can be up to 2 MiB each.
Performance and scaling
-----------------------
If you have no active containers when the web endpoint receives a request, it will
experience a “cold start”. Consult the guide page on [cold start performance](cold-start.html) for more information on when
Functions will cold start and advice how to mitigate the impact.
If your Function uses `@modal.concurrent`, multiple requests to the same
endpoint may be handled by the same container. Beyond this limit, additional
containers will start up to scale your App horizontally. When you reach the
Function’s limit on containers, requests will queue for handling.
Each workspace on Modal has a rate limit on total operations. For a new account,
this is set to 200 function inputs or web endpoint requests per second, with a
burst multiplier of 5 seconds. If you reach the rate limit, excess requests to
web endpoints will return a [429 status code](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429),
and you’ll need to [get in touch](mailto:[email protected]) with us about
raising the limit.
Web endpoint request bodies can be up to 4 GiB, and their response bodies are
unlimited in size.
Authentication
--------------
Modal offers first-class web endpoint protection via [proxy auth tokens](webhook-proxy-auth.html).
Proxy auth tokens protect web endpoints by requiring a key and token combination to be passed
in the `Modal-Key` and `Modal-Secret` headers.
Modal works as a proxy, rejecting requests that aren’t authorized to access
your endpoint.
We also support standard techniques for securing web servers.
### Token-based authentication
This is easy to implement in whichever framework you’re using. For example, if
you’re using `@modal.fastapi_endpoint` or `@modal.asgi_app` with FastAPI, you
can validate a Bearer token like this:
```
from fastapi import Depends, HTTPException, status, Request
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
import modal
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
app = modal.App("auth-example", image=image)
auth_scheme = HTTPBearer()
@app.function(secrets=[modal.Secret.from_name("my-web-auth-token")])
@modal.fastapi_endpoint()
async def f(request: Request, token: HTTPAuthorizationCredentials = Depends(auth_scheme)):
import os
print(os.environ["AUTH_TOKEN"])
if token.credentials != os.environ["AUTH_TOKEN"]:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect bearer token",
headers={"WWW-Authenticate": "Bearer"},
)
# Function body
return "success!"
```
This assumes you have a [Modal Secret](../../login%EF%B9%96next=%EA%A4%B7secrets.html) named `my-web-auth-token` created, with contents `{AUTH_TOKEN: secret-random-token}`.
Now, your endpoint will return a 401 status code except when you hit it with the
correct `Authorization` header set (note that you have to prefix the token with `Bearer`):
```
curl --header "Authorization: Bearer secret-random-token" https://modal-labs--auth-example-f.modal.run
```
### Client IP address
You can access the IP address of the client making the request. This can be used
for geolocation, whitelists, blacklists, and rate limits.
```
from fastapi import Request
import modal
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
app = modal.App(image=image)
@app.function()
@modal.fastapi_endpoint()
def get_ip_address(request: Request):
return f"Your IP address is {request.client.host}"
```
[Web endpoints](#web-endpoints)[Simple endpoints](#simple-endpoints)[Developing with modal serve](#developing-with-modal-serve)[Deploying with modal deploy](#deploying-with-modal-deploy)[Passing arguments to an endpoint](#passing-arguments-to-an-endpoint)[How do web endpoints run in the cloud?](#how-do-web-endpoints-run-in-the-cloud)[Serving ASGI and WSGI apps](#serving-asgi-and-wsgi-apps)[ASGI apps - FastAPI, FastHTML, Starlette](#asgi-apps---fastapi-fasthtml-starlette)[ASGI Lifespan](#asgi-lifespan)[WSGI apps - Django, Flask](#wsgi-apps---django-flask)[Non-ASGI web servers](#non-asgi-web-servers)[Serve many configurations with parametrized functions](#serve-many-configurations-with-parametrized-functions)[WebSockets](#websockets)[Performance and scaling](#performance-and-scaling)[Authentication](#authentication)[Token-based authentication](#token-based-authentication)[Client IP address](#client-ip-address)
Fully featured web apps
[LLM Voice Chat (React)](../examples/llm-voice-chat.html)
[Stable Diffusion (Alpine)](../examples/stable_diffusion_cli.html)
[Whisper Podcast Transcriber (React)](../examples/whisper-transcriber.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
GPU Metrics
===========
Modal exposes a number of GPU metrics that help monitor the health and utilization of the GPUs you’re using.
* **GPU utilization %** is the percentage of time that the GPU was executing at least one CUDA kernel. This is the same metric reported as utilization by [`nvidia-smi`](../../gpu-glossary/host-software/nvidia-smi.html). GPU utilization is helpful for determining the amount of time GPU work is blocked on CPU work, like PyTorch compute graph construction or input processing. However, it is far from indicating what fraction of the GPU’s computing firepower (FLOPS or memory throughput, [CUDA Cores](../../gpu-glossary/device-hardware/cuda-core.html), [SMs](../../gpu-glossary/device-hardware/streaming-multiprocessor.html)) is being used. See [this blog post](https://arthurchiao.art/blog/understanding-gpu-performance) for details.
* **GPU power utilization %** is the percentage of the maximum power draw that the device is currently drawing. When aggregating across containers, we also report **Total GPU power usage** in Watts. Because high-performance GPUs are [fundamentally limited by power draw](https://www.thonking.ai/p/strangely-matrix-multiplications), both for computation and memory access, the power usage can be used as a proxy of how much work the GPU is doing. A fully-saturated GPU should draw at or near its entire power budget (which can also be found by running `nvidia-smi`).
* **GPU temperature** is the temperature measured on the die of the GPU. Like power draw, which is the source of the thermal energy, the ability to efflux heat is a fundamental limit on GPU performance: continuing to draw full power without removing the waste heat would damage the system. At the highest temperatures readily observed in proper GPU deployments (i.e. mid-70s Celsius for an H100), increased error correction from thermal noise can already reduce performance. Generally, power utilization is a better proxy for performance, but we report temperature for completeness.
* **GPU memory used** is the amount of memory allocated on the GPU, in bytes.
In general, these metrics are useful signals or correlates of performance, but can’t be used to directly debug performance issues. Instead, we (and [the manufacturers!](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#assess-parallelize-optimize-deploy)) recommend tracing and profiling workloads. See [this example](../examples/torch_profiling.html) of profiling PyTorch applications on Modal.
[GPU Metrics](#gpu-metrics)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Input concurrency
=================
As traffic to your application increases, Modal will automatically scale up the
number of containers running your Function:
By default, each container will be assigned one input at a time. Autoscaling
across containers allows your Function to process inputs in parallel. This is
ideal when the operations performed by your Function are CPU-bound.
For some workloads, though, it is inefficient for containers to process inputs
one-by-one. Modal supports these workloads with its *input concurrency* feature,
which allows individual containers to process multiple inputs at the same time:
When used effectively, input concurrency can reduce latency and lower costs.
Use cases
---------
Input concurrency can be especially effective for workloads that are primarily
I/O-bound, e.g.:
* Querying a database
* Making external API requests
* Making remote calls to other Modal Functions
For such workloads, individual containers may be able to concurrently process
large numbers of inputs with minimal additional latency. This means that your
Modal application will be more efficient overall, as it won’t need to scale
containers up and down as traffic ebbs and flows.
Another use case is to leverage *continuous batching* on GPU-accelerated
containers. Frameworks such as [vLLM](../examples/vllm_inference.html) can
achieve the benefits of batching across multiple inputs even when those
inputs do not arrive simultaneously (because new batches are formed for each
forward pass of the model).
Note that for CPU-bound workloads, input concurrency will likely not be as
effective (or will even be counterproductive), and you may want to use
Modal’s [*dynamic batching* feature](dynamic-batching.html) instead.
Enabling input concurrency
--------------------------
To enable input concurrency, add the `@modal.concurrent` decorator:
```
@app.function()
@modal.concurrent(max_inputs=100)
def my_function(input: str):
...
```
When using the class pattern, the decorator should be applied at the level of
the *class*, not on individual methods:
```
@app.cls()
@modal.concurrent(max_inputs=100)
class MyCls:
@modal.method()
def my_method(self, input: str):
...
```
Because all methods on a class will be served by the same containers, a class
with input concurrency enabled will concurrently run distinct methods in
addition to multiple inputs for the same method.
**Note:** The `@modal.concurrent` decorator was added in v0.73.148 of the Modal
Python SDK. Input concurrency could previously be enabled by setting the `allow_concurrent_inputs` parameter on the `@app.function` decorator.
Setting a concurrency target
----------------------------
When using the `@modal.concurrent` decorator, you must always configure the
maximum number of inputs that each container will concurrently process. If
demand exceeds this limit, Modal will automatically scale up more containers.
Additional inputs may need to queue up while these additional containers cold
start. To help avoid degraded latency during scaleup, the `@modal.concurrent` decorator has a separate `target_inputs` parameter. When set, Modal’s autoscaler
will aim for this target as it provisions resources. If demand increases faster
than new containers can spin up, the active containers will be allowed to burst
above the target up to the `max_inputs` limit:
```
@app.function()
@modal.concurrent(max_inputs=120, target_inputs=100) # Allow a 20% burst
def my_function(input: str):
...
```
It may take some experimentation to find the right settings for these parameters
in your particular application. Our suggestion is to set the `target_inputs` based on your desired latency and the `max_inputs` based on resource constraints
(i.e., to avoid GPU OOM). You may also consider the relative latency cost of
scaling up a new container versus overloading the existing containers.
Concurrency mechanisms
----------------------
Modal uses different concurrency mechanisms to execute your Function depending
on whether it is defined as synchronous or asynchronous. Each mechanism imposes
certain requirements on the Function implementation. Input concurrency is an
advanced feature, and it’s important to make sure that your implementation
complies with these requirements to avoid unexpected behavior.
For synchronous Functions, Modal will execute concurrent inputs on separate
threads. *This means that the Function implementation must be thread-safe.*
```
# Each container can execute up to 10 inputs in separate threads
@app.function()
@modal.concurrent(max_inputs=10)
def sleep_sync():
# Function must be thread-safe
time.sleep(1)
```
For asynchronous Functions, Modal will execute concurrent inputs using
separate `asyncio` tasks on a single thread. This does not require thread
safety, but it does mean that the Function needs to participate in
collaborative multitasking (i.e., it should not block the event loop).
```
# Each container can execute up to 10 inputs with separate async tasks
@app.function()
@modal.concurrent(max_inputs=10)
async def sleep_async():
# Function must not block the event loop
await asyncio.sleep(1)
```
Gotchas
-------
Input concurrency is a powerful feature, but there are a few caveats that can
be useful to be aware of before adopting it.
### Input cancellations
Synchronous and asynchronous Functions handle input cancellations differently.
Modal will raise a `modal.exception.InputCancellation` exception in synchronous
Functions and an `asyncio.CancelledError` in asynchronous Functions.
When using input concurrency with a synchronous Function, a single input
cancellation will terminate the entire container. If your workflow depends on
graceful input cancellations, we recommend using an asynchronous
implementation.
### Concurrent logging
The separate threads or tasks that are executing the concurrent inputs will
write any logs to the same stream. This makes it difficult to associate logs
with a specific input, and filtering for a specific function call in Modal’s web
dashboard will show logs for all inputs running at the same time.
To work around this, we recommend including a unique identifier in the messages
you log (either your own identifier or the `modal.current_input_id()`) so that
you can use the search functionality to surface logs for a specific input:
```
@app.function()
@modal.concurrent(max_inputs=10)
async def better_concurrent_logging(x: int):
logger.info(f"{modal.current_input_id()}: Starting work with {x}")
```
[Input concurrency](#input-concurrency)[Use cases](#use-cases)[Enabling input concurrency](#enabling-input-concurrency)[Setting a concurrency target](#setting-a-concurrency-target)[Concurrency mechanisms](#concurrency-mechanisms)[Gotchas](#gotchas)[Input cancellations](#input-cancellations)[Concurrent logging](#concurrent-logging)
See it in action
[Single GPU serving concurrent requests](../examples/vllm_inference.html)
[Responsive Stable Diffusion web UI](../examples/stable_diffusion_cli.html)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Job processing
==============
Modal can be used as a scalable job queue to handle asynchronous tasks submitted
from a web app or any other Python application. This allows you to offload up to 1 million
long-running or resource-intensive tasks to Modal, while your main application
remains responsive.
Creating jobs with .spawn()
---------------------------
The basic pattern for using Modal as a job queue involves three key steps:
1. Defining and deploying the job processing function using `modal deploy`.
2. Submitting a job using [`modal.Function.spawn()`](../reference/modal.Function.html#spawn)
3. Polling for the job’s result using [`modal.FunctionCall.get()`](../reference/modal.FunctionCall.html#get)
Here’s a simple example that you can run with `modal run my_job_queue.py`:
```
# my_job_queue.py
import modal
app = modal.App("my-job-queue")
@app.function()
def process_job(data):
# Perform the job processing here
return {"result": data}
def submit_job(data):
# Since the `process_job` function is deployed, need to first look it up
process_job = modal.Function.from_name("my-job-queue", "process_job")
call = process_job.spawn(data)
return call.object_id
def get_job_result(call_id):
function_call = modal.FunctionCall.from_id(call_id)
try:
result = function_call.get(timeout=5)
except modal.exception.OutputExpiredError:
result = {"result": "expired"}
except TimeoutError:
result = {"result": "pending"}
return result
@app.local_entrypoint()
def main():
data = "my-data"
# Submit the job to Modal
call_id = submit_job(data)
print(get_job_result(call_id))
```
In this example:
* `process_job` is the Modal function that performs the actual job processing.
To deploy the `process_job` function on Modal, run `modal deploy my_job_queue.py`.
* `submit_job` submits a new job by first looking up the deployed `process_job` function, then calling `.spawn()` with the job data. It returns the unique ID
of the spawned function call.
* `get_job_result` attempts to retrieve the result of a previously submitted job
using [`FunctionCall.from_id()`](../reference/modal.FunctionCall.html#from_id) and [`FunctionCall.get()`](../reference/modal.FunctionCall.html#get). [`FunctionCall.get()`](../reference/modal.FunctionCall.html#get) waits indefinitely
by default. It takes an optional timeout argument that specifies the maximum
number of seconds to wait, which can be set to 0 to poll for an output
immediately. Here, if the job hasn’t completed yet, we return a pending
response.
* The results of a `.spawn()` are accessible via `FunctionCall.get()` for up to
7 days after completion. After this period, we return an expired response.
[Document OCR Web App](../examples/doc_ocr_webapp.html) is an example that uses
this pattern.
Integration with web frameworks
-------------------------------
You can easily integrate the job queue pattern with web frameworks like FastAPI.
Here’s an example, assuming that you have already deployed `process_job` on
Modal with `modal deploy` as above. This example won’t work if you haven’t
deployed your app yet.
```
# my_job_queue_endpoint.py
import fastapi
import modal
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
app = modal.App("fastapi-modal", image=image)
web_app = fastapi.FastAPI()
@app.function()
@modal.asgi_app()
def fastapi_app():
return web_app
@web_app.post("/submit")
async def submit_job_endpoint(data):
process_job = modal.Function.from_name("my-job-queue", "process_job")
call = process_job.spawn(data)
return {"call_id": call.object_id}
@web_app.get("/result/{call_id}")
async def get_job_result_endpoint(call_id: str):
function_call = modal.FunctionCall.from_id(call_id)
try:
result = function_call.get(timeout=0)
except modal.exception.OutputExpiredError:
return fastapi.responses.JSONResponse(content="", status_code=404)
except TimeoutError:
return fastapi.responses.JSONResponse(content="", status_code=202)
return result
```
In this example:
* The `/submit` endpoint accepts job data, submits a new job using `process_job.spawn()`, and returns the job’s ID to the client.
* The `/result/{call_id}` endpoint allows the client to poll for the job’s
result using the job ID. If the job hasn’t completed yet, it returns a 202
status code to indicate that the job is still being processed. If the job
has expired, it returns a 404 status code to indicate that the job is not found.
You can try this app by serving it with `modal serve`:
```
modal serve my_job_queue_endpoint.py
```
Then interact with its endpoints with `curl`:
```
# Make a POST request to your app endpoint with.
$ curl -X POST $YOUR_APP_ENDPOINT/submit?data=data
{"call_id":"fc-XXX"}
# Use the call_id value from above.
$ curl -X GET $YOUR_APP_ENDPOINT/result/fc-XXX
```
Scaling and reliability
-----------------------
Modal automatically scales the job queue based on the workload, spinning up new
instances as needed to process jobs concurrently. It also provides built-in
reliability features like automatic retries and timeout handling.
You can customize the behavior of the job queue by configuring the `@app.function()` decorator with options like [`retries`](retries.html#function-retries), [`timeout`](timeouts.html#timeouts), and [`max_containers`](scale.html#configuring-autoscaling-behavior).
[Job processing](#job-processing)[Creating jobs with .spawn()](#creating-jobs-with-spawn)[Integration with web frameworks](#integration-with-web-frameworks)[Scaling and reliability](#scaling-and-reliability)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Scaling out
===========
Modal makes it trivially easy to scale compute across thousands of containers.
You won’t have to worry about your App crashing if it goes viral or need to wait
a long time for your batch jobs to complete.
For the the most part, scaling out will happen automatically, and you won’t need
to think about it. But it can be helpful to understand how Modal’s autoscaler
works and how you can control its behavior when you need finer control.
How does autoscaling work on Modal?
-----------------------------------
Every Modal Function corresponds to an autoscaling pool of containers. The size
of the pool is managed by Modal’s autoscaler. The autoscaler will spin up new
containers when there is no capacity available for new inputs, and it will spin
down containers when resources are idling. By default, Modal Functions will
scale to zero when there are no inputs to process.
Autoscaling decisions are made quickly and frequently so that your batch jobs
can ramp up fast and your deployed Apps can respond to any sudden changes in
traffic.
Configuring autoscaling behavior
--------------------------------
Modal exposes a few settings that allow you to configure the autoscaler’s
behavior. These settings can be passed to the `@app.function` or `@app.cls` decorators:
* `max_containers`: The upper limit on containers for the specific Function.
* `min_containers`: The minimum number of containers that should be kept warm,
even when the Function is inactive.
* `buffer_containers`: The size of the buffer to maintain while the Function is
active, so that additional inputs will not need to queue for a new container.
* `scaledown_window`: The maximum duration (in seconds) that individual
containers can remain idle when scaling down.
In general, these settings allow you to trade off cost and latency. Maintaining
a larger warm pool or idle buffer will increase costs but reduce the chance that
inputs will need to wait for a new container to start.
Similarly, a longer scaledown window will let containers idle for longer, which
might help avoid unnecessary churn for Apps that receive regular but infrequent
inputs. Note that containers may not wait for the entire scaledown window before
shutting down if the App is substantially overprovisioned.
Parallel execution of inputs
----------------------------
If your code is running the same function repeatedly with different independent
inputs (e.g., a grid search), the easiest way to increase performance is to run
those function calls in parallel using Modal’s [`Function.map()`](../reference/modal.Function.html#map) method.
Here is an example if we had a function `evaluate_model` that takes a single
argument:
```
import modal
app = modal.App()
@app.function()
def evaluate_model(x):
...
@app.local_entrypoint()
def main():
inputs = list(range(100))
for result in evaluate_model.map(inputs): # runs many inputs in parallel
...
```
In this example, `evaluate_model` will be called with each of the 100 inputs
(the numbers 0 - 99 in this case) roughly in parallel and the results are
returned as an iterable with the results ordered in the same way as the inputs.
### Exceptions
By default, if any of the function calls raises an exception, the exception will
be propagated. To treat exceptions as successful results and aggregate them in
the results list, pass in [`return_exceptions=True`](../reference/modal.Function.html#map).
```
@app.function()
def my_func(a):
if a == 2:
raise Exception("ohno")
return a ** 2
@app.local_entrypoint()
def main():
print(list(my_func.map(range(3), return_exceptions=True)))
# [0, 1, UserCodeException(Exception('ohno'))]
```
### Starmap
If your function takes multiple variable arguments, you can either use [`Function.map()`](../reference/modal.Function.html#map) with one input iterator
per argument, or [`Function.starmap()`](../reference/modal.Function.html#starmap) with a single input iterator containing sequences (like tuples) that can be
spread over the arguments. This works similarly to Python’s built in `map` and `itertools.starmap`.
```
@app.function()
def my_func(a, b):
return a + b
@app.local_entrypoint()
def main():
assert list(my_func.starmap([(1, 2), (3, 4)])) == [3, 7]
```
### Gotchas
Note that `.map()` is a method on the modal function object itself, so you don’t
explicitly *call* the function.
Incorrect usage:
```
results = evaluate_model(inputs).map()
```
Modal’s map is also not the same as using Python’s builtin `map()`. While the
following will technically work, it will execute all inputs in sequence rather
than in parallel.
Incorrect usage:
```
results = map(evaluate_model, inputs)
```
Asynchronous usage
------------------
All Modal APIs are available in both blocking and asynchronous variants. If you
are comfortable with asynchronous programming, you can use it to create
arbitrary parallel execution patterns, with the added benefit that any Modal
functions will be executed remotely. See the [async guide](async.html) or
the examples for more information about asynchronous usage.
GPU acceleration
----------------
Sometimes you can speed up your applications by utilizing GPU acceleration. See
the [gpu section](gpu.html) for more information.
Scaling Limits
--------------
Modal enforces the following limits for every function:
* 2,000 pending inputs (inputs that haven’t been assigned to a container yet)
* 25,000 total inputs (which include both running and pending inputs)
For inputs created with `.spawn()` for async jobs, Modal allows up to 1 million pending inputs instead of 2,000.
If you try to create more inputs and exceed these limits, you’ll receive a `Resource Exhausted` error, and you should retry your request later. If you need higher limits, please reach out!
Additionally, each `.map()` invocation can process at most 1000 inputs concurrently.
[Scaling out](#scaling-out)[How does autoscaling work on Modal?](#how-does-autoscaling-work-on-modal)[Configuring autoscaling behavior](#configuring-autoscaling-behavior)[Parallel execution of inputs](#parallel-execution-of-inputs)[Exceptions](#exceptions)[Starmap](#starmap)[Gotchas](#gotchas)[Asynchronous usage](#asynchronous-usage)[GPU acceleration](#gpu-acceleration)[Scaling Limits](#scaling-limits)
See it in action
[Auto-scaling LLM inference endpoints](../examples/vllm_inference.html)
[Job queue for OCR](../examples/doc_ocr_jobs.html)
[Parallel web scraping](../examples/web-scraper.html#scaling-out)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Cluster networking
==================
i6pn (IPv6 private networking) is Modal’s private container-to-container networking solution. It allows users to create clusters of Modal containers which can send network traffic to each other with low latency and high bandwidth (≥ 50Gbps).
Normally, `modal.Function` containers can initiate outbound network connections to the internet but they are not directly addressable by other containers. i6pn-enabled containers, on the other hand, can be directly connected to by other i6pn-enabled containers and this is a key enabler of Modal’s preview `@modal.experimental.clustered` functionality.
You can enable i6pn on any `modal.Function`:
```
@app.function(i6pn=True)
def hello_private_network():
import socket
i6pn_addr = socket.getaddrinfo("i6pn.modal.local", None, socket.AF_INET6)[0][4][0]
print(i6pn_addr) # fdaa:5137:3ebf:a70:1b9d:3a11:71f2:5f0f
```
In this snippet we see that the i6pn-enabled container is able to retrieve its own IPv6 address by
resolving `i6pn.modal.local`. For this Function container to discover the addresses of *other* containers,
address sharing must be implemented using an auxiliary data structure, such as a shared `modal.Dict` or `modal.Queue`.
Private networking
------------------
All i6pn network traffic is *Workspace private*.
![i6pn-diagram](https://modal-cdn.com/cdnbot/i6pn-1eksk4vuy_c4c4a0df.webp)
In the image above, Workspace A has subnet `fdaa:1::/48`, while Workspace B has subnet `fdaa:2::/48`.
You’ll notice they share the first 16 bits. This is because the `fdaa::/16` prefix contains all of our private network IPv6 addresses, while each workspace is assigned a random 32-bit identifier when it is created. Together, these form the 48-bit subnet.
The upshot of this is that only containers in the same workspace can see each other and send each other network packets. i6pn networking is secure by default.
Region boundaries
-----------------
Modal operates a [global fleet](region-selection.html) and allows containers to run on multiple cloud providers and in many regions. i6pn networking is however region-scoped functionality, meaning that only i6pn-enabled containers in the same region can perform network communication.
Modal’s i6pn-enabled primitives such as `@modal.experimental.clustered` automatically restrict container geographic placement and cloud placement to ensure inter-container connectivity.
Public network access to cluster networking
-------------------------------------------
For cluster networked containers that need to be publicly accessible, you need to expose ports with [modal.Tunnel](tunnels.html) because i6pn addresses are not publicly exposed.
Consider having a container setup a Tunnel and act as the gateway to the private cluster networking.
[Cluster networking](#cluster-networking)[Private networking](#private-networking)[Region boundaries](#region-boundaries)[Public network access to cluster networking](#public-network-access-to-cluster-networking)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Dynamic batching (beta)
=======================
Modal’s `@batched` feature allows you to accumulate requests
and process them in dynamically-sized batches, rather than one-by-one.
Batching increases throughput at a potential cost to latency.
Batched requests can share resources and reuse work, reducing the time and cost per request.
Batching is particularly useful for GPU-accelerated machine learning workloads,
as GPUs are designed to maximize throughput and are frequently bottlenecked on shareable resources,
like weights stored in memory.
Static batching can lead to unbounded latency, as the function waits for a fixed number of requests to arrive.
Modal’s dynamic batching waits for the lesser of a fixed time *or* a fixed number of requests before executing,
maximizing the throughput benefit of batching while minimizing the latency penalty.
Enable dynamic batching with `@batched`
---------------------------------------
To enable dynamic batching, apply the [`@modal.batched` decorator](../reference/modal.batched.html) to the target
Python function. Then, wrap it in `@app.function()` and run it on Modal,
and the inputs will be accumulated and processed in batches.
Here’s what that looks like:
```
import modal
app = modal.App()
@app.function()
@modal.batched(max_batch_size=2, wait_ms=1000)
async def batch_add(xs: list[int], ys: list[int]) -> list[int]:
return [x + y for x, y in zip(xs, ys)]
```
When you invoke a function decorated with `@batched`, you invoke it asynchronously on individual inputs.
Outputs are returned where they were invoked.
For instance, the code below invokes the decorated `batch_add` function above three times, but `batch_add` only executes twice:
```
@app.local_entrypoint()
async def main():
inputs = [(1, 300), (2, 200), (3, 100)]
async for result in batch_add.starmap.aio(inputs):
print(f"Sum: {result}")
# Sum: 301
# Sum: 202
# Sum: 103
```
The first time it is executed with `xs` batched to `[1, 2]` and `ys` batched to `[300, 200]`. After about a one second delay, it is executed with `xs` batched to `[3]` and `ys` batched to `[100]`.
The result is an iterator that yields `301`, `202`, and `101`.
Use `@batched` with functions that take and return lists
--------------------------------------------------------
For a Python function to be compatible with `@modal.batched`, it must adhere to
the following rules:
* **The inputs to the function must be lists.** In the example above, we pass `xs` and `ys`, which are both lists of `int`s.
* **The function must return a list**. In the example above, the function returns
a list of sums.
* **The lengths of all the input lists and the output list must be the same.** In the example above, if `L == len(xs) == len(ys)`, then `L == len(batch_add(xs, ys))`.
Modal `Cls` methods are compatible with dynamic batching
--------------------------------------------------------
Methods on Modal [`Cls`](lifecycle-functions.html)es also support dynamic batching.
```
import modal
app = modal.App()
@app.cls()
class BatchedClass():
@modal.batched(max_batch_size=2, wait_ms=1000)
async def batch_add(self, xs: list[int], ys: list[int]) -> list[int]:
return [x + y for x, y in zip(xs, ys)]
```
One additional rule applies to classes with Batched Methods:
* If a class has a Batched Method, it **cannot have other Batched Methods or [Methods](../reference/modal.method.html#modalmethod)**.
Configure the wait time and batch size of dynamic batches
---------------------------------------------------------
The `@batched` decorator takes in two required configuration parameters:
* `max_batch_size` limits the number of inputs combined into a single batch.
* `wait_ms` limits the amount of time the Function waits for more inputs after
the first input is received.
The first invocation of the Batched Function initiates a new batch, and subsequent
calls add requests to this ongoing batch. If `max_batch_size` is reached,
the batch immediately executes. If the `max_batch_size` is not met but `wait_ms` has passed since the first request was added to the batch, the unfilled batch is
executed.
### Selecting a batch configuration
To optimize the batching configurations for your application, consider the following heuristics:
* Set `max_batch_size` to the largest value your function can handle, so you
can amortize and parallelize as much work as possible.
* Set `wait_ms` to the difference between your targeted latency and the execution time. Most applications
have a targeted latency, and this allows the latency of any request to stay
within that limit.
Serve web endpoints with dynamic batching
-----------------------------------------
Here’s a simple example of serving a Function that batches requests dynamically
with a [`@modal.fastapi_endpoint`](webhooks.html). Run [`modal serve`](../reference/cli/serve.html), submit requests to the endpoint,
and the Function will batch your requests on the fly.
```
import modal
app = modal.App(image=modal.Image.debian_slim().pip_install("fastapi"))
@app.function()
@modal.batched(max_batch_size=2, wait_ms=1000)
async def batch_add(xs: list[int], ys: list[int]) -> list[int]:
return [x + y for x, y in zip(xs, ys)]
@app.function()
@modal.fastapi_endpoint(method="POST", docs=True)
async def add(body: dict[str, int]) -> dict[str, int]:
result = await batch_add.remote.aio(body["x"], body["y"])
return {"result": result}
```
Now, you can submit requests to the web endpoint and process them in batches. For instance, the three requests
in the following example, which might be requests from concurrent clients in a real deployment,
will be batched into two executions:
```
import asyncio
import aiohttp
async def send_post_request(session, url, data):
async with session.post(url, json=data) as response:
return await response.json()
async def main():
# Enter the URL of your web endpoint here
url = "https://workspace--app-name-endpoint-name.modal.run"
async with aiohttp.ClientSession() as session:
# Submit three requests asynchronously
tasks = [
send_post_request(session, url, {"x": 1, "y": 300}),
send_post_request(session, url, {"x": 2, "y": 200}),
send_post_request(session, url, {"x": 3, "y": 100}),
]
results = await asyncio.gather(*tasks)
for result in results:
print(f"Sum: {result['result']}")
asyncio.run(main())
```
[Dynamic batching (beta)](#dynamic-batching-beta)[Enable dynamic batching with @batched](#enable-dynamic-batching-with-batched)[Use @batched with functions that take and return lists](#use-batched-with-functions-that-take-and-return-lists)[Modal Cls methods are compatible with dynamic batching](#modal-cls-methods-are-compatible-with-dynamic-batching)[Configure the wait time and batch size of dynamic batches](#configure-the-wait-time-and-batch-size-of-dynamic-batches)[Selecting a batch configuration](#selecting-a-batch-configuration)[Serve web endpoints with dynamic batching](#serve-web-endpoints-with-dynamic-batching)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Project structure
=================
Apps spanning multiple files
----------------------------
When your project spans multiple files, more care is required to package the
full structure for running or deploying on Modal.
There are two main considerations: (1) ensuring that all of your Functions get
registered to the App, and (2) ensuring that any local dependencies get included
in the Modal container.
Say that you have a simple project that’s distributed across three files:
```
src/
├── app.py # Defines the `modal.App` as a variable named `app`
├── llm.py # Imports `app` and decorates some functions
└── web.py # Imports `app` and decorates other functions
```
With this structure, if you deploy using `modal deploy src/app.py`, Modal won’t
discover the Functions defined in the other two modules, because they never get
imported.
If you instead run `modal deploy src/llm.py`, Modal will deploy the App with
just the Functions defined in that module.
One option would be to ensure that one module in the project transitively
imports all of the other modules and to point the `modal deploy` CLI at it, but
this approach can lead to an awkard project structure.
### Defining your project as a Python package
A better approach would be to define your project as a Python *package* and to
use the Modal CLI’s “module mode” invocation pattern.
In Python, a package is a directory containing an `__init__.py` file (and
usually some other Python modules). If you have a `src/__init__.py` that
imports all of the member modules, it will ensure that any decorated Functions
contained within them get registered to the App:
```
# Contents of __init__.py
import .app
import .llm
import .web
```
*Important: use *relative* imports (`import .app`) between member modules.*
Unfortunately, it’s not enough just to set this up and make your deploy command `modal deploy src/app.py`. Instead, you need to invoke Modal in *module mode*: `modal deploy -m src.app`. Note the use of the `-m` flag and the module path
(`src.app` instead of `src/app.py`). Akin to `python -m ...`, this incantation
treats the target as a package rather than just a single script.
### App composition
As your project grows in scope, it may become helpful to organize it into
multiple component Apps, rather than having the project defined as one large
monolith. That way, as you iterate during development, you can target a specific
component, which will build faster and avoid any conflicts with concurrent work
on other parts of the project.
Projects set up this way can still be deployed as one unit by using `App.include`.
Say our project from above defines separate Apps in `llm.py` and `web.py` and then
adds a new `deploy.py` file:
```
# Contents of deploy.py
import modal
from .llm import llm_app
from .web import web_app
app = modal.App("full-app").include(llm_app).include(web_app)
```
This lets you run `modal deploy -m src.deploy` to package everything in one
step.
**Note:** Since the multi-file app still has a single namespace for all
functions, it’s important to name your Modal functions uniquely across the
project even when splitting it up across files: otherwise you risk some
functions “shadowing” others with the same name.
Including local dependencies
----------------------------
Another factor to consider is whether Modal will package all of the local
dependencies that your App requires.
Even if your Modal App itself can be contained to a single file, any local
modules that file imports (like, say, a `helpers.py`) also need to be available
in the Modal container.
By default, Modal will automatically include the module or package where a
Function is defined in all containers that run that Function. So if the project
is set up as a package and the helper modules are part of that package, you
should be all set. If you’re not using a package setup, or if the local
dependencies are external to your project’s package, you’ll need to explicitly
include them in the Image, i.e. with `modal.Image.add_local_python_source`.
**Note:** This behavior changed in Modal 1.0. Previously, Modal would
“automount” any local dependencies that were imported by your App source into a
container. This was changed to be more selective to avoid unnecessary inclusion
of large local packages.
[Project structure](#project-structure)[Apps spanning multiple files](#apps-spanning-multiple-files)[Defining your project as a Python package](#defining-your-project-as-a-python-package)[App composition](#app-composition)[Including local dependencies](#including-local-dependencies)
See it in action
[QuiLLMan - Voice Chat with LLMs](https://github.com/modal-labs/quillman)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Timeouts
========
All Modal [Function](../reference/modal.Function.html) executions have a default
execution timeout of 300 seconds (5 minutes), but users may specify timeout
durations between 10 seconds and 24 hours.
```
import time
@app.function()
def f():
time.sleep(599) # Timeout!
@app.function(timeout=600)
def g():
time.sleep(599)
print("*Just* made it!")
```
The timeout duration is a measure of a Function’s *execution* time. It does not
include scheduling time or any other period besides the time your code is
executing in Modal. This duration is also per execution attempt, meaning
Functions configured with [`modal.Retries`](../reference/modal.Retries.html) will
start new execution timeouts on each retry. For example, an infinite-looping
Function with a 100 second timeout and 3 allowed retries will run for least 400
seconds within Modal.
Handling timeouts
-----------------
After exhausting any specified retries, a timeout in a Function will produce a `modal.exception.FunctionTimeoutError` which you may catch in your code.
```
import modal.exception
@app.function(timeout=100)
def f():
time.sleep(200) # Timeout!
@app.local_entrypoint()
def main():
try:
f.remote()
except modal.exception.FunctionTimeoutError:
... # Handle the timeout.
```
Timeout accuracy
----------------
Functions will run for *at least* as long as their timeout allows, but they may
run a handful of seconds longer. If you require accurate and precise timeout
durations on your Function executions, it is recommended that you implement
timeout logic in your user code.
[Timeouts](#timeouts)[Handling timeouts](#handling-timeouts)[Timeout accuracy](#timeout-accuracy)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/09_job_queues/doc_ocr_webapp.py)
Serve a document OCR web app
============================
This tutorial shows you how to use Modal to deploy a fully serverless [React](https://reactjs.org/) + [FastAPI](https://fastapi.tiangolo.com/) application.
We’re going to build a simple “Receipt Parser” web app that submits OCR transcription
tasks to a separate Modal app defined in [another example](doc_ocr_jobs.html),
polls until the task is completed, and displays
the results. Try it out for yourself [here](https://modal-labs-examples--example-doc-ocr-webapp-wrapper.modal.run/).
[![Webapp frontend](https://modal-cdn.com/doc_ocr_frontend.jpg)](https://modal-labs-examples--example-doc-ocr-webapp-wrapper.modal.run/)
Basic setup
-----------
Let’s get the imports out of the way and define an [`App`](../reference/modal.App.html).
```
from pathlib import Path
import fastapi
import fastapi.staticfiles
import modal
app = modal.App("example-doc-ocr-webapp")
```
Modal works with any [ASGI](../guide/webhooks.html#serving-asgi-and-wsgi-apps) or [WSGI](../guide/webhooks.html#wsgi) web framework. Here, we choose to use [FastAPI](https://fastapi.tiangolo.com/).
```
web_app = fastapi.FastAPI()
```
Define endpoints
----------------
We need two endpoints: one to accept an image and submit it to the Modal job queue,
and another to poll for the results of the job.
In `parse`, we’re going to submit tasks to the function defined in the [Job
Queue tutorial](doc_ocr_jobs.html), so we import it first using [`Function.lookup`](../reference/modal.Function.html#lookup).
We call [`.spawn()`](../reference/modal.Function.html#spawn) on the function handle
we imported above to kick off our function without blocking on the results. `spawn` returns
a unique ID for the function call, which we then use
to poll for its result.
```
@web_app.post("/parse")
async def parse(request: fastapi.Request):
parse_receipt = modal.Function.from_name("example-doc-ocr-jobs", "parse_receipt")
form = await request.form()
receipt = await form["receipt"].read() # type: ignore
call = parse_receipt.spawn(receipt)
return {"call_id": call.object_id}
```
`/result` uses the provided `call_id` to instantiate a `modal.FunctionCall` object, and attempt
to get its result. If the call hasn’t finished yet, we return a `202` status code, which indicates
that the server is still working on the job.
```
@web_app.get("/result/{call_id}")
async def poll_results(call_id: str):
function_call = modal.functions.FunctionCall.from_id(call_id)
try:
result = function_call.get(timeout=0)
except TimeoutError:
return fastapi.responses.JSONResponse(content="", status_code=202)
return result
```
Now that we’ve defined our endpoints, we’re ready to host them on Modal.
First, we specify our dependencies — here, a basic Debian Linux
environment with FastAPI installed.
```
image = modal.Image.debian_slim(python_version="3.12").pip_install(
"fastapi[standard]==0.115.4"
)
```
Then, we add the static files for our front-end. We’ve made [a simple React
app](https://github.com/modal-labs/modal-examples/tree/main/09_job_queues/doc_ocr_frontend) that hits the two endpoints defined above. To package these files with our app, we use `add_local_dir` with the local directory of the assets, and specify that we want them
in the `/assets` directory inside our container (the `remote_path`). Then, we instruct FastAPI to [serve
this static file directory](https://fastapi.tiangolo.com/tutorial/static-files/) at our root path.
```
local_assets_path = Path(__file__).parent / "doc_ocr_frontend"
image = image.add_local_dir(local_assets_path, remote_path="/assets")
@app.function(image=image)
@modal.asgi_app()
def wrapper():
web_app.mount("/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True))
return web_app
```
Running
-------
While developing, you can run this as an ephemeral app by executing the command
```
modal serve doc_ocr_webapp.py
```
Modal watches all the mounted files and updates the app if anything changes.
See [these docs](../guide/webhooks.html#developing-with-modal-serve) for more details.
Deploy
------
To deploy your application, run
```
modal deploy doc_ocr_webapp.py
```
That’s all!
If successful, this will print a URL for your app that you can navigate to in
your browser 🎉 .
[![Webapp frontend](https://modal-cdn.com/doc_ocr_frontend.jpg)](https://modal-labs-examples--example-doc-ocr-webapp-wrapper.modal.run/)
[Serve a document OCR web app](#serve-a-document-ocr-web-app)[Basic setup](#basic-setup)[Define endpoints](#define-endpoints)[Running](#running)[Deploy](#deploy)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 09_job_queues/doc_ocr_webapp.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-serving/sgl_vlm.py)
Run Qwen2-VL on SGLang for Visual QA
====================================
Vision-Language Models (VLMs) are like LLMs with eyes:
they can generate text based not just on other text,
but on images as well.
This example shows how to run a VLM on Modal using the [SGLang](https://github.com/sgl-project/sglang) library.
Here’s a sample inference, with the image rendered directly (and at low resolution) in the terminal:
![Sample output answering a question about a photo of the Statue of Liberty](https://modal-public-assets.s3.amazonaws.com/sgl_vlm_qa_sol.png)
Setup
-----
First, we’ll import the libraries we need locally
and define some constants.
```
import os
import time
import warnings
from typing import Optional
from uuid import uuid4
import modal
```
VLMs are generally larger than LLMs with the same cognitive capability.
LLMs are already hard to run effectively on CPUs, so we’ll use a GPU here.
We find that inference for a single input takes about 3-4 seconds on an A10G.
You can customize the GPU type and count using the `GPU_TYPE` and `GPU_COUNT` environment variables.
If you want to see the model really rip, try an `"a100-80gb"` or an `"h100"` on a large batch.
```
GPU_TYPE = os.environ.get("GPU_TYPE", "l40s")
GPU_COUNT = os.environ.get("GPU_COUNT", 1)
GPU_CONFIG = f"{GPU_TYPE}:{GPU_COUNT}"
SGL_LOG_LEVEL = "error" # try "debug" or "info" if you have issues
MINUTES = 60 # seconds
```
We use the [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) model by Alibaba.
```
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
MODEL_REVISION = "a7a06a1cc11b4514ce9edcde0e3ca1d16e5ff2fc"
TOKENIZER_PATH = "Qwen/Qwen2-VL-7B-Instruct"
MODEL_CHAT_TEMPLATE = "qwen2-vl"
```
We download it from the Hugging Face Hub using the Python function below.
```
def download_model_to_image():
import transformers
from huggingface_hub import snapshot_download
snapshot_download(
MODEL_PATH,
revision=MODEL_REVISION,
ignore_patterns=["*.pt", "*.bin"],
)
# otherwise, this happens on first inference
transformers.utils.move_cache()
```
Modal runs Python functions on containers in the cloud.
The environment those functions run in is defined by the container’s `Image`.
The block of code below defines our example’s `Image`.
```
cuda_version = "12.8.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
vlm_image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11")
.pip_install( # add sglang and some Python dependencies
"transformers==4.47.1",
"numpy<2",
"fastapi[standard]==0.115.4",
"pydantic==2.9.2",
"requests==2.32.3",
"starlette==0.41.2",
"torch==2.4.0",
"sglang[all]==0.4.1",
"sgl-kernel==0.1.0",
# as per sglang website: https://sgl-project.github.io/start/install.html
extra_options="--find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/",
)
.run_function( # download the model by running a Python function
download_model_to_image
)
.pip_install( # add an optional extra that renders images in the terminal
"term-image==0.7.1"
)
)
```
Defining a Visual QA service
----------------------------
Running an inference service on Modal is as easy as writing inference in Python.
The code below adds a modal `Cls` to an `App` that runs the VLM.
We define a method `generate` that takes a URL for an image and a question
about the image as inputs and returns the VLM’s answer.
By decorating it with `@modal.fastapi_endpoint`, we expose it as an HTTP endpoint,
so it can be accessed over the public Internet from any client.
```
app = modal.App("example-sgl-vlm")
@app.cls(
gpu=GPU_CONFIG,
timeout=20 * MINUTES,
scaledown_window=20 * MINUTES,
image=vlm_image,
)
@modal.concurrent(max_inputs=100)
class Model:
@modal.enter() # what should a container do after it starts but before it gets input?
def start_runtime(self):
"""Starts an SGL runtime to execute inference."""
import sglang as sgl
self.runtime = sgl.Runtime(
model_path=MODEL_PATH,
tokenizer_path=TOKENIZER_PATH,
tp_size=GPU_COUNT, # t_ensor p_arallel size, number of GPUs to split the model over
log_level=SGL_LOG_LEVEL,
)
self.runtime.endpoint.chat_template = sgl.lang.chat_template.get_chat_template(
MODEL_CHAT_TEMPLATE
)
sgl.set_default_backend(self.runtime)
@modal.fastapi_endpoint(method="POST", docs=True)
def generate(self, request: dict) -> str:
from pathlib import Path
import requests
import sglang as sgl
from term_image.image import from_file
start = time.monotonic_ns()
request_id = uuid4()
print(f"Generating response to request {request_id}")
image_url = request.get("image_url")
if image_url is None:
image_url = (
"https://modal-public-assets.s3.amazonaws.com/golden-gate-bridge.jpg"
)
response = requests.get(image_url)
response.raise_for_status()
image_filename = image_url.split("/")[-1]
image_path = Path(f"/tmp/{uuid4()}-{image_filename}")
image_path.write_bytes(response.content)
@sgl.function
def image_qa(s, image_path, question):
s += sgl.user(sgl.image(str(image_path)) + question)
s += sgl.assistant(sgl.gen("answer"))
question = request.get("question")
if question is None:
question = "What is this?"
state = image_qa.run(
image_path=image_path, question=question, max_new_tokens=128
)
# show the question and image in the terminal for demonstration purposes
print(Colors.BOLD, Colors.GRAY, "Question: ", question, Colors.END, sep="")
terminal_image = from_file(image_path)
terminal_image.draw()
print(
f"request {request_id} completed in {round((time.monotonic_ns() - start) / 1e9, 2)} seconds"
)
return state["answer"]
@modal.exit() # what should a container do before it shuts down?
def shutdown_runtime(self):
self.runtime.shutdown()
```
Asking questions about images via POST
--------------------------------------
Now, we can send this Modal Function a POST request with an image and a question
and get back an answer.
The code below will start up the inference service
so that it can be run from the terminal as a one-off,
like a local script would be, using `modal run`:
```
modal run sgl_vlm.py
```
By default, we hit the endpoint twice to demonstrate how much faster
the inference is once the server is running.
```
@app.local_entrypoint()
def main(
image_url: Optional[str] = None, question: Optional[str] = None, twice: bool = True
):
import json
import urllib.request
model = Model()
payload = json.dumps(
{
"image_url": image_url,
"question": question,
},
)
req = urllib.request.Request(
model.generate.get_web_url(),
data=payload.encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req) as response:
assert response.getcode() == 200, response.getcode()
print(json.loads(response.read().decode()))
if twice:
# second response is faster, because the Function is already running
with urllib.request.urlopen(req) as response:
assert response.getcode() == 200, response.getcode()
print(json.loads(response.read().decode()))
```
Deployment
----------
To set this up as a long-running, but serverless, service, we can deploy it to Modal:
```
modal deploy sgl_vlm.py
```
And then send requests from anywhere. See the [docs](../guide/webhook-urls.html) for details on the `web_url` of the function, which also appears in the terminal output
when running `modal deploy`.
You can also find interactive documentation for the endpoint at the `/docs` route of the web endpoint URL.
Addenda
-------
The rest of the code in this example is just utility code.
```
warnings.filterwarnings( # filter warning from the terminal image library
"ignore",
message="It seems this process is not running within a terminal. Hence, some features will behave differently or be disabled.",
category=UserWarning,
)
class Colors:
"""ANSI color codes"""
GREEN = "\033[0;32m"
BLUE = "\033[0;34m"
GRAY = "\033[0;90m"
BOLD = "\033[1m"
END = "\033[0m"
```
[Run Qwen2-VL on SGLang for Visual QA](#run-qwen2-vl-on-sglang-for-visual-qa)[Setup](#setup)[Defining a Visual QA service](#defining-a-visual-qa-service)[Asking questions about images via POST](#asking-questions-about-images-via-post)[Deployment](#deployment)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/llm-serving/sgl_vlm.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/webrtc/webrtc_yolo.py)
Real-time object detection with WebRTC and YOLO
===============================================
This example demonstrates how to architect a serverless real-time streaming application with Modal and WebRTC.
The sample application detects objects in webcam video with YOLO.
See the clip below from a live demo of this example in a course by [Kwindla Kramer](https://machine-theory.com/), WebRTC OG and co-founder of [Daily](https://www.daily.co/).
[](https://modal-cdn.com/example-webrtc_yolo.mp4)
You can also try our deployment [here](https://modal-labs-examples--example-webrtc-yolo-webcamobjdet-web.modal.run).
What is WebRTC?
---------------
WebRTC (Web Real-Time Communication) is an [IETF Internet protocol](https://www.rfc-editor.org/rfc/rfc8825) and a [W3C API specification](https://www.w3.org/TR/webrtc/) for real-time media streaming between peers
over internets or the World Wide Web.
What makes it so effective and different from other bidirectional web-based communication protocols (e.g. WebSockets) is that it’s purpose-built for media streaming in real time.
It’s primarily designed for browser applications using the JavaScript API, but [APIs exist for other languages](https://www.webrtc-developers.com/did-i-choose-the-right-webrtc-stack/).
We’ll build our app using Python’s [`aiortc`](https://aiortc.readthedocs.io/en/latest/) package.
### What makes up a WebRTC application?
A simple WebRTC app generally consists of three players:
1. a peer that initiates the connection,
2. a peer that responds to the connection, and
3. a server that passes some initial messages between the two peers.
First, one peer initiates the connection by offering up a description of itself - its media sources, codec capabilities, Internet Protocol (IP) addressing info, etc - which is relayed to another peer through the server.
The other peer then either accepts the offer by providing a compatible description of its own capabilities or rejects it if no compatible configuration is possible.
This process is called “signaling” or sometimes the “negotiation” in the WebRTC world, and the server that mediates it is usually called the “signaling server”.
Once the peers have agreed on a configuration there’s a brief pause to establish communication… and then you’re live.
![Basic WebRTC architecture](https://modal-cdn.com/cdnbot/just_webrtc-1oic3iems_a4a8e77c.webp)
A basic WebRTC app architecture
Obviously there’s more going on under the hood.
If you want to get into the details, we recommend checking out the [RFCs](https://www.rfc-editor.org/rfc/rfc8825) or a [more-thorough explainer](https://webrtcforthecurious.com/).
In this document, we’ll focus on how to architect a WebRTC application where one or more peer is running on Modal’s serverless cloud infrastructure.
If you just want to quickly get started with WebRTC for a small internal service or a hack project, check out [our FastRTC example](fastrtc_flip_webcam.html) instead.
How do I run a WebRTC app on Modal?
-----------------------------------
Modal turns Python code into scalable cloud services.
When you call a Modal Function, you get one replica.
If you call it 999 more times before it returns, you have 1000 replicas.
When your Functions all return, you spin down to 0 replicas.
The core constraints of the Modal programming model that make this possible are that Function Calls are stateless and self-contained.
In other words, correctly-written Modal Functions don’t store information in memory between runs (though they might cache data to the ephemeral local disk for efficiency) and they don’t create processes or tasks which must continue to run after the Function Call returns in order for the application to be correct.
WebRTC apps, on the other hand, require passing messages back and forth in a multi-step protocol, and APIs spawn several “agents” (no, AI is not involved, just processes) which do work behind the scenes - including managing the peer-to-peer (P2P) connection itself.
This means that streaming may have only just begun when the application logic in our Function has finished.
![Modal programming model and WebRTC signaling](https://modal-cdn.com/cdnbot/flow_comparisong6iibzq3_638bdd84.webp)
Modal's stateless programming model (left) and WebRTC's stateful signaling (right)
To ensure we properly leverage Modal’s autoscaling and concurrency features, we need to align the signaling and streaming lifetimes with Modal Function Call lifetimes.
The architecture we recommend for this appears below.
![WebRTC on Modal](https://modal-cdn.com/cdnbot/webrtc_with_modal-2horb680q_eab69b28.webp)
A clean architecture for WebRTC on Modal
It handles passing messages between the client peer and the signaling server using a [WebSocket](../guide/webhooks.html#websockets) for persistent, bidirectional communication over the Web within a single Function Call.
(Modal’s Web layer maps HTTP and WS onto Function Calls, details [here](https://modal.com/blog/serverless-http)).
We [`.spawn`](../reference/modal.Function.html#spawn) the cloud peer inside the WebSocket endpoint
and communicate it using a [`modal.Queue`](../reference/modal.Queue.html).
We can then use the state of the P2P connection to determine when to return from the calls to both the signaling server and the cloud peer.
When the P2P connection has been *established*, we’ll close the WebSocket which in turn ends the call to the signaling server.
And when the P2P connection has been *closed*, we’ll return from the call to the cloud peer.
That way, our WebRTC application benefits from all the autoscaling and concurrency logic built into Modal
that enables users to deliver efficient cloud applications.
We wrote two classes, `ModalWebRtcPeer` and `ModalWebRtcSignalingServer`, to abstract away that boilerplate as well as a lot of the `aiortc` implementation details.
They’re also decorated with Modal [lifetime hooks](../guide/lifecycle-functions.html).
Add the [`app.cls`](../reference/modal.App.html#cls) decorator and some custom logic, and you’re ready to deploy on Modal.
You can find them in the [`modal_webrtc.py` file](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/webrtc/modal_webrtc.py) provided alongside this example in the [GitHub repo](https://github.com/modal-labs/modal-examples/tree/main/07_web_endpoints/webrtc/modal_webrtc.py).
Using `modal_webrtc` to detect objects in webcam footage
--------------------------------------------------------
For our WebRTC app, we’ll take a client’s video stream, run a [YOLO](https://docs.ultralytics.com/tasks/detect/) object detector on it with an A100 GPU on Modal, and then stream the annotated video back to the client.
With this setup, we can achieve inference times between 2-4 milliseconds per frame and RTTs below video frame rates (usually around 30 milliseconds per frame).
Let’s get started!
### Setup
We’ll start with a simple container [Image](../guide/images.html) and then
* set it up to properly use TensorRT and the ONNX Runtime, which keep latency minimal,
* install the necessary libs for processing video, `opencv` and `ffmpeg`, and
* install the necessary Python packages.
```
import os
from pathlib import Path
import modal
from .modal_webrtc import ModalWebRtcPeer, ModalWebRtcSignalingServer
py_version = "3.12"
tensorrt_ld_path = f"/usr/local/lib/python{py_version}/site-packages/tensorrt_libs"
video_processing_image = (
modal.Image.debian_slim(python_version=py_version) # matching ld path
# update locale as required by onnx
.apt_install("locales")
.run_commands(
"sed -i '/^#\\s*en_US.UTF-8 UTF-8/ s/^#//' /etc/locale.gen", # use sed to uncomment
"locale-gen en_US.UTF-8", # set locale
"update-locale LANG=en_US.UTF-8",
)
.env({"LD_LIBRARY_PATH": tensorrt_ld_path, "LANG": "en_US.UTF-8"})
# install system dependencies
.apt_install("python3-opencv", "ffmpeg")
# install Python dependencies
.pip_install(
"aiortc==1.11.0",
"fastapi==0.115.12",
"huggingface-hub[hf_xet]==0.30.2",
"onnxruntime-gpu==1.21.0",
"opencv-python==4.11.0.86",
"tensorrt==10.9.0.34",
"torch==2.7.0",
"shortuuid==1.0.13",
)
)
```
### Cache weights and compute graphs on a Volume
We also need to create a Modal [Volume](../guide/volumes.html) to store things we need across replicas —
primarily the model weights and ONNX inference graph, but also a few other artifacts like a video file where
we’ll write out the processed video stream for testing.
The very first time we run the app, downloading the model and building the ONNX inference graph will take a few minutes.
After that, we can load the cached weights and graph from the Volume, which reduces the startup time to about 15 seconds per container.
```
CACHE_VOLUME = modal.Volume.from_name("webrtc-yolo-cache", create_if_missing=True)
CACHE_PATH = Path("/cache")
cache = {CACHE_PATH: CACHE_VOLUME}
app = modal.App("example-webrtc-yolo")
```
### Implement YOLO object detection as a `ModalWebRtcPeer`
Our application needs to process an incoming video track with YOLO and return an annotated video track to the source peer.
To implement a `ModalWebRtcPeer`, we need to:
* Decorate our subclass with `@app.cls`. We provision it with an A100 GPU and a [Secret](../guide/secrets.html) credential, described below.
* Implement the method `setup_streams`. This is where we’ll use `aiortc` to add the logic for processing the incoming video track with YOLO
and returning an annotated video track to the source peer.
`ModalWebRtcPeer` has a few other methods that users can optionally implement:
* `initialize()`: This contains any custom initialization logic, called when `@modal.enter()` is called.
* `run_streams()`: Logic for starting streams. This is necessary when the peer is the source of the stream.
This is where you’d ensure a webcam was running, start playing a video file, or spin up a [video generative model](image_to_video.html).
* `get_turn_servers()`: We haven’t talked about [TURN servers](https://datatracker.ietf.org/doc/html/rfc5766),
but just know that they’re necessary if you want to use WebRTC across complex (e.g. carrier-grade) NAT or firewall configurations.
Free services have tight limits because TURN servers are expensive to run (lots of bandwidth and state management required). [STUN](https://datatracker.ietf.org/doc/html/rfc5389) servers, on the other hand, are essentially just echo servers, and so there are many free services available.
If you don’t provide TURN servers you can still serve your app on many networks using any of a number of free STUN servers for NAT traversal.
* `exit()`: This contains any custom cleanup logic, called when `@modal.exit()` is called.
In our case, we load the YOLO model in `initialize` and provide server information for the free [Open Relay TURN server](https://www.metered.ca/tools/openrelay/).
If you want to use it, you’ll need to create an account [here](https://dashboard.metered.ca/login?tool=turnserver) and then create a Modal [Secret](../guide/secrets.html) called `turn-credentials` [here](https://modal.com/secrets).
We also use the `@modal.concurrent` decorator to allow multiple instances of our peer to run on one GPU.
**Setting the Region**
Much of the latency in Internet applications comes from distance between communicating parties —
the Internet operates within a factor of two of the speed of light, but that’s just not that fast.
To minimize latency under this constraint, the physical distance of the P2P connection
between the webcam-using peer and the GPU container needs to be kept as short as possible.
We’ll use the `region` parameter of the `cls` decorator to set the region of the GPU container.
You should set this to the closest region to your users.
See the [region selection](../guide/region-selection.html) guide for more information.
```
@app.cls(
image=video_processing_image,
gpu="A100-40GB",
volumes=cache,
secrets=[modal.Secret.from_name("turn-credentials")],
region="us-east", # set to your region
)
@modal.concurrent(
target_inputs=2, # try to stick to just two peers per GPU container
max_inputs=3, # but allow up to three
)
class ObjDet(ModalWebRtcPeer):
async def initialize(self):
self.yolo_model = get_yolo_model(CACHE_PATH)
async def setup_streams(self, peer_id: str):
from aiortc import MediaStreamTrack
# keep us notified on connection state changes
@self.pcs[peer_id].on("connectionstatechange")
async def on_connectionstatechange() -> None:
if self.pcs[peer_id]:
print(
f"Video Processor, {self.id}, connection state to {peer_id}: {self.pcs[peer_id].connectionState}"
)
# when we receive a track from the source peer
# we create a processed track and add it to our stream
# back to the source peer
@self.pcs[peer_id].on("track")
def on_track(track: MediaStreamTrack) -> None:
print(
f"Video Processor, {self.id}, received {track.kind} track from {peer_id}"
)
output_track = get_yolo_track(track, self.yolo_model) # see Addenda
self.pcs[peer_id].addTrack(output_track)
# keep us notified when the incoming track ends
@track.on("ended")
async def on_ended() -> None:
print(
f"Video Processor, {self.id}, incoming video track from {peer_id} ended"
)
async def get_turn_servers(self, peer_id=None, msg=None) -> dict:
creds = {
"username": os.environ["TURN_USERNAME"],
"credential": os.environ["TURN_CREDENTIAL"],
}
turn_servers = [
{"urls": "stun:stun.relay.metered.ca:80"}, # STUN is free, no creds neeeded
# for TURN, sign up for the free service here: https://www.metered.ca/tools/openrelay/
{"urls": "turn:standard.relay.metered.ca:80"} | creds,
{"urls": "turn:standard.relay.metered.ca:80?transport=tcp"} | creds,
{"urls": "turn:standard.relay.metered.ca:443"} | creds,
{"urls": "turns:standard.relay.metered.ca:443?transport=tcp"} | creds,
]
return {"type": "turn_servers", "ice_servers": turn_servers}
```
### Implement a `SignalingServer`
The `ModalWebRtcSignalingServer` class is much simpler to implement.
The main thing we need to do is implement the `get_modal_peer_class` method which will return our implementation of the `ModalWebRtcPeer` class, `ObjDet`.
It also has an `initialize()` method we can optionally override (called at the beginning of the [container lifecycle](https://modal.com/docs/guides/lifecycle-functions))
as well as a `web_app` property which will be [served by Modal](../guide/webhooks.html#asgi-apps---fastapi-fasthtml-starlette).
We’ll use these to add a frontend which uses the WebRTC JavaScript API to stream a peer’s webcam from the browser.
The JavaScript and HTML files are alongside this example in the [Github repo](https://github.com/modal-labs/modal-examples/tree/main/07_web_endpoints/webrtc/frontend).
```
base_image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("python3-opencv", "ffmpeg")
.pip_install(
"fastapi[standard]==0.115.4",
"aiortc==1.11.0",
"opencv-python==4.11.0.86",
"shortuuid==1.0.13",
)
)
this_directory = Path(__file__).parent.resolve()
server_image = base_image.add_local_dir(
this_directory / "frontend", remote_path="/frontend"
)
@app.cls(image=server_image)
class WebcamObjDet(ModalWebRtcSignalingServer):
def get_modal_peer_class(self):
return ObjDet
def initialize(self):
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
self.web_app.mount("/static", StaticFiles(directory="/frontend"))
@self.web_app.get("/")
async def root():
html = open("/frontend/index.html").read()
return HTMLResponse(content=html)
```
Addenda
-------
The remainder of this page is not central to running a WebRTC application on Modal,
but is included for completeness.
### YOLO helper functions
The two functions below are used to set up the YOLO model and create our custom [`MediaStreamTrack`](https://aiortc.readthedocs.io/en/latest/api.html#aiortc.MediaStreamTrack).
The first, `get_yolo_model`, sets up the ONNXRuntime and loads the model weights.
We call this in the `initialize` method of the `ModalWebRtcPeer` class
so that it only happens once per container.
```
def get_yolo_model(cache_path):
import onnxruntime
from .yolo import YOLOv10
onnxruntime.preload_dlls()
return YOLOv10(cache_path)
```
The second, `get_yolo_track`, creates a custom `MediaStreamTrack` that performs object detection on the video stream.
We call this in the `setup_streams` method of the `ModalWebRtcPeer` class
so it happens once per peer connection.
```
def get_yolo_track(track, yolo_model=None):
import numpy as np
import onnxruntime
from aiortc import MediaStreamTrack
from aiortc.contrib.media import VideoFrame
from .yolo import YOLOv10
class YOLOTrack(MediaStreamTrack):
"""
Custom media stream track performs object detection
on the video stream and passes it back to the source peer
"""
kind: str = "video"
conf_threshold: float = 0.15
def __init__(self, track: MediaStreamTrack, yolo_model=None) -> None:
super().__init__()
self.track = track
if yolo_model is None:
onnxruntime.preload_dlls()
self.yolo_model = YOLOv10(CACHE_PATH)
else:
self.yolo_model = yolo_model
def detection(self, image: np.ndarray) -> np.ndarray:
import cv2
orig_shape = image.shape[:-1]
image = cv2.resize(
image,
(self.yolo_model.input_width, self.yolo_model.input_height),
)
image = self.yolo_model.detect_objects(image, self.conf_threshold)
image = cv2.resize(image, (orig_shape[1], orig_shape[0]))
return image
# this is the essential method we need to implement
# to create a custom MediaStreamTrack
async def recv(self) -> VideoFrame:
frame = await self.track.recv()
img = frame.to_ndarray(format="bgr24")
processed_img = self.detection(img)
# VideoFrames are from a really nice package called av
# which is a pythonic wrapper around ffmpeg
# and a dependency of aiortc
new_frame = VideoFrame.from_ndarray(processed_img, format="bgr24")
new_frame.pts = frame.pts
new_frame.time_base = frame.time_base
return new_frame
return YOLOTrack(track)
```
### Testing a WebRTC application on Modal
As any seasoned developer of real-time applications on the Web will tell you,
testing and ensuring correctness is quite difficult. We spent nearly as much time
designing and troubleshooting an appropriate testing process for this application as we did writing
the application itself!
You can find the testing code in the GitHub repository [here](https://github.com/modal-labs/modal-examples/tree/main/07_web_endpoints/webrtc/webrtc_yolo_test.py).
[Real-time object detection with WebRTC and YOLO](#real-time-object-detection-with-webrtc-and-yolo)[What is WebRTC?](#what-is-webrtc)[What makes up a WebRTC application?](#what-makes-up-a-webrtc-application)[How do I run a WebRTC app on Modal?](#how-do-i-run-a-webrtc-app-on-modal)[Using modal\_webrtc to detect objects in webcam footage](#using-modal_webrtc-to-detect-objects-in-webcam-footage)[Setup](#setup)[Cache weights and compute graphs on a Volume](#cache-weights-and-compute-graphs-on-a-volume)[Implement YOLO object detection as a ModalWebRtcPeer](#implement-yolo-object-detection-as-a-modalwebrtcpeer)[Implement a SignalingServer](#implement-a-signalingserver)[Addenda](#addenda)[YOLO helper functions](#yolo-helper-functions)[Testing a WebRTC application on Modal](#testing-a-webrtc-application-on-modal)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve -m 07_web_endpoints.webrtc.webrtc_yolo
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Miscellaneous examples
======================
Looking for how to make a popular model or library work with Modal?
There’s a guide for that:
* [Fine-tune Flan-T5 and monitor with TensorBoard](flan_t5_finetune.html)
* [Profile PyTorch code](torch_profiling.html)
* [Real-time object detection with webcam input](webcam.html)
* [Run batched Whisper transcription](batched_whisper.html)
* [Run continuous integration (CI) tests on Modal](ci-on-modal.html)
* [Run multilingual chat rooms with SeamlessM4T-V2](seamless-chat.html)
* [Run OpenCV to detect faces](count_faces.html)
* [Run SAM 2 video segmentation model](segment_anything.html)
* [Run Stable Video Diffusion image-to-video model](stable_video_diffusion.html)
* [Run Text Embedding Inference (TEI)](text_embeddings_inference.html)
You can find even more examples on the [`modal-examples` GitHub repository](https://github.com/modal-labs/modal-examples) or find larger projects built by Modal users at the [`awesome-modal` GitHub repository](https://github.com/modal-labs/awesome-modal).
[Miscellaneous examples](#miscellaneous-examples)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/yolo/finetune_yolo.py)
Fine-tune open source YOLO models for object detection
======================================================
Example by [@Erik-Dunteman](https://github.com/erik-dunteman) and [@AnirudhRahul](https://github.com/AnirudhRahul/).
The popular “You Only Look Once” (YOLO) model line provides high-quality object detection in an economical package.
In this example, we use the [YOLOv10](https://docs.ultralytics.com/models/yolov10/) model, released on May 23, 2024.
We will:
* Download two custom datasets from the [Roboflow](https://roboflow.com/) computer vision platform: a dataset of birds and a dataset of bees
* Fine-tune the model on those datasets, in parallel, using the [Ultralytics package](https://docs.ultralytics.com/)
* Run inference with the fine-tuned models on single images and on streaming frames
For commercial use, be sure to consult the [Ultralytics software license options](https://docs.ultralytics.com/#yolo-licenses-how-is-ultralytics-yolo-licensed),
which include AGPL-3.0.
Set up the environment
----------------------
```
import warnings
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import modal
```
Modal runs your code in the cloud inside containers. So to use it, we have to define the dependencies
of our code as part of the container’s [image](https://modal.com/docs/guide/custom-container).
```
image = (
modal.Image.debian_slim(python_version="3.10")
.apt_install( # install system libraries for graphics handling
["libgl1-mesa-glx", "libglib2.0-0"]
)
.pip_install( # install python libraries for computer vision
["ultralytics~=8.2.68", "roboflow~=1.1.37", "opencv-python~=4.10.0"]
)
.pip_install( # add an optional extra that renders images in the terminal
"term-image==0.7.1"
)
)
```
We also create a persistent [Volume](../guide/volumes.html) for storing datasets, trained weights, and inference outputs.
```
volume = modal.Volume.from_name("yolo-finetune", create_if_missing=True)
volume_path = ( # the path to the volume from within the container
Path("/root") / "data"
)
```
We attach both of these to a Modal [App](../guide/apps.html).
```
app = modal.App("yolo-finetune", image=image, volumes={volume_path: volume})
```
Download a dataset
------------------
We’ll be downloading our data from the [Roboflow](https://roboflow.com/) computer vision platform, so to follow along you’ll need to:
* Create a free account on [Roboflow](https://app.roboflow.com/)
* [Generate a Private API key](https://app.roboflow.com/settings/api)
* Set up a Modal [Secret](../guide/secrets.html) called `roboflow-api-key` in the Modal UI [here](../../login%EF%B9%96next=%EA%A4%B7secrets.html),
setting the `ROBOFLOW_API_KEY` to the value of your API key.
You’re also free to bring your own dataset with a config in YOLOv10-compatible yaml format.
We’ll be training on the medium size model, but you’re free to experiment with [other model sizes](https://docs.ultralytics.com/models/yolov10/#model-variants).
```
@dataclass
class DatasetConfig:
"""Information required to download a dataset from Roboflow."""
workspace_id: str
project_id: str
version: int
format: str
target_class: str
@property
def id(self) -> str:
return f"{self.workspace_id}/{self.project_id}/{self.version}"
@app.function(
secrets=[
modal.Secret.from_name("roboflow-api-key", required_keys=["ROBOFLOW_API_KEY"])
]
)
def download_dataset(config: DatasetConfig):
import os
from roboflow import Roboflow
rf = Roboflow(api_key=os.getenv("ROBOFLOW_API_KEY"))
project = (
rf.workspace(config.workspace_id)
.project(config.project_id)
.version(config.version)
)
dataset_dir = volume_path / "dataset" / config.id
project.download(config.format, location=str(dataset_dir))
```
Train a model
-------------
We train the model on a single A100 GPU. Training usually takes only a few minutes.
```
MINUTES = 60
TRAIN_GPU_COUNT = 1
TRAIN_GPU = f"A100:{TRAIN_GPU_COUNT}"
TRAIN_CPU_COUNT = 4
@app.function(
gpu=TRAIN_GPU,
cpu=TRAIN_CPU_COUNT,
timeout=60 * MINUTES,
)
def train(
model_id: str,
dataset: DatasetConfig,
model_size="yolov10m.pt",
quick_check=False,
):
from ultralytics import YOLO
volume.reload() # make sure volume is synced
model_path = volume_path / "runs" / model_id
model_path.mkdir(parents=True, exist_ok=True)
data_path = volume_path / "dataset" / dataset.id / "data.yaml"
model = YOLO(model_size)
model.train(
# dataset config
data=data_path,
fraction=0.4
if not quick_check
else 0.04, # fraction of dataset to use for training/validation
# optimization config
device=list(range(TRAIN_GPU_COUNT)), # use the GPU(s)
epochs=8 if not quick_check else 1, # pass over entire dataset this many times
batch=0.95, # automatic batch size to target fraction of GPU util
seed=117, # set seed for reproducibility
# data processing config
workers=max(
TRAIN_CPU_COUNT // TRAIN_GPU_COUNT, 1
), # split CPUs evenly across GPUs
cache=False, # cache preprocessed images in RAM?
# model saving config
project=f"{volume_path}/runs",
name=model_id,
exist_ok=True, # overwrite previous model if it exists
verbose=True, # detailed logs
)
```
Run inference on single inputs and on streams
---------------------------------------------
We demonstrate two different ways to run inference — on single images and on a stream of images.
The images we use for inference are loaded from the test set, which was added to our Volume when we downloaded the dataset.
Each image read takes ~50ms, and inference can take ~5ms, so the disk read would be our biggest bottleneck if we just looped over the image paths.
To avoid it, we parallelize the disk reads across many workers using Modal’s [`.map`](../guide/scale.html),
streaming the images to the model. This roughly mimics the behavior of an interactive object detection pipeline.
This can increase throughput up to ~60 images/s, or ~17 milliseconds/image, depending on image size.
```
@app.function()
def read_image(image_path: str):
import cv2
source = cv2.imread(image_path)
return source
```
We use the `@enter` feature of [`modal.Cls`](../guide/lifecycle-functions.html) to load the model only once on container start and reuse it for future inferences.
We use a generator to stream images to the model.
```
@app.cls(gpu="a10g")
class Inference:
weights_path: str = modal.parameter()
@modal.enter()
def load_model(self):
from ultralytics import YOLO
self.model = YOLO(self.weights_path)
@modal.method()
def predict(self, model_id: str, image_path: str, display: bool = False):
"""A simple method for running inference on one image at a time."""
results = self.model.predict(
image_path,
half=True, # use fp16
save=True,
exist_ok=True,
project=f"{volume_path}/predictions/{model_id}",
)
if display:
from term_image.image import from_file
terminal_image = from_file(results[0].path)
terminal_image.draw()
# you can view the output file via the Volumes UI in the Modal dashboard -- https://modal.com/storage
@modal.method()
def streaming_count(self, batch_dir: str, threshold: float | None = None):
"""Counts the number of objects in a directory of images.
Intended as a demonstration of high-throughput streaming inference."""
import os
import time
image_files = [os.path.join(batch_dir, f) for f in os.listdir(batch_dir)]
completed, start = 0, time.monotonic_ns()
for image in read_image.map(image_files):
# note that we run predict on a single input at a time.
# each individual inference is usually done before the next image arrives, so there's no throughput benefit to batching.
results = self.model.predict(
image,
half=True, # use fp16
save=False, # don't save to disk, as it slows down the pipeline significantly
verbose=False,
)
completed += 1
for res in results:
for conf in res.boxes.conf:
if threshold is None:
yield 1
continue
if conf.item() >= threshold:
yield 1
yield 0
elapsed_seconds = (time.monotonic_ns() - start) / 1e9
print(
"Inferences per second:",
round(completed / elapsed_seconds, 2),
)
```
Running the example
-------------------
We’ll kick off our parallel training jobs and run inference from the command line.
```
modal run finetune_yolo.py
```
This runs the training in `quick_check` mode, useful for debugging the pipeline and getting a feel for it.
To do a longer run that actually meaningfully improves performance, use:
```
modal run finetune_yolo.py --no-quick-check
```
```
@app.local_entrypoint()
def main(quick_check: bool = True, inference_only: bool = False):
"""Run fine-tuning and inference on two datasets.
Args:
quick_check: fine-tune on a small subset. Lower quality results, but faster iteration.
inference_only: skip fine-tuning and only run inference
"""
birds = DatasetConfig(
workspace_id="birds-s35xe",
project_id="birds-u8mti",
version=2,
format="yolov9",
target_class="🐥",
)
bees = DatasetConfig(
workspace_id="bees-tbdsg",
project_id="bee-counting",
version=11,
format="yolov9",
target_class="🐝",
)
datasets = [birds, bees]
# .for_each runs a function once on each element of the input iterators
# here, that means download each dataset, in parallel
if not inference_only:
download_dataset.for_each(datasets)
today = datetime.now().strftime("%Y-%m-%d")
model_ids = [dataset.id + f"/{today}" for dataset in datasets]
if not inference_only:
train.for_each(model_ids, datasets, kwargs={"quick_check": quick_check})
# let's run inference!
for model_id, dataset in zip(model_ids, datasets):
inference = Inference(
weights_path=str(volume_path / "runs" / model_id / "weights" / "best.pt")
)
# predict on a single image and save output to the volume
test_images = volume.listdir(
str(Path("dataset") / dataset.id / "test" / "images")
)
# run inference on the first 5 images
for ii, image in enumerate(test_images):
print(f"{model_id}: Single image prediction on image", image.path)
inference.predict.remote(
model_id=model_id,
image_path=f"{volume_path}/{image.path}",
display=(
ii == 0 # display inference results only on first image
),
)
if ii >= 4:
break
# streaming inference on images from the test set
print(f"{model_id}: Streaming inferences on all images in the test set...")
count = 0
for detection in inference.streaming_count.remote_gen(
batch_dir=f"{volume_path}/dataset/{dataset.id}/test/images"
):
if detection:
print(f"{dataset.target_class}", end="")
count += 1
else:
print("🎞️", end="", flush=True)
print(f"\n{model_id}: Counted {count} {dataset.target_class}s!")
```
Addenda
-------
The rest of the code in this example is utility code.
```
warnings.filterwarnings( # filter warning from the terminal image library
"ignore",
message="It seems this process is not running within a terminal. Hence, some features will behave differently or be disabled.",
category=UserWarning,
)
```
[Fine-tune open source YOLO models for object detection](#fine-tune-open-source-yolo-models-for-object-detection)[Set up the environment](#set-up-the-environment)[Download a dataset](#download-a-dataset)[Train a model](#train-a-model)[Run inference on single inputs and on streams](#run-inference-on-single-inputs-and-on-streams)[Running the example](#running-the-example)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/yolo/finetune_yolo.py --no-quick-check
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/torch_profiling.py)
Tracing and profiling GPU-accelerated PyTorch programs on Modal
===============================================================
![A PyTorch trace loaded into ui.perfetto.dev](https://modal-public-assets.s3.amazonaws.com/tmpx_2c9bl5_c5aa7ab0.webp)
GPUs are high-performance computing devices. For high-performance computing,
tools for measuring and investigating performance are as critical
as tools for testing and confirming correctness in typical software.
In this example, we demonstrate how to wrap a Modal Function with PyTorch’s
built-in profiler, which captures events on both CPUs & GPUs. We also show
how to host TensorBoard, which includes useful visualizations and
performance improvement suggestions.
For a live walkthrough, check out [this video on our YouTube channel](https://www.youtube.com/watch?v=4cesQJLyHA8).
Saving traces to a Modal Volume
-------------------------------
Most tracing tools, including PyTorch’s profiler, produce results as files on disk.
Modal Functions run in ephemeral containers in Modal’s cloud infrastructure,
so by default these files disappear as soon as the Function finishes running.
We can ensure these files persist by saving them to a [Modal Volume](https://modal.com/docs/guide/volume).
Volumes are a distributed file system: files can be read or written from
by many machines across a network, in this case from inside any Modal Function.
To start, we just create a Volume with a specific name.
We’ll also set a particular directory that we’ll use for it
in our Functions below, for convenience.
```
from pathlib import Path
from typing import Optional
import modal
traces = modal.Volume.from_name("example-traces", create_if_missing=True)
TRACE_DIR = Path("/traces")
```
Setting up a Modal App with a GPU-accelerated PyTorch Function
--------------------------------------------------------------
We next set up the Modal Function that we wish to profile.
In general, we want to attach profiling tools to code that’s already in place
and measure or debug its performance, and then detach it as easily as possible
so that we can be confident that the same performance characteristics pertain in production.
In keeping with that workflow, in this example we first define the Modal Function we want to profile,
without including any of the profiling logic.
That starts with the Function’s environment: the Modal [App](../guide/apps.html) the Function is attached to, the container [Image](https://modal.com/docs/guide/custom-container) with the Function’s dependencies, and the hardware requirements of the Function, like a [GPU](../guide/cuda.html).
```
app = modal.App("example-torch-profiling") # create an App
image = modal.Image.debian_slim( # define dependencies
python_version="3.11"
).pip_install("torch==2.5.1", "numpy==2.1.3")
with image.imports(): # set up common imports
import torch
```
Here, we define the config as a dictionary so that we can re-use it here
and later, when we attach the profiler. We want to make sure the profiler is in the same environment!
```
config = {"gpu": "a10g", "image": image}
```
The Function we target for profiling appears below. It’s just some simple PyTorch logic
that repeatedly multiplies a random matrix with itself.
The logic is simple, but it demonstrates two common issues with
GPU-accelerated Python code that are relatively easily fixed:
1. Slowing down the issuance of work to the GPU
2. Providing insufficient work for the GPU to complete
We’ll cover these in more detail once we have the profiler set up.
```
@app.function(**config)
def underutilize(scale=1):
records = []
x = torch.randn( # 🐌 2: not enough work to keep the GPU busy
scale * 100, scale * 100, device="cuda"
)
for ii in range(10):
x = x @ x
class Record: # 🐌 1: heavy Python work in the hot loop
def __init__(self, value):
self.value = value
records.append(Record(ii))
x[0][0].cpu() # force a host sync for accurate timing
```
Wrapping a Modal Function with a profiler
-----------------------------------------
Now, let’s wrap our `underutilize` Function with another Modal Function
that runs PyTorch’s profiler while executing it.
This Function has the same environment `config` as `underutilize`,
but it also attaches a remote Modal Volume to save profiler outputs.
To increase the flexibility of this approach, we allow it to take the target Function’s name
as an argument. That’s not much use here where there’s only one Function,
but it makes it easier to copy-paste this code into your projects to add profiling.
```
@app.function(volumes={TRACE_DIR: traces}, **config)
def profile(
function,
label: Optional[str] = None,
steps: int = 3,
schedule=None,
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
print_rows: int = 0,
**kwargs,
):
from uuid import uuid4
if isinstance(function, str):
try:
function = app.registered_functions[function]
except KeyError:
raise ValueError(f"Function {function} not found")
function_name = function.tag
output_dir = (
TRACE_DIR / (function_name + (f"_{label}" if label else "")) / str(uuid4())
)
output_dir.mkdir(parents=True, exist_ok=True)
if schedule is None:
if steps < 3:
raise ValueError("Steps must be at least 3 when using default schedule")
schedule = {"wait": 1, "warmup": 1, "active": steps - 2, "repeat": 0}
schedule = torch.profiler.schedule(**schedule)
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=schedule,
record_shapes=record_shapes,
profile_memory=profile_memory,
with_stack=with_stack,
on_trace_ready=torch.profiler.tensorboard_trace_handler(output_dir),
) as prof:
for _ in range(steps):
function.local(**kwargs) # <-- here we wrap the target Function
prof.step()
if print_rows:
print(
prof.key_averages().table(sort_by="cuda_time_total", row_limit=print_rows)
)
trace_path = sorted(
output_dir.glob("**/*.pt.trace.json"),
key=lambda pth: pth.stat().st_mtime,
reverse=True,
)[0]
print(f"trace saved to {trace_path.relative_to(TRACE_DIR)}")
return trace_path.read_text(), trace_path.relative_to(TRACE_DIR)
```
Triggering profiled execution from the command line and viewing in Perfetto
---------------------------------------------------------------------------
We wrap one more layer to make this executable from the command line:
a `local_entrypoint` that runs
```
modal run torch_profiling.py --function underutilize --print-rows 10
```
```
@app.local_entrypoint()
def main(
function: str = "underutilize",
label: Optional[str] = None,
steps: int = 3,
schedule=None,
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
print_rows: int = 10,
kwargs_json_path: Optional[str] = None,
):
if kwargs_json_path is not None: # use to pass arguments to function
import json
kwargs = json.loads(Path(kwargs_json_path).read_text())
else:
kwargs = {}
results, remote_path = profile.remote(
function,
label=label,
steps=steps,
schedule=schedule,
record_shapes=record_shapes,
profile_memory=profile_memory,
with_stack=with_stack,
print_rows=print_rows,
**kwargs,
)
output_path = Path("/tmp") / remote_path.name
output_path.write_text(results)
print(f"trace saved locally at {output_path}")
```
Underneath the profile results, you’ll also see the path at which the trace was saved on the Volume
and the path at which it was saved locally.
You can view the trace in the free online [Perfetto UI](https://ui.perfetto.dev).
### Improving the performance of our dummy test code
The `underutilize` demonstrates two common patterns that leads to unnecessarily low GPU utilization:
1. Slowing down the issuance of work to the GPU
2. Providing insufficient work for the GPU to complete
We simulated 1 in `underutilize` by defining a Python class in the middle of the matrix multiplication loop.
This takes on the order of 10 microseconds, roughly the same time it takes our A10 GPU to do the matrix multiplication.
Move it out of the loop to observe a small improvement in utilization. In a real setting,
this code might be useful logging or data processing logic, which we must carefully keep
out of the way of the code driving work on the GPU.
We simulated 2 in `underutilize` by providing a matrix that is too small to occupy the GPU for long.
Increase the size of the matrix by a factor of 4 in each dimension (a factor of 16 total),
to increase the utilization without increasing the execution time.
This is an untuitive feature of GPU programming in general: much work is done concurrently
and bottlenecks are non-obvious, so sometimes more work can be done for free or on the cheap.
In a server for large generative models, this might mean producing multiple outputs per user
or handling multiple users at the same time is more economical than it at first seems!
Serving TensorBoard on Modal to view PyTorch profiles and traces
----------------------------------------------------------------
The TensorBoard experiment monitoring server also includes a plugin
for viewing and interpreting the results of PyTorch profiler runs:
the `torch_tb_profiler` plugin.
```
tb_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"tensorboard==2.18.0", "torch_tb_profiler==0.4.3"
)
```
Because TensorBoard is a WSGI app, we can [host it on Modal](../guide/webhooks.html) with the `modal.wsgi_app` decorator.
Making this work with Modal requires one extra step:
we add some [WSGI Middleware](https://peps.python.org/pep-3333/) that checks the Modal Volume for updates
whenever the whole page is reloaded.
```
class VolumeMiddleware:
def __init__(self, app):
self.app = app
def __call__(self, environ, start_response):
if (route := environ.get("PATH_INFO")) in ["/", "/modal-volume-reload"]:
try:
traces.reload()
except Exception as e:
print("Exception while re-loading traces: ", e)
if route == "/modal-volume-reload":
environ["PATH_INFO"] = "/" # redirect
return self.app(environ, start_response)
```
You can deploy the TensorBoard server defined below with the following command:
```
modal deploy torch_profiling
```
and you can find your server at the URL printed to the terminal.
```
@app.function(
volumes={TRACE_DIR: traces},
image=tb_image,
max_containers=1, # single replica
scaledown_window=5 * 60, # five minute idle time
)
@modal.concurrent(max_inputs=100) # 100 concurrent request threads
@modal.wsgi_app()
def tensorboard():
import tensorboard
board = tensorboard.program.TensorBoard()
board.configure(logdir=str(TRACE_DIR))
(data_provider, deprecated_multiplexer) = board._make_data_provider()
wsgi_app = tensorboard.backend.application.TensorBoardWSGIApp(
board.flags,
board.plugin_loaders,
data_provider,
board.assets_zip_provider,
deprecated_multiplexer,
experimental_middlewares=[VolumeMiddleware],
)
return wsgi_app._create_wsgi_app()
```
[Tracing and profiling GPU-accelerated PyTorch programs on Modal](#tracing-and-profiling-gpu-accelerated-pytorch-programs-on-modal)[Saving traces to a Modal Volume](#saving-traces-to-a-modal-volume)[Setting up a Modal App with a GPU-accelerated PyTorch Function](#setting-up-a-modal-app-with-a-gpu-accelerated-pytorch-function)[Wrapping a Modal Function with a profiler](#wrapping-a-modal-function-with-a-profiler)[Triggering profiled execution from the command line and viewing in Perfetto](#triggering-profiled-execution-from-the-command-line-and-viewing-in-perfetto)[Improving the performance of our dummy test code](#improving-the-performance-of-our-dummy-test-code)[Serving TensorBoard on Modal to view PyTorch profiles and traces](#serving-tensorboard-on-modal-to-view-pytorch-profiles-and-traces)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/torch_profiling.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/stable_diffusion/text_to_image.py)
Run Stable Diffusion 3.5 Large Turbo as a CLI, API, and web UI
==============================================================
This example shows how to run [Stable Diffusion 3.5 Large Turbo](https://huggingface.co/stabilityai/stable-diffusion-3.5-large-turbo) on Modal
to generate images from your local command line, via an API, and as a web UI.
Inference takes about one minute to cold start,
at which point images are generated at a rate of one image every 1-2 seconds
for batch sizes between one and 16.
Below are four images produced by the prompt
“A princess riding on a pony”.
![stable diffusion montage](https://modal-cdn.com/cdnbot/sd-montage-princess-yxu2vnbl_e896a9c0.webp)
Basic setup
-----------
```
import io
import random
import time
from pathlib import Path
from typing import Optional
import modal
MINUTES = 60
```
All Modal programs need an [`App`](../reference/modal.App.html) — an object that acts as a recipe for
the application. Let’s give it a friendly name.
```
app = modal.App("example-text-to-image")
```
Configuring dependencies
------------------------
The model runs remotely inside a [container](https://modal.com/docs/guide/custom-container).
That means we need to install the necessary dependencies in that container’s image.
Below, we start from a lightweight base Linux image
and then install our Python dependencies, like Hugging Face’s `diffusers` library and `torch`.
```
CACHE_DIR = "/cache"
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"accelerate==0.33.0",
"diffusers==0.31.0",
"fastapi[standard]==0.115.4",
"huggingface-hub[hf_transfer]==0.25.2",
"sentencepiece==0.2.0",
"torch==2.5.1",
"torchvision==0.20.1",
"transformers~=4.44.0",
)
.env(
{
"HF_HUB_ENABLE_HF_TRANSFER": "1", # faster downloads
"HF_HUB_CACHE": CACHE_DIR,
}
)
)
with image.imports():
import diffusers
import torch
from fastapi import Response
```
Implementing SD3.5 Large Turbo inference on Modal
-------------------------------------------------
We wrap inference in a Modal [Cls](https://modal.com/docs/guide/lifecycle-methods) that ensures models are loaded and then moved to the GPU once when a new container
starts, before the container picks up any work.
The `run` function just wraps a `diffusers` pipeline.
It sends the output image back to the client as bytes.
We also include a `web` wrapper that makes it possible
to trigger inference via an API call.
See the `/docs` route of the URL ending in `inference-web.modal.run` that appears when you deploy the app for details.
```
MODEL_ID = "adamo1139/stable-diffusion-3.5-large-turbo-ungated"
MODEL_REVISION_ID = "9ad870ac0b0e5e48ced156bb02f85d324b7275d2"
cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
@app.cls(
image=image,
gpu="H100",
timeout=10 * MINUTES,
volumes={CACHE_DIR: cache_volume},
)
class Inference:
@modal.enter()
def load_pipeline(self):
self.pipe = diffusers.StableDiffusion3Pipeline.from_pretrained(
MODEL_ID,
revision=MODEL_REVISION_ID,
torch_dtype=torch.bfloat16,
).to("cuda")
@modal.method()
def run(
self, prompt: str, batch_size: int = 4, seed: Optional[int] = None
) -> list[bytes]:
seed = seed if seed is not None else random.randint(0, 2**32 - 1)
print("seeding RNG with", seed)
torch.manual_seed(seed)
images = self.pipe(
prompt,
num_images_per_prompt=batch_size, # outputting multiple images per prompt is much cheaper than separate calls
num_inference_steps=4, # turbo is tuned to run in four steps
guidance_scale=0.0, # turbo doesn't use CFG
max_sequence_length=512, # T5-XXL text encoder supports longer sequences, more complex prompts
).images
image_output = []
for image in images:
with io.BytesIO() as buf:
image.save(buf, format="PNG")
image_output.append(buf.getvalue())
torch.cuda.empty_cache() # reduce fragmentation
return image_output
@modal.fastapi_endpoint(docs=True)
def web(self, prompt: str, seed: Optional[int] = None):
return Response(
content=self.run.local( # run in the same container
prompt, batch_size=1, seed=seed
)[0],
media_type="image/png",
)
```
Generating Stable Diffusion images from the command line
--------------------------------------------------------
This is the command we’ll use to generate images. It takes a text `prompt`,
a `batch_size` that determines the number of images to generate per prompt,
and the number of times to run image generation (`samples`).
You can also provide a `seed` to make sampling more deterministic.
Run it with
```
modal run text_to_image.py
```
and pass `--help` to see more options.
```
@app.local_entrypoint()
def entrypoint(
samples: int = 4,
prompt: str = "A princess riding on a pony",
batch_size: int = 4,
seed: Optional[int] = None,
):
print(
f"prompt => {prompt}",
f"samples => {samples}",
f"batch_size => {batch_size}",
f"seed => {seed}",
sep="\n",
)
output_dir = Path("/tmp/stable-diffusion")
output_dir.mkdir(exist_ok=True, parents=True)
inference_service = Inference()
for sample_idx in range(samples):
start = time.time()
images = inference_service.run.remote(prompt, batch_size, seed)
duration = time.time() - start
print(f"Run {sample_idx + 1} took {duration:.3f}s")
if sample_idx:
print(
f"\tGenerated {len(images)} image(s) at {(duration) / len(images):.3f}s / image."
)
for batch_idx, image_bytes in enumerate(images):
output_path = (
output_dir
/ f"output_{slugify(prompt)[:64]}_{str(sample_idx).zfill(2)}_{str(batch_idx).zfill(2)}.png"
)
if not batch_idx:
print("Saving outputs", end="\n\t")
print(
output_path,
end="\n" + ("\t" if batch_idx < len(images) - 1 else ""),
)
output_path.write_bytes(image_bytes)
```
Generating Stable Diffusion images via an API
---------------------------------------------
The Modal `Cls` above also included a [`fastapi_endpoint`](basic_web.html),
which adds a simple web API to the inference method.
To try it out, run
```
modal deploy text_to_image.py
```
copy the printed URL ending in `inference-web.modal.run`,
and add `/docs` to the end. This will bring up the interactive
Swagger/OpenAPI docs for the endpoint.
Generating Stable Diffusion images in a web UI
----------------------------------------------
Lastly, we add a simple front-end web UI (written in Alpine.js) for
our image generation backend.
This is also deployed by running
```
modal deploy text_to_image.py.
```
The `Inference` class will serve multiple users from its own auto-scaling pool of warm GPU containers automatically.
```
frontend_path = Path(__file__).parent / "frontend"
web_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install("jinja2==3.1.4", "fastapi[standard]==0.115.4")
.add_local_dir(frontend_path, remote_path="/assets")
)
@app.function(image=web_image)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def ui():
import fastapi.staticfiles
from fastapi import FastAPI, Request
from fastapi.templating import Jinja2Templates
web_app = FastAPI()
templates = Jinja2Templates(directory="/assets")
@web_app.get("/")
async def read_root(request: Request):
return templates.TemplateResponse(
"index.html",
{
"request": request,
"inference_url": Inference.web.get_web_url(),
"model_name": "Stable Diffusion 3.5 Large Turbo",
"default_prompt": "A cinematic shot of a baby raccoon wearing an intricate italian priest robe.",
},
)
web_app.mount(
"/static",
fastapi.staticfiles.StaticFiles(directory="/assets"),
name="static",
)
return web_app
def slugify(s: str) -> str:
return "".join(c if c.isalnum() else "-" for c in s).strip("-")
```
[Run Stable Diffusion 3.5 Large Turbo as a CLI, API, and web UI](#run-stable-diffusion-35-large-turbo-as-a-cli-api-and-web-ui)[Basic setup](#basic-setup)[Configuring dependencies](#configuring-dependencies)[Implementing SD3.5 Large Turbo inference on Modal](#implementing-sd35-large-turbo-inference-on-modal)[Generating Stable Diffusion images from the command line](#generating-stable-diffusion-images-from-the-command-line)[Generating Stable Diffusion images via an API](#generating-stable-diffusion-images-via-an-api)[Generating Stable Diffusion images in a web UI](#generating-stable-diffusion-images-in-a-web-ui)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/stable_diffusion/text_to_image.py --prompt 'A 1600s oil painting of the New York City skyline'
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/fasthtml-checkboxes/fasthtml_checkboxes.py)
Deploy 100,000 multiplayer checkboxes on Modal with FastHTML
============================================================
[![Screenshot of FastHTML Checkboxes UI](../../_app/immutable/assets/ui.BaSTrcQW.png)](https://modal-labs-examples--example-checkboxes-web.modal.run)
This example shows how you can deploy a multiplayer checkbox game with FastHTML on Modal.
[FastHTML](https://www.fastht.ml/) is a Python library built on top of [HTMX](https://htmx.org/) which allows you to create entire web applications using only Python.
For a simpler template for using FastHTML with Modal, check out [this example](https://modal.com/docs/examples/fasthtml_app).
Our example is inspired by [1 Million Checkboxes](https://onemillioncheckboxes.com/).
```
import time
from asyncio import Lock
from pathlib import Path
from uuid import uuid4
import modal
from .constants import N_CHECKBOXES
app = modal.App("example-checkboxes")
db = modal.Dict.from_name("example-checkboxes-db", create_if_missing=True)
css_path_local = Path(__file__).parent / "styles.css"
css_path_remote = "/assets/styles.css"
@app.function(
image=modal.Image.debian_slim(python_version="3.12")
.pip_install("python-fasthtml==0.6.9", "inflect~=7.4.0")
.add_local_file(css_path_local, remote_path=css_path_remote),
max_containers=1, # we currently maintain state in memory, so we restrict the server to one worker
)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def web():
import fasthtml.common as fh
import inflect
# Connected clients are tracked in-memory
clients = {}
clients_mutex = Lock()
# We keep all checkbox fasthtml elements in memory during operation, and persist to modal dict across restarts
checkboxes = db.get("checkboxes", [])
checkbox_mutex = Lock()
if len(checkboxes) == N_CHECKBOXES:
print("Restored checkbox state from previous session.")
else:
print("Initializing checkbox state.")
checkboxes = []
for i in range(N_CHECKBOXES):
checkboxes.append(
fh.Input(
id=f"cb-{i}",
type="checkbox",
checked=False,
# when clicked, that checkbox will send a POST request to the server with its index
hx_post=f"/checkbox/toggle/{i}",
hx_swap_oob="true", # allows us to later push diffs to arbitrary checkboxes by id
)
)
async def on_shutdown():
# Handle the shutdown event by persisting current state to modal dict
async with checkbox_mutex:
db["checkboxes"] = checkboxes
print("Checkbox state persisted.")
style = open(css_path_remote, "r").read()
app, _ = fh.fast_app(
# FastHTML uses the ASGI spec, which allows handling of shutdown events
on_shutdown=[on_shutdown],
hdrs=[fh.Style(style)],
)
# handler run on initial page load
@app.get("/")
async def get():
# register a new client
client = Client()
async with clients_mutex:
clients[client.id] = client
return (
fh.Title(f"{N_CHECKBOXES // 1000}k Checkboxes"),
fh.Main(
fh.H1(
f"{inflect.engine().number_to_words(N_CHECKBOXES).title()} Checkboxes"
),
fh.Div(
*checkboxes,
id="checkbox-array",
),
cls="container",
# use HTMX to poll for diffs to apply
hx_trigger="every 1s", # poll every second
hx_get=f"/diffs/{client.id}", # call the diffs endpoint
hx_swap="none", # don't replace the entire page
),
)
# users submitting checkbox toggles
@app.post("/checkbox/toggle/{i}")
async def toggle(i: int):
async with checkbox_mutex:
cb = checkboxes[i]
cb.checked = not cb.checked
checkboxes[i] = cb
async with clients_mutex:
expired = []
for client in clients.values():
# clean up old clients
if not client.is_active():
expired.append(client.id)
# add diff to client for when they next poll
client.add_diff(i)
for client_id in expired:
del clients[client_id]
return
# clients polling for any outstanding diffs
@app.get("/diffs/{client_id}")
async def diffs(client_id: str):
# we use the `hx_swap_oob='true'` feature to
# push updates only for the checkboxes that changed
async with clients_mutex:
client = clients.get(client_id, None)
if client is None or len(client.diffs) == 0:
return
client.heartbeat()
diffs = client.pull_diffs()
async with checkbox_mutex:
diff_array = [checkboxes[i] for i in diffs]
return diff_array
return app
```
Class for tracking state to push out to connected clients
```
class Client:
def __init__(self):
self.id = str(uuid4())
self.diffs = []
self.inactive_deadline = time.time() + 30
def is_active(self):
return time.time() < self.inactive_deadline
def heartbeat(self):
self.inactive_deadline = time.time() + 30
def add_diff(self, i):
if i not in self.diffs:
self.diffs.append(i)
def pull_diffs(self):
# return a copy of the diffs and clear them
diffs = self.diffs
self.diffs = []
return diffs
```
[Deploy 100,000 multiplayer checkboxes on Modal with FastHTML](#deploy-100000-multiplayer-checkboxes-on-modal-with-fasthtml)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve -m 07_web_endpoints.fasthtml-checkboxes.fasthtml_checkboxes
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-structured/outlines_generate.py)
Enforcing JSON outputs on LLMs
==============================
[Outlines](https://github.com/outlines-dev/outlines) is a tool that lets you control the generation of language models to make their output more predictable.
This includes things like:
* Reducing the completion to a choice between multiple possibilities
* Type constraints
* Efficient regex-structured generation
* Efficient JSON generation following a Pydantic model
* Efficient JSON generation following a JSON schema
Outlines is considered an alternative to tools like [JSONFormer](https://github.com/1rgs/jsonformer), and can be used on top of a variety of LLMs, including:
* OpenAI models
* LLaMA
* Mamba
In this guide, we will show how you can use Outlines to enforce a JSON schema on the output of Mistral-7B.
Build image
-----------
First, you’ll want to build an image and install the relevant Python dependencies: `outlines` and a Hugging Face inference stack.
```
import modal
app = modal.App(name="outlines-app")
outlines_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"outlines==0.0.44",
"transformers==4.41.2",
"sentencepiece==0.2.0",
"datasets==2.18.0",
"accelerate==0.27.2",
"numpy<2",
)
```
Download the model
------------------
Next, we download the Mistral 7B model from Hugging Face.
We do this as part of the definition of our Modal Image so that
we don’t need to download it every time our inference function is run.
```
MODEL_NAME = "mistral-community/Mistral-7B-v0.2"
def import_model(model_name):
import outlines
outlines.models.transformers(model_name)
outlines_image = outlines_image.run_function(
import_model, kwargs={"model_name": MODEL_NAME}
)
```
Define the schema
-----------------
Next, we define the schema that we want to enforce on the output of Mistral-7B. This schema is for a character description, and includes a name, age, armor, weapon, and strength.
```
schema = """{
"title": "Character",
"type": "object",
"properties": {
"name": {
"title": "Name",
"maxLength": 10,
"type": "string"
},
"age": {
"title": "Age",
"type": "integer"
},
"armor": {"$ref": "#/definitions/Armor"},
"weapon": {"$ref": "#/definitions/Weapon"},
"strength": {
"title": "Strength",
"type": "integer"
}
},
"required": ["name", "age", "armor", "weapon", "strength"],
"definitions": {
"Armor": {
"title": "Armor",
"description": "An enumeration.",
"enum": ["leather", "chainmail", "plate"],
"type": "string"
},
"Weapon": {
"title": "Weapon",
"description": "An enumeration.",
"enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
"type": "string"
}
}
}"""
```
Define the function
-------------------
Next, we define the generation function.
We use the `@app.function` decorator to tell Modal to run this function on the app we defined above.
Note that we import `outlines` from inside the Modal function. This is because the `outlines` package exists in the container, but not necessarily locally.
We specify that we want to use the Mistral-7B model, and then ask for a character, and we’ll receive structured data with the right schema.
```
@app.function(image=outlines_image, gpu="A100-40GB")
def generate(
prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
import outlines
model = outlines.models.transformers(MODEL_NAME, device="cuda")
generator = outlines.generate.json(model, schema)
character = generator(f"Give me a character description. Describe {prompt}.")
return character
```
Define the entrypoint
---------------------
Finally, we define the entrypoint that will connect our local computer
to the functions above, that run on Modal, and we are done!
When you run this script with `modal run`, you should see something like this printed out:
`{'name': 'Amiri', 'age': 53, 'armor': 'leather', 'weapon': 'sword', 'strength': 10}`
```
@app.local_entrypoint()
def main(
prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
print(generate.remote(prompt))
```
[Enforcing JSON outputs on LLMs](#enforcing-json-outputs-on-llms)[Build image](#build-image)[Download the model](#download-the-model)[Define the schema](#define-the-schema)[Define the function](#define-the-function)[Define the entrypoint](#define-the-entrypoint)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/llm-structured/outlines_generate.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/protein-folding/boltz1.py)
Fold proteins with Boltz-1
==========================
Boltz-1 is an open source molecular structure prediction model that matches the performance of closed source models like AlphaFold 3.
It was created by the [MIT Jameel Clinic](https://jclinic.mit.edu/boltz-1/).
For details, see [their technical report](https://gcorso.github.io/assets/boltz1.pdf).
Here, we demonstrate how to run Boltz-1 on Modal.
Setup
-----
```
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import modal
here = Path(__file__).parent # the directory of this file
MINUTES = 60 # seconds
app = modal.App(name="example-boltz1-inference")
```
Fold a protein from the command line
------------------------------------
The logic for running Boltz-1 is encapsulated in the function below,
which you can trigger from the command line by running
```
modal run boltz1
```
This will set up the environment for running Boltz-1 inference in Modal’s cloud,
run it, and then save the results locally as a [tarball](https://computing.help.inf.ed.ac.uk/FAQ/whats-tarball-or-how-do-i-unpack-or-create-tgz-or-targz-file).
That tarball archive contains, among other things, the predicted structure as a [Crystallographic Information File](https://en.wikipedia.org/wiki/Crystallographic_Information_File),
which you can render with the online [Molstar Viewer](https://molstar.org/viewer).
You can pass any options for the [`boltz predict` command line tool](https://github.com/jwohlwend/boltz/blob/2355c62c957e95305527290112e9742d0565c458/docs/prediction.md) as a string, like
```
modal run boltz1 --args "--sampling_steps 10"
```
To see more options, run the command with the `--help` flag.
To learn how it works, read on!
```
@app.local_entrypoint()
def main(
force_download: bool = False, input_yaml_path: Optional[str] = None, args: str = ""
):
print("🧬 loading model remotely")
download_model.remote(force_download)
if input_yaml_path is None:
input_yaml_path = here / "data" / "boltz1_ligand.yaml"
input_yaml = input_yaml_path.read_text()
msas = find_msas(input_yaml_path)
print(f"🧬 running boltz with input from {input_yaml_path}")
output = boltz1_inference.remote(input_yaml, msas)
output_path = Path("/tmp") / "boltz1" / "boltz1_result.tar.gz"
output_path.parent.mkdir(exist_ok=True, parents=True)
print(f"🧬 writing output to {output_path}")
output_path.write_bytes(output)
```
Installing Boltz-1 Python dependencies on Modal
-----------------------------------------------
Code running on Modal runs inside containers built from [container images](../guide/images.html) that include that code’s dependencies.
Because Modal images include [GPU drivers](../guide/cuda.html) by default,
installation of higher-level packages like `boltz` that require GPUs is painless.
Here, we do it in a few lines, using the `uv` package manager for extra speed.
```
image = modal.Image.debian_slim(python_version="3.12").run_commands(
"uv pip install --system --compile-bytecode boltz==0.3.2"
)
```
Storing Boltz-1 model weights on Modal with Volumes
---------------------------------------------------
Not all “dependencies” belong in a container image. Boltz-1, for example, depends on
the weights of the model and a [Chemical Component Dictionary](https://www.wwpdb.org/data/ccd) (CCD) file.
Rather than loading them dynamically at run-time (which would add several minutes of GPU time to each inference),
or installing them into the image (which would require they be re-downloaded any time the other dependencies changed),
we load them onto a [Modal Volume](../guide/volumes.html).
A Modal Volume is a file system that all of your code running on Modal (or elsewhere!) can access.
For more on storing model weights on Modal, see [this guide](../guide/model-weights.html).
For details on how we download the weights in this case, see the [Addenda](#addenda).
```
boltz_model_volume = modal.Volume.from_name("boltz1-models", create_if_missing=True)
models_dir = Path("/models/boltz1")
```
Running Boltz-1 on Modal
------------------------
To run inference on Modal we wrap our function in a decorator, `@app.function`.
We provide that decorator with some arguments that describe the infrastructure our code needs to run:
the Volume we created, the Image we defined, and of course a fast GPU!
Note that the `boltz` command-line tool we use takes the path to a [specially-formatted YAML file](https://github.com/jwohlwend/boltz/blob/2355c62c957e95305527290112e9742d0565c458/docs/prediction.md) that includes definitions of molecules to predict the structures of and optionally paths to [Multiple Sequence Alignment](https://en.wikipedia.org/wiki/Multiple_sequence_alignment) (MSA) files
for any protein molecules. See the [Addenda](#addenda) for details.
```
@app.function(
image=image,
volumes={models_dir: boltz_model_volume},
timeout=10 * MINUTES,
gpu="H100",
)
def boltz1_inference(boltz_input_yaml: str, msas: list["MSA"], args="") -> bytes:
import shlex
import subprocess
input_path = Path("input.yaml")
input_path.write_text(boltz_input_yaml)
for msa in msas:
msa.path.write_text(msa.data)
args = shlex.split(args)
print(f"🧬 predicting structure using boltz model from {models_dir}")
subprocess.run(
["boltz", "predict", input_path, "--cache", str(models_dir)] + args,
check=True,
)
print("🧬 packaging up outputs")
output_bytes = package_outputs(f"boltz_results_{input_path.with_suffix('').name}")
return output_bytes
```
Addenda
-------
Above, we glossed over just how we got hold of the model weights —
the `local_entrypoint` just called a function named `download_model`.
Here’s the implementation of that function. For details, see our [guide to storing model weights on Modal](../guide/model-weights.html).
```
download_image = (
modal.Image.debian_slim()
.pip_install("huggingface_hub[hf_transfer]==0.26.3")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # and enable it
)
@app.function(
volumes={models_dir: boltz_model_volume},
timeout=20 * MINUTES,
image=download_image,
)
def download_model(
force_download: bool = False,
revision: str = "7c1d83b779e4c65ecc37dfdf0c6b2788076f31e1",
):
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="boltz-community/boltz-1",
revision=revision,
local_dir=models_dir,
force_download=force_download,
)
boltz_model_volume.commit()
print(f"🧬 model downloaded to {models_dir}")
```
Additionally, the YAML format accepted by the `boltz predict` command
includes the option to specify the sequence alignments for any input `protein` via a path to an MSA file (in the “aligned-FASTA” format, [`.a3m`](https://yanglab.qd.sdu.edu.cn/trRosetta/msa_format.html)).
To ensure these files are available to the Modal Function running remotely,
we parse the YAML file and extract the paths to and data from the MSA files.
```
@dataclass
class MSA:
data: str
path: Path
def find_msas(boltz_yaml_path: Path) -> list[MSA]:
"""Finds the MSA data in a YAML file in the Boltz input format.
See https://github.com/jwohlwend/boltz/blob/2355c62c957e95305527290112e9742d0565c458/docs/prediction.md for details."""
import yaml
data = yaml.safe_load(boltz_yaml_path.read_text())
data_dir = boltz_yaml_path.parent
sequences = data["sequences"]
msas = []
for sequence in sequences:
if protein := sequence.get("protein"):
if msa_path := protein.get("msa"):
if msa_path == "empty": # special value
continue
if not msa_path.startswith("."):
raise ValueError(
f"Must specify MSA paths relative to the input yaml path, but got {msa_path}"
)
msa_data = (data_dir / Path(msa_path).name).read_text()
msas.append(MSA(msa_data, Path(msa_path)))
return msas
def package_outputs(output_dir: str) -> bytes:
import io
import tarfile
tar_buffer = io.BytesIO()
with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar:
tar.add(output_dir, arcname=output_dir)
return tar_buffer.getvalue()
```
[Fold proteins with Boltz-1](#fold-proteins-with-boltz-1)[Setup](#setup)[Fold a protein from the command line](#fold-a-protein-from-the-command-line)[Installing Boltz-1 Python dependencies on Modal](#installing-boltz-1-python-dependencies-on-modal)[Storing Boltz-1 model weights on Modal with Volumes](#storing-boltz-1-model-weights-on-modal-with-volumes)[Running Boltz-1 on Modal](#running-boltz-1-on-modal)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/protein-folding/boltz1.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/long-training.py)
Run long, resumable training jobs on Modal
==========================================
Individual Modal Function calls have a [maximum timeout of 24 hours](../guide/timeouts.html).
You can still run long training jobs on Modal by making them interruptible and resumable
(aka [*reentrant*](https://en.wikipedia.org/wiki/Reentrancy_%28computing%29)).
This is usually done via checkpointing: saving the model state to disk at regular intervals.
We recommend implementing checkpointing logic regardless of the duration of your training jobs.
This prevents loss of progress in case of interruptions or [preemptions](../guide/preemption.html).
In this example, we’ll walk through how to implement this pattern in [PyTorch Lightning](https://lightning.ai/docs/pytorch/2.4.0/).
But the fundamental pattern is simple and can be applied to any training framework:
1. Periodically save checkpoints to a Modal [Volume](../guide/volumes.html)
2. When your training function starts, check the Volume for the latest checkpoint
3. Add [retries](../guide/retries.html) to your training function
Resuming from checkpoints in a training loop
--------------------------------------------
The `train` function below shows some very simple training logic
using the built-in checkpointing features of PyTorch Lightning.
Lightning uses a special filename, `last.ckpt`,
to indicate which checkpoint is the most recent.
We check for this file and resume training from it if it exists.
```
from pathlib import Path
from typing import Optional
import modal
def train(experiment):
experiment_dir = CHECKPOINTS_PATH / experiment
last_checkpoint = experiment_dir / "last.ckpt"
if last_checkpoint.exists():
print(f"⚡️ resuming training from the latest checkpoint: {last_checkpoint}")
train_model(
DATA_PATH,
experiment_dir,
resume_from_checkpoint=last_checkpoint,
)
print("⚡️ training finished successfully")
else:
print("⚡️ starting training from scratch")
train_model(DATA_PATH, experiment_dir)
```
This implementation works fine in a local environment.
Running it serverlessly and durably on Modal — with access to auto-scaling cloud GPU infrastructure
— does not require any adjustments to the code.
We just need to ensure that data and checkpoints are saved in Modal *Volumes*.
Modal Volumes are distributed file systems
------------------------------------------
Modal [Volumes](../guide/volumes.html) are distributed file systems —
you can read and write files from them just like local disks,
but they are accessible to all of your Modal Functions.
Their performance is tuned for [Write-Once, Read-Many](https://en.wikipedia.org/wiki/Write_once_read_many) workloads
with small numbers of large files.
You can attach them to any Modal Function that needs access.
But first, you need to create them:
```
volume = modal.Volume.from_name("example-long-training", create_if_missing=True)
```
Porting training to Modal
-------------------------
To attach a Modal Volume to our training function, we need to port it over to run on Modal.
That means we need to define our training function’s dependencies
(as a [container image](https://modal.com/docs/guide/custom-container))
and attach it to an application (a [`modal.App`](../guide/apps.html)).
Modal Functions that run on GPUs [already have CUDA drivers installed](../guide/cuda.html),
so dependency specification is straightforward.
We just `pip_install` PyTorch and PyTorch Lightning.
```
image = modal.Image.debian_slim(python_version="3.12").pip_install(
"lightning~=2.4.0", "torch~=2.4.0", "torchvision==0.19.0"
)
app = modal.App("example-long-training-lightning", image=image)
```
Next, we attach our training function to this app with `app.function`.
We define all of the serverless infrastructure-specific details of our training at this point.
For resumable training, there are three key pieces: attaching volumes, adding retries, and setting the timeout.
We want to attach the Volume to our Function so that the data and checkpoints are saved into it.
In this sample code, we set these paths via global variables, but in another setting,
these might be set via environment variables or other configuration mechanisms.
```
volume_path = Path("/experiments")
DATA_PATH = volume_path / "data"
CHECKPOINTS_PATH = volume_path / "checkpoints"
volumes = {volume_path: volume}
```
Then, we define how we want to restart our training in case of interruption.
We can use `modal.Retries` to add automatic retries to our Function.
We set the delay time to `0.0` seconds, because on pre-emption or timeout we want to restart immediately.
We set `max_retries` to the current maximum, which is `10`.
```
retries = modal.Retries(initial_delay=0.0, max_retries=10)
```
Timeouts on Modal are set in seconds, with a minimum of 10 seconds and a maximum of 24 hours.
When running training jobs that last up to week, we’d set that timeout to 24 hours,
which would give our training job a maximum of 10 days to complete before we’d need to manually restart.
For this example, we’ll set it to 30 seconds. When running the example, you should observe a few interruptions.
```
timeout = 30 # seconds
```
Now, we put all of this together by wrapping `train` and decorating it
with `app.function` to add all the infrastructure.
```
@app.function(volumes=volumes, gpu="a10g", timeout=timeout, retries=retries)
def train_interruptible(*args, **kwargs):
train(*args, **kwargs)
```
Kicking off interruptible training
----------------------------------
We define a [`local_entrypoint`](../guide/apps.html#entrypoints-for-ephemeral-apps) to kick off the training job from the local Python environment.
```
@app.local_entrypoint()
def main(experiment: Optional[str] = None):
if experiment is None:
from uuid import uuid4
experiment = uuid4().hex[:8]
print(f"⚡️ starting interruptible training experiment {experiment}")
train_interruptible.remote(experiment)
```
You can run this with
```
modal run --detach 06_gpu_and_ml/long-training.py
```
You should see the training job start and then be interrupted,
producing a large stack trace in the terminal in red font.
The job will restart within a few seconds.
The `--detach` flag ensures training will continue even if you close your terminal or turn off your computer.
Try detaching and then watch the logs in the [Modal dashboard](../../login%EF%B9%96next=%EA%A4%B7apps.html).
Details of PyTorch Lightning implementation
-------------------------------------------
This basic pattern works for any training framework or for custom training jobs —
or for any reentrant work that can save state to disk.
But to make the example complete, we include all the details of the PyTorch Lightning implementation below.
PyTorch Lightning offers [built-in checkpointing](https://pytorch-lightning.readthedocs.io/en/1.2.10/common/weights_loading.html).
You can specify the checkpoint file path that you want to resume from using the `ckpt_path` parameter of [`trainer.fit`](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html) Additionally, you can specify the checkpointing interval with the `every_n_epochs` parameter of [`ModelCheckpoint`](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html).
```
def get_checkpoint(checkpoint_dir):
from lightning.pytorch.callbacks import ModelCheckpoint
return ModelCheckpoint(
dirpath=checkpoint_dir,
save_last=True,
every_n_epochs=10,
filename="{epoch:02d}",
)
def train_model(data_dir, checkpoint_dir, resume_from_checkpoint=None):
import lightning as L
autoencoder = get_autoencoder()
train_loader = get_train_loader(data_dir=data_dir)
checkpoint_callback = get_checkpoint(checkpoint_dir)
trainer = L.Trainer(
limit_train_batches=100, max_epochs=100, callbacks=[checkpoint_callback]
)
if resume_from_checkpoint is not None:
trainer.fit(
model=autoencoder,
train_dataloaders=train_loader,
ckpt_path=resume_from_checkpoint,
)
else:
trainer.fit(autoencoder, train_loader)
def get_autoencoder(checkpoint_path=None):
import lightning as L
from torch import nn, optim
class LitAutoEncoder(L.LightningModule):
def __init__(self):
super().__init__()
self.encoder = nn.Sequential(
nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3)
)
self.decoder = nn.Sequential(
nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28)
)
def training_step(self, batch, batch_idx):
x, _ = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
loss = nn.functional.mse_loss(x_hat, x)
self.log("train_loss", loss)
return loss
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
return optimizer
return LitAutoEncoder()
def get_train_loader(data_dir):
from torch import utils
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
print("⚡ setting up data")
dataset = MNIST(data_dir, download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset, num_workers=4)
return train_loader
```
[Run long, resumable training jobs on Modal](#run-long-resumable-training-jobs-on-modal)[Resuming from checkpoints in a training loop](#resuming-from-checkpoints-in-a-training-loop)[Modal Volumes are distributed file systems](#modal-volumes-are-distributed-file-systems)[Porting training to Modal](#porting-training-to-modal)[Kicking off interruptible training](#kicking-off-interruptible-training)[Details of PyTorch Lightning implementation](#details-of-pytorch-lightning-implementation)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run --detach 06_gpu_and_ml/long-training.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/controlnet/controlnet_gradio_demos.py)
Play with the ControlNet demos
==============================
This example allows you to play with all 10 demonstration Gradio apps from the new and amazing ControlNet project.
ControlNet provides a minimal interface allowing users to use images to constrain StableDiffusion’s generation process.
With ControlNet, users can easily condition the StableDiffusion image generation with different spatial contexts
including a depth maps, segmentation maps, scribble drawings, and keypoints!
[](https://user-images.githubusercontent.com/12058921/222927911-3ab52dd1-f2ee-4fb8-97e8-dafbf96ed5c5.mp4)
Imports and config preamble
---------------------------
```
import importlib
import os
import pathlib
from dataclasses import dataclass, field
import modal
from fastapi import FastAPI
```
Below are the configuration objects for all **10** demos provided in the original [lllyasviel/ControlNet](https://github.com/lllyasviel/ControlNet) repo.
The demos each depend on their own custom pretrained StableDiffusion model, and these models are 5-6GB each.
We can only run one demo at a time, so this module avoids downloading the model and ‘detector’ dependencies for
all 10 demos and instead uses the demo configuration object to download only what’s necessary for the chosen demo.
Even just limiting our dependencies setup to what’s required for one demo, the resulting container image is *huge*.
```
@dataclass(frozen=True)
class DemoApp:
"""Config object defining a ControlNet demo app's specific dependencies."""
name: str
model_files: list[str]
detector_files: list[str] = field(default_factory=list)
demos = [
DemoApp(
name="canny2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_canny.pth"
],
),
DemoApp(
name="depth2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_depth.pth"
],
detector_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
],
),
DemoApp(
name="fake_scribble2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_scribble.pth"
],
detector_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth"
],
),
DemoApp(
name="hed2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_hed.pth"
],
detector_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth"
],
),
DemoApp(
name="hough2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_mlsd.pth"
],
detector_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth",
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_tiny_512_fp32.pth",
],
),
DemoApp(
name="normal2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_normal.pth"
],
),
DemoApp(
name="pose2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_openpose.pth"
],
detector_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth",
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth",
],
),
DemoApp(
name="scribble2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_scribble.pth"
],
),
DemoApp(
name="scribble2image_interactive",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_scribble.pth"
],
),
DemoApp(
name="seg2image",
model_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_seg.pth"
],
detector_files=[
"https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
],
),
]
demos_map: dict[str, DemoApp] = {d.name: d for d in demos}
```
Pick a demo, any demo
---------------------
Simply by changing the `DEMO_NAME` below, you can change which ControlNet demo app is setup
and run by this Modal script.
```
DEMO_NAME = "scribble2image" # Change this value to change the active demo app.
selected_demo = demos_map[DEMO_NAME]
```
Setting up the dependencies
---------------------------
ControlNet requires *a lot* of dependencies which could be fiddly to setup manually, but Modal’s programmatic
container image building Python APIs handle this complexity straightforwardly and automatically.
To run any of the 10 demo apps, we need the following:
1. a base Python 3 Linux image (we use Debian Slim)
2. a bunch of third party PyPi packages
3. `git`, so that we can download the ControlNet source code (there’s no `controlnet` PyPi package)
4. some image process Linux system packages, including `ffmpeg`
5. and demo specific pre-trained model and detector `.pth` files
That’s a lot! Fortunately, the code below is already written for you that stitches together a working container image
ready to produce remarkable ControlNet images.
**Note:** a ControlNet model pipeline is [now available in Huggingface’s `diffusers` package](https://huggingface.co/blog/controlnet). But this does not contain the demo apps.
```
def download_file(url: str, output_path: pathlib.Path):
import httpx
from tqdm import tqdm
with open(output_path, "wb") as download_file:
with httpx.stream("GET", url, follow_redirects=True) as response:
total = int(response.headers["Content-Length"])
with tqdm(
total=total, unit_scale=True, unit_divisor=1024, unit="B"
) as progress:
num_bytes_downloaded = response.num_bytes_downloaded
for chunk in response.iter_bytes():
download_file.write(chunk)
progress.update(
response.num_bytes_downloaded - num_bytes_downloaded
)
num_bytes_downloaded = response.num_bytes_downloaded
def download_demo_files() -> None:
"""
The ControlNet repo instructs: 'Make sure that SD models are put in "ControlNet/models".'
'ControlNet' is just the repo root, so we place in /root/models.
The ControlNet repo also instructs: 'Make sure that... detectors are put in "ControlNet/annotator/ckpts".'
'ControlNet' is just the repo root, so we place in /root/annotator/ckpts.
"""
demo = demos_map[os.environ["DEMO_NAME"]]
models_dir = pathlib.Path("/root/models")
for url in demo.model_files:
filepath = pathlib.Path(url).name
download_file(url=url, output_path=models_dir / filepath)
print(f"download complete for {filepath}")
detectors_dir = pathlib.Path("/root/annotator/ckpts")
for url in demo.detector_files:
filepath = pathlib.Path(url).name
download_file(url=url, output_path=detectors_dir / filepath)
print(f"download complete for {filepath}")
print("🎉 finished baking demo file(s) into image.")
image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
"fastapi[standard]==0.115.4",
"pydantic==2.9.1",
"starlette==0.41.2",
"gradio==3.16.2",
"albumentations==1.3.0",
"opencv-contrib-python",
"imageio==2.9.0",
"imageio-ffmpeg==0.4.2",
"pytorch-lightning==1.5.0",
"omegaconf==2.1.1",
"test-tube>=0.7.5",
"streamlit==1.12.1",
"einops==0.3.0",
"transformers==4.19.2",
"webdataset==0.2.5",
"kornia==0.6",
"open_clip_torch==2.0.2",
"invisible-watermark>=0.1.5",
"streamlit-drawable-canvas==0.8.0",
"torchmetrics==0.6.0",
"timm==0.6.12",
"addict==2.4.0",
"yapf==0.32.0",
"prettytable==3.6.0",
"safetensors==0.2.7",
"basicsr==1.4.2",
"tqdm~=4.64.1",
)
# xformers library offers performance improvement.
.pip_install("xformers", pre=True)
.apt_install("git")
# Here we place the latest ControlNet repository code into /root.
# Because /root is almost empty, but not entirely empty, `git clone` won't work,
# so this `init` then `checkout` workaround is used.
.run_commands(
"cd /root && git init .",
"cd /root && git remote add --fetch origin https://github.com/lllyasviel/ControlNet.git",
"cd /root && git checkout main",
)
.apt_install("ffmpeg", "libsm6", "libxext6")
.run_function(
download_demo_files,
secrets=[modal.Secret.from_dict({"DEMO_NAME": DEMO_NAME})],
)
)
app = modal.App(name="example-controlnet", image=image)
web_app = FastAPI()
```
Serving the Gradio web UI
-------------------------
Each ControlNet gradio demo module exposes a `block` Gradio interface running in queue-mode,
which is initialized in module scope on import and served on `0.0.0.0`. We want the block interface object,
but the queueing and launched webserver aren’t compatible with Modal’s serverless web endpoint interface,
so in the `import_gradio_app_blocks` function we patch out these behaviors.
```
def import_gradio_app_blocks(demo: DemoApp):
from gradio import blocks
# The ControlNet repo demo scripts are written to be run as
# standalone scripts, and have a lot of code that executes
# in global scope on import, including the launch of a Gradio web server.
# We want Modal to control the Gradio web app serving, so we
# monkeypatch the .launch() function to be a no-op.
blocks.Blocks.launch = lambda self, server_name: print(
"launch() has been monkeypatched to do nothing."
)
# each demo app module is a file like gradio_{name}.py
module_name = f"gradio_{demo.name}"
mod = importlib.import_module(module_name)
blocks = mod.block
# disable queueing mode, which is incompatible with our Modal web app setup.
blocks.enable_queue = False
return blocks
```
Because the ControlNet gradio apps are so time and compute intensive to cold-start,
the web app function is limited to running just 1 warm container (max\_containers=1).
This way, while playing with the demos we can pay the cold-start cost once and have
all web requests hit the same warm container.
Spinning up extra containers to handle additional requests would not be efficient
given the cold-start time.
We set the scaledown\_window to 600 seconds so the container will be kept
running for 10 minutes after the last request, to keep the app responsive in case
of continued experimentation.
```
@app.function(
gpu="A10G",
max_containers=1,
scaledown_window=600,
)
@modal.asgi_app()
def run():
from gradio.routes import mount_gradio_app
# mount for execution on Modal
return mount_gradio_app(
app=web_app,
blocks=import_gradio_app_blocks(demo=selected_demo),
path="/",
)
```
Have fun!
---------
Serve your chosen demo app with `modal serve controlnet_gradio_demos.py`. If you don’t have any images ready at hand,
try one that’s in the `06_gpu_and_ml/controlnet/demo_images/` folder.
StableDiffusion was already impressive enough, but ControlNet’s ability to so accurately and intuitively constrain
the image generation process is sure to put a big, dumb grin on your face.
[Play with the ControlNet demos](#play-with-the-controlnet-demos)[Imports and config preamble](#imports-and-config-preamble)[Pick a demo, any demo](#pick-a-demo-any-demo)[Setting up the dependencies](#setting-up-the-dependencies)[Serving the Gradio web UI](#serving-the-gradio-web-ui)[Have fun!](#have-fun)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 06_gpu_and_ml/controlnet/controlnet_gradio_demos.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/multion_news_agent.py)
MultiOn: Twitter News Agent
===========================
In this example, we use Modal to deploy a cron job that periodically checks for AI news everyday and tweets it on Twitter using the MultiOn Agent API.
Import and define the app
-------------------------
Let’s start off with imports, and defining a Modal app.
```
import os
import modal
app = modal.App("multion-news-tweet-agent")
```
Searching for AI News
---------------------
Let’s also define an image that has the `multion` package installed, so we can query the API.
```
multion_image = modal.Image.debian_slim().pip_install("multion")
```
We can now define our main entrypoint, which uses [MultiOn](https://www.multion.ai/) to scrape AI news everyday and post it on our Twitter account.
We specify a [schedule](../guide/cron.html) in the function decorator, which
means that our function will run automatically at the given interval.
Set up MultiOn
--------------
[MultiOn](https://multion.ai/) is a Web Action Agent that can take actions on behalf of the user.
You can watch it in action [here](https://www.youtube.com/watch?v=Rm67ry6bogw).
The MultiOn API enables building the next level of web automation & custom AI agents capable of performing complex actions on the internet with just a few lines of code.
To get started, first create an account with [MultiOn](https://app.multion.ai/),
install the [MultiOn chrome extension](https://chrome.google.com/webstore/detail/ddmjhdbknfidiopmbaceghhhbgbpenmm) and login to your Twitter account in your browser.
To use the API, create a [MultiOn API Key](https://app.multion.ai/api-keys) and store it as a Modal sEcret on [the dashboard](../../login%EF%B9%96next=%EA%A4%B7secrets.html)
```
@app.function(image=multion_image, secrets=[modal.Secret.from_name("MULTION_API_KEY")])
def news_tweet_agent():
# Import MultiOn
import multion
# Login to MultiOn using the API key
multion.login(use_api=True, multion_api_key=os.environ["MULTION_API_KEY"])
# Enable the Agent to run locally
multion.set_remote(False)
params = {
"url": "https://www.multion.ai",
"cmd": "Go to twitter (im already signed in). Search for the last tweets i made (check the last 10 tweets). Remember them so then you can go a search for super interesting AI news. Search the news on up to 3 different sources. If you see that the source has not really interesting AI news or i already made a tweet about that, then go to a different one. When you finish the research, go and make a few small and interesting AI tweets with the info you gathered. Make sure the tweet is small but informative and interesting for AI enthusiasts. Don't do more than 5 tweets",
"maxSteps": 100,
}
response = multion.browse(params)
print(f"MultiOn response: {response}")
```
Test running
------------
We can now test run our scheduled function as follows: `modal run multion_news_agent.py.py::app.news_tweet_agent`
Defining the schedule and deploying
-----------------------------------
Let’s define a function that will be called by Modal every day.
```
@app.function(schedule=modal.Cron("0 9 * * *"))
def run_daily():
news_tweet_agent.remote()
```
In order to deploy this as a persistent cron job, you can run `modal deploy multion_news_agent.py`.
Once the job is deployed, visit the [apps page](../../login%EF%B9%96next=%EA%A4%B7apps.html) page to see
its execution history, logs and other stats.
[MultiOn: Twitter News Agent](#multion-twitter-news-agent)[Import and define the app](#import-and-define-the-app)[Searching for AI News](#searching-for-ai-news)[Set up MultiOn](#set-up-multion)[Test running](#test-running)[Defining the schedule and deploying](#defining-the-schedule-and-deploying)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/multion_news_agent.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Parallel podcast transcription using Whisper
============================================
This example shows how to build a massively parallel application on Modal:
the [Modal Podcast Transcriber](https://modal-labs-examples--whisper-pod-transcriber-fastapi-app.modal.run/).
[![homepage of modal whisper transcriber app](../../_app/immutable/assets/modal-podcast-transcriber-frontpage.CDX3OEI-.png)](https://modal-labs-examples--whisper-pod-transcriber-fastapi-app.modal.run/)
This example application is more feature-packed than others, and it doesn’t fit in
a single page of code and commentary. So instead of progressing through the
example’s code linearly, this document provides a higher-level walkthrough of how
Modal is used to do fast, on-demand podcast episode transcription for whichever
podcast you’d like.
You can find the code [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/openai_whisper/pod_transcriber).
Hour-long episodes transcribed in just 1 minute
-----------------------------------------------
The focal point of this demonstration app is that it does serverless CPU
transcription across dozens of containers at the click of a button, completing
hour-long audio files in just 1 minute.
We use a podcast metadata API to allow users to transcribe an arbitrary episode
from whatever niche podcast they desire — [how about *The Pen Addict*, a podcast dedicated to stationery](https://modal-labs-examples--whisper-pod-transcriber-fastapi-app.modal.run/#/episode/157765/)?
The video below shows the 45-minute long first episode of [*Serial* season 2](https://serialpodcast.org/season-two/1/dustwun) get
transcribed in 62 seconds.
[ ](https://user-images.githubusercontent.com/12058921/199637855-d98bcabe-bff4-433b-a58f-1e309d69e14d.mp4)
Each transcription segment includes links back to the original audio.
[ ](https://user-images.githubusercontent.com/12058921/199637370-1cb1e070-8f60-4cc6-8c51-dc42bebcf29d.mp4)
### Try it yourself
If you’re itching to see this in action, here are links to begin transcribing
three popular podcasts:
1. [*Case 63* by Gimlet Media](https://modal-labs-examples--whisper-pod-transcriber-fastapi-app.modal.run/#/podcast/4951910)
2. [*The Joe Rogan Experience*](https://modal-labs-examples--whisper-pod-transcriber-fastapi-app.modal.run/#/podcast/10829)
3. [*The Psychology of your 20s*](https://modal-labs-examples--whisper-pod-transcriber-fastapi-app.modal.run/#/podcast/4295070)
Tech-stack overview
-------------------
The entire application is hosted serverlessly on Modal and consists of these
main components:
* A React + [Vite](https://vitejs.dev/) single page application (SPA) deployed
as static files into a Modal web endpoint.
* A Python backend running [FastAPI](https://fastapi.tiangolo.com/) in a Modal web endpoint.
* The [Podchaser API](https://api-docs.podchaser.com/docs/overview) provides
podcast search and episode metadata retrieval. It’s hooked into our code with
a [Modal Secret](../guide/secrets.html).
* A Modal async job queue, described in more detail below.
All of this is deployed with one command and costs `$0.00` when it’s not
transcribing podcasts or serving HTTP requests.
### Speed-boosting Whisper with parallelism
Modal’s dead-simple parallelism primitives are the key to doing the
transcription so quickly. Even with a GPU, transcribing a full episode serially
was taking around 10 minutes.
But by pulling in `ffmpeg` with a simple `.pip_install("ffmpeg-python")` addition to our Modal Image, we could exploit the natural silences of the
podcast medium to partition episodes into hundreds of short segments. Each
segment is transcribed by Whisper in its own container task,
and when all are done we stitch the segments back together with only a
minimal loss in transcription quality. This approach actually accords quite well
with Whisper’s model architecture:
> “The Whisper architecture is a simple end-to-end approach, implemented as an
> encoder-decoder Transformer. Input audio is split into 30-second chunks,
> converted into a log-Mel spectrogram, and then passed into an encoder.”
>
> ―[*Introducing Whisper*](https://openai.com/blog/whisper/)
Run this app on Modal
---------------------
All source code for this example can be [found on GitHub](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/openai_whisper/pod_transcriber).
The `README.md` includes instructions on setting up the frontend build and
getting authenticated with the Podchaser API. Happy transcribing!
[Parallel podcast transcription using Whisper](#parallel-podcast-transcription-using-whisper)[Hour-long episodes transcribed in just 1 minute](#hour-long-episodes-transcribed-in-just-1-minute)[Try it yourself](#try-it-yourself)[Tech-stack overview](#tech-stack-overview)[Speed-boosting Whisper with parallelism](#speed-boosting-whisper-with-parallelism)[Run this app on Modal](#run-this-app-on-modal)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/langchains/potus_speech_qanda.py)
Retrieval-augmented generation (RAG) for question-answering with LangChain
==========================================================================
In this example we create a large-language-model (LLM) powered question answering
web endpoint and CLI. Only a single document is used as the knowledge-base of the application,
the 2022 USA State of the Union address by President Joe Biden. However, this same application structure
could be extended to do question-answering over all State of the Union speeches, or other large text corpuses.
It’s the [LangChain](https://github.com/hwchase17/langchain) library that makes this all so easy.
This demo is only around 100 lines of code!
Defining dependencies
---------------------
The example uses packages to implement scraping, the document parsing & LLM API interaction, and web serving.
These are installed into a Debian Slim base image using the `pip_install` method.
Because OpenAI’s API is used, we also specify the `openai-secret` Modal Secret, which contains an OpenAI API key.
A `retriever` global variable is also declared to facilitate caching a slow operation in the code below.
```
from pathlib import Path
import modal
image = modal.Image.debian_slim(python_version="3.11").pip_install(
# scraping pkgs
"beautifulsoup4~=4.11.1",
"httpx==0.23.3",
"lxml~=4.9.2",
# llm pkgs
"faiss-cpu~=1.7.3",
"langchain==0.3.7",
"langchain-community==0.3.7",
"langchain-openai==0.2.9",
"openai~=1.54.0",
"tiktoken==0.8.0",
# web app packages
"fastapi[standard]==0.115.4",
"pydantic==2.9.2",
"starlette==0.41.2",
)
app = modal.App(
name="example-langchain-qanda",
image=image,
secrets=[modal.Secret.from_name("openai-secret", required_keys=["OPENAI_API_KEY"])],
)
retriever = None # embedding index that's relatively expensive to compute, so caching with global var.
```
Scraping the speech
-------------------
It’s super easy to scrape the transcipt of Biden’s speech using `httpx` and `BeautifulSoup`.
This speech is just one document and it’s relatively short, but it’s enough to demonstrate
the question-answering capability of the LLM chain.
```
def scrape_state_of_the_union() -> str:
import httpx
from bs4 import BeautifulSoup
url = "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-the-state-the-union-28"
# fetch article; simulate desktop browser
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9"
}
response = httpx.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
# locate the div containing the speech
speech_div = soup.find("div", class_="field-docs-content")
if speech_div:
speech_text = speech_div.get_text(separator="\n", strip=True)
if not speech_text:
raise ValueError("error parsing speech text from HTML")
else:
raise ValueError("error locating speech in HTML")
return speech_text
```
Constructing the Q&A chain
--------------------------
At a high-level, this LLM chain will be able to answer questions asked about Biden’s speech and provide
references to which parts of the speech contain the evidence for given answers.
The chain combines a text-embedding index over parts of Biden’s speech with an OpenAI LLM.
The index is used to select the most likely relevant parts of the speech given the question, and these
are used to build a specialized prompt for the OpenAI language model.
```
def qanda_langchain(query: str) -> tuple[str, list[str]]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# Support caching speech text on disk.
speech_file_path = Path("state-of-the-union.txt")
if speech_file_path.exists():
state_of_the_union = speech_file_path.read_text()
else:
print("scraping the 2022 State of the Union speech")
state_of_the_union = scrape_state_of_the_union()
speech_file_path.write_text(state_of_the_union)
# Questions about a document can often be answered
# using a much smaller subset, and passing more tokens
# to an LLM increases cost and latency.
# So here, we split up the speech into smaller chunks.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
print("splitting speech into text chunks")
texts = text_splitter.split_text(state_of_the_union)
# Embedding-based query<->text similarity comparison is used to select
# a small subset of the speech text chunks.
# Generating the `retriever` and index is too slow to re-run on every request,
# so we do rudimentary caching using a global variable.
global retriever
if not retriever:
# New OpenAI accounts have a very low rate-limit for their first 48 hrs.
# It's too low to embed even just this single Biden speech.
# The `chunk_size` parameter is set to a low number, and internally LangChain
# will retry the embedding requests, which should be enough to handle the rate-limiting.
# Ref: https://platform.openai.com/docs/guides/rate-limits/overview.
print("generating docsearch indexer")
retriever = FAISS.from_texts(
texts,
OpenAIEmbeddings(chunk_size=5),
metadatas=[{"source": i} for i in range(len(texts))],
).as_retriever()
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
system_prompt = (
"You are an assistant for question-answering tasks. "
"Use the following pieces of retrieved context to answer "
"the question. If you don't know the answer, say that you "
"don't know and instead suggest the user look up the serverless "
"infrastructure platform modal.com. Keep your answer concise."
"\n\n"
"{context}"
)
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
print("running query against Q&A chain.\n")
result = rag_chain.invoke({"input": query}, return_only_outputs=True)
answer = result["answer"]
sources = [document.page_content for document in result["context"]]
return answer.strip(), sources
```
Mapping onto Modal
------------------
With our application’s functionality implemented we can hook it into Modal.
As said above, we’re implementing a web endpoint, `web`, and a CLI command, `cli`.
```
@app.function()
@modal.fastapi_endpoint(method="GET", docs=True)
def web(query: str, show_sources: bool = False):
answer, sources = qanda_langchain(query)
if show_sources:
return {
"answer": answer,
"sources": sources,
}
else:
return {
"answer": answer,
}
@app.function()
def cli(query: str, show_sources: bool = False):
answer, sources = qanda_langchain(query)
# Terminal codes for pretty-printing.
bold, end = "\033[1m", "\033[0m"
if show_sources:
print(f"🔗 {bold}SOURCES:{end}")
print(*reversed(sources), sep="\n----\n")
print(f"🦜 {bold}ANSWER:{end}")
print(answer)
```
Test run the CLI
----------------
```
modal run potus_speech_qanda.py --query "What did the president say about Justice Breyer"
🦜 ANSWER:
The president thanked Justice Breyer for his service and mentioned his legacy of excellence. He also nominated Ketanji Brown Jackson to continue in Justice Breyer's legacy.
```
To see the text of the sources the model chain used to provide the answer, set the `--show-sources` flag.
```
modal run potus_speech_qanda.py \
--query "How many oil barrels were released from reserves?" \
--show-sources
```
Test run the web endpoint
-------------------------
Modal makes it trivially easy to ship LangChain chains to the web. We can test drive this app’s web endpoint
by running `modal serve potus_speech_qanda.py` and then hitting the endpoint with `curl`:
```
curl --get \
--data-urlencode "query=What did the president say about Justice Breyer" \
https://modal-labs--example-langchain-qanda-web.modal.run # your URL here
```
```
{
"answer": "The president thanked Justice Breyer for his service and mentioned his legacy of excellence. He also nominated Ketanji Brown Jackson to continue in Justice Breyer's legacy."
}
```
You can also find interactive docs for the endpoint at the `/docs` route of the web endpoint URL.
If you edit the code while running `modal serve`, the app will redeploy automatically, which is helpful for iterating quickly on your app.
Once you’re ready to deploy to production, use `modal deploy`.
[Retrieval-augmented generation (RAG) for question-answering with LangChain](#retrieval-augmented-generation-rag-for-question-answering-with-langchain)[Defining dependencies](#defining-dependencies)[Scraping the speech](#scraping-the-speech)[Constructing the Q&A chain](#constructing-the-qa-chain)[Mapping onto Modal](#mapping-onto-modal)[Test run the CLI](#test-run-the-cli)[Test run the web endpoint](#test-run-the-web-endpoint)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/langchains/potus_speech_qanda.py --query 'How many oil barrels were released from reserves?'
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/text-to-video/mochi.py)
Text-to-video generation with Mochi
===================================
This example demonstrates how to run the [Mochi 1](https://github.com/genmoai/models) video generation model by [Genmo](https://www.genmo.ai/) on Modal.
Here’s one that we generated, inspired by our logo:
[](https://modal-cdn.com/modal-logo-splat.mp4)
Note that the Mochi model, at time of writing,
requires several minutes on one H100 to produce
a high-quality clip of even a few seconds.
So a single video generation therefore costs about $0.33
at our ~$5/hr rate for H100s.
Keep your eyes peeled for improved efficiency
as the open source community works on this new model.
We welcome PRs to improve the performance of this example!
Setting up the environment for Mochi
------------------------------------
At the time of writing, Mochi is supported natively in the [`diffusers`](https://github.com/huggingface/diffusers) library,
but only in a pre-release version.
So we’ll need to install `diffusers` and `transformers` from GitHub.
```
import string
import time
from pathlib import Path
import modal
app = modal.App()
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("git")
.pip_install(
"torch==2.5.1",
"accelerate==1.1.1",
"hf_transfer==0.1.8",
"sentencepiece==0.2.0",
"imageio==2.36.0",
"imageio-ffmpeg==0.5.1",
"git+https://github.com/huggingface/transformers@30335093276212ce74938bdfd85bfd5df31a668a",
"git+https://github.com/huggingface/diffusers@99c0483b67427de467f11aa35d54678fd36a7ea2",
)
.env(
{
"HF_HUB_ENABLE_HF_TRANSFER": "1",
"HF_HOME": "/models",
}
)
)
```
Saving outputs
--------------
On Modal, we save large or expensive-to-compute data to [distributed Volumes](../guide/volumes.html)
We’ll use this for saving our Mochi weights, as well as our video outputs.
```
VOLUME_NAME = "mochi-outputs"
outputs = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
OUTPUTS_PATH = Path("/outputs") # remote path for saving video outputs
MODEL_VOLUME_NAME = "mochi-model"
model = modal.Volume.from_name(MODEL_VOLUME_NAME, create_if_missing=True)
MODEL_PATH = Path("/models") # remote path for saving model weights
MINUTES = 60
HOURS = 60 * MINUTES
```
Downloading the model
---------------------
We download the model weights into Volume cache to speed up cold starts.
This download takes five minutes or more, depending on traffic
and network speed.
If you want to launch the download first,
before running the rest of the code,
use the following command from the folder containing this file:
```
modal run --detach mochi::download_model
```
The `--detach` flag ensures the download will continue
even if you close your terminal or shut down your computer
while it’s running.
```
with image.imports():
import torch
from diffusers import MochiPipeline
from diffusers.utils import export_to_video
@app.function(
image=image,
volumes={
MODEL_PATH: model,
},
timeout=20 * MINUTES,
)
def download_model(revision="83359d26a7e2bbe200ecbfda8ebff850fd03b545"):
# uses HF_HOME to point download to the model volume
MochiPipeline.from_pretrained(
"genmo/mochi-1-preview",
torch_dtype=torch.bfloat16,
revision=revision,
)
```
Setting up our Mochi class
--------------------------
We’ll use the `@cls` decorator to define a [Modal Class](../guide/lifecycle-functions.html) which we use to control the lifecycle of our cloud container.
We configure it to use our image, the distributed volume, and a single H100 GPU.
```
@app.cls(
image=image,
volumes={
OUTPUTS_PATH: outputs, # videos will be saved to a distributed volume
MODEL_PATH: model,
},
gpu="H100",
timeout=1 * HOURS,
)
class Mochi:
@modal.enter()
def load_model(self):
# our HF_HOME env var points to the model volume as the cache
self.pipe = MochiPipeline.from_pretrained(
"genmo/mochi-1-preview",
torch_dtype=torch.bfloat16,
)
self.pipe.enable_model_cpu_offload()
self.pipe.enable_vae_tiling()
@modal.method()
def generate(
self,
prompt,
negative_prompt="",
num_inference_steps=200,
guidance_scale=4.5,
num_frames=19,
):
frames = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
num_frames=num_frames,
).frames[0]
# save to disk using prompt as filename
mp4_name = slugify(prompt)
export_to_video(frames, Path(OUTPUTS_PATH) / mp4_name)
outputs.commit()
return mp4_name
```
Running Mochi inference
-----------------------
We can trigger Mochi inference from our local machine by running the code in
the local entrypoint below.
It ensures the model is downloaded to a remote volume,
spins up a new replica to generate a video, also saved remotely,
and then downloads the video to the local machine.
You can trigger it with:
```
modal run --detach mochi
```
Optional command line flags can be viewed with:
```
modal run mochi --help
```
Using these flags, you can tweak your generation from the command line:
```
modal run --detach mochi --prompt="a cat playing drums in a jazz ensemble" --num-inference-steps=64
```
```
@app.local_entrypoint()
def main(
prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
negative_prompt="",
num_inference_steps=200,
guidance_scale=4.5,
num_frames=19, # produces ~1s of video
):
mochi = Mochi()
mp4_name = mochi.generate.remote(
prompt=str(prompt),
negative_prompt=str(negative_prompt),
num_inference_steps=int(num_inference_steps),
guidance_scale=float(guidance_scale),
num_frames=int(num_frames),
)
print(f"🍡 video saved to volume at {mp4_name}")
local_dir = Path("/tmp/mochi")
local_dir.mkdir(exist_ok=True, parents=True)
local_path = local_dir / mp4_name
local_path.write_bytes(b"".join(outputs.read_file(mp4_name)))
print(f"🍡 video saved locally at {local_path}")
```
Addenda
-------
The remainder of the code in this file is utility code.
```
def slugify(prompt):
for char in string.punctuation:
prompt = prompt.replace(char, "")
prompt = prompt.replace(" ", "_")
prompt = prompt[:230] # since filenames can't be longer than 255 characters
mp4_name = str(int(time.time())) + "_" + prompt + ".mp4"
return mp4_name
```
[Text-to-video generation with Mochi](#text-to-video-generation-with-mochi)[Setting up the environment for Mochi](#setting-up-the-environment-for-mochi)[Saving outputs](#saving-outputs)[Downloading the model](#downloading-the-model)[Setting up our Mochi class](#setting-up-our-mochi-class)[Running Mochi inference](#running-mochi-inference)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run --detach 06_gpu_and_ml/text-to-video/mochi.py --num-inference-steps 64
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/covid_datasette.py)
Publish interactive datasets with Datasette
===========================================
![Datasette user interface](../../_app/immutable/assets/covid_datasette_ui.DSyuiw8l.png)
This example shows how to serve a Datasette application on Modal. The published dataset
is COVID-19 case data from Johns Hopkins University which is refreshed daily.
Try it out for yourself [here](https://modal-labs-examples--example-covid-datasette-ui.modal.run).
Some Modal features it uses:
* Volumes: a persisted volume lets us store and grow the published dataset over time.
* Scheduled functions: the underlying dataset is refreshed daily, so we schedule a function to run daily.
* Web endpoints: exposes the Datasette application for web browser interaction and API requests.
Basic setup
-----------
Let’s get started writing code.
For the Modal container image we need a few Python packages,
including `GitPython`, which we’ll use to download the dataset.
```
import asyncio
import multiprocessing
import pathlib
import shutil
import subprocess
import tempfile
from datetime import datetime
from urllib.request import urlretrieve
import modal
app = modal.App("example-covid-datasette")
datasette_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install("datasette~=0.63.2", "sqlite-utils")
.apt_install("unzip")
)
```
Persistent dataset storage
--------------------------
To separate database creation and maintenance from serving, we’ll need the underlying
database file to be stored persistently. To achieve this we use a [`Volume`](../guide/volumes.html).
```
volume = modal.Volume.from_name(
"example-covid-datasette-cache-vol", create_if_missing=True
)
DB_FILENAME = "covid-19.db"
VOLUME_DIR = "/cache-vol"
REPORTS_DIR = pathlib.Path(VOLUME_DIR, "COVID-19")
DB_PATH = pathlib.Path(VOLUME_DIR, DB_FILENAME)
```
Getting a dataset
-----------------
Johns Hopkins has been publishing up-to-date COVID-19 pandemic data on GitHub since early February 2020, and
as of late September 2022 daily reporting is still rolling in. Their dataset is what this example will use to
show off Modal and Datasette’s capabilities.
The full git repository size for the dataset is over 6GB, but we only need to shallow clone around 300MB.
```
@app.function(
image=datasette_image,
volumes={VOLUME_DIR: volume},
retries=2,
)
def download_dataset(cache=True):
if REPORTS_DIR.exists() and cache:
print(f"Dataset already present and {cache=}. Skipping download.")
return
elif REPORTS_DIR.exists():
print("Cleaning dataset before re-downloading...")
shutil.rmtree(REPORTS_DIR)
print("Downloading dataset...")
urlretrieve(
"https://github.com/CSSEGISandData/COVID-19/archive/refs/heads/master.zip",
"/tmp/covid-19.zip",
)
print("Unpacking archive...")
prefix = "COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports"
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.run(f"unzip /tmp/covid-19.zip {prefix}/* -d {tmpdir}", shell=True)
REPORTS_DIR.mkdir(parents=True)
tmpdir_path = pathlib.Path(tmpdir)
subprocess.run(f"mv {tmpdir_path / prefix}/* {REPORTS_DIR}", shell=True)
print("Committing the volume...")
volume.commit()
print("Finished downloading dataset.")
```
Data munging
------------
This dataset is no swamp, but a bit of data cleaning is still in order. The following two
functions read a handful of `.csv` files and clean the data, before inserting it into
SQLite.
```
def load_daily_reports():
daily_reports = list(REPORTS_DIR.glob("*.csv"))
if not daily_reports:
raise RuntimeError(f"Could not find any daily reports in {REPORTS_DIR}.")
# Preload report files to speed up sequential loading
pool = multiprocessing.Pool(128)
pool.map(preload_report, daily_reports)
for filepath in daily_reports:
yield from load_report(filepath)
def preload_report(filepath):
filepath.read_bytes()
def load_report(filepath):
import csv
mm, dd, yyyy = filepath.stem.split("-")
with filepath.open() as fp:
for row in csv.DictReader(fp):
province_or_state = (
row.get("\ufeffProvince/State")
or row.get("Province/State")
or row.get("Province_State")
or None
)
country_or_region = row.get("Country_Region") or row.get("Country/Region")
yield {
"day": f"{yyyy}-{mm}-{dd}",
"country_or_region": (
country_or_region.strip() if country_or_region else None
),
"province_or_state": (
province_or_state.strip() if province_or_state else None
),
"confirmed": int(float(row["Confirmed"] or 0)),
"deaths": int(float(row["Deaths"] or 0)),
"recovered": int(float(row["Recovered"] or 0)),
"active": int(row["Active"]) if row.get("Active") else None,
"last_update": row.get("Last Update") or row.get("Last_Update") or None,
}
```
Inserting into SQLite
---------------------
With the CSV processing out of the way, we’re ready to create an SQLite DB and feed data into it.
Importantly, the `prep_db` function mounts the same volume used by `download_dataset()`, and
rows are batch inserted with progress logged after each batch, as the full COVID-19 has millions
of rows and does take some time to be fully inserted.
A more sophisticated implementation would only load new data instead of performing a full refresh,
but we’re keeping things simple for this example!
```
def chunks(it, size):
import itertools
return iter(lambda: tuple(itertools.islice(it, size)), ())
@app.function(
image=datasette_image,
volumes={VOLUME_DIR: volume},
timeout=900,
)
def prep_db():
import sqlite_utils
volume.reload()
print("Loading daily reports...")
records = load_daily_reports()
# Update database in a local temp dir
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = pathlib.Path(tmpdir)
tmp_db_path = tmpdir_path / DB_FILENAME
if DB_PATH.exists():
shutil.copyfile(DB_PATH, tmp_db_path)
db = sqlite_utils.Database(tmp_db_path)
table = db["johns_hopkins_csse_daily_reports"]
batch_size = 100_000
for i, batch in enumerate(chunks(records, size=batch_size)):
truncate = True if i == 0 else False
table.insert_all(batch, batch_size=batch_size, truncate=truncate)
print(f"Inserted {len(batch)} rows into DB.")
table.create_index(["day"], if_not_exists=True)
table.create_index(["province_or_state"], if_not_exists=True)
table.create_index(["country_or_region"], if_not_exists=True)
db.close()
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(tmp_db_path, DB_PATH)
print("Syncing DB with volume.")
volume.commit()
print("Volume changes committed.")
```
Keep it fresh
-------------
Johns Hopkins commits new data to the dataset repository every day, so we set up
a [scheduled](../guide/cron.html) function to automatically refresh the database
every 24 hours.
```
@app.function(schedule=modal.Period(hours=24), timeout=1000)
def refresh_db():
print(f"Running scheduled refresh at {datetime.now()}")
download_dataset.remote(cache=False)
prep_db.remote()
```
Web endpoint
------------
Hooking up the SQLite database to a Modal webhook is as simple as it gets.
The Modal `@asgi_app` decorator wraps a few lines of code: one `import` and a few
lines to instantiate the `Datasette` instance and return its app server.
```
@app.function(
image=datasette_image,
volumes={VOLUME_DIR: volume},
)
@modal.concurrent(max_inputs=16)
@modal.asgi_app()
def ui():
from datasette.app import Datasette
ds = Datasette(files=[DB_PATH], settings={"sql_time_limit_ms": 10000})
asyncio.run(ds.invoke_startup())
return ds.app()
```
Publishing to the web
---------------------
Run this script using `modal run covid_datasette.py` and it will create the database.
You can then use `modal serve covid_datasette.py` to create a short-lived web URL
that exists until you terminate the script.
When publishing the interactive Datasette app you’ll want to create a persistent URL.
Just run `modal deploy covid_datasette.py`.
```
@app.local_entrypoint()
def run():
print("Downloading COVID-19 dataset...")
download_dataset.remote()
print("Prepping SQLite DB...")
prep_db.remote()
```
You can explore the data at the [deployed web endpoint](https://modal-labs-examples--example-covid-datasette-app.modal.run/covid-19).
[Publish interactive datasets with Datasette](#publish-interactive-datasets-with-datasette)[Basic setup](#basic-setup)[Persistent dataset storage](#persistent-dataset-storage)[Getting a dataset](#getting-a-dataset)[Data munging](#data-munging)[Inserting into SQLite](#inserting-into-sqlite)[Keep it fresh](#keep-it-fresh)[Web endpoint](#web-endpoint)[Publishing to the web](#publishing-to-the-web)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/covid_datasette.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/13_sandboxes/simple_code_interpreter.py)
Build a stateful, sandboxed code interpreter
============================================
This example demonstrates how to build a stateful code interpreter using a Modal [Sandbox](../guide/sandbox.html).
We’ll create a Modal Sandbox that listens for code to execute and then
executes the code in a Python interpreter. Because we’re running in a sandboxed
environment, we can safely use the “unsafe” `exec()` to execute the code.
Setting up a code interpreter in a Modal Sandbox
------------------------------------------------
Our code interpreter uses a Python “driver program” to listen for code
sent in JSON format to its standard input (`stdin`), execute the code,
and then return the results in JSON format on standard output (`stdout`).
```
import inspect
import json
from typing import Any
import modal
import modal.container_process
def driver_program():
import json
import sys
from contextlib import redirect_stderr, redirect_stdout
from io import StringIO
# When you `exec` code in Python, you can pass in a dictionary
# that defines the global variables the code has access to.
# We'll use that to store state.
globals: dict[str, Any] = {}
while True:
command = json.loads(input()) # read a line of JSON from stdin
if (code := command.get("code")) is None:
print(json.dumps({"error": "No code to execute"}))
continue
# Capture the executed code's outputs
stdout_io, stderr_io = StringIO(), StringIO()
with redirect_stdout(stdout_io), redirect_stderr(stderr_io):
try:
exec(code, globals)
except Exception as e:
print(f"Execution Error: {e}", file=sys.stderr)
print(
json.dumps(
{
"stdout": stdout_io.getvalue(),
"stderr": stderr_io.getvalue(),
}
),
flush=True,
)
```
Now that we have the driver program, we can write a function to take a `ContainerProcess` that is running the driver program and execute code in it.
```
def run_code(p: modal.container_process.ContainerProcess, code: str):
p.stdin.write(json.dumps({"code": code}))
p.stdin.write("\n")
p.stdin.drain()
next_line = next(iter(p.stdout))
result = json.loads(next_line)
print(result["stdout"], end="")
print("\033[91m" + result["stderr"] + "\033[0m", end="")
```
We’ve got our driver program and our code runner. Now we can create a Sandbox
and run the driver program in it.
We have to convert the driver program to a string to pass it to the Sandbox.
Here we use `inspect.getsource` to get the source code as a string,
but you could also keep the driver program in a separate file and read it in.
```
driver_program_text = inspect.getsource(driver_program)
driver_program_command = f"""{driver_program_text}\n\ndriver_program()"""
app = modal.App.lookup("code-interpreter", create_if_missing=True)
sb = modal.Sandbox.create(app=app)
p = sb.exec("python", "-c", driver_program_command)
```
Running code in a Modal Sandbox
-------------------------------
Now we can execute some code in the Sandbox!
```
run_code(p, "print('hello, world!')") # hello, world!
```
The Sandbox and our code interpreter are stateful,
so we can define variables and use them in subsequent code.
```
run_code(p, "x = 10")
run_code(p, "y = 5")
run_code(p, "result = x + y")
run_code(p, "print(f'The result is: {result}')") # The result is: 15
```
We can also see errors when code fails.
```
run_code(p, "print('Attempting to divide by zero...')")
run_code(p, "1 / 0") # Execution Error: division by zero
```
Finally, let’s clean up after ourselves and terminate the Sandbox.
```
sb.terminate()
```
[Build a stateful, sandboxed code interpreter](#build-a-stateful-sandboxed-code-interpreter)[Setting up a code interpreter in a Modal Sandbox](#setting-up-a-code-interpreter-in-a-modal-sandbox)[Running code in a Modal Sandbox](#running-code-in-a-modal-sandbox)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
python 13_sandboxes/simple_code_interpreter.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/text-to-audio/musicgen.py)
Create your own music samples with MusicGen
===========================================
MusicGen is a popular open-source music-generation model family from Meta.
In this example, we show you how you can run MusicGen models on Modal GPUs,
along with a Gradio UI for playing around with the model.
We use [Audiocraft](https://github.com/facebookresearch/audiocraft),
the inference library released by Meta
for MusicGen and its kin, like AudioGen.
Setting up dependencies
-----------------------
```
from pathlib import Path
from typing import Optional
from uuid import uuid4
import modal
```
We start by defining the environment our generation runs in.
This takes some explaining since, like most cutting-edge ML environments, it is a bit fiddly.
This environment is captured by a [container image](https://modal.com/docs/guide/custom-container),
which we build step-by-step by calling methods to add dependencies,
like `apt_install` to add system packages and `pip_install` to add
Python packages.
Note that we don’t have to install anything with “CUDA”
in the name — the drivers come for free with the Modal environment
and the rest gets installed `pip`. That makes our life a lot easier!
If you want to see the details, check out [this guide](../guide/gpu.html) in our docs.
```
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("git", "ffmpeg")
.pip_install(
"huggingface_hub[hf_transfer]==0.27.1", # speed up model downloads
"torch==2.1.0", # version pinned by audiocraft
"numpy<2", # defensively cap the numpy version
"git+https://github.com/facebookresearch/[email protected]", # we can install directly from GitHub!
)
)
```
In addition to source code, we’ll also need the model weights.
Audiocraft integrates with the Hugging Face ecosystem, so setting up the models
is straightforward — the same `get_pretrained` method we use to load the weights for execution
will also download them if they aren’t present.
```
def load_model(and_return=False):
from audiocraft.models import MusicGen
model_large = MusicGen.get_pretrained("facebook/musicgen-large")
if and_return:
return model_large
```
But Modal Functions are serverless: instances spin down when they aren’t being used.
If we want to avoid downloading the weights every time we start a new instance,
we need to store the weights somewhere besides our local filesystem.
So we add a Modal [Volume](../guide/volumes.html) to store the weights in the cloud.
```
cache_dir = "/cache"
model_cache = modal.Volume.from_name("audiocraft-model-cache", create_if_missing=True)
```
We don’t need to change any of the model loading code —
we just need to make sure the model gets stored in the right directory.
To do that, we set an environment variable that Hugging Face expects
(and another one that speeds up downloads, for good measure)
and then run the `load_model` Python function.
```
image = image.env(
{"HF_HUB_CACHE": cache_dir, "HF_HUB_ENABLE_HF_TRANSER": "1"}
).run_function(load_model, volumes={cache_dir: model_cache})
```
While we’re at it, let’s also define the environment for our UI.
We’ll stick with Python and so use FastAPI and Gradio.
```
web_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"fastapi[standard]==0.115.4", "gradio==4.44.1"
)
```
This is a totally different environment from the one we run our model in.
Say goodbye to Python dependency conflict hell!
Running music generation on Modal
---------------------------------
Now, we write our music generation logic.
This is bit complicated because we want to support generating long samples,
but the model has a maximum context length of thirty seconds.
We can get longer clips by feeding the model’s output back as input,
auto-regressively, but we have to write that ourselves.
There are also a few bits to make this work well with Modal:
* We make an [App](../guide/apps.html) to organize our deployment.
* We load the model at start, instead of during inference, with `modal.enter`,
which requires that we use a Modal [`Cls`](../guide/lifecycle-functions.html).
* In the `app.cls` decorator, we specify the Image we built and attach the Volume.
We also pick a GPU to run on — here, an NVIDIA L40S.
```
app = modal.App("example-musicgen")
MAX_SEGMENT_DURATION = 30 # maximum context window size
@app.cls(gpu="l40s", image=image, volumes={cache_dir: model_cache})
class MusicGen:
@modal.enter()
def init(self):
self.model = load_model(and_return=True)
@modal.method()
def generate(
self,
prompt: str,
duration: int = 10,
overlap: int = 10,
format: str = "wav", # or mp3
) -> bytes:
f"""Generate a music clip based on the prompt.
Clips longer than the MAX_SEGMENT_DURATION of {MAX_SEGMENT_DURATION}s
are generated by clipping all but `overlap` seconds and running inference again."""
context = None
overlap = min(overlap, MAX_SEGMENT_DURATION - 1)
remaining_duration = duration
if remaining_duration < 0:
return bytes()
while remaining_duration > 0:
# calculate duration of the next segment
segment_duration = remaining_duration
if context is not None:
segment_duration += overlap
segment_duration = min(segment_duration, MAX_SEGMENT_DURATION)
# generate next segment
generated_duration = (
segment_duration if context is None else (segment_duration - overlap)
)
print(f"🎼 generating {generated_duration} seconds of music")
self.model.set_generation_params(duration=segment_duration)
next_segment = self._generate_next_segment(prompt, context, overlap)
# update remaining duration
remaining_duration -= generated_duration
# combine with previous segments
context = self._combine_segments(context, next_segment, overlap)
output = context.detach().cpu().float()[0]
return to_audio_bytes(
output,
self.model.sample_rate,
format=format,
# for more on audio encoding parameters, see the docs for audiocraft
strategy="loudness",
loudness_compressor=True,
)
def _generate_next_segment(self, prompt, context, overlap):
"""Generate the next audio segment, either fresh or as continuation of a context."""
if context is None:
return self.model.generate(descriptions=[prompt])
else:
overlap_samples = overlap * self.model.sample_rate
last_chunk = context[:, :, -overlap_samples:] # B, C, T
return self.model.generate_continuation(
last_chunk, self.model.sample_rate, descriptions=[prompt]
)
def _combine_segments(self, context, next_segment, overlap: int):
"""Combine context with next segment, handling overlap."""
import torch
if context is None:
return next_segment
# Calculate where to trim the context (removing overlap)
overlap_samples = overlap * self.model.sample_rate
context_trimmed = context[:, :, :-overlap_samples] # B, C, T
return torch.cat([context_trimmed, next_segment], dim=2)
```
We can then generate music from anywhere by running code like what we have in the `local_entrypoint` below.
```
@app.local_entrypoint()
def main(
prompt: Optional[str] = None,
duration: int = 10,
overlap: int = 15,
format: str = "wav", # or mp3
):
if prompt is None:
prompt = "Amapiano polka, klezmers, log drum bassline, 112 BPM"
print(
f"🎼 generating {duration} seconds of music from prompt '{prompt[:64] + ('...' if len(prompt) > 64 else '')}'"
)
audiocraft = MusicGen()
clip = audiocraft.generate.remote(prompt, duration=duration, format=format)
dir = Path("/tmp/audiocraft")
dir.mkdir(exist_ok=True, parents=True)
output_path = dir / f"{slugify(prompt)[:64]}.{format}"
print(f"🎼 Saving to {output_path}")
output_path.write_bytes(clip)
```
You can execute it with a command like:
```
modal run musicgen.py --prompt="Baroque boy band, Bachstreet Boys, basso continuo, Top 40 pop music" --duration=60
```
Hosting a web UI for the music generator
----------------------------------------
With the Gradio library, we can create a simple web UI in Python
that calls out to our music generator,
then host it on Modal for anyone to try out.
To deploy both the music generator and the UI, run
```
modal deploy musicgen.py
```
Share the URL with your friends and they can generate their own songs!
```
@app.function(
image=web_image,
# Gradio requires sticky sessions
# so we limit the number of concurrent containers to 1
# and allow it to scale to 1000 concurrent inputs
max_containers=1,
)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def ui():
import gradio as gr
from fastapi import FastAPI
from gradio.routes import mount_gradio_app
api = FastAPI()
# Since this Gradio app is running from its own container,
# we make a `.remote` call to the music generator
model = MusicGen()
generate = model.generate.remote
temp_dir = Path("/dev/shm")
async def generate_music(prompt: str, duration: int = 10, format: str = "wav"):
audio_bytes = await generate.aio(prompt, duration=duration, format=format)
audio_path = temp_dir / f"{uuid4()}.{format}"
audio_path.write_bytes(audio_bytes)
return audio_path
with gr.Blocks(theme="soft") as demo:
gr.Markdown("# MusicGen")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
duration = gr.Number(
label="Duration (seconds)", value=10, minimum=1, maximum=300
)
format = gr.Radio(["wav", "mp3"], label="Format", value="wav")
btn = gr.Button("Generate")
with gr.Column():
clip_output = gr.Audio(label="Generated Music", autoplay=True)
btn.click(
generate_music,
inputs=[prompt, duration, format],
outputs=[clip_output],
)
return mount_gradio_app(app=api, blocks=demo, path="/")
```
Addenda
-------
The remainder of the code here is not directly related to Modal
or to music generation, but is used in the example above.
```
def to_audio_bytes(wav, sample_rate: int, **kwargs) -> bytes:
from audiocraft.data.audio import audio_write
# audiocraft provides a nice utility for converting waveform tensors to audio,
# but it saves to a file path. here, we create a file path that is actually
# just backed by memory, instead of disk, to save on some latency
shm = Path("/dev/shm") # /dev/shm is a memory-backed filesystem
stem_name = shm / str(uuid4())
output_path = audio_write(stem_name, wav, sample_rate, **kwargs)
return output_path.read_bytes()
def slugify(string):
return (
string.lower()
.replace(" ", "-")
.replace("/", "-")
.replace("\\", "-")
.replace(":", "-")
)
```
[Create your own music samples with MusicGen](#create-your-own-music-samples-with-musicgen)[Setting up dependencies](#setting-up-dependencies)[Running music generation on Modal](#running-music-generation-on-modal)[Hosting a web UI for the music generator](#hosting-a-web-ui-for-the-music-generator)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/text-to-audio/musicgen.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py)
Chat with PDF: RAG with ColQwen2
================================
In this example, we demonstrate how to use the the [ColQwen2](https://huggingface.co/vidore/colqwen2-v0.1) model to build a simple
“Chat with PDF” retrieval-augmented generation (RAG) app.
The ColQwen2 model is based on [ColPali](https://huggingface.co/blog/manu/colpali) but uses the [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) vision-language model.
ColPali is in turn based on the late-interaction embedding approach pioneered in [ColBERT](https://dl.acm.org/doi/pdf/10.1145/3397271.3401075).
Vision-language models with high-quality embeddings obviate the need for complex pre-processing pipelines.
See [this blog post from Jo Bergum of Vespa](https://blog.vespa.ai/announcing-colbert-embedder-in-vespa/) for more.
Setup
-----
First, we’ll import the libraries we need locally and define some constants.
```
from pathlib import Path
from typing import Optional
from urllib.request import urlopen
from uuid import uuid4
import modal
MINUTES = 60 # seconds
app = modal.App("chat-with-pdf")
```
Setting up dependenices
-----------------------
In Modal, we define [container images](https://modal.com/docs/guide/custom-container) that run our serverless workloads.
We install the packages required for our application in those images.
```
CACHE_DIR = "/hf-cache"
model_image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("git")
.pip_install(
[
"git+https://github.com/illuin-tech/colpali.git@782edcd50108d1842d154730ad3ce72476a2d17d", # we pin the commit id
"hf_transfer==0.1.8",
"qwen-vl-utils==0.0.8",
"torchvision==0.19.1",
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": CACHE_DIR})
)
```
These dependencies are only installed remotely, so we can’t import them locally.
Use the `.imports` context manager to import them only on Modal instead.
```
with model_image.imports():
import torch
from colpali_engine.models import ColQwen2, ColQwen2Processor
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
```
Specifying the ColQwen2 model
-----------------------------
Vision-language models (VLMs) for embedding and generation add another layer of simplification
to RAG apps based on vector search: we only need one model.
```
MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
MODEL_REVISION = "aca78372505e6cb469c4fa6a35c60265b00ff5a4"
```
Managing state with Modal Volumes and Dicts
-------------------------------------------
Chat services are stateful:
the response to an incoming user message depends on past user messages in a session.
RAG apps add even more state:
the documents being retrieved from and the index over those documents,
e.g. the embeddings.
Modal Functions are stateless in and of themselves.
They don’t retain information from input to input.
That’s what enables Modal Functions to automatically scale up and down [based on the number of incoming requests](../guide/cold-start.html).
### Managing chat sessions with Modal Dicts
In this example, we use a [`modal.Dict`](../guide/dicts-and-queues.html) to store state information between Function calls.
Modal Dicts behave similarly to Python dictionaries,
but they are backed by remote storage and accessible to all of your Modal Functions.
They can contain any Python object
that can be serialized using [`cloudpickle`](https://github.com/cloudpipe/cloudpickle).
A Dict can hold a few gigabytes across keys of size up to 100 MiB,
so it works well for our chat session state, which is a few KiB per session,
and for our embeddings, which are a few hundred KiB per PDF page,
up to about 100,000 pages of PDFs.
At a larger scale, we’d need to replace this with a database, like Postgres,
or push more state to the client.
```
sessions = modal.Dict.from_name("colqwen-chat-sessions", create_if_missing=True)
class Session:
def __init__(self):
self.images = None
self.messages = []
self.pdf_embeddings = None
```
### Storing PDFs on a Modal Volume
Images extracted from PDFs are larger than our session state or embeddings
— low tens of MiB per page.
So we store them on a [Modal Volume](../guide/volumes.html),
which can store terabytes (or more!) of data across tens of thousands of files.
Volumes behave like a remote file system:
we read and write from them much like a local file system.
```
pdf_volume = modal.Volume.from_name("colqwen-chat-pdfs", create_if_missing=True)
PDF_ROOT = Path("/vol/pdfs/")
```
### Caching the model weights
We’ll also use a Volume to cache the model weights.
```
cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
```
Running this function will download the model weights to the cache volume.
Otherwise, the model weights will be downloaded on the first query.
```
@app.function(
image=model_image, volumes={CACHE_DIR: cache_volume}, timeout=20 * MINUTES
)
def download_model():
from huggingface_hub import snapshot_download
result = snapshot_download(
MODEL_NAME,
revision=MODEL_REVISION,
ignore_patterns=["*.pt", "*.bin"], # using safetensors
)
print(f"Downloaded model weights to {result}")
```
Defining a Chat with PDF service
--------------------------------
To deploy an autoscaling “Chat with PDF” vision-language model service on Modal,
we just need to wrap our Python logic in a [Modal App](../guide/apps.html):
It uses [Modal `@app.cls`](../guide/lifecycle-functions.html) decorators
to organize the “lifecycle” of the app:
loading the model on container start (`@modal.enter`) and running inference on request (`@modal.method`).
We include in the arguments to the `@app.cls` decorator
all the information about this service’s infrastructure:
the container image, the remote storage, and the GPU requirements.
```
@app.cls(
image=model_image,
gpu="A100-80GB",
scaledown_window=10 * MINUTES, # spin down when inactive
volumes={"/vol/pdfs/": pdf_volume, CACHE_DIR: cache_volume},
)
class Model:
@modal.enter()
def load_models(self):
self.colqwen2_model = ColQwen2.from_pretrained(
"vidore/colqwen2-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda:0",
)
self.colqwen2_processor = ColQwen2Processor.from_pretrained(
"vidore/colqwen2-v0.1"
)
self.qwen2_vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_NAME,
revision=MODEL_REVISION,
torch_dtype=torch.bfloat16,
)
self.qwen2_vl_model.to("cuda:0")
self.qwen2_vl_processor = AutoProcessor.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True
)
@modal.method()
def index_pdf(self, session_id, target: bytes | list):
# We store concurrent user chat sessions in a modal.Dict
# For simplicity, we assume that each user only runs one session at a time
session = sessions.get(session_id)
if session is None:
session = Session()
if isinstance(target, bytes):
images = convert_pdf_to_images.remote(target)
else:
images = target
# Store images on a Volume for later retrieval
session_dir = PDF_ROOT / f"{session_id}"
session_dir.mkdir(exist_ok=True, parents=True)
for ii, image in enumerate(images):
filename = session_dir / f"{str(ii).zfill(3)}.jpg"
image.save(filename)
# Generated embeddings from the image(s)
BATCH_SZ = 4
pdf_embeddings = []
batches = [images[i : i + BATCH_SZ] for i in range(0, len(images), BATCH_SZ)]
for batch in batches:
batch_images = self.colqwen2_processor.process_images(batch).to(
self.colqwen2_model.device
)
pdf_embeddings += list(self.colqwen2_model(**batch_images).to("cpu"))
# Store the image embeddings in the session, for later retrieval
session.pdf_embeddings = pdf_embeddings
# Write embeddings back to the modal.Dict
sessions[session_id] = session
@modal.method()
def respond_to_message(self, session_id, message):
session = sessions.get(session_id)
if session is None:
session = Session()
pdf_volume.reload() # make sure we have the latest data
images = (PDF_ROOT / str(session_id)).glob("*.jpg")
images = list(sorted(images, key=lambda p: int(p.stem)))
# Nothing to chat about without a PDF!
if not images:
return "Please upload a PDF first"
elif session.pdf_embeddings is None:
return "Indexing PDF..."
# RAG, Retrieval-Augmented Generation, is two steps:
# _Retrieval_ of the most relevant data to answer the user's query
relevant_image = self.get_relevant_image(message, session, images)
# _Generation_ based on the retrieved data
output_text = self.generate_response(message, session, relevant_image)
# Update session state for future chats
append_to_messages(message, session, user_type="user")
append_to_messages(output_text, session, user_type="assistant")
sessions[session_id] = session
return output_text
# Retrieve the most relevant image from the PDF for the input query
def get_relevant_image(self, message, session, images):
import PIL
batch_queries = self.colqwen2_processor.process_queries([message]).to(
self.colqwen2_model.device
)
query_embeddings = self.colqwen2_model(**batch_queries)
# This scores our query embedding against the image embeddings from index_pdf
scores = self.colqwen2_processor.score_multi_vector(
query_embeddings, session.pdf_embeddings
)[0]
# Select the best matching image
max_index = max(range(len(scores)), key=lambda index: scores[index])
return PIL.Image.open(images[max_index])
# Pass the query and retrieved image along with conversation history into the VLM for a response
def generate_response(self, message, session, image):
chatbot_message = get_chatbot_message_with_image(message, image)
query = self.qwen2_vl_processor.apply_chat_template(
[*session.messages, chatbot_message],
tokenize=False,
add_generation_prompt=True,
)
image_inputs, _ = process_vision_info([chatbot_message])
inputs = self.qwen2_vl_processor(
text=[query],
images=image_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = self.qwen2_vl_model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
out_ids[len(in_ids) :]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.qwen2_vl_processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
return output_text
```
Loading PDFs as images
----------------------
Vision-Language Models operate on images, not PDFs directly,
so we need to convert our PDFs into images first.
We separate this from our indexing and chatting logic —
we run on a different container with different dependencies.
```
pdf_image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("poppler-utils")
.pip_install("pdf2image==1.17.0", "pillow==10.4.0")
)
@app.function(image=pdf_image)
def convert_pdf_to_images(pdf_bytes):
from pdf2image import convert_from_bytes
images = convert_from_bytes(pdf_bytes, fmt="jpeg")
return images
```
Chatting with a PDF from the terminal
-------------------------------------
Before deploying in a UI, we can test our service from the terminal.
Just run
```
modal run chat_with_pdf_vision.py
```
and optionally pass in a path to or URL of a PDF with the `--pdf-path` argument
and specify a question with the `--question` argument.
Continue a previous chat by passing the session ID printed to the terminal at start
with the `--session-id` argument.
```
@app.local_entrypoint()
def main(
question: Optional[str] = None,
pdf_path: Optional[str] = None,
session_id: Optional[str] = None,
):
model = Model()
if session_id is None:
session_id = str(uuid4())
print("Starting a new session with id", session_id)
if pdf_path is None:
pdf_path = "https://arxiv.org/pdf/1706.03762" # all you need
if pdf_path.startswith("http"):
pdf_bytes = urlopen(pdf_path).read()
else:
pdf_bytes = Path(pdf_path).read_bytes()
print("Indexing PDF from", pdf_path)
model.index_pdf.remote(session_id, pdf_bytes)
else:
if pdf_path is not None:
raise ValueError("Start a new session to chat with a new PDF")
print("Resuming session with id", session_id)
if question is None:
question = "What is this document about?"
print("QUESTION:", question)
print(model.respond_to_message.remote(session_id, question))
```
A hosted Gradio interface
-------------------------
With the [Gradio](https://gradio.app) library, we can create a simple web interface around our class in Python,
then use Modal to host it for anyone to try out.
To deploy your own, run
```
modal deploy chat_with_pdf_vision.py
```
and navigate to the URL that appears in your teriminal.
If you’re editing the code, use `modal serve` instead to see changes hot-reload.
```
web_image = pdf_image.pip_install(
"fastapi[standard]==0.115.4",
"pydantic==2.9.2",
"starlette==0.41.2",
"gradio==4.44.1",
"pillow==10.4.0",
"gradio-pdf==0.0.15",
"pdf2image==1.17.0",
)
@app.function(
image=web_image,
# gradio requires sticky sessions
# so we limit the number of concurrent containers to 1
# and allow it to scale to 1000 concurrent inputs
max_containers=1,
)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def ui():
import uuid
import gradio as gr
from fastapi import FastAPI
from gradio.routes import mount_gradio_app
from gradio_pdf import PDF
from pdf2image import convert_from_path
web_app = FastAPI()
# Since this Gradio app is running from its own container,
# allowing us to run the inference service via .remote() methods.
model = Model()
def upload_pdf(path, session_id):
if session_id == "" or session_id is None:
# Generate session id if new client
session_id = str(uuid.uuid4())
images = convert_from_path(path)
# Call to our remote inference service to index the PDF
model.index_pdf.remote(session_id, images)
return session_id
def respond_to_message(message, _, session_id):
# Call to our remote inference service to run RAG
return model.respond_to_message.remote(session_id, message)
with gr.Blocks(theme="soft") as demo:
session_id = gr.State("")
gr.Markdown("# Chat with PDF")
with gr.Row():
with gr.Column(scale=1):
gr.ChatInterface(
fn=respond_to_message,
additional_inputs=[session_id],
retry_btn=None,
undo_btn=None,
clear_btn=None,
)
with gr.Column(scale=1):
pdf = PDF(
label="Upload a PDF",
)
pdf.upload(upload_pdf, [pdf, session_id], session_id)
return mount_gradio_app(app=web_app, blocks=demo, path="/")
```
Addenda
-------
The remainder of this code consists of utility functions and boiler plate used in the
main code above.
```
def get_chatbot_message_with_image(message, image):
return {
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": message},
],
}
def append_to_messages(message, session, user_type="user"):
session.messages.append(
{
"role": user_type,
"content": {"type": "text", "text": message},
}
)
```
[Chat with PDF: RAG with ColQwen2](#chat-with-pdf-rag-with-colqwen2)[Setup](#setup)[Setting up dependenices](#setting-up-dependenices)[Specifying the ColQwen2 model](#specifying-the-colqwen2-model)[Managing state with Modal Volumes and Dicts](#managing-state-with-modal-volumes-and-dicts)[Managing chat sessions with Modal Dicts](#managing-chat-sessions-with-modal-dicts)[Storing PDFs on a Modal Volume](#storing-pdfs-on-a-modal-volume)[Caching the model weights](#caching-the-model-weights)[Defining a Chat with PDF service](#defining-a-chat-with-pdf-service)[Loading PDFs as images](#loading-pdfs-as-images)[Chatting with a PDF from the terminal](#chatting-with-a-pdf-from-the-terminal)[A hosted Gradio interface](#a-hosted-gradio-interface)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/13_sandboxes/safe_code_execution.py)
Run arbitrary code in a sandboxed environment
=============================================
This example demonstrates how to run arbitrary code
in multiple languages in a Modal [Sandbox](../guide/sandbox.html).
Setting up a multi-language environment
---------------------------------------
Sandboxes allow us to run any kind of code in a safe environment.
We’ll use an image with a few different language runtimes to demonstrate this.
```
import modal
image = modal.Image.debian_slim(python_version="3.11").apt_install(
"nodejs", "ruby", "php"
)
app = modal.App.lookup("safe-code-execution", create_if_missing=True)
```
We’ll now create a Sandbox with this image. We’ll also enable output so we can see the image build
logs. Note that we don’t pass any commands to the Sandbox, so it will stay alive, waiting for us
to send it commands.
```
with modal.enable_output():
sandbox = modal.Sandbox.create(app=app, image=image)
print(f"Sandbox ID: {sandbox.object_id}")
```
Running bash, Python, Node.js, Ruby, and PHP in a Sandbox
---------------------------------------------------------
We can now use [`Sandbox.exec`](../reference/modal.Sandbox.html#exec) to run a few different
commands in the Sandbox.
```
bash_ps = sandbox.exec("echo", "hello from bash")
python_ps = sandbox.exec("python", "-c", "print('hello from python')")
nodejs_ps = sandbox.exec("node", "-e", 'console.log("hello from nodejs")')
ruby_ps = sandbox.exec("ruby", "-e", "puts 'hello from ruby'")
php_ps = sandbox.exec("php", "-r", "echo 'hello from php';")
print(bash_ps.stdout.read(), end="")
print(python_ps.stdout.read(), end="")
print(nodejs_ps.stdout.read(), end="")
print(ruby_ps.stdout.read(), end="")
print(php_ps.stdout.read(), end="")
print()
```
The output should look something like
```
hello from bash
hello from python
hello from nodejs
hello from ruby
hello from php
```
We can use multiple languages in tandem to build complex applications.
Let’s demonstrate this by piping data between Python and Node.js using bash. Here
we generate some random numbers with Python and sum them with Node.js.
```
combined_process = sandbox.exec(
"bash",
"-c",
"""python -c 'import random; print(\" \".join(str(random.randint(1, 100)) for _ in range(10)))' |
node -e 'const readline = require(\"readline\");
const rl = readline.createInterface({input: process.stdin});
rl.on(\"line\", (line) => {
const sum = line.split(\" \").map(Number).reduce((a, b) => a + b, 0);
console.log(`The sum of the random numbers is: ${sum}`);
rl.close();
});'""",
)
result = combined_process.stdout.read().strip()
print(result)
```
For long-running processes, you can use stdout as an iterator to stream the output.
```
slow_printer = sandbox.exec(
"ruby",
"-e",
"""
10.times do |i|
puts "Line #{i + 1}: #{Time.now}"
STDOUT.flush
sleep(0.5)
end
""",
)
for line in slow_printer.stdout:
print(line, end="")
```
This should print something like
```
Line 1: 2024-10-21 15:30:53 +0000
Line 2: 2024-10-21 15:30:54 +0000
...
Line 10: 2024-10-21 15:30:58 +0000
```
Since Sandboxes are safely separated from the rest of our system,
we can run very dangerous code in them!
```
sandbox.exec("rm", "-rfv", "/", "--no-preserve-root")
```
This command has deleted the entire filesystem, so we can’t run any more commands.
Let’s terminate the Sandbox to clean up after ourselves.
```
sandbox.terminate()
```
[Run arbitrary code in a sandboxed environment](#run-arbitrary-code-in-a-sandboxed-environment)[Setting up a multi-language environment](#setting-up-a-multi-language-environment)[Running bash, Python, Node.js, Ruby, and PHP in a Sandbox](#running-bash-python-nodejs-ruby-and-php-in-a-sandbox)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
python 13_sandboxes/safe_code_execution.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/05_scheduling/hackernews_alerts.py)
Run cron jobs in the cloud to search Hacker News
================================================
In this example, we use Modal to deploy a cron job that periodically queries Hacker News for
new posts matching a given search term, and posts the results to Slack.
Import and define the app
-------------------------
Let’s start off with imports, and defining a Modal app.
```
import os
from datetime import datetime, timedelta
import modal
app = modal.App("example-hn-bot")
```
Now, let’s define an image that has the `slack-sdk` package installed, in which we can run a function
that posts a slack message.
```
slack_sdk_image = modal.Image.debian_slim().pip_install("slack-sdk")
```
Defining the function and importing the secret
----------------------------------------------
Our Slack bot will need access to a bot token.
We can use Modal’s [Secrets](https://modal.com/secrets) interface to accomplish this.
To quickly create a Slack bot secret, navigate to the [create secret](https://modal.com/secrets/create) page, select the Slack secret template
from the list options, and follow the instructions in the “Where to find the credentials?” panel.
Name your secret `hn-bot-slack.`
Now, we define the function `post_to_slack`, which simply instantiates the Slack client using our token,
and then uses it to post a message to a given channel name.
```
@app.function(
image=slack_sdk_image,
secrets=[modal.Secret.from_name("hn-bot-slack", required_keys=["SLACK_BOT_TOKEN"])],
)
async def post_to_slack(message: str):
import slack_sdk
client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"])
client.chat_postMessage(channel="hn-alerts", text=message)
```
Searching Hacker News
---------------------
We are going to use Algolia’s [Hacker News Search API](https://hn.algolia.com/api) to query for posts
matching a given search term in the past X days. Let’s define our search term and query period.
```
QUERY = "serverless"
WINDOW_SIZE_DAYS = 1
```
Let’s also define an image that has the `requests` package installed, so we can query the API.
```
requests_image = modal.Image.debian_slim().pip_install("requests")
```
We can now define our main entrypoint, that queries Algolia for the term, and calls `post_to_slack` on all the results. We specify a [schedule](../guide/cron.html) in the function decorator, which means that our function will run automatically at the given interval.
```
@app.function(image=requests_image)
def search_hackernews():
import requests
url = "http://hn.algolia.com/api/v1/search"
threshold = datetime.utcnow() - timedelta(days=WINDOW_SIZE_DAYS)
params = {
"query": QUERY,
"numericFilters": f"created_at_i>{threshold.timestamp()}",
}
response = requests.get(url, params, timeout=10).json()
urls = [item["url"] for item in response["hits"] if item.get("url")]
print(f"Query returned {len(urls)} items.")
post_to_slack.for_each(urls)
```
Test running
------------
We can now test run our scheduled function as follows: `modal run hackernews_alerts.py::app.search_hackernews`
Defining the schedule and deploying
-----------------------------------
Let’s define a function that will be called by Modal every day
```
@app.function(schedule=modal.Period(days=1))
def run_daily():
search_hackernews.remote()
```
In order to deploy this as a persistent cron job, you can run `modal deploy hackernews_alerts.py`,
Once the job is deployed, visit the [apps page](https://modal.com/apps) page to see
its execution history, logs and other stats.
[Run cron jobs in the cloud to search Hacker News](#run-cron-jobs-in-the-cloud-to-search-hacker-news)[Import and define the app](#import-and-define-the-app)[Defining the function and importing the secret](#defining-the-function-and-importing-the-secret)[Searching Hacker News](#searching-hacker-news)[Test running](#test-running)[Defining the schedule and deploying](#defining-the-schedule-and-deploying)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 05_scheduling/hackernews_alerts.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Deploy a personalized music video generation service on Modal
=============================================================
Music videos are [cool](https://youtu.be/Cye-1RP5jso),
but unless you are famous or [pay a lot of money](https://youtu.be/kfVsfOSbJY0),
you don’t get to star in them.
Until now!
[The repo](https://github.com/modal-labs/music-video-gen) includes all the code you need to deploy a custom
music video generator on [Modal](https://modal.com),
a serverless infrastructure platform for data, ML, and AI applications.
Below is a sample video, generated by Modal Developer Advocate [`@charles_irl`](https://twitter.com/charles_irl).
[](https://github.com/user-attachments/assets/5bd90898-7251-4298-808f-6d58ed4c6b6f)
And because Modal is [generic serverless infrastructure](https://twitter.com/charles_irl/status/1819438860771663923),
you can customize this custom music video generator however you wish —
it’s just code and containers!
Setup
-----
In the Python environment of your choosing,
run `pip install modal`.
If you run into trouble with Python environments,
we suggest using [this Google Colab notebook](https://colab.research.google.com/github/modal-labs/music-video-gen/blob/main/notebooks/self_contained.ipynb),
where we’ve set the environment up for you.
It’s a bit of work to get used to running terminal commands in a notebook
if you haven’t done that before, but the Python setup works and running the notebook in Colab is free!
All you need is a Google account.
Then, if you’ve never used Modal on the computer you’re using,
run `modal setup` to create an account on Modal (if you don’t have one)
and set up authentication.
Data Prep
---------
Create a folder inside `data/`, parallel to the sample data, `data/sample`.
You can name it whatever you want.
Place at least four images of yourself in that folder —
ideally eight or more.
Images should be in `.png` or `.jpg` format
and around 400 to 800 pixels on each side.
For best results, we recommend putting a variety of images,
in particular where you are wearing different clothes and making different faces,
and including some images that have other people in them.
But you can also just take a few photos of yourself right now!
Optionally, add captions in `.txt` files in that same folder.
They should look something like `"[trigger] smiling at the camera, outdoor scene, close-up, selfie"`.
See the sample data for more example image-caption pairs.
Training
--------
Start up a JupyterLab server on Modal with
```
modal run train_from_notebook.py
```
Click the `modal.host` URL that appears in the output
to open Jupyter in the browser.
Open the training notebook, `training.ipynb`.
Read the notebook and run it, following the instructions to edit cells as needed.
In particular, change the dataset path to the folder you created —
it has been mounted on the remote cloud machine where the notebook is running.
You can also directly upload data to the `/root/data` folder on the remote machine.
You can even edit caption files inside of JupyterLab!
This data will stick around between runs, and you can find it with
```
modal volume ls finetune-video-data
```
See the help for `modal volume` and its subcommands for details.
The notebook will kick off training, which takes a few minutes.
Take note of the name given to your training run.
By default, it’s a hash like `38c67a92f6ce87882044ab53bf94cce0`,
but you can customize it in the notebook.
This is your `finetune-id`.
If you forget it, you can show all of your `finetune-id`s
by running
```
modal volume ls finetune-video-models
```
Inference
---------
Test out your new fine-tuned model by running:
```
modal run inference.py --finetune-id {your-finetune-id} --num-frames 15
```
You can also provide a `--prompt` to customize the generation.
You can deploy the video generator onto Modal with
```
modal deploy inference.py
```
Modal is serverless, so this won’t cost you any money when it isn’t serving any traffic.
Music video generation
----------------------
Once you’ve deployed an inference endpoint,
you can generate a music video starring yourself by running
```
modal run music_video_generator.py --finetune-id {your-finetune-id}
```
With the default settings, this will create a thirty second video in about five minutes
by running generation in parallel on seven H100s.
The music can be changed by passing in a different song via the `--mp3-file` argument.
The default is a Modal-themed song in `data/coding-up-a-storm.mp3`.
This song was created with [Suno](https://suno.com),
a music generation service — that runs on Modal!
If you want to DIY music generation as well,
see [this example](musicgen.html) in the Modal docs.
The generated clips can be changed by passing a different list of prompts via the `--prompt-file` argument.
The default is a set of prompts created with OpenAI’s GPT-4.5 system.
You can write your own or generate them with a language model.
If you want to serve your own language model,
see [this example](vllm_inference.html) in the Modal docs.
[Deploy a personalized music video generation service on Modal](#deploy-a-personalized-music-video-generation-service-on-modal)[Setup](#setup)[Data Prep](#data-prep)[Training](#training)[Inference](#inference)[Music video generation](#music-video-generation)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/discord_bot.py)
Serve a Discord Bot on Modal
============================
In this example we will demonstrate how to use Modal to build and serve a Discord bot that uses [slash commands](https://discord.com/developers/docs/interactions/application-commands).
Slash commands send information from Discord server members to a service at a URL.
Here, we set up a simple [FastAPI app](https://fastapi.tiangolo.com/) to run that service and deploy it easily Modal’s [`@asgi_app`](../guide/webhooks.html#serving-asgi-and-wsgi-apps) decorator.
As our example service, we hit a simple free API:
the [Free Public APIs API](https://www.freepublicapis.com/api),
a directory of free public APIs.
[Try it out on Discord](https://discord.gg/PmG7P47EPQ)!
Set up our App and its Image
----------------------------
First, we define the [container image](../guide/images.html) that all the pieces of our bot will run in.
We set that as the default image for a Modal [App](../guide/apps.html).
The App is where we’ll attach all the components of our bot.
```
import json
from enum import Enum
import modal
image = modal.Image.debian_slim(python_version="3.11").pip_install(
"fastapi[standard]==0.115.4", "pynacl~=1.5.0", "requests~=2.32.3"
)
app = modal.App("example-discord-bot", image=image)
```
Hit the Free Public APIs API
----------------------------
We start by defining the core service that our bot will provide.
In a real application, this might be [music generation](musicgen.html),
a [chatbot](chat_with_pdf_vision.html),
or [interacting with a database](covid_datasette.html).
Here, we just hit a simple free public API:
the [Free Public APIs](https://www.freepublicapis.com) API,
an “API of APIs” that returns information about free public APIs,
like the [Global Shark Attack API](https://www.freepublicapis.com/global-shark-attack-api) and the [Corporate Bullshit Generator](https://www.freepublicapis.com/corporate-bullshit-generator).
We convert the response into a Markdown-formatted message.
We turn our Python function into a Modal Function by attaching the `app.function` decorator.
We make the function `async` and add `@modal.concurrent()` with a large `max_inputs` value, because
communicating with an external API is a classic case for better performance from asynchronous execution.
Modal handles things like the async event loop for us.
```
@app.function()
@modal.concurrent(max_inputs=1000)
async def fetch_api() -> str:
import aiohttp
url = "https://www.freepublicapis.com/api/random"
async with aiohttp.ClientSession() as session:
try:
async with session.get(url) as response:
response.raise_for_status()
data = await response.json()
message = (
f"# {data.get('emoji') or '🤖'} [{data['title']}]({data['source']})"
)
message += f"\n _{''.join(data['description'].splitlines())}_"
except Exception as e:
message = f"# 🤖: Oops! {e}"
return message
```
This core component has nothing to do with Discord,
and it’s nice to be able to interact with and test it in isolation.
For that, we add a `local_entrypoint` that calls the Modal Function.
Notice that we add `.remote` to the function’s name.
Later, when you replace this component of the app with something more interesting,
test it by triggering this entrypoint with `modal run discord_bot.py`.
```
@app.local_entrypoint()
def test_fetch_api():
result = fetch_api.remote()
if result.startswith("# 🤖: Oops! "):
raise Exception(result)
else:
print(result)
```
Integrate our Modal Function with Discord Interactions
------------------------------------------------------
Now we need to map this function onto Discord’s interface —
in particular the [Interactions API](https://discord.com/developers/docs/interactions/overview).
Reviewing the documentation, we see that we need to send a JSON payload
to a specific API URL that will include an `app_id` that identifies our bot
and a `token` that identifies the interaction (loosely, message) that we’re participating in.
So let’s write that out. This function doesn’t need to live on Modal,
since it’s just encapsulating some logic — we don’t want to turn it into a service or an API on its own.
That means we don’t need any Modal decorators.
```
async def send_to_discord(payload: dict, app_id: str, interaction_token: str):
import aiohttp
interaction_url = f"https://discord.com/api/v10/webhooks/{app_id}/{interaction_token}/messages/@original"
async with aiohttp.ClientSession() as session:
async with session.patch(interaction_url, json=payload) as resp:
print("🤖 Discord response: " + await resp.text())
```
Other parts of our application might want to both hit the Free Public APIs API and send the result to Discord,
so we both write a Python function for this and we promote it to a Modal Function with a decorator.
Notice that we use the `.local` suffix to call our `fetch_api` Function. That means we run
the Function the same way we run all the other Python functions, rather than treating it as a special
Modal Function. This reduces a bit of extra latency, but couples these two Functions more tightly.
```
@app.function()
@modal.concurrent(max_inputs=1000)
async def reply(app_id: str, interaction_token: str):
message = await fetch_api.local()
await send_to_discord({"content": message}, app_id, interaction_token)
```
Set up a Discord app
--------------------
Now, we need to actually connect to Discord.
We start by creating an application on the Discord Developer Portal.
1. Go to the [Discord Developer Portal](https://discord.com/developers/applications) and
log in with your Discord account.
2. On the portal, go to **Applications** and create a new application by
clicking **New Application** in the top right next to your profile picture.
3. [Create a custom Modal Secret](../guide/secrets.html) for your Discord bot.
On Modal’s Secret creation page, select ‘Discord’. Copy your Discord application’s **Public Key** and **Application ID** (from the **General Information** tab in the Discord Developer Portal)
and paste them as the value of `DISCORD_PUBLIC_KEY` and `DISCORD_CLIENT_ID`.
Additionally, head to the **Bot** tab and use the **Reset Token** button to create a new bot token.
Paste this in the value of an additional key in the Secret, `DISCORD_BOT_TOKEN`.
Name this Secret `discord-secret`.
We access that Secret in code like so:
```
discord_secret = modal.Secret.from_name(
"discord-secret",
required_keys=[ # included so we get nice error messages if we forgot a key
"DISCORD_BOT_TOKEN",
"DISCORD_CLIENT_ID",
"DISCORD_PUBLIC_KEY",
],
)
```
Register a Slash Command
------------------------
Next, we’re going to register a [Slash Command](https://discord.com/developers/docs/interactions/application-commands#slash-commands) for our Discord app. Slash Commands are triggered by users in servers typing `/` and the name of the command.
The Modal Function below will register a Slash Command for your bot named `bored`.
More information about Slash Commands can be found in the Discord docs [here](https://discord.com/developers/docs/interactions/application-commands).
You can run this Function with
```
modal run discord_bot::create_slash_command
```
```
@app.function(secrets=[discord_secret], image=image)
def create_slash_command(force: bool = False):
"""Registers the slash command with Discord. Pass the force flag to re-register."""
import os
import requests
BOT_TOKEN = os.getenv("DISCORD_BOT_TOKEN")
CLIENT_ID = os.getenv("DISCORD_CLIENT_ID")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bot {BOT_TOKEN}",
}
url = f"https://discord.com/api/v10/applications/{CLIENT_ID}/commands"
command_description = {
"name": "api",
"description": "Information about a random free, public API",
}
# first, check if the command already exists
response = requests.get(url, headers=headers)
try:
response.raise_for_status()
except Exception as e:
raise Exception("Failed to create slash command") from e
commands = response.json()
command_exists = any(
command.get("name") == command_description["name"] for command in commands
)
# and only recreate it if the force flag is set
if command_exists and not force:
print(f"🤖: command {command_description['name']} exists")
return
response = requests.post(url, headers=headers, json=command_description)
try:
response.raise_for_status()
except Exception as e:
raise Exception("Failed to create slash command") from e
print(f"🤖: command {command_description['name']} created")
```
Host a Discord Interactions endpoint on Modal
---------------------------------------------
If you look carefully at the definition of the Slash Command above,
you’ll notice that it doesn’t know anything about our bot besides an ID.
To hook the Slash Commands in the Discord UI up to our logic for hitting the Bored API,
we need to set up a service that listens at some URL and follows a specific protocol,
described [here](https://discord.com/developers/docs/interactions/overview#configuring-an-interactions-endpoint-url).
Here are some of the most important facets:
1. We’ll need to respond within five seconds or Discord will assume we are dead.
Modal’s fast-booting serverless containers usually start faster than that,
but it’s not guaranteed. So we’ll add the `min_containers` parameter to our
Function so that there’s at least one live copy ready to respond quickly at any time.
Modal charges a minimum of about 2¢ an hour for live containers (pricing details [here](../../pricing.html)).
Note that that still fits within Modal’s $30/month of credits on the free tier.
2. We have to respond to Discord that quickly, but we don’t have to respond to the user that quickly.
We instead send an acknowledgement so that they know we’re alive and they can close their connection to us.
We also trigger our `reply` Modal Function, which will respond to the user via Discord’s Interactions API,
but we don’t wait for the result, we just `spawn` the call.
3. The protocol includes some authentication logic that is mandatory
and checked by Discord. We’ll explain in more detail in the next section.
We can set up our interaction endpoint by deploying a FastAPI app on Modal.
This is as easy as creating a Python Function that returns a FastAPI app
and adding the `modal.asgi_app` decorator.
For more details on serving Python web apps on Modal, see [this guide](../guide/webhooks.html).
```
@app.function(secrets=[discord_secret], min_containers=1)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def web_app():
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
web_app = FastAPI()
# must allow requests from other domains, e.g. from Discord's servers
web_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@web_app.post("/api")
async def get_api(request: Request):
body = await request.body()
# confirm this is a request from Discord
authenticate(request.headers, body)
print("🤖: parsing request")
data = json.loads(body.decode())
if data.get("type") == DiscordInteractionType.PING.value:
print("🤖: acking PING from Discord during auth check")
return {"type": DiscordResponseType.PONG.value}
if data.get("type") == DiscordInteractionType.APPLICATION_COMMAND.value:
print("🤖: handling slash command")
app_id = data["application_id"]
interaction_token = data["token"]
# kick off request asynchronously, will respond when ready
reply.spawn(app_id, interaction_token)
# respond immediately with defer message
return {
"type": DiscordResponseType.DEFERRED_CHANNEL_MESSAGE_WITH_SOURCE.value
}
print(f"🤖: unable to parse request with type {data.get('type')}")
raise HTTPException(status_code=400, detail="Bad request")
return web_app
```
The authentication for Discord is a bit involved and there aren’t,
to our knowledge, any good Python libraries for it.
So we have to implement the protocol “by hand”.
Essentially, Discord sends a header in their request
that we can use to verify the request comes from them.
For that, we use the `DISCORD_PUBLIC_KEY` from
our Application Information page.
The details aren’t super important, but they appear in the `authenticate` function below
(which defers the real cryptography work to [PyNaCl](https://pypi.org/project/PyNaCl/),
a Python wrapper for [`libsodium`](https://github.com/jedisct1/libsodium)).
Discord will also check that we reject unauthorized requests,
so we have to be sure to get this right!
```
def authenticate(headers, body):
import os
from fastapi.exceptions import HTTPException
from nacl.exceptions import BadSignatureError
from nacl.signing import VerifyKey
print("🤖: authenticating request")
# verify the request is from Discord using their public key
public_key = os.getenv("DISCORD_PUBLIC_KEY")
verify_key = VerifyKey(bytes.fromhex(public_key))
signature = headers.get("X-Signature-Ed25519")
timestamp = headers.get("X-Signature-Timestamp")
message = timestamp.encode() + body
try:
verify_key.verify(message, bytes.fromhex(signature))
except BadSignatureError:
# either an unauthorized request or Discord's "negative control" check
raise HTTPException(status_code=401, detail="Invalid request")
```
The code above used a few enums to abstract bits of the Discord protocol.
Now that we’ve walked through all of it,
we’re in a position to understand what those are
and so the code for them appears below.
```
class DiscordInteractionType(Enum):
PING = 1 # hello from Discord during auth check
APPLICATION_COMMAND = 2 # an actual command
class DiscordResponseType(Enum):
PONG = 1 # hello back during auth check
DEFERRED_CHANNEL_MESSAGE_WITH_SOURCE = 5 # we'll send a message later
```
Deploy on Modal
---------------
You can deploy this app on Modal by running the following commands:
```
modal run discord_bot.py # checks the API wrapper, little test
modal run discord_bot.py::create_slash_command # creates the slash command, if missing
modal deploy discord_bot.py # deploys the web app and the API wrapper
```
Copy the Modal URL that is printed in the output and go back to the **General Information** section on the [Discord Developer Portal](https://discord.com/developers/applications).
Paste the URL, making sure to append the path of your `POST` route (here, `/api`), in the **Interactions Endpoint URL** field, then click **Save Changes**. If your
endpoint URL is incorrect or if authentication is incorrectly implemented,
Discord will refuse to save the URL. Once it saves, you can start
handling interactions!
Finish setting up Discord bot
-----------------------------
To start using the Slash Command you just set up, you need to invite the bot to
a Discord server. To do so, go to your application’s **Installation** section on the [Discord Developer Portal](https://discord.com/developers/applications).
Copy the **Discored Provided Link** and visit it to invite the bot to your bot to the server.
Now you can open your Discord server and type `/api` in a channel to trigger the bot.
You can see a working version [in our test Discord server](https://discord.gg/PmG7P47EPQ).
[Serve a Discord Bot on Modal](#serve-a-discord-bot-on-modal)[Set up our App and its Image](#set-up-our-app-and-its-image)[Hit the Free Public APIs API](#hit-the-free-public-apis-api)[Integrate our Modal Function with Discord Interactions](#integrate-our-modal-function-with-discord-interactions)[Set up a Discord app](#set-up-a-discord-app)[Register a Slash Command](#register-a-slash-command)[Host a Discord Interactions endpoint on Modal](#host-a-discord-interactions-endpoint-on-modal)[Deploy on Modal](#deploy-on-modal)[Finish setting up Discord bot](#finish-setting-up-discord-bot)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 07_web_endpoints/discord_bot.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
DoppelBot: Fine-tune an LLM to replace your CEO
===============================================
*(quick links: [add to your own Slack](https://github.com/modal-labs/doppel-bot#usage); [source code](https://github.com/modal-labs/doppel-bot))*
Internally at Modal, we spend a *lot* of time talking to each other on Slack.
Now, with the advent of open-source large language models, we had started to
wonder if all of this wasn’t a bit redundant. Could we have these language
models bike-shed on Slack for us, so we could spend our time on higher leverage
activities such as [paddleboarding in Tahiti](https://twitter.com/modal_labs/status/1642262543757352960) instead?
To test this, we fine-tuned [Llama 3.1](https://ai.meta.com/blog/meta-llama-3-1/) on [Erik](https://twitter.com/bernhardsson)’s Slack messages, and `@erik-bot` was
born.
![erik-bot](../../_app/immutable/assets/erik-bot-1.CjDvIhCc.jpeg)
Since then, `@erik-bot` has been an invaluable asset to us, in areas ranging from [API design](../../_app/immutable/assets/erik-bot-2.CDmWvLM4.png) to [legal advice](../../_app/immutable/assets/erik-bot-3.C_m8x6a2.png) to thought leadership.
![erik-bot-3](../../_app/immutable/assets/erik-bot-4.CEbrQZVg.png)
We were planning on releasing the weights for `@erik-bot` to the world, but all
our metrics have been going up and to the right a little too much since we’ve
launched him…
So, we are releasing the next best thing. `DoppelBot` is a Slack bot that you
can install in your own workspace, and fine-tune on your own Slack messages.
Follow the instructions [here](https://github.com/modal-labs/doppel-bot#usage) to replace your own CEO with an LLM today.
All the components—scraping, fine-tuning, inference and slack event handlers run
on Modal, and the code itself is open-source and available [here](https://github.com/modal-labs/doppel-bot). If you’re new to Modal, it’s
worth reiterating that **all of these components are also serverless and scale
to zero**. This means that you can deploy and forget about them, because you’ll
only pay for compute when your app is used!
How it works
------------
DoppelBot uses the Slack SDK to scrape messages from a Slack workspace, and
converts them into prompt/response pairs. It uses these to fine-tune a language
model using [Low-Rank Adaptation (LoRA)](https://arxiv.org/abs/2106.09685), a
technique that produces a small adapter that can be merged with the base model
when needed, instead of modifying all the parameters in the base model. The
fine-tuned adapters for each user are stored in a Modal [Volume](../guide/volumes.html). When a user `@`s the bot,
Slack sends a webhook call to Modal, which loads the adapter for that user and
generates a response.
We go into detail into each of these steps below, and provide commands for
running each of them individually. To follow along, [clone the repo](https://github.com/modal-labs/doppel-bot) and [set up a Slack token](https://github.com/modal-labs/doppel-bot#create-a-slack-app) for yourself.
### Scraping slack
The scraper uses Modal’s [`.map()`](../guide/scale.html#scaling-out) to fetch
messages from all public channels in parallel. Each thread is split into
contiguous messages from the target users and continguous messages from other
users. These will be fed into the model as prompts in the following format:
```
[system]: You are {user}, employee at a fast-growing startup. Below is an input conversation that takes place in the company's internal Slack. Write a response that appropriately continues the conversation.
[user]: <slack thread>
[assistant]: <target user's response>
```
Initial versions of the model were prone to generating short responses
— unsurprising, because a majority of Slack communication is pretty terse.
Adding a minimum character length for the target user’s messages fixed this.
If you’re following along at home, you can run the scraper with the following
command:
```
modal run -m src.scrape::scrape --user="<user>"
```
Scraped results are stored in a Modal [Volume](../guide/volumes.html), so they can be used by the next step.
### Fine-tuning
Next, we use the prompts to fine-tune a language model. We chose [Llama 3.1](https://ai.meta.com/blog/meta-llama-3-1/) because of its permissive license and high quality relative to its small size. Fine-tuning is
done using [Low-Rank Adaptation (LoRA)](https://arxiv.org/abs/2106.09685), a [parameter-efficient fine-tuning](https://huggingface.co/blog/peft) technique
that produces a small adapter that can be merged with the base model when needed
(~60MB for the rank we’re using).
Our fine-tuning implementation uses [torchtune](https://github.com/pytorch/torchtune), a new PyTorch library for easily configuring fine-tuning runs.
Because of the typically small sample sizes we’re working with, training for
longer than a couple hundred steps (with our batch size of 128) quickly led to
overfitting. Admittedly, we haven’t thoroughly evaluated the hyperparameter
space yet — do reach out to us if you’re interested in collaborating on this!
![train-loss](../../_app/immutable/assets/train-loss.DFD7oOI8.png)
To try this step yourself, run:
```
modal run -m src.finetune --user="<user>"
```
### Inference
We use [vLLM](https://github.com/vllm-project/vllm) as our inference engine, which now comes with support for dynamically swapping LoRA adapters [out of the box](https://docs.vllm.ai/en/latest/features/lora.html).
With parametrized functions, every user model gets its own pool of containers
that scales up when there are incoming requests, and scales to 0 when there’s
none. Here’s what that looks like stripped down to the essentials:
```
@app.cls(gpu="L40S")
class Model():
@modal.enter()
def enter(self):
self.engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(...))
self.loras: dict[str, int] = dict() # per replica LoRA identifier
@method()
def generate(self, input: str):
if (ident := f"{user}-{team_id}") not in self.loras:
self.loras[ident] = len(self.loras) + 1
lora_request = LoRARequest(
ident, self.loras[ident], lora_local_path=checkpoint_path
)
tokenizer = await self.engine.get_tokenizer(lora_request=lora_request)
prompt = tokenizer.apply_chat_template(
conversation=inpt, tokenize=False, add_generation_prompt=True
)
results_generator = self.engine.generate(prompt, lora_request=lora_request,)
```
If you’ve fine-tuned a model already in the previous step, you can run inference
using it now:
```
modal run -m src.inference --user="<user>"
```
(We have a list of sample inputs in the file, but you can also try it out with
your own messages!)
### Slack Bot
Finally, it all comes together in [`bot.py`](https://github.com/modal-labs/doppel-bot/blob/main/src/bot.py). As
you might have guessed, all events from Slack are handled by serverless Modal
functions. We handle 3 types of events:
* [`url_verification`](https://github.com/modal-labs/doppel-bot/blob/24609583c43c0e722f56f85a1c00bb55b46c7754/src/bot.py#L112):
To verify that this is a Slack app, Slack expects us to return a challenge
string.
* [`app_mention`](https://github.com/modal-labs/doppel-bot/blob/main/src/bot.py#L118):
When the bot is mentioned in a channel, we retrieve the recent messages from
that thread, do some basic cleaning and call the user’s model to generate a
response.
```
model = OpenLlamaModel.remote(user, team_id)
result = model.generate(messages)
```
* [`doppel` slash command](https://github.com/modal-labs/doppel-bot/blob/main/src/bot.py#L182):
This command kicks off the scraping -> finetuning pipeline for the user.
To deploy the slackbot in its entirety, you need to run:
```
modal deploy -m src.bot
```
### Multi-Workspace Support
Everything we’ve talked about so far is for a single-workspace Slack app. To
make it work with multiple workspaces, we’ll need to handle [workspace installation and authentication with OAuth](https://api.slack.com/authentication/oauth-v2),
and also store some state for each workspace.
Luckily, Slack’s [Bolt](https://slack.dev/bolt-python/concepts) framework
provides a complete (but frugally documented) OAuth implemention. A neat feature
is that the OAuth state can be backed by a file system, so all we need to do is [point Bolt](https://github.com/modal-labs/doppel-bot/blob/24609583c43c0e722f56f85a1c00bb55b46c7754/src/bot.py#L78) at a Modal [Volume](../guide/volumes.html), and then we don’t need to worry about
managing this state ourselves.
To store state for each workspace, we’re using [Neon](https://neon.tech/), a
serverless Postgres database that’s really easy to set up and *just works*. If
you’re interested in developing a multi-workspace app, [follow our instructions](https://github.com/modal-labs/doppel-bot#optional-multi-workspace-app) on how to set up Neon with Modal.
Next Steps
----------
If you’ve made it this far, you have just found a way to increase your team’s
productivity by 10x! Congratulations on the well-earned vacation! 🎉
If you’re interested in learning more about Modal, check out our [docs](../../docs.html) and other [examples](../../examples.html).
[DoppelBot: Fine-tune an LLM to replace your CEO](#doppelbot-fine-tune-an-llm-to-replace-your-ceo)[How it works](#how-it-works)[Scraping slack](#scraping-slack)[Fine-tuning](#fine-tuning)[Inference](#inference)[Slack Bot](#slack-bot)[Multi-Workspace Support](#multi-workspace-support)[Next Steps](#next-steps)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/cloud_bucket_mount_loras.py)
LoRAs Galore: Create a LoRA Playground with Modal, Gradio, and S3
=================================================================
This example shows how to mount an S3 bucket in a Modal app using [`CloudBucketMount`](../reference/modal.CloudBucketMount.html).
We will download a bunch of LoRA adapters from the [HuggingFace Hub](https://huggingface.co/models) into our S3 bucket
then read from that bucket, on the fly, when doing inference.
By default, we use the [IKEA instructions LoRA](https://huggingface.co/ostris/ikea-instructions-lora-sdxl) as an example,
which produces the following image when prompted to generate “IKEA instructions for building a GPU rig for deep learning”:
![IKEA instructions for building a GPU rig for deep learning](../../_app/immutable/assets/ikea-instructions-for-building-a-gpu-rig-for-deep-learning.DcGj0diD.png)
By the end of this example, we’ve deployed a “playground” app where anyone with a browser can try
out these custom models. That’s the power of Modal: custom, autoscaling AI applications, deployed in seconds.
You can try out our deployment [here](https://modal-labs-examples--loras-galore-ui.modal.run).
Basic setup
-----------
```
import io
import os
from pathlib import Path
from typing import Optional
import modal
```
You will need to have an S3 bucket and AWS credentials to run this example. Refer to the documentation
for the detailed [IAM permissions](../guide/cloud-bucket-mounts.html#iam-permissions) those credentials will need.
After you are done creating a bucket and configuring IAM settings,
you now need to create a [Modal Secret](../guide/secrets.html). Navigate to the “Secrets” tab and
click on the AWS card, then fill in the fields with the AWS key and secret created
previously. Name the Secret `s3-bucket-secret`.
```
bucket_secret = modal.Secret.from_name(
"s3-bucket-secret",
required_keys=["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
)
MOUNT_PATH: Path = Path("/mnt/bucket")
LORAS_PATH: Path = MOUNT_PATH / "loras/v5"
BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
CACHE_DIR = "/hf-cache"
```
Modal runs serverless functions inside containers.
The environments those functions run in are defined by
the container `Image`. The line below constructs an image
with the dependencies we need — no need to install them locally.
```
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"huggingface_hub==0.21.4",
"transformers==4.38.2",
"diffusers==0.26.3",
"peft==0.9.0",
"accelerate==0.27.2",
)
.env({"HF_HUB_CACHE": CACHE_DIR})
)
with image.imports():
# we import these dependencies only inside the container
import diffusers
import huggingface_hub
import torch
```
We attach the S3 bucket to all the Modal functions in this app by mounting it on the filesystem they see,
passing a `CloudBucketMount` to the `volumes` dictionary argument. We can read and write to this mounted bucket
(almost) as if it were a local directory.
```
app = modal.App(
"loras-galore",
image=image,
volumes={
MOUNT_PATH: modal.CloudBucketMount(
"modal-s3mount-test-bucket",
secret=bucket_secret,
)
},
)
```
For the base model, we’ll use a modal.Volume to store the Hugging Face cache.
```
cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
@app.function(image=image, volumes={CACHE_DIR: cache_volume})
def download_model():
loc = huggingface_hub.snapshot_download(repo_id=BASE_MODEL)
print(f"Saved model to {loc}")
```
Acquiring LoRA weights
----------------------
`search_loras()` will use the Hub API to search for LoRAs. We limit LoRAs
to a maximum size to avoid downloading very large model weights.
We went with 800 MiB, but feel free to adapt to what works best for you.
```
@app.function(secrets=[bucket_secret])
def search_loras(limit: int, max_model_size: int = 1024 * 1024 * 1024):
api = huggingface_hub.HfApi()
model_ids: list[str] = []
for model in api.list_models(
tags=["lora", f"base_model:{BASE_MODEL}"],
library="diffusers",
sort="downloads", # sort by most downloaded
):
try:
model_size = 0
for file in api.list_files_info(model.id):
model_size += file.size
except huggingface_hub.utils.GatedRepoError:
print(f"gated model ({model.id}); skipping")
continue
# Skip models that are larger than file limit.
if model_size > max_model_size:
print(f"model {model.id} is too large; skipping")
continue
model_ids.append(model.id)
if len(model_ids) >= limit:
return model_ids
return model_ids
```
We want to take the LoRA weights we found and move them from Hugging Face onto S3,
where they’ll be accessible, at short latency and high throughput, for our Modal functions.
Downloading files in this mount will automatically upload files to S3.
To speed things up, we will run this function in parallel using Modal’s [`map`](../reference/modal.Function.html#map).
```
@app.function()
def download_lora(repository_id: str) -> Optional[str]:
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
# CloudBucketMounts will report 0 bytes of available space leading to many
# unnecessary warnings, so we patch the method that emits those warnings.
from huggingface_hub import file_download
file_download._check_disk_space = lambda x, y: False
repository_path = LORAS_PATH / repository_id
try:
# skip models we've already downloaded
if not repository_path.exists():
huggingface_hub.snapshot_download(
repository_id,
local_dir=repository_path.as_posix().replace(".", "_"),
allow_patterns=["*.safetensors"],
)
downloaded_lora = len(list(repository_path.rglob("*.safetensors"))) > 0
except OSError:
downloaded_lora = False
except FileNotFoundError:
downloaded_lora = False
if downloaded_lora:
return repository_id
else:
return None
```
Inference with LoRAs
--------------------
We define a `StableDiffusionLoRA` class to organize our inference code.
We load Stable Diffusion XL 1.0 as a base model, then, when doing inference,
we load whichever LoRA the user specifies from the S3 bucket.
For more on the decorators we use on the methods below to speed up building and booting,
check out the [container lifecycle hooks guide](https://modal.com/docs/guide/lifecycle-hooks).
```
@app.cls(
gpu="a10g", # A10G GPUs are great for inference
volumes={CACHE_DIR: cache_volume}, # We cache the base model
)
class StableDiffusionLoRA:
@modal.enter() # when a new container starts, we load the base model into the GPU
def load(self):
self.pipe = diffusers.DiffusionPipeline.from_pretrained(
BASE_MODEL, torch_dtype=torch.float16
).to("cuda")
@modal.method() # at inference time, we pull in the LoRA weights and pass the final model the prompt
def run_inference_with_lora(
self, lora_id: str, prompt: str, seed: int = 8888
) -> bytes:
for file in (LORAS_PATH / lora_id).rglob("*.safetensors"):
self.pipe.load_lora_weights(lora_id, weight_name=file.name)
break
lora_scale = 0.9
image = self.pipe(
prompt,
num_inference_steps=10,
cross_attention_kwargs={"scale": lora_scale},
generator=torch.manual_seed(seed),
).images[0]
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
```
Try it locally!
---------------
To use our inference code from our local command line, we add a `local_entrypoint` to our `app`.
Run it using `modal run cloud_bucket_mount_loras.py`, and pass `--help` to see the available options.
The inference code will run on our machines, but the results will be available on yours.
```
@app.local_entrypoint()
def main(
limit: int = 100,
example_lora: str = "ostris/ikea-instructions-lora-sdxl",
prompt: str = "IKEA instructions for building a GPU rig for deep learning",
seed: int = 8888,
):
# Download LoRAs in parallel.
lora_model_ids = [example_lora]
lora_model_ids += search_loras.remote(limit)
downloaded_loras = []
for model in download_lora.map(lora_model_ids):
if model:
downloaded_loras.append(model)
print(f"downloaded {len(downloaded_loras)} loras => {downloaded_loras}")
# Run inference using one of the downloaded LoRAs.
byte_stream = StableDiffusionLoRA().run_inference_with_lora.remote(
example_lora, prompt, seed
)
dir = Path("/tmp/stable-diffusion-xl")
if not dir.exists():
dir.mkdir(exist_ok=True, parents=True)
output_path = dir / f"{as_slug(prompt.lower())}.png"
print(f"Saving it to {output_path}")
with open(output_path, "wb") as f:
f.write(byte_stream)
```
LoRA Exploradora: A hosted Gradio interface
-------------------------------------------
Command line tools are cool, but we can do better!
With the Gradio library by Hugging Face, we can create a simple web interface
around our Python inference function, then use Modal to host it for anyone to try out.
To set up your own, run `modal deploy cloud_bucket_mount_loras.py` and navigate to the URL it prints out.
If you’re playing with the code, use `modal serve` instead to see changes live.
```
web_image = modal.Image.debian_slim(python_version="3.12").pip_install(
"fastapi[standard]==0.115.4",
"gradio~=5.7.1",
"pillow~=10.2.0",
)
@app.function(
image=web_image,
min_containers=1,
scaledown_window=60 * 20,
# gradio requires sticky sessions
# so we limit the number of concurrent containers to 1
# and allow it to scale to 100 concurrent inputs
max_containers=1,
)
@modal.concurrent(max_inputs=100)
@modal.asgi_app()
def ui():
"""A simple Gradio interface around our LoRA inference."""
import io
import gradio as gr
from fastapi import FastAPI
from gradio.routes import mount_gradio_app
from PIL import Image
# determine which loras are available
lora_ids = [
f"{lora_dir.parent.stem}/{lora_dir.stem}" for lora_dir in LORAS_PATH.glob("*/*")
]
# pick one to be default, set a default prompt
default_lora_id = (
"ostris/ikea-instructions-lora-sdxl"
if "ostris/ikea-instructions-lora-sdxl" in lora_ids
else lora_ids[0]
)
default_prompt = (
"IKEA instructions for building a GPU rig for deep learning"
if default_lora_id == "ostris/ikea-instructions-lora-sdxl"
else "text"
)
# the simple path to making an app on Gradio is an Interface: a UI wrapped around a function.
def go(lora_id: str, prompt: str, seed: int) -> Image:
return Image.open(
io.BytesIO(
StableDiffusionLoRA().run_inference_with_lora.remote(
lora_id, prompt, seed
)
),
)
iface = gr.Interface(
go,
inputs=[ # the inputs to go/our inference function
gr.Dropdown(choices=lora_ids, value=default_lora_id, label="👉 LoRA ID"),
gr.Textbox(default_prompt, label="🎨 Prompt"),
gr.Number(value=8888, label="🎲 Random Seed"),
],
outputs=gr.Image(label="Generated Image"),
# some extra bits to make it look nicer
title="LoRAs Galore",
description="# Try out some of the top custom SDXL models!"
"\n\nPick a LoRA finetune of SDXL from the dropdown, then prompt it to generate an image."
"\n\nCheck out [the code on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/cloud_bucket_mount_loras.py)"
" if you want to create your own version or just see how it works."
"\n\nPowered by [Modal](https://modal.com) 🚀",
theme="soft",
allow_flagging="never",
)
return mount_gradio_app(app=FastAPI(), blocks=iface, path="/")
def as_slug(name):
"""Converts a string, e.g. a prompt, into something we can use as a filename."""
import re
s = str(name).strip().replace(" ", "-")
s = re.sub(r"(?u)[^-\w.]", "", s)
return s
```
[LoRAs Galore: Create a LoRA Playground with Modal, Gradio, and S3](#loras-galore-create-a-lora-playground-with-modal-gradio-and-s3)[Basic setup](#basic-setup)[Acquiring LoRA weights](#acquiring-lora-weights)[Inference with LoRAs](#inference-with-loras)[Try it locally!](#try-it-locally)[LoRA Exploradora: A hosted Gradio interface](#lora-exploradora-a-hosted-gradio-interface)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/cloud_bucket_mount_loras.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/obj_detection_webcam/webcam.py)
Real-time object detection via webcam
=====================================
This example creates a web endpoint that uses a Huggingface model for object detection.
The web endpoint takes an image from their webcam, and sends it to a Modal web endpoint.
The Modal web endpoint in turn calls a Modal function that runs the actual model.
If you run this, it will look something like this:
![webcam](../../_app/immutable/assets/webcam.BpPs9Hiu.png)
Live demo
---------
[Take a look at the deployed app](https://modal-labs-examples--example-webcam-object-detection.modal.run/).
A couple of caveats:
* This is not optimized for latency: every prediction takes about 1s, and
there’s an additional overhead on the first prediction since the containers
have to be started and the model initialized.
* This doesn’t work on iPhone unfortunately due to some issues with HTML5
webcam components
Code
----
Starting with imports:
```
import base64
import io
from pathlib import Path
import modal
```
We need to install [transformers](https://github.com/huggingface/transformers) which is a package Huggingface uses for all their models, but also [Pillow](https://python-pillow.org/) which lets us work with images from Python,
and a system font for drawing.
This example uses the `facebook/detr-resnet-50` pre-trained model,
which we’ll cache to a Volume for fast cold starts.
```
MODEL_REPO_ID = "facebook/detr-resnet-50"
MODEL_DIR = "/cache"
app = modal.App("example-webcam-object-detection")
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"huggingface-hub==0.27.1",
"Pillow",
"timm",
"transformers",
)
.apt_install("fonts-freefont-ttf")
.env({"HF_HUB_CACHE": MODEL_DIR})
)
```
Prediction function
-------------------
The object detection function has a few different features worth mentioning:
* There’s a container initialization step in the method decorated with `@enter()`,
which runs on every container start. This lets us load the model only once per
container, so that it’s reused for subsequent function calls.
* We’re running it on multiple CPUs for extra performance
Note that the function takes an image and returns a new image.
The input image is from the webcam
The output image is an image with all the bounding boxes and labels on them,
with an alpha channel so that most of the image is transparent so that the
web interface can render it on top of the webcam view.
```
with image.imports():
import torch
from huggingface_hub import snapshot_download
from PIL import Image, ImageColor, ImageDraw, ImageFont
from transformers import DetrForObjectDetection, DetrImageProcessor
```
We’ll store the model weights in a Volume and provide a function that you can `modal run` against to download the model weights prior to deploying the App.
Otherwise, the model weights will be downloaded for the first inference
and cached to the Volume when the first container exits.
```
cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
@app.function(image=image, volumes={MODEL_DIR: cache_volume})
def download_model():
loc = snapshot_download(repo_id=MODEL_REPO_ID)
print(f"Saved model to {loc}")
@app.cls(image=image, volumes={MODEL_DIR: cache_volume})
class ObjectDetection:
@modal.enter()
def load_model(self):
self.feature_extractor = DetrImageProcessor.from_pretrained(
MODEL_REPO_ID,
)
self.model = DetrForObjectDetection.from_pretrained(
MODEL_REPO_ID,
)
@modal.method()
def detect(self, img_data_in):
# Based on https://huggingface.co/spaces/nateraw/detr-object-detection/blob/main/app.py
# Read png from input
image = Image.open(io.BytesIO(img_data_in)).convert("RGB")
# Make prediction
inputs = self.feature_extractor(image, return_tensors="pt")
outputs = self.model(**inputs)
img_size = torch.tensor([tuple(reversed(image.size))])
processed_outputs = self.feature_extractor.post_process_object_detection(
outputs=outputs,
target_sizes=img_size,
threshold=0,
)
output_dict = processed_outputs[0]
# Grab boxes
keep = output_dict["scores"] > 0.7
boxes = output_dict["boxes"][keep].tolist()
scores = output_dict["scores"][keep].tolist()
labels = output_dict["labels"][keep].tolist()
# Plot bounding boxes
colors = list(ImageColor.colormap.values())
font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 18)
output_image = Image.new("RGBA", (image.width, image.height))
output_image_draw = ImageDraw.Draw(output_image)
for _score, box, label in zip(scores, boxes, labels):
color = colors[label % len(colors)]
text = self.model.config.id2label[label]
box = tuple(map(int, box))
output_image_draw.rectangle(box, outline=color)
output_image_draw.text(box[:2], text, font=font, fill=color, width=3)
# Return PNG as bytes
with io.BytesIO() as output_buf:
output_image.save(output_buf, format="PNG")
return output_buf.getvalue()
```
Defining the web interface
--------------------------
To keep things clean, we define the web endpoints separate from the prediction
function. This will introduce a tiny bit of extra latency (every web request
triggers a Modal function call which will call another Modal function) but in
practice the overhead is much smaller than the overhead of running the prediction
function etc.
We also serve a static html page which contains some tiny bit of Javascript to
capture the webcam feed and send it to Modal.
```
static_path = Path(__file__).with_name("webcam").resolve()
@app.function(
image=modal.Image.debian_slim(python_version="3.12")
.pip_install("fastapi[standard]==0.115.4")
.add_local_dir(static_path, remote_path="/assets")
)
@modal.asgi_app(label="example-webcam-object-detection")
def fastapi_app():
from fastapi import FastAPI, Request, Response
from fastapi.staticfiles import StaticFiles
web_app = FastAPI()
# The endpoint for the prediction function takes an image as a
# [data URI](https://en.wikipedia.org/wiki/Data_URI_scheme)
# and returns another image, also as a data URI:
@web_app.post("/predict")
async def predict(request: Request):
# Takes a webcam image as a datauri, returns a bounding box image as a datauri
body = await request.body()
img_data_in = base64.b64decode(body.split(b",")[1]) # read data-uri
img_data_out = ObjectDetection().detect.remote(img_data_in)
output_data = b"data:image/png;base64," + base64.b64encode(img_data_out)
return Response(content=output_data)
web_app.mount("/", StaticFiles(directory="/assets", html=True))
return web_app
```
Running this locally
--------------------
You can run this as an ephemeral app, by running
```
modal serve webcam.py
```
[Real-time object detection via webcam](#real-time-object-detection-via-webcam)[Live demo](#live-demo)[Code](#code)[Prediction function](#prediction-function)[Defining the web interface](#defining-the-web-interface)[Running this locally](#running-this-locally)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 06_gpu_and_ml/obj_detection_webcam/webcam.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/algolia_indexer.py)
Algolia docsearch crawler
=========================
This tutorial shows you how to use Modal to run the [Algolia docsearch
crawler](https://docsearch.algolia.com/docs/legacy/run-your-own/) to index your
website and make it searchable. This is not just example code - we run the same
code in production to power search on this page (`Ctrl+K` to try it out!).
Basic setup
-----------
Let’s get the imports out of the way.
```
import json
import os
import subprocess
import modal
```
Modal lets you [use and extend existing Docker images](https://modal.com/docs/guide/custom-container#use-an-existing-container-image-with-from_registry),
as long as they have `python` and `pip` available. We’ll use the official crawler image built by Algolia, with a small
adjustment: since this image has `python` symlinked to `python3.6` and Modal is not compatible with Python 3.6, we
install Python 3.11 and symlink that as the `python` executable instead.
```
algolia_image = modal.Image.from_registry(
"algolia/docsearch-scraper:v1.16.0",
add_python="3.11",
setup_dockerfile_commands=["ENTRYPOINT []"],
)
app = modal.App("example-algolia-indexer")
```
Configure the crawler
---------------------
Now, let’s configure the crawler with the website we want to index, and which
CSS selectors we want to scrape. Complete documentation for crawler configuration is available [here](https://docsearch.algolia.com/docs/legacy/config-file).
```
CONFIG = {
"index_name": "modal_docs",
"custom_settings": {
"separatorsToIndex": "._",
"synonyms": [["cls", "class"]],
},
"stop_urls": [
"https://modal.com/docs/reference/modal.Stub",
"https://modal.com/gpu-glossary",
"https://modal.com/docs/reference/changelog",
],
"start_urls": [
{
"url": "https://modal.com/docs/guide",
"selectors_key": "default",
"page_rank": 2,
},
{
"url": "https://modal.com/docs/examples",
"selectors_key": "examples",
"page_rank": 1,
},
{
"url": "https://modal.com/docs/reference",
"selectors_key": "reference",
"page_rank": 1,
},
],
"selectors": {
"default": {
"lvl0": {
"selector": "header .navlink-active",
"global": True,
},
"lvl1": "article h1",
"lvl2": "article h2",
"lvl3": "article h3",
"text": "article p,article ol,article ul",
},
"examples": {
"lvl0": {
"selector": "header .navlink-active",
"global": True,
},
"lvl1": "article h1",
"text": "article p,article ol,article ul",
},
"reference": {
"lvl0": {
"selector": "//div[contains(@class, 'sidebar')]//a[contains(@class, 'active')]//preceding::a[contains(@class, 'header')][1]",
"type": "xpath",
"global": True,
"default_value": "",
"skip": {"when": {"value": ""}},
},
"lvl1": "article h1",
"lvl2": "article h2",
"lvl3": "article h3",
"text": "article p,article ol,article ul",
},
},
}
```
Create an API key
-----------------
If you don’t already have one, sign up for an account on [Algolia](https://www.algolia.com/). Set up
a project and create an API key with `write` access to your index, and with the ACL permissions `addObject`, `editSettings` and `deleteIndex`. Now, create a Secret on the Modal [Secrets](../../login%EF%B9%96next=%EA%A4%B7secrets.html) page with the `API_KEY` and `APPLICATION_ID` you just created. You can name this anything you want,
but we named it `algolia-secret` and so that’s what the code below expects.
The actual function
-------------------
We want to trigger our crawler from our CI/CD pipeline, so we’re serving it as a [web endpoint](../guide/webhooks.html) that can be triggered by a `GET` request during deploy.
You could also consider running the crawler on a [schedule](../guide/cron.html).
The Algolia crawler is written for Python 3.6 and needs to run in the `pipenv` created for it,
so we’re invoking it using a subprocess.
```
@app.function(
image=algolia_image,
secrets=[modal.Secret.from_name("algolia-secret")],
)
def crawl():
# Installed with a 3.6 venv; Python 3.6 is unsupported by Modal, so use a subprocess instead.
subprocess.run(
["pipenv", "run", "python", "-m", "src.index"],
env={**os.environ, "CONFIG": json.dumps(CONFIG)},
)
```
We want to be able to trigger this function through a webhook.
```
@app.function(image=modal.Image.debian_slim().pip_install("fastapi[standard]"))
@modal.fastapi_endpoint()
def crawl_webhook():
crawl.remote()
return "Finished indexing docs"
```
Deploy the indexer
------------------
That’s all the code we need! To deploy your application, run
```
modal deploy algolia_indexer.py
```
If successful, this will print a URL for your new webhook, that you can hit using `curl` or a browser. Logs from webhook invocations can be found from the [apps](../../login%EF%B9%96next=%EA%A4%B7apps.html) page.
The indexed contents can be found at <https://www.algolia.com/apps/APP_ID/explorer/browse/>, for your
APP\_ID. Once you’re happy with the results, you can [set up the `docsearch` package with your
website](https://docsearch.algolia.com/docs/docsearch-v3/), and create a search component that uses this index.
Entrypoint for development
--------------------------
To make it easier to test this, we also have an entrypoint for when you run `modal run algolia_indexer.py`
```
@app.local_entrypoint()
def run():
crawl.remote()
```
[Algolia docsearch crawler](#algolia-docsearch-crawler)[Basic setup](#basic-setup)[Configure the crawler](#configure-the-crawler)[Create an API key](#create-an-api-key)[The actual function](#the-actual-function)[Deploy the indexer](#deploy-the-indexer)[Entrypoint for development](#entrypoint-for-development)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/algolia_indexer.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/04_secrets/db_to_sheet.py)
Write to Google Sheets from Postgres
====================================
In this tutorial, we’ll show how to use Modal to schedule a daily report in a spreadsheet on Google Sheets
that combines data from a PostgreSQL database with data from an external API.
In particular, we’ll extract the city of each user from the database, look up the current weather in that city,
and then build a count/histogram of how many users are experiencing each type of weather.
Entering credentials
--------------------
We begin by setting up some credentials that we’ll need in order to access our database and output
spreadsheet. To do that in a secure manner, we log in to our Modal account on the web and go to
the [Secrets](../../login%EF%B9%96next=%EA%A4%B7secrets.html) section.
### Database
First we will enter our database credentials. The easiest way to do this is to click **New
secret** and select the **Postgres compatible** Secret preset and fill in the requested
information. Then we press **Next** and name our Secret `postgres-secret` and click **Create**.
### Google Sheets/GCP
We’ll now add another Secret for Google Sheets access through Google Cloud Platform. Click **New
secret** and select the Google Sheets preset.
In order to access the Google Sheets API, we’ll need to create a *Service Account* in Google Cloud
Platform. You can skip this step if you already have a Service Account json file.
1. Sign up to Google Cloud Platform or log in if you haven’t
(<https://cloud.google.com/>).
2. Go to <https://console.cloud.google.com/>.
3. In the navigation pane on the left, go to **IAM & Admin** > **Service Accounts**.
4. Click the **+ CREATE SERVICE ACCOUNT** button.
5. Give the service account a suitable name, like “sheet-access-bot”. Click **Done**. You don’t
have to grant it any specific access privileges at this time.
6. Click your new service account in the list view that appears and navigate to the **Keys** section.
7. Click **Add key** and choose **Create new key**. Use the **JSON** key type and confirm by
clicking **Create**.
8. A json key file should be downloaded to your computer at this point. Copy the contents of that
file and use it as the value for the `SERVICE_ACCOUNT_JSON` field in your new secret.
We’ll name this other Secret `"gsheets-secret"`.
Now you can access the values of your Secrets from Modal Functions that you annotate with the
corresponding `modal.Secret`s, e.g.:
```
import os
import modal
app = modal.App("example-db-to-sheet")
@app.function(secrets=[modal.Secret.from_name("postgres-secret")])
def show_host():
# automatically filled from the specified secret
print("Host is " + os.environ["PGHOST"])
```
Because these Secrets are Python objects, you can construct and manipulate them in your code.
We’ll do that below by defining a variable to hold our Secret for accessing Postgres
You can additionally specify
```
pg_secret = modal.Secret.from_name(
"postgres-secret",
required_keys=["PGHOST", "PGPORT", "PGDATABASE", "PGUSER", "PGPASSWORD"],
)
```
In order to connect to the database, we’ll use the `psycopg2` Python package. To make it available
to your Modal Function you need to supply it with an `image` argument that tells Modal how to
build the container image that contains that package. We’ll base it off of the `Image.debian_slim` base
image that’s built into Modal, and make sure to install the required binary packages as well as
the `psycopg2` package itself:
```
pg_image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("libpq-dev")
.pip_install("psycopg2~=2.9.9")
)
```
Since the default keynames for a **Postgres compatible** secret correspond to the environment
variables that `psycopg2` looks for, we can now easily connect to the database even without
explicit credentials in your code. We’ll create a simple function that queries the city for each
user in the `users` table.
```
@app.function(image=pg_image, secrets=[pg_secret])
def get_db_rows(verbose=True):
import psycopg2
conn = psycopg2.connect() # no explicit credentials needed
cur = conn.cursor()
cur.execute("SELECT city FROM users")
results = [row[0] for row in cur.fetchall()]
if verbose:
print(results)
return results
```
Note that we import `psycopg2` inside our function instead of the global scope. This allows us to
run this Modal Function even from an environment where `psycopg2` is not installed. We can test run
this function using the `modal run` shell command: `modal run db_to_sheet.py::app.get_db_rows`.
To run this function, make sure there is a table called `users` in your database with a column called `city`.
You can populate the table with some example data using the following SQL commands:
```
CREATE TABLE users (city TEXT);
INSERT INTO users VALUES ('Stockholm,,Sweden');
INSERT INTO users VALUES ('New York,NY,USA');
INSERT INTO users VALUES ('Tokyo,,Japan');
```
Applying Python logic
---------------------
For each row in our source data we’ll run an online lookup of the current weather using the <http://openweathermap.org> API. To do this, we’ll add the API key to
another Modal Secret. We’ll use a custom secret called “weather-secret” with the key `OPENWEATHER_API_KEY` containing our API key for OpenWeatherMap.
```
requests_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"requests~=2.31.0"
)
@app.function(
image=requests_image,
secrets=[
modal.Secret.from_name("weather-secret", required_keys=["OPENWEATHER_API_KEY"])
],
)
def city_weather(city):
import requests
url = "https://api.openweathermap.org/data/2.5/weather"
params = {"q": city, "appid": os.environ["OPENWEATHER_API_KEY"]}
response = requests.get(url, params=params)
weather_label = response.json()["weather"][0]["main"]
return weather_label
```
We’ll make use of Modal’s built-in `function.map` method to create our report. `function.map` makes it really easy to parallelize work by executing a Function on every element in a sequence of
data. For this example we’ll just do a simple count of rows per weather type —
answering the question “how many of our users are experiencing each type of weather?“.
```
from collections import Counter
@app.function()
def create_report(cities):
# run city_weather for each city in parallel
user_weather = city_weather.map(cities)
count_users_by_weather = Counter(user_weather).items()
return count_users_by_weather
```
Let’s try to run this! To make it simple to trigger the function with some
predefined input data, we create a “local entrypoint” that can be
run from the command line with
```
modal run db_to_sheet.py
```
```
@app.local_entrypoint()
def main():
cities = [
"Stockholm,,Sweden",
"New York,NY,USA",
"Tokyo,,Japan",
]
print(create_report.remote(cities))
```
Running the local entrypoint using `modal run db_to_sheet.py` should print something like: `dict_items([('Clouds', 3)])`.
Note that since this file only has a single app, and the app has only one local entrypoint
we only have to specify the file to run it - the function/entrypoint is inferred.
In this case the logic is quite simple, but in a real world context you could have applied a
machine learning model or any other tool you could build into a container to transform the data.
Sending output to a Google Sheet
--------------------------------
We’ll set up a new Google Sheet to send our report to. Using the “Sharing” dialog in Google
Sheets, share the document to the service account’s email address (the value of the `client_email` field in the json file)
and make the service account an editor of the document.
You may also need to enable the Google Sheets API for your project in the Google Cloud Platform console.
If so, the URL will be printed inside the message of a 403 Forbidden error when you run the function.
It begins with <https://console.developers.google.com/apis/api/sheets.googleapis.com/overview>.
Lastly, we need to point our code to the correct Google Sheet. We’ll need the *key* of the document.
You can find the key in the URL of the Google Sheet. It appears after the `/d/` in the URL, like: `https://docs.google.com/spreadsheets/d/1wOktal......IJR77jD8Do`.
We’ll make use of the `pygsheets` python package to authenticate with
Google Sheets and then update the spreadsheet with information from the report we just created:
```
pygsheets_image = modal.Image.debian_slim(python_version="3.11").pip_install(
"pygsheets~=2.0.6"
)
@app.function(
image=pygsheets_image,
secrets=[
modal.Secret.from_name("gsheets-secret", required_keys=["SERVICE_ACCOUNT_JSON"])
],
)
def update_sheet_report(rows):
import pygsheets
gc = pygsheets.authorize(service_account_env_var="SERVICE_ACCOUNT_JSON")
document_key = "1JxhGsht4wltyPFFOd2hP0eIv6lxZ5pVxJN_ZwNT-l3c"
sh = gc.open_by_key(document_key)
worksheet = sh.sheet1
worksheet.clear("A2")
worksheet.update_values("A2", [list(row) for row in rows])
```
At this point, we have everything we need in order to run the full program. We can put it all together in
another Modal Function, and add a [`schedule`](../guide/cron.html) argument so it runs every day automatically:
```
@app.function(schedule=modal.Period(days=1))
def db_to_sheet():
rows = get_db_rows.remote()
report = create_report.remote(rows)
update_sheet_report.remote(report)
print("Updated sheet with new weather distribution")
for weather, count in report:
print(f"{weather}: {count}")
```
This entire app can now be deployed using `modal deploy db_to_sheet.py`. The [apps page](../../login%EF%B9%96next=%EA%A4%B7apps.html) shows our cron job’s execution history and lets you navigate to each invocation’s logs.
To trigger a manual run from your local code during development, you can also trigger this function using the cli: `modal run db_to_sheet.py::db_to_sheet`
Note that all of the `@app.function()` annotated functions above run remotely in isolated containers that are specified per
function, but they are called as seamlessly as if we were using regular Python functions. This is a simple
showcase of how you can mix and match Modal Functions that use different environments and have them feed
into each other or even call each other as if they were all functions in the same local program.
[Write to Google Sheets from Postgres](#write-to-google-sheets-from-postgres)[Entering credentials](#entering-credentials)[Database](#database)[Google Sheets/GCP](#google-sheetsgcp)[Applying Python logic](#applying-python-logic)[Sending output to a Google Sheet](#sending-output-to-a-google-sheet)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 04_secrets/db_to_sheet.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-serving/llama_cpp.py)
Run large and small language models with llama.cpp (DeepSeek-R1, Phi-4)
=======================================================================
This example demonstrate how to run small (Phi-4) and large (DeepSeek-R1)
language models on Modal with [`llama.cpp`](https://github.com/ggerganov/llama.cpp).
By default, this example uses DeepSeek-R1 to produce a “Flappy Bird” game in Python —
see the video below. The code used in the video is [here](https://gist.github.com/charlesfrye/a3788c61019c32cb7947f4f5b1c04818),
along with the model’s raw outputs.
Note that getting the game to run required a small bugfix from a human —
our jobs are still safe, for now.
[[](https://modal-cdn.com/example-flap-py.mp4)](https://gist.github.com/charlesfrye/a3788c61019c32cb7947f4f5b1c04818)
```
from pathlib import Path
from typing import Optional
import modal
```
What GPU can run DeepSeek-R1? What GPU can run Phi-4?
-----------------------------------------------------
Our large model is a real whale: [DeepSeek-R1](https://api-docs.deepseek.com/news/news250120),
which has 671B total parameters and so consumes over 100GB of storage,
even when [quantized down to one ternary digit (1.58 bits)](https://unsloth.ai/blog/deepseekr1-dynamic) per parameter.
To make sure we have enough room for it and its activations/KV cache,
we select four L40S GPUs, which together have 192 GB of memory.
[Phi-4](https://huggingface.co/microsoft/phi-4),
on the other hand, is a svelte 14B total parameters,
or roughly 5 GB when quantized down to [two bits per parameter](https://huggingface.co/unsloth/phi-4-GGUF).
That’s small enough that it can be comfortably run on a CPU,
especially for a single-user setup like the one we’ll build here.
```
GPU_CONFIG = "L40S:4" # for DeepSeek-R1, literal `None` for phi-4
```
Calling a Modal Function from the command line
----------------------------------------------
To start, we define our `main` function —
the Python function that we’ll run locally to
trigger our inference to run on Modal’s cloud infrastructure.
This function, like the others that form our inference service
running on Modal, is part of a Modal [App](../guide/apps.html).
Specifically, it is a `local_entrypoint`.
Any Python code can call Modal Functions remotely,
but local entrypoints get a command-line interface for free.
```
app = modal.App("example-llama-cpp")
@app.local_entrypoint()
def main(
prompt: Optional[str] = None,
model: str = "DeepSeek-R1", # or "phi-4"
n_predict: int = -1, # max number of tokens to predict, -1 is infinite
args: Optional[str] = None, # string of arguments to pass to llama.cpp's cli
):
"""Run llama.cpp inference on Modal for phi-4 or deepseek r1."""
import shlex
org_name = "unsloth"
# two sample models: the diminuitive phi-4 and the chonky deepseek r1
if model.lower() == "phi-4":
model_name = "phi-4-GGUF"
quant = "Q2_K"
model_entrypoint_file = f"phi-4-{quant}.gguf"
model_pattern = f"*{quant}*"
revision = None
parsed_args = DEFAULT_PHI_ARGS if args is None else shlex.split(args)
elif model.lower() == "deepseek-r1":
model_name = "DeepSeek-R1-GGUF"
quant = "UD-IQ1_S"
model_entrypoint_file = (
f"{model}-{quant}/DeepSeek-R1-{quant}-00001-of-00003.gguf"
)
model_pattern = f"*{quant}*"
revision = "02656f62d2aa9da4d3f0cdb34c341d30dd87c3b6"
parsed_args = DEFAULT_DEEPSEEK_R1_ARGS if args is None else shlex.split(args)
else:
raise ValueError(f"Unknown model {model}")
repo_id = f"{org_name}/{model_name}"
download_model.remote(repo_id, [model_pattern], revision)
# call out to a `.remote` Function on Modal for inference
result = llama_cpp_inference.remote(
model_entrypoint_file,
prompt,
n_predict,
parsed_args,
store_output=model.lower() == "deepseek-r1",
)
output_path = Path("/tmp") / f"llama-cpp-{model}.txt"
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"🦙 writing response to {output_path}")
output_path.write_text(result)
```
You can trigger inference from the command line with
```
modal run llama_cpp.py
```
To try out Phi-4 instead, use the `--model` argument:
```
modal run llama_cpp.py --model="phi-4"
```
Note that this will run for up to 30 minutes, which costs ~$5.
To allow it to proceed even if your local terminal fails,
add the `--detach` flag after `modal run`.
See below for details on getting the outputs.
You can pass prompts with the `--prompt` argument and set the maximum number of tokens
with the `--n-predict` argument.
Additional arguments for `llama-cli` are passed as a string like `--args="--foo 1 --bar"`.
For convenience, we set a number of sensible defaults for DeepSeek-R1,
following the suggestions by the team at unsloth,
who [quantized the model to 1.58 bit](https://unsloth.ai/blog/deepseekr1-dynamic).
```
DEFAULT_DEEPSEEK_R1_ARGS = [ # good default llama.cpp cli args for deepseek-r1
"--cache-type-k",
"q4_0",
"--threads",
"12",
"-no-cnv",
"--prio",
"2",
"--temp",
"0.6",
"--ctx-size",
"8192",
]
DEFAULT_PHI_ARGS = [ # good default llama.cpp cli args for phi-4
"--threads",
"16",
"-no-cnv",
"--ctx-size",
"16384",
]
```
Compiling llama.cpp with CUDA support
-------------------------------------
In order to run inference, we need the model’s weights
and we need code to run inference with those weights.
[`llama.cpp`](https://github.com/ggerganov/llama.cpp) is a no-frills C++ library for running large language models.
It supports highly-quantized versions of models ideal for running
single-user language modeling services on CPU or GPU.
We compile it, with CUDA support, and add it to a Modal [container image](../guide/images.html) using the code below.
For more details on using CUDA on Modal, including why
we need to use the `nvidia/cuda` registry image in this case
(hint: it’s for the [`nvcc` compiler](https://modal.com/gpu-glossary/host-software/nvcc)),
see the [Modal guide to using CUDA](../guide/cuda.html).
```
LLAMA_CPP_RELEASE = "b4568"
MINUTES = 60
cuda_version = "12.4.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
.apt_install("git", "build-essential", "cmake", "curl", "libcurl4-openssl-dev")
.run_commands("git clone https://github.com/ggerganov/llama.cpp")
.run_commands(
"cmake llama.cpp -B llama.cpp/build "
"-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON "
)
.run_commands( # this one takes a few minutes!
"cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli"
)
.run_commands("cp llama.cpp/build/bin/llama-* llama.cpp")
.entrypoint([]) # remove NVIDIA base container entrypoint
)
```
Storing models on Modal
-----------------------
To make the model weights available on Modal,
we download them from Hugging Face.
Modal is serverless, so disks are by default ephemeral.
To make sure our weights don’t disappear between runs,
which would trigger a long download, we store them in a
Modal [Volume](../guide/volumes.html).
For more on how to use Modal Volumes to store model weights,
see [this guide](../guide/model-weights.html).
```
model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)
cache_dir = "/root/.cache/llama.cpp"
download_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("huggingface_hub[hf_transfer]==0.26.2")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
@app.function(
image=download_image, volumes={cache_dir: model_cache}, timeout=30 * MINUTES
)
def download_model(repo_id, allow_patterns, revision: Optional[str] = None):
from huggingface_hub import snapshot_download
print(f"🦙 downloading model from {repo_id} if not present")
snapshot_download(
repo_id=repo_id,
revision=revision,
local_dir=cache_dir,
allow_patterns=allow_patterns,
)
model_cache.commit() # ensure other Modal Functions can see our writes before we quit
print("🦙 model loaded")
```
Storing model outputs on Modal
------------------------------
Contemporary large reasoning models are slow —
for the sample “flappy bird” prompt we provide,
results are sometimes produced only after several (or even tens of) minutes.
That makes their outputs worth storing.
In addition to sending them back to clients,
like our local command line,
we’ll store the results on a Modal Volume for safe-keeping.
```
results = modal.Volume.from_name("llamacpp-results", create_if_missing=True)
results_dir = "/root/results"
```
You can retrieve the results later in a number of ways.
You can use the Volume CLI:
```
modal volume ls llamacpp-results
```
You can attach the Volume to a Modal `shell` to poke around in a familiar terminal environment:
```
modal shell --volume llamacpp-results
# then cd into /mnt
```
Or you can access it from any other Python environment
by using the same `modal.Volume` call as above to instantiate it:
```
results = modal.Volume.from_name("llamacpp-results")
print(dir(results)) # show methods
```
Running llama.cpp as a Modal Function
-------------------------------------
Now, let’s put it all together.
At the top of our `llama_cpp_inference` function,
we add an `app.function` decorator to attach all of our infrastructure:
* the `image` with the dependencies
* the `volumes` with the weights and where we can put outputs
* the `gpu` we want, if any
We also specify a `timeout` after which to cancel the run.
Inside the function, we call the `llama.cpp` CLI
with `subprocess.Popen`. This requires a bit of extra ceremony
because we want to both show the output as we run
and store the output to save and return to the local caller.
For details, see the [Addenda section](#addenda) below.
Alternatively, you might set up an OpenAI-compatible server
using base `llama.cpp` or its [Python wrapper library](https://github.com/abetlen/llama-cpp-python) along with one of [Modal’s decorators for web hosting](../guide/webhooks.html).
```
@app.function(
image=image,
volumes={cache_dir: model_cache, results_dir: results},
gpu=GPU_CONFIG,
timeout=30 * MINUTES,
)
def llama_cpp_inference(
model_entrypoint_file: str,
prompt: Optional[str] = None,
n_predict: int = -1,
args: Optional[list[str]] = None,
store_output: bool = True,
):
import subprocess
from uuid import uuid4
if prompt is None:
prompt = DEFAULT_PROMPT # see end of file
if "deepseek" in model_entrypoint_file.lower():
prompt = "<|User|>" + prompt + "<think>"
if args is None:
args = []
# set layers to "off-load to", aka run on, GPU
if GPU_CONFIG is not None:
n_gpu_layers = 9999 # all
else:
n_gpu_layers = 0
if store_output:
result_id = str(uuid4())
print(f"🦙 running inference with id:{result_id}")
command = [
"/llama.cpp/llama-cli",
"--model",
f"{cache_dir}/{model_entrypoint_file}",
"--n-gpu-layers",
str(n_gpu_layers),
"--prompt",
prompt,
"--n-predict",
str(n_predict),
] + args
print("🦙 running commmand:", command, sep="\n\t")
p = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=False
)
stdout, stderr = collect_output(p)
if p.returncode != 0:
raise subprocess.CalledProcessError(p.returncode, command, stdout, stderr)
if store_output: # save results to a Modal Volume if requested
print(f"🦙 saving results for {result_id}")
result_dir = Path(results_dir) / result_id
result_dir.mkdir(
parents=True,
)
(result_dir / "out.txt").write_text(stdout)
(result_dir / "err.txt").write_text(stderr)
return stdout
```
Addenda
=======
The remainder of this code is less interesting from the perspective
of running LLM inference on Modal but necessary for the code to run.
For example, it includes the default “Flappy Bird in Python” prompt included in [unsloth’s announcement](https://unsloth.ai/blog/deepseekr1-dynamic) of their 1.58 bit quantization of DeepSeek-R1.
```
DEFAULT_PROMPT = """Create a Flappy Bird game in Python. You must include these things:
You must use pygame.
The background color should be randomly chosen and is a light shade. Start with a light blue color.
Pressing SPACE multiple times will accelerate the bird.
The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.
Place on the bottom some land colored as dark brown or yellow chosen randomly.
Make a score shown on the top right side. Increment if you pass pipes and don't hit them.
Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.
When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.
The final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section."""
def stream_output(stream, queue, write_stream):
"""Reads lines from a stream and writes to a queue and a write stream."""
for line in iter(stream.readline, b""):
line = line.decode("utf-8", errors="replace")
write_stream.write(line)
write_stream.flush()
queue.put(line)
stream.close()
def collect_output(process):
"""Collect up the stdout and stderr of a process while still streaming it out."""
import sys
from queue import Queue
from threading import Thread
stdout_queue = Queue()
stderr_queue = Queue()
stdout_thread = Thread(
target=stream_output, args=(process.stdout, stdout_queue, sys.stdout)
)
stderr_thread = Thread(
target=stream_output, args=(process.stderr, stderr_queue, sys.stderr)
)
stdout_thread.start()
stderr_thread.start()
stdout_thread.join()
stderr_thread.join()
process.wait()
stdout_collected = "".join(stdout_queue.queue)
stderr_collected = "".join(stderr_queue.queue)
return stdout_collected, stderr_collected
```
[Run large and small language models with llama.cpp (DeepSeek-R1, Phi-4)](#run-large-and-small-language-models-with-llamacpp-deepseek-r1-phi-4)[What GPU can run DeepSeek-R1? What GPU can run Phi-4?](#what-gpu-can-run-deepseek-r1-what-gpu-can-run-phi-4)[Calling a Modal Function from the command line](#calling-a-modal-function-from-the-command-line)[Compiling llama.cpp with CUDA support](#compiling-llamacpp-with-cuda-support)[Storing models on Modal](#storing-models-on-modal)[Storing model outputs on Modal](#storing-model-outputs-on-modal)[Running llama.cpp as a Modal Function](#running-llamacpp-as-a-modal-function)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/llm-serving/llama_cpp.py --n-predict 1024
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/tailscale/modal_tailscale.py)
Add Modal Apps to Tailscale
===========================
This example demonstrates how to integrate Modal with Tailscale (<https://tailscale.com>).
It outlines the steps to configure Modal containers so that they join the Tailscale network.
We use a custom entrypoint to automatically add containers to a Tailscale network (tailnet).
This configuration enables the containers to interact with one another and with
additional applications within the same tailnet.
```
import modal
```
Install Tailscale and copy custom entrypoint script ([entrypoint.sh](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/tailscale/entrypoint.sh)). The script must be
executable.
```
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("curl")
.run_commands("curl -fsSL https://tailscale.com/install.sh | sh")
.pip_install("requests==2.32.3", "PySocks==1.7.1")
.add_local_file("./entrypoint.sh", "/root/entrypoint.sh", copy=True)
.dockerfile_commands(
"RUN chmod a+x /root/entrypoint.sh",
'ENTRYPOINT ["/root/entrypoint.sh"]',
)
)
app = modal.App(image=image)
```
Configure Python to use the SOCKS5 proxy globally.
```
with image.imports():
import socket
import socks
socks.set_default_proxy(socks.SOCKS5, "0.0.0.0", 1080)
socket.socket = socks.socksocket
```
Run your function adding a Tailscale secret. We suggest creating a [reusable and ephemeral key](https://tailscale.com/kb/1111/ephemeral-nodes).
```
@app.function(
secrets=[
modal.Secret.from_name("tailscale-auth", required_keys=["TAILSCALE_AUTHKEY"]),
modal.Secret.from_dict(
{
"ALL_PROXY": "socks5://localhost:1080/",
"HTTP_PROXY": "http://localhost:1080/",
"http_proxy": "http://localhost:1080/",
}
),
],
)
def connect_to_machine():
import requests
# Connect to other machines in your tailnet.
resp = requests.get("http://my-tailscale-machine:5000")
print(resp.content)
```
Run this script with `modal run modal_tailscale.py`. You will see Tailscale logs
when the container start indicating that you were able to login successfully and
that the proxies (SOCKS5 and HTTP) have created been successfully. You will also
be able to see Modal containers in your Tailscale dashboard in the “Machines” tab.
Every new container launched will show up as a new “machine”. Containers are
individually addressable using their Tailscale name or IP address.
[Add Modal Apps to Tailscale](#add-modal-apps-to-tailscale)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/tailscale/modal_tailscale.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-serving/vllm_inference.py)
Run OpenAI-compatible LLM inference with LLaMA 3.1-8B and vLLM
==============================================================
LLMs do more than just model language: they chat, they produce JSON and XML, they run code, and more.
This has complicated their interface far beyond “text-in, text-out”.
OpenAI’s API has emerged as a standard for that interface,
and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/).
In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal.
Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible).
You can find a video walkthrough of this example and the related scripts on the Modal YouTube channel [here](https://www.youtube.com/watch?v=QmY_7ePR1hM).
Set up the container image
--------------------------
Our first order of business is to define the environment our server will run in:
the [container `Image`](https://modal.com/docs/guide/custom-container).
vLLM can be installed with `pip`.
```
import modal
vllm_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"vllm==0.7.2",
"huggingface_hub[hf_transfer]==0.26.2",
"flashinfer-python==0.2.0.post2", # pinning, very unstable
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
)
```
In its 0.7 release, vLLM added a new version of its backend infrastructure,
the [V1 Engine](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Using this new engine can lead to some [impressive speedups](https://github.com/modal-labs/modal-examples/pull/1064),
but as of version 0.7.2 the new engine does not support all inference engine features
(including important performance optimizations like [speculative decoding](https://docs.vllm.ai/en/v0.7.2/features/spec_decode.html)).
The features we use in this demo are supported, so we turn the engine on by setting an environment variable
on the Modal Image.
```
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
```
Download the model weights
--------------------------
We’ll be running a pretrained foundation model — Meta’s LLaMA 3.1 8B
in the Instruct variant that’s trained to chat and follow instructions,
quantized to 4-bit by [Neural Magic](https://neuralmagic.com/) and uploaded to Hugging Face.
You can read more about the `w4a16` “Machete” weight layout and kernels [here](https://neuralmagic.com/blog/introducing-machete-a-mixed-input-gemm-kernel-optimized-for-nvidia-hopper-gpus/).
```
MODELS_DIR = "/llamas"
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
```
Although vLLM will download weights on-demand, we want to cache them if possible. We’ll use [Modal Volumes](../guide/volumes.html),
which act as a “shared disk” that all Modal Functions can access, for our cache.
```
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
```
Build a vLLM engine and serve it
--------------------------------
The function below spawns a vLLM instance listening at port 8000, serving requests to our model. vLLM will authenticate requests
using the API key we provide it.
We wrap it in the [`@modal.web_server` decorator](../guide/webhooks.html#non-asgi-web-servers) to connect it to the Internet.
```
app = modal.App("example-vllm-openai-compatible")
N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
API_KEY = "super-secret-key" # api key, for auth. for production use, replace with a modal.Secret
MINUTES = 60 # seconds
VLLM_PORT = 8000
@app.function(
image=vllm_image,
gpu=f"H100:{N_GPU}",
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
timeout=10 * MINUTES, # how long should we wait for container start?
volumes={
"/root/.cache/huggingface": hf_cache_vol,
"/root/.cache/vllm": vllm_cache_vol,
},
)
@modal.concurrent(
max_inputs=100
) # how many requests can one replica handle? tune carefully!
@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
def serve():
import subprocess
cmd = [
"vllm",
"serve",
"--uvicorn-log-level=info",
MODEL_NAME,
"--revision",
MODEL_REVISION,
"--host",
"0.0.0.0",
"--port",
str(VLLM_PORT),
"--api-key",
API_KEY,
]
subprocess.Popen(" ".join(cmd), shell=True)
```
Deploy the server
-----------------
To deploy the API on Modal, just run
```
modal deploy vllm_inference.py
```
This will create a new app on Modal, build the container image for it if it hasn’t been built yet,
and deploy the app.
Interact with the server
------------------------
Once it is deployed, you’ll see a URL appear in the command line,
something like `https://your-workspace-name--example-vllm-openai-compatible-serve.modal.run`.
You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/) at the `/docs` route of that URL, i.e. `https://your-workspace-name--example-vllm-openai-compatible-serve.modal.run/docs`.
These docs describe each route and indicate the expected input and output
and translate requests into `curl` commands.
For simple routes like `/health`, which checks whether the server is responding,
you can even send a request directly from the docs.
To interact with the API programmatically in Python, we recommend the `openai` library.
See the `client.py` script in the examples repository [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible) to take it for a spin:
```
# pip install openai==1.76.0
python openai_compatible/client.py
```
Testing the server
------------------
To make it easier to test the server setup, we also include a `local_entrypoint` that does a healthcheck and then hits the server.
If you execute the command
```
modal run vllm_inference.py
```
a fresh replica of the server will be spun up on Modal while
the code below executes on your local machine.
Think of this like writing simple tests inside of the `if __name__ == "__main__"` block of a Python script, but for cloud deployments!
```
@app.local_entrypoint()
def test(test_timeout=10 * MINUTES):
import json
import time
import urllib
print(f"Running health check for server at {serve.get_web_url()}")
up, start, delay = False, time.time(), 10
while not up:
try:
with urllib.request.urlopen(serve.get_web_url() + "/health") as response:
if response.getcode() == 200:
up = True
except Exception:
if time.time() - start > test_timeout:
break
time.sleep(delay)
assert up, f"Failed health check for server at {serve.get_web_url()}"
print(f"Successful health check for server at {serve.get_web_url()}")
messages = [{"role": "user", "content": "Testing! Is this thing on?"}]
print(f"Sending a sample message to {serve.get_web_url()}", *messages, sep="\n")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
payload = json.dumps({"messages": messages, "model": MODEL_NAME})
req = urllib.request.Request(
serve.get_web_url() + "/v1/chat/completions",
data=payload.encode("utf-8"),
headers=headers,
method="POST",
)
with urllib.request.urlopen(req) as response:
print(json.loads(response.read().decode()))
```
We also include a basic example of a load-testing setup using `locust` in the `load_test.py` script [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible):
```
modal run openai_compatible/load_test.py
```
[Run OpenAI-compatible LLM inference with LLaMA 3.1-8B and vLLM](#run-openai-compatible-llm-inference-with-llama-31-8b-and-vllm)[Set up the container image](#set-up-the-container-image)[Download the model weights](#download-the-model-weights)[Build a vLLM engine and serve it](#build-a-vllm-engine-and-serve-it)[Deploy the server](#deploy-the-server)[Interact with the server](#interact-with-the-server)[Testing the server](#testing-the-server)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/llm-serving/vllm_inference.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/basic_web.py)
Hello world wide web!
=====================
Modal makes it easy to turn your Python functions into serverless web services:
access them via a browser or call them from any client that speaks HTTP, all
without having to worry about setting up servers or managing infrastructure.
This tutorial shows the path with the shortest [“time to 200”](https://shkspr.mobi/blog/2021/05/whats-your-apis-time-to-200/): [`modal.fastapi_endpoint`](../reference/modal.fastapi_endpoint.html).
On Modal, web endpoints have all the superpowers of Modal Functions:
they can be [accelerated with GPUs](../guide/gpu.html),
they can access [Secrets](../guide/secrets.html) or [Volumes](../guide/volumes.html),
and they [automatically scale](../guide/cold-start.html) to handle more traffic.
Under the hood, we use the [FastAPI library](https://fastapi.tiangolo.com/),
which has [high-quality documentation](https://fastapi.tiangolo.com/tutorial/),
linked throughout this tutorial.
Turn a Modal Function into an API endpoint with a single decorator
------------------------------------------------------------------
Modal Functions are already accessible remotely — when you add the `@app.function` decorator to a Python function
and run `modal deploy`, you make it possible for your [other Python functions to call it](../guide/trigger-deployed-functions.html).
That’s great, but it’s not much help if you want to share what you’ve written with someone running code in a different language —
or not running code at all!
And that’s where most of the power of the Internet comes from: sharing information and functionality across different computer systems.
So we provide the `fastapi_endpoint` decorator to wrap your Modal Functions in the lingua franca of the web: HTTP.
Here’s what that looks like:
```
import modal
image = modal.Image.debian_slim().pip_install("fastapi[standard]")
app = modal.App(name="example-lifecycle-web", image=image)
@app.function()
@modal.fastapi_endpoint(
docs=True # adds interactive documentation in the browser
)
def hello():
return "Hello world!"
```
You can turn this function into a web endpoint by running `modal serve basic_web.py`.
In the output, you should see a URL that ends with `hello-dev.modal.run`.
If you navigate to this URL, you should see the `"Hello world!"` message appear in your browser.
You can also find interactive documentation, powered by OpenAPI and Swagger,
if you add `/docs` to the end of the URL.
From this documentation, you can interact with your endpoint, sending HTTP requests and receiving HTTP responses.
For more details, see the [FastAPI documentation](https://fastapi.tiangolo.com/features/#automatic-docs).
By running the endpoint with `modal serve`, you created a temporary endpoint that will disappear if you interrupt your terminal.
These temporary endpoints are great for debugging — when you save a change to any of your dependent files, the endpoint will redeploy.
Try changing the message to something else, hitting save, and then hitting refresh in your browser or re-sending
the request from `/docs` or the command line. You should see the new message, along with logs in your terminal showing the redeploy and the request.
When you’re ready to deploy this endpoint permanently, run `modal deploy basic_web.py`.
Now, your function will be available even when you’ve closed your terminal or turned off your computer.
Send data to a web endpoint
---------------------------
The web endpoint above was a bit silly: it always returns the same message.
Most endpoints need an input to be useful. There are two ways to send data to a web endpoint:
* in the URL as a [query parameter](#sending-data-in-query-parameters)
* in the [body of the request](#sending-data-in-the-request-body) as JSON
### Sending data in query parameters
By default, your function’s arguments are treated as query parameters:
they are extracted from the end of the URL, where they should be added in the form `?arg1=foo&arg2=bar`.
From the Python side, there’s hardly anything to do:
```
@app.function()
@modal.fastapi_endpoint(docs=True)
def greet(user: str) -> str:
return f"Hello {user}!"
```
If you are already running `modal serve basic_web.py`, this endpoint will be available at a URL, printed in your terminal, that ends with `greet-dev.modal.run`.
We provide Python type-hints to get type information in the docs and [automatic validation](https://fastapi.tiangolo.com/tutorial/query-params-str-validations/).
For example, if you navigate directly to the URL for `greet`, you will get a detailed error message
indicating that the `user` parameter is missing. Navigate instead to `/docs` to see how to invoke the endpoint properly.
You can read more about query parameters in the [FastAPI documentation](https://fastapi.tiangolo.com/tutorial/query-params/).
### Sending data in the request body
For larger and more complex data, it is generally preferrable to send data in the body of the HTTP request.
This body is formatted as [JSON](https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Objects/JSON),
the most common data interchange format on the web.
To set up an endpoint that accepts JSON data, add an argument with a `dict` type-hint to your function.
This argument will be populated with the data sent in the request body.
```
@app.function()
@modal.fastapi_endpoint(method="POST", docs=True)
def goodbye(data: dict) -> str:
name = data.get("name") or "world"
return f"Goodbye {name}!"
```
Note that we gave a value of `"POST"` for the `method` argument here.
This argument defines the HTTP request method that the endpoint will respond to,
and it defaults to `"GET"`.
If you head to the URL for the `goodbye` endpoint in your browser,
you will get a 405 Method Not Allowed error, because browsers only send GET requests by default.
While this is technically a separate concern from query parameters versus request bodies
and you can define an endpoint that accepts GET requests and uses data from the body,
it is [considered bad form](https://stackoverflow.com/a/983458).
Navigate to `/docs` for more on how to invoke the endpoint properly.
You will need to send a POST request with a JSON body containing a `name` key.
To get the same typing and validation benefits as with query parameters,
use a [Pydantic model](https://fastapi.tiangolo.com/tutorial/body/) for this argument.
You can read more about request bodies in the [FastAPI documentation](https://fastapi.tiangolo.com/tutorial/body/).
Handle expensive startup with `modal.Cls`
-----------------------------------------
Sometimes your endpoint needs to do something before it can handle its first request,
like get a value from a database or set the value of a variable.
If that step is expensive, like [loading a large ML model](../guide/model-weights.html),
it’d be a shame to have to do it every time a request comes in!
Web endpoints can be methods on a [`modal.Cls`](../guide/lifecycle-functions.html#container-lifecycle-functions-and-parameters),
which allows you to manage the container’s lifecycle independently from processing individual requests.
This example will only set the `start_time` instance variable once, on container startup.
```
@app.cls()
class WebApp:
@modal.enter()
def startup(self):
from datetime import datetime, timezone
print("🏁 Starting up!")
self.start_time = datetime.now(timezone.utc)
@modal.fastapi_endpoint(docs=True)
def web(self):
from datetime import datetime, timezone
current_time = datetime.now(timezone.utc)
return {"start_time": self.start_time, "current_time": current_time}
```
Protect web endpoints with proxy authentication
-----------------------------------------------
Sharing your Python functions on the web is great, but it’s not always a good idea
to make those functions available to just anyone.
For example, you might have a function like the one below that
is more expensive to run than to call (and so might be abused by your enemies)
or reveals information that you would rather keep secret.
To protect your Modal web endpoints so that they can’t be triggered except
by members of your [Modal workspace](../guide/workspaces.html),
add the `requires_proxy_auth=True` flag to the `fastapi_endpoint` decorator.
```
@app.function(gpu="h100")
@modal.fastapi_endpoint(requires_proxy_auth=True, docs=False)
def expensive_secret():
return "I didn't care for 'The Godfather'. It insists upon itself."
```
The `expensive-secret` endpoint URL will still be printed to the output when you `modal serve` or `modal deploy`,
along with a ”🔑” emoji indicating that it is secured with proxy authentication.
If you head to that URL via the browser, you will get a [`401 Unauthorized`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401) error code in response.
You should also check the dashboard page for this app (at the URL printed at the very top of the `modal` command output)
so you can see that no containers were spun up to handle the request — this authorization is handled entirely inside Modal’s infrastructure.
You can trigger the web endpoint by [creating a Proxy Auth Token](https://modal.com/settings/proxy-auth-tokens) and then including the token ID and secret in the `Modal-Key` and `Modal-Secret` headers.
From the command line, that might look like
```
export TOKEN_ID=wk-1234abcd
export TOKEN_SECRET=ws-1234abcd
curl -H "Modal-Key: $TOKEN_ID" \
-H "Modal-Secret: $TOKEN_SECRET" \
https://your-workspace-name--expensive-secret.modal.run
```
For more details, see the [guide to proxy authentication](../guide/webhook-proxy-auth.html).
What next?
----------
Modal’s `fastapi_endpoint` decorator is opinionated and designed for relatively simple web applications —
one or a few independent Python functions that you want to expose to the web.
Three additional decorators allow you to serve more complex web applications with greater control:
* [`asgi_app`](../guide/webhooks.html#asgi) to serve applications compliant with the ASGI standard,
like [FastAPI](https://fastapi.tiangolo.com/)
* [`wsgi_app`](../guide/webhooks.html#wsgi) to serve applications compliant with the WSGI standard,
like [Flask](https://flask.palletsprojects.com/)
* [`web_server`](../guide/webhooks.html#non-asgi-web-servers) to serve any application that listens on a port
[Hello world wide web!](#hello-world-wide-web)[Turn a Modal Function into an API endpoint with a single decorator](#turn-a-modal-function-into-an-api-endpoint-with-a-single-decorator)[Send data to a web endpoint](#send-data-to-a-web-endpoint)[Sending data in query parameters](#sending-data-in-query-parameters)[Sending data in the request body](#sending-data-in-the-request-body)[Handle expensive startup with modal.Cls](#handle-expensive-startup-with-modalcls)[Protect web endpoints with proxy authentication](#protect-web-endpoints-with-proxy-authentication)[What next?](#what-next)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 07_web_endpoints/basic_web.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/13_sandboxes/codelangchain/agent.py)
Build a coding agent with Modal Sandboxes and LangGraph
=======================================================
This example demonstrates how to build an LLM coding “agent” that can generate and execute Python code, using
documentation from the web to inform its approach.
Naturally, we use the agent to generate code that runs language models.
The agent is built with [LangGraph](https://github.com/langchain-ai/langgraph), a library for building
directed graphs of computation popular with AI agent developers,
and uses models from the OpenAI API.
Setup
-----
```
import modal
from .src import edges, nodes, retrieval
from .src.common import COLOR, PYTHON_VERSION, image
```
You will need two [Modal Secrets](../guide/secrets.html) to run this example:
one to access the OpenAI API and another to access the LangSmith API for logging the agent’s behavior.
To create them, head to the [Secrets dashboard](https://modal.com/secrets), select “Create new secret”,
and use the provided templates for OpenAI and LangSmith.
```
app = modal.App(
"example-code-langchain",
image=image,
secrets=[
modal.Secret.from_name("openai-secret", required_keys=["OPENAI_API_KEY"]),
modal.Secret.from_name("langsmith-secret", required_keys=["LANGCHAIN_API_KEY"]),
],
)
```
Creating a Sandbox
------------------
We execute the agent’s code in a Modal [Sandbox](../guide/sandbox.html), which allows us to
run arbitrary code in a safe environment. In this example, we will use the [`transformers`](https://huggingface.co/docs/transformers/index) library to generate text with a pre-trained model. Let’s create a Sandbox with the necessary dependencies.
```
def create_sandbox(app) -> modal.Sandbox:
# Change this image (and the retrieval logic in the retrieval module)
# if you want the agent to give coding advice on other libraries!
agent_image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install(
"torch==2.5.0",
"transformers==4.46.0",
)
return modal.Sandbox.create(
image=agent_image,
timeout=60 * 10, # 10 minutes
app=app,
# Modal sandboxes support GPUs!
gpu="T4",
# you can also pass secrets here -- note that the main app's secrets are not shared
)
```
We also need a way to run our code in the sandbox. For this, we’ll write a simple wrapper
around the Modal Sandox `exec` method. We use `exec` because it allows us to run code without spinning up a
new container. And we can reuse the same container for multiple runs, preserving state.
```
def run(code: str, sb: modal.Sandbox) -> tuple[str, str]:
print(
f"{COLOR['HEADER']}📦: Running in sandbox{COLOR['ENDC']}",
f"{COLOR['GREEN']}{code}{COLOR['ENDC']}",
sep="\n",
)
exc = sb.exec("python", "-c", code)
exc.wait()
stdout = exc.stdout.read()
stderr = exc.stderr.read()
if exc.returncode != 0:
print(
f"{COLOR['HEADER']}📦: Failed with exitcode {sb.returncode}{COLOR['ENDC']}"
)
return stdout, stderr
```
Constructing the agent’s graph
------------------------------
Now that we have the sandbox to execute code in, we can construct our agent’s graph. Our graph is
defined in the `edges` and `nodes` modules [associated with this example](https://github.com/modal-labs/modal-examples/tree/main/13_sandboxes/codelangchain).
Nodes are actions that change the state. Edges are transitions between nodes.
The idea is simple: we start at the node `generate`, which invokes the LLM to generate code based off documentation.
The generated code is executed (in the sandbox) as part of an edge called `check_code_execution` and then the outputs are passed to the LLM for evaluation (the `evaluate_execution` node).
If the LLM determines that the code has executed correctly — which might mean that the code raised an exception! —
we pass along the `decide_to_finish` edge and finish.
```
def construct_graph(sandbox: modal.Sandbox, debug: bool = False):
from langgraph.graph import StateGraph
from .src.common import GraphState
# Crawl the transformers documentation to inform our code generation
context = retrieval.retrieve_docs(debug=debug)
graph = StateGraph(GraphState)
# Attach our nodes to the graph
graph_nodes = nodes.Nodes(context, sandbox, run, debug=debug)
for key, value in graph_nodes.node_map.items():
graph.add_node(key, value)
# Construct the graph by adding edges
graph = edges.enrich(graph)
# Set the starting and ending nodes of the graph
graph.set_entry_point(key="generate")
graph.set_finish_point(key="finish")
return graph
```
We now set up the graph and compile it. See the `src` module for details
on the content of the graph and the nodes we’ve defined.
```
DEFAULT_QUESTION = "How do I generate Python code using a pre-trained model from the transformers library?"
@app.function()
def go(
question: str = DEFAULT_QUESTION,
debug: bool = False,
):
"""Compiles the Python code generation agent graph and runs it, returning the result."""
sb = create_sandbox(app)
graph = construct_graph(sb, debug=debug)
runnable = graph.compile()
result = runnable.invoke(
{"keys": {"question": question, "iterations": 0}},
config={"recursion_limit": 50},
)
sb.terminate()
return result["keys"]["response"]
```
Running the Graph
-----------------
Now let’s call the agent from the command line!
We define a `local_entrypoint` that runs locally and triggers execution on Modal.
You can invoke it by executing following command from a folder that contains the `codelangchain` directory [from our examples repo](https://github.com/modal-labs/modal-examples/tree/main/13_sandboxes/codelangchain):
```
modal run -m codelangchain.agent --question "How do I run a pre-trained model from the transformers library?"
```
```
@app.local_entrypoint()
def main(
question: str = DEFAULT_QUESTION,
debug: bool = False,
):
"""Sends a question to the Python code generation agent.
Switch to debug mode for shorter context and smaller model."""
if debug:
if question == DEFAULT_QUESTION:
question = "hi there, how are you?"
print(go.remote(question, debug=debug))
```
If things are working properly, you should see output like the following:
```
$ modal run -m codelangchain.agent --question "generate some cool output with transformers"
---DECISION: FINISH---
---FINISHING---
To generate some cool output using transformers, we can use a pre-trained language model from the Hugging Face Transformers library. In this example, we'll use the GPT-2 model to generate text based on a given prompt. The GPT-2 model is a popular choice for text generation tasks due to its ability to produce coherent and contextually relevant text. We'll use the pipeline API from the Transformers library, which simplifies the process of using pre-trained models for various tasks, including text generation.
from transformers import pipeline
# Initialize the text generation pipeline with the GPT-2 model
generator = pipeline('text-generation', model='gpt2')
# Define a prompt for the model to generate text from
prompt = "Once upon a time in a land far, far away"
# Generate text using the model
output = generator(prompt, max_length=50, num_return_sequences=1)
# Print the generated text
print(output[0]['generated_text'])
Result of code execution:
Once upon a time in a land far, far away, and still inhabited even after all the human race, there would be one God: a perfect universal God who has always been and will ever be worshipped. All His acts and deeds are immutable,
```
[Build a coding agent with Modal Sandboxes and LangGraph](#build-a-coding-agent-with-modal-sandboxes-and-langgraph)[Setup](#setup)[Creating a Sandbox](#creating-a-sandbox)[Constructing the agent’s graph](#constructing-the-agents-graph)[Running the Graph](#running-the-graph)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run -m 13_sandboxes.codelangchain.agent --question 'Use gpt2 and transformers to generate text'
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/flan_t5/flan_t5_finetune.py)
Finetuning Flan-T5
==================
Example by [@anishpdalal](https://github.com/anishpdalal)
[Flan-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5) is a highly versatile model that’s been instruction-tuned to
perform well on a variety of text-based tasks such as question answering and summarization. There are smaller model variants available which makes
Flan-T5 a great base model to use for finetuning on a specific instruction dataset with just a single GPU. In this example, we’ll
finetune Flan-T5 on the [Extreme Sum (“XSum”)](https://huggingface.co/datasets/xsum) dataset to summarize news articles.
Defining dependencies
---------------------
The example uses the `dataset` package from HuggingFace to load the xsum dataset. It also uses the `transformers` and `accelerate` packages with a PyTorch backend to finetune and serve the model. Finally, we also
install `tensorboard` and serve it via a web app. All packages are installed into a Debian Slim base image
using the `pip_install` function.
```
from pathlib import Path
import modal
VOL_MOUNT_PATH = Path("/vol")
```
Other Flan-T5 models can be found [here](https://huggingface.co/docs/transformers/model_doc/flan-t5)
```
BASE_MODEL = "google/flan-t5-base"
image = modal.Image.debian_slim(python_version="3.12").pip_install(
"accelerate",
"transformers",
"torch",
"datasets",
"tensorboard",
)
app = modal.App(name="example-news-summarizer", image=image)
output_vol = modal.Volume.from_name("finetune-volume", create_if_missing=True)
```
### Handling preemption
As this finetuning job is long-running it’s possible that it experiences a preemption.
The training code is robust to preemption events by periodically saving checkpoints and restoring
from checkpoint on restart. But it’s also helpful to observe in logs when a preemption restart has occurred,
so we track restarts with a `modal.Dict`.
See the [guide on preemptions](../guide/preemption.html#preemption) for more details on preemption handling.
```
restart_tracker_dict = modal.Dict.from_name(
"finetune-restart-tracker", create_if_missing=True
)
def track_restarts(restart_tracker: modal.Dict) -> int:
if not restart_tracker.contains("count"):
preemption_count = 0
print(f"Starting first time. {preemption_count=}")
restart_tracker["count"] = preemption_count
else:
preemption_count = restart_tracker.get("count") + 1
print(f"Restarting after pre-emption. {preemption_count=}")
restart_tracker["count"] = preemption_count
return preemption_count
```
Finetuning Flan-T5 on XSum dataset
----------------------------------
Each row in the dataset has a `document` (input news article) and `summary` column.
```
@app.function(
gpu="A10g",
timeout=7200,
volumes={VOL_MOUNT_PATH: output_vol},
)
def finetune(num_train_epochs: int = 1, size_percentage: int = 10):
from datasets import load_dataset
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
)
restarts = track_restarts(restart_tracker_dict)
# Use size percentage to retrieve subset of the dataset to iterate faster
if size_percentage:
xsum_train = load_dataset("xsum", split=f"train[:{size_percentage}%]")
xsum_test = load_dataset("xsum", split=f"test[:{size_percentage}%]")
# Load the whole dataset
else:
xsum = load_dataset("xsum")
xsum_train = xsum["train"]
xsum_test = xsum["test"]
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
# Replace all padding tokens with a large negative number so that the loss function ignores them in
# its calculation
padding_token_id = -100
batch_size = 8
def preprocess(batch):
# prepend summarize: prefix to document to convert the example to a summarization instruction
inputs = ["summarize: " + doc for doc in batch["document"]]
model_inputs = tokenizer(
inputs, max_length=512, truncation=True, padding="max_length"
)
labels = tokenizer(
text_target=batch["summary"],
max_length=128,
truncation=True,
padding="max_length",
)
labels["input_ids"] = [
[l if l != tokenizer.pad_token_id else padding_token_id for l in label]
for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_xsum_train = xsum_train.map(
preprocess, batched=True, remove_columns=["document", "summary", "id"]
)
tokenized_xsum_test = xsum_test.map(
preprocess, batched=True, remove_columns=["document", "summary", "id"]
)
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=padding_token_id,
pad_to_multiple_of=batch_size,
)
training_args = Seq2SeqTrainingArguments(
# Save checkpoints to the mounted volume
output_dir=str(VOL_MOUNT_PATH / "model"),
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
predict_with_generate=True,
learning_rate=3e-5,
num_train_epochs=num_train_epochs,
logging_strategy="steps",
logging_steps=100,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=100,
save_total_limit=2,
load_best_model_at_end=True,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_xsum_train,
eval_dataset=tokenized_xsum_test,
)
try:
resume = restarts > 0
if resume:
print("resuming from checkpoint")
trainer.train(resume_from_checkpoint=resume)
except KeyboardInterrupt: # handle possible preemption
print("received interrupt; saving state and model")
trainer.save_state()
trainer.save_model()
raise
# Save the trained model and tokenizer to the mounted volume
model.save_pretrained(str(VOL_MOUNT_PATH / "model"))
tokenizer.save_pretrained(str(VOL_MOUNT_PATH / "tokenizer"))
output_vol.commit()
print("✅ done")
```
Monitoring Finetuning with Tensorboard
--------------------------------------
Tensorboard is an application for visualizing training loss. In this example we
serve it as a Modal WSGI app.
```
@app.function(volumes={VOL_MOUNT_PATH: output_vol})
@modal.wsgi_app()
def monitor():
import tensorboard
board = tensorboard.program.TensorBoard()
board.configure(logdir=f"{VOL_MOUNT_PATH}/logs")
(data_provider, deprecated_multiplexer) = board._make_data_provider()
wsgi_app = tensorboard.backend.application.TensorBoardWSGIApp(
board.flags,
board.plugin_loaders,
data_provider,
board.assets_zip_provider,
deprecated_multiplexer,
)
return wsgi_app
```
Model Inference
---------------
```
@app.cls(volumes={VOL_MOUNT_PATH: output_vol})
class Summarizer:
@modal.enter()
def load_model(self):
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
# Load saved tokenizer and finetuned from training run
tokenizer = AutoTokenizer.from_pretrained(
BASE_MODEL, cache_dir=VOL_MOUNT_PATH / "tokenizer/"
)
model = AutoModelForSeq2SeqLM.from_pretrained(
BASE_MODEL, cache_dir=VOL_MOUNT_PATH / "model/"
)
self.summarizer = pipeline("summarization", tokenizer=tokenizer, model=model)
@modal.method()
def generate(self, input: str) -> str:
return self.summarizer(input)[0]["summary_text"]
@app.local_entrypoint()
def main():
input = """
The 14-time major champion, playing in his first full PGA Tour event for almost 18 months,
carded a level-par second round of 72, but missed the cut by four shots after his first-round 76.
World number one Jason Day and US Open champion Dustin Johnson also missed the cut at Torrey Pines in San Diego.
Overnight leader Rose carded a one-under 71 to put him on eight under. Canada's
Adam Hadwin and USA's Brandt Snedeker are tied in second on seven under, while US PGA champion
Jimmy Walker missed the cut as he finished on three over. Woods is playing in just his
second tournament since 15 months out with a back injury. "It's frustrating not being
able to have a chance to win the tournament," said the 41-year-old, who won his last major,
the US Open, at the same course in 2008. "Overall today was a lot better than yesterday.
I hit it better, I putted well again. I hit a lot of beautiful putts that didn't go in, but
I hit it much better today, which was nice." Scotland's Martin Laird and England's Paul Casey
are both on two under, while Ireland's Shane Lowry is on level par.
"""
model = Summarizer()
response = model.generate.remote(input)
print(response)
```
Run via the CLI
---------------
Trigger model finetuning using the following command:
```
modal run --detach flan_t5_finetune.py::finetune --num-train-epochs=1 --size-percentage=10
View the tensorboard logs at https://<username>--example-news-summarizer-monitor-dev.modal.run
```
Then, you can invoke inference via the `local_entrypoint` with this command:
```
modal run flan_t5_finetune.py
World number one Tiger Woods missed the cut at the US Open as he failed to qualify for the final round of the event in Los Angeles.
```
[Finetuning Flan-T5](#finetuning-flan-t5)[Defining dependencies](#defining-dependencies)[Handling preemption](#handling-preemption)[Finetuning Flan-T5 on XSum dataset](#finetuning-flan-t5-on-xsum-dataset)[Monitoring Finetuning with Tensorboard](#monitoring-finetuning-with-tensorboard)[Model Inference](#model-inference)[Run via the CLI](#run-via-the-cli)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/flan_t5/flan_t5_finetune.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/blender/blender_video.py)
Render a video with Blender on many GPUs or CPUs in parallel
============================================================
This example shows how you can render an animated 3D scene using [Blender](https://www.blender.org/)’s Python interface.
You can run it on CPUs to scale out on one hundred containers
or run it on GPUs to get higher throughput per node.
Even for this simple scene, GPUs render >10x faster than CPUs.
The final render looks something like this:
[](https://modal-cdn.com/modal-blender-video.mp4)
Defining a Modal app
--------------------
```
from pathlib import Path
import modal
```
Modal runs your Python functions for you in the cloud.
You organize your code into apps, collections of functions that work together.
```
app = modal.App("examples-blender-video")
```
We need to define the environment each function runs in — its container image.
The block below defines a container image, starting from a basic Debian Linux image
adding Blender’s system-level dependencies
and then installing the `bpy` package, which is Blender’s Python API.
```
rendering_image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("xorg", "libxkbcommon0") # X11 (Unix GUI) dependencies
.pip_install("bpy==4.1.0") # Blender as a Python package
)
```
Rendering a single frame
------------------------
We define a function that renders a single frame. We’ll scale this function out on Modal later.
Functions in Modal are defined along with their hardware and their dependencies.
This function can be run with GPU acceleration or without it, and we’ll use a global flag in the code to switch between the two.
```
WITH_GPU = (
True # try changing this to False to run rendering massively in parallel on CPUs!
)
```
We decorate the function with `@app.function` to define it as a Modal function.
Note that in addition to defining the hardware requirements of the function,
we also specify the container image that the function runs in (the one we defined above).
The details of the scene aren’t too important for this example, but we’ll load
a .blend file that we created earlier. This scene contains a rotating
Modal logo made of a transmissive ice-like material, with a generated displacement map. The
animation keyframes were defined in Blender.
```
@app.function(
gpu="L40S" if WITH_GPU else None,
# default limits on Modal free tier
max_containers=10 if WITH_GPU else 100,
image=rendering_image,
)
def render(blend_file: bytes, frame_number: int = 0) -> bytes:
"""Renders the n-th frame of a Blender file as a PNG."""
import bpy
input_path = "/tmp/input.blend"
output_path = f"/tmp/output-{frame_number}.png"
# Blender requires input as a file.
Path(input_path).write_bytes(blend_file)
bpy.ops.wm.open_mainfile(filepath=input_path)
bpy.context.scene.frame_set(frame_number)
bpy.context.scene.render.filepath = output_path
configure_rendering(bpy.context, with_gpu=WITH_GPU)
bpy.ops.render.render(write_still=True)
# Blender renders image outputs to a file as well.
return Path(output_path).read_bytes()
```
### Rendering with acceleration
We can configure the rendering process to use GPU acceleration with NVIDIA CUDA.
We select the [Cycles rendering engine](https://www.cycles-renderer.org/), which is compatible with CUDA,
and then activate the GPU.
```
def configure_rendering(ctx, with_gpu: bool):
# configure the rendering process
ctx.scene.render.engine = "CYCLES"
ctx.scene.render.resolution_x = 3000
ctx.scene.render.resolution_y = 2000
ctx.scene.render.resolution_percentage = 50
ctx.scene.cycles.samples = 128
cycles = ctx.preferences.addons["cycles"]
# Use GPU acceleration if available.
if with_gpu:
cycles.preferences.compute_device_type = "CUDA"
ctx.scene.cycles.device = "GPU"
# reload the devices to update the configuration
cycles.preferences.get_devices()
for device in cycles.preferences.devices:
device.use = True
else:
ctx.scene.cycles.device = "CPU"
# report rendering devices -- a nice snippet for debugging and ensuring the accelerators are being used
for dev in cycles.preferences.devices:
print(f"ID:{dev['id']} Name:{dev['name']} Type:{dev['type']} Use:{dev['use']}")
```
Combining frames into a video
-----------------------------
Rendering 3D images is fun, and GPUs can make it faster, but rendering 3D videos is better!
We add another function to our app, running on a different, simpler container image
and different hardware, to combine the frames into a video.
```
combination_image = modal.Image.debian_slim(python_version="3.11").apt_install("ffmpeg")
```
The function to combine the frames into a video takes a sequence of byte sequences, one for each rendered frame,
and converts them into a single sequence of bytes, the MP4 file.
```
@app.function(image=combination_image)
def combine(frames_bytes: list[bytes], fps: int = 60) -> bytes:
import subprocess
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
for i, frame_bytes in enumerate(frames_bytes):
frame_path = Path(tmpdir) / f"frame_{i:05}.png"
frame_path.write_bytes(frame_bytes)
out_path = Path(tmpdir) / "output.mp4"
subprocess.run(
f"ffmpeg -framerate {fps} -pattern_type glob -i '{tmpdir}/*.png' -c:v libx264 -pix_fmt yuv420p {out_path}",
shell=True,
)
return out_path.read_bytes()
```
Rendering in parallel in the cloud from the comfort of the command line
-----------------------------------------------------------------------
With these two functions defined, we need only a few more lines to run our rendering at scale on Modal.
First, we need a function that coordinates our functions to `render` frames and `combine` them.
We decorate that function with `@app.local_entrypoint` so that we can run it with `modal run blender_video.py`.
In that function, we use `render.map` to map the `render` function over the range of frames.
We give the `local_entrypoint` two parameters to control the render — the number of frames to render and how many frames to skip.
These demonstrate a basic pattern for controlling Functions on Modal from a local client.
We collect the bytes from each frame into a `list` locally and then send it to `combine` with `.remote`.
The bytes for the video come back to our local machine, and we write them to a file.
The whole rendering process (for four seconds of 1080p 60 FPS video) takes about three minutes to run on 10 L40S GPUs,
with a per-frame latency of about six seconds, and about five minutes to run on 100 CPUs, with a per-frame latency of about one minute.
```
@app.local_entrypoint()
def main(frame_count: int = 250, frame_skip: int = 1):
output_directory = Path("/tmp") / "render"
output_directory.mkdir(parents=True, exist_ok=True)
input_path = Path(__file__).parent / "IceModal.blend"
blend_bytes = input_path.read_bytes()
args = [(blend_bytes, frame) for frame in range(1, frame_count + 1, frame_skip)]
images = list(render.starmap(args))
for i, image in enumerate(images):
frame_path = output_directory / f"frame_{i + 1}.png"
frame_path.write_bytes(image)
print(f"Frame saved to {frame_path}")
video_path = output_directory / "output.mp4"
video_bytes = combine.remote(images)
video_path.write_bytes(video_bytes)
print(f"Video saved to {video_path}")
```
[Render a video with Blender on many GPUs or CPUs in parallel](#render-a-video-with-blender-on-many-gpus-or-cpus-in-parallel)[Defining a Modal app](#defining-a-modal-app)[Rendering a single frame](#rendering-a-single-frame)[Rendering with acceleration](#rendering-with-acceleration)[Combining frames into a video](#combining-frames-into-a-video)[Rendering in parallel in the cloud from the comfort of the command line](#rendering-in-parallel-in-the-cloud-from-the-comfort-of-the-command-line)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/blender/blender_video.py --frame-skip 2
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/streamlit/serve_streamlit.py)
Run and share Streamlit apps
============================
This example shows you how to run a Streamlit app with `modal serve`, and then deploy it as a serverless web app.
![example streamlit app](../../_app/immutable/assets/streamlit.RHfhqFCX.png)
This example is structured as two files:
1. This module, which defines the Modal objects (name the script `serve_streamlit.py` locally).
2. `app.py`, which is any Streamlit script to be mounted into the Modal
function ([download script](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/streamlit/app.py)).
```
import shlex
import subprocess
from pathlib import Path
import modal
```
Define container dependencies
-----------------------------
The `app.py` script imports three third-party packages, so we include these in the example’s
image definition and then add the `app.py` file itself to the image.
```
streamlit_script_local_path = Path(__file__).parent / "app.py"
streamlit_script_remote_path = "/root/app.py"
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("streamlit~=1.35.0", "numpy~=1.26.4", "pandas~=2.2.2")
.add_local_file(
streamlit_script_local_path,
streamlit_script_remote_path,
)
)
app = modal.App(name="example-modal-streamlit", image=image)
if not streamlit_script_local_path.exists():
raise RuntimeError(
"app.py not found! Place the script with your streamlit app in the same directory."
)
```
Spawning the Streamlit server
-----------------------------
Inside the container, we will run the Streamlit server in a background subprocess using `subprocess.Popen`. We also expose port 8000 using the `@web_server` decorator.
```
@app.function()
@modal.concurrent(max_inputs=100)
@modal.web_server(8000)
def run():
target = shlex.quote(streamlit_script_remote_path)
cmd = f"streamlit run {target} --server.port 8000 --server.enableCORS=false --server.enableXsrfProtection=false"
subprocess.Popen(cmd, shell=True)
```
Iterate and Deploy
------------------
While you’re iterating on your screamlit app, you can run it “ephemerally” with `modal serve`. This will
run a local process that watches your files and updates the app if anything changes.
```
modal serve serve_streamlit.py
```
Once you’re happy with your changes, you can deploy your application with
```
modal deploy serve_streamlit.py
```
If successful, this will print a URL for your app that you can navigate to from
your browser 🎉 .
[Run and share Streamlit apps](#run-and-share-streamlit-apps)[Define container dependencies](#define-container-dependencies)[Spawning the Streamlit server](#spawning-the-streamlit-server)[Iterate and Deploy](#iterate-and-deploy)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 10_integrations/streamlit/serve_streamlit.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/embeddings/text_embeddings_inference.py)
Run TextEmbeddingsInference (TEI) on Modal
==========================================
This example runs the [Text Embedding Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) toolkit on the Hacker News BigQuery public dataset.
```
import json
import os
import socket
import subprocess
from pathlib import Path
import modal
GPU_CONFIG = "A10G"
MODEL_ID = "BAAI/bge-base-en-v1.5"
BATCH_SIZE = 32
DOCKER_IMAGE = (
"ghcr.io/huggingface/text-embeddings-inference:86-0.4.0" # Ampere 86 for A10s.
# "ghcr.io/huggingface/text-embeddings-inference:0.4.0" # Ampere 80 for A100s.
# "ghcr.io/huggingface/text-embeddings-inference:0.3.0" # Turing for T4s.
)
DATA_PATH = Path("/data/dataset.jsonl")
LAUNCH_FLAGS = [
"--model-id",
MODEL_ID,
"--port",
"8000",
]
def spawn_server() -> subprocess.Popen:
process = subprocess.Popen(["text-embeddings-router"] + LAUNCH_FLAGS)
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
while True:
try:
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
print("Webserver ready!")
return process
except (socket.timeout, ConnectionRefusedError):
# Check if launcher webserving process has exited.
# If so, a connection can never be made.
retcode = process.poll()
if retcode is not None:
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
def download_model():
# Wait for server to start. This downloads the model weights when not present.
spawn_server().terminate()
volume = modal.Volume.from_name("tei-hn-data", create_if_missing=True)
app = modal.App("example-tei")
tei_image = (
modal.Image.from_registry(
DOCKER_IMAGE,
add_python="3.10",
)
.dockerfile_commands("ENTRYPOINT []")
.run_function(download_model, gpu=GPU_CONFIG)
.pip_install("httpx")
)
with tei_image.imports():
from httpx import AsyncClient
@app.cls(
gpu=GPU_CONFIG,
image=tei_image,
max_containers=20, # Use up to 20 GPU containers at once.
)
@modal.concurrent(
max_inputs=10
) # Allow each container to process up to 10 batches at once.
class TextEmbeddingsInference:
@modal.enter()
def setup_server(self):
self.process = spawn_server()
self.client = AsyncClient(base_url="http://127.0.0.1:8000")
@modal.exit()
def teardown_server(self):
self.process.terminate()
@modal.method()
async def embed(self, inputs_with_ids: list[tuple[int, str]]):
ids, inputs = zip(*inputs_with_ids)
resp = await self.client.post("/embed", json={"inputs": inputs})
resp.raise_for_status()
outputs = resp.json()
return list(zip(ids, outputs))
def download_data():
service_account_info = json.loads(os.environ["SERVICE_ACCOUNT_JSON"])
credentials = service_account.Credentials.from_service_account_info(
service_account_info
)
client = bigquery.Client(credentials=credentials)
iterator = client.list_rows(
"bigquery-public-data.hacker_news.full",
max_results=100_000,
)
df = iterator.to_dataframe(progress_bar_type="tqdm").dropna()
df["id"] = df["id"].astype(int)
df["text"] = df["text"].apply(lambda x: x[:512])
data = list(zip(df["id"], df["text"]))
with open(DATA_PATH, "w") as f:
json.dump(data, f)
volume.commit()
image = modal.Image.debian_slim(python_version="3.10").pip_install(
"google-cloud-bigquery", "pandas", "db-dtypes", "tqdm"
)
with image.imports():
from google.cloud import bigquery
from google.oauth2 import service_account
@app.function(
image=image,
secrets=[modal.Secret.from_name("bigquery")],
volumes={DATA_PATH.parent: volume},
)
def embed_dataset():
model = TextEmbeddingsInference()
if not DATA_PATH.exists():
print("Downloading data. This takes a while...")
download_data()
with open(DATA_PATH) as f:
data = json.loads(f.read())
def generate_batches():
batch = []
for item in data:
batch.append(item)
if len(batch) == BATCH_SIZE:
yield batch
batch = []
# data is of type list[tuple[str, str]].
# starmap spreads the tuples into positional arguments.
for output_batch in model.embed.map(generate_batches(), order_outputs=False):
# Do something with the outputs.
pass
```
[Run TextEmbeddingsInference (TEI) on Modal](#run-textembeddingsinference-tei-on-modal)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/embeddings/text_embeddings_inference.py:\:embed_dataset
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/comfyui/comfyapp.py)
Run Flux on ComfyUI as an API
=============================
In this example, we show you how to turn a [ComfyUI](https://github.com/comfyanonymous/ComfyUI) workflow into a scalable API endpoint.
Quickstart
----------
To run this simple text-to-image [Flux Schnell workflow](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/comfyui/workflow_api.json) as an API:
1. Deploy ComfyUI behind a web endpoint:
```
modal deploy 06_gpu_and_ml/comfyui/comfyapp.py
```
2. In another terminal, run inference:
```
python 06_gpu_and_ml/comfyui/comfyclient.py --modal-workspace $(modal profile current) --prompt "Surreal dreamscape with floating islands, upside-down waterfalls, and impossible geometric structures, all bathed in a soft, ethereal light"
```
![example comfyui image](https://modal-cdn.com/cdnbot/flux_gen_imagesenr_0w3_209b7170.webp)
The first inference will take ~1m since the container needs to launch the ComfyUI server and load Flux into memory. Successive calls on a warm container should take a few seconds.
Installing ComfyUI
------------------
We use [comfy-cli](https://github.com/Comfy-Org/comfy-cli) to install ComfyUI and its dependencies.
```
import json
import subprocess
import uuid
from pathlib import Path
from typing import Dict
import modal
image = ( # build up a Modal Image to run ComfyUI, step by step
modal.Image.debian_slim( # start from basic Linux with Python
python_version="3.11"
)
.apt_install("git") # install git to clone ComfyUI
.pip_install("fastapi[standard]==0.115.4") # install web dependencies
.pip_install("comfy-cli==1.3.8") # install comfy-cli
.run_commands( # use comfy-cli to install ComfyUI and its dependencies
"comfy --skip-prompt install --fast-deps --nvidia --version 0.3.10"
)
)
```
Downloading custom nodes
------------------------
We’ll also use `comfy-cli` to download custom nodes, in this case the popular [WAS Node Suite](https://github.com/WASasquatch/was-node-suite-comfyui).
Use the [ComfyUI Registry](https://registry.comfy.org/) to find the specific custom node name to use with this command.
```
image = (
image.run_commands( # download a custom node
"comfy node install --fast-deps [email protected]"
)
# Add .run_commands(...) calls for any other custom nodes you want to download
)
```
See [this post](https://modal.com/blog/comfyui-custom-nodes) for more examples
on how to install popular custom nodes like ComfyUI Impact Pack and ComfyUI IPAdapter Plus.
Downloading models
------------------
`comfy-cli` also supports downloading models, but we’ve found it’s faster to use [`hf_hub_download`](https://huggingface.co/docs/huggingface_hub/en/guides/download#download-a-single-file) directly by:
1. Enabling [faster downloads](https://huggingface.co/docs/huggingface_hub/en/guides/download#faster-downloads)
2. Mounting the cache directory to a [Volume](../guide/volumes.html)
By persisting the cache to a Volume, you avoid re-downloading the models every time you rebuild your image.
```
def hf_download():
from huggingface_hub import hf_hub_download
flux_model = hf_hub_download(
repo_id="Comfy-Org/flux1-schnell",
filename="flux1-schnell-fp8.safetensors",
cache_dir="/cache",
)
# symlink the model to the right ComfyUI directory
subprocess.run(
f"ln -s {flux_model} /root/comfy/ComfyUI/models/checkpoints/flux1-schnell-fp8.safetensors",
shell=True,
check=True,
)
vol = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
image = (
# install huggingface_hub with hf_transfer support to speed up downloads
image.pip_install("huggingface_hub[hf_transfer]==0.30.0")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
hf_download,
# persist the HF cache to a Modal Volume so future runs don't re-download models
volumes={"/cache": vol},
)
)
```
Lastly, copy the ComfyUI workflow JSON to the container.
```
image = image.add_local_file(
Path(__file__).parent / "workflow_api.json", "/root/workflow_api.json"
)
```
Running ComfyUI interactively
-----------------------------
Spin up an interactive ComfyUI server by wrapping the `comfy launch` command in a Modal Function
and serving it as a [web server](../guide/webhooks.html#non-asgi-web-servers).
```
app = modal.App(name="example-comfyui", image=image)
@app.function(
max_containers=1, # limit interactive session to 1 container
gpu="L40S", # good starter GPU for inference
volumes={"/cache": vol}, # mounts our cached models
)
@modal.concurrent(
max_inputs=10
) # required for UI startup process which runs several API calls concurrently
@modal.web_server(8000, startup_timeout=60)
def ui():
subprocess.Popen("comfy launch -- --listen 0.0.0.0 --port 8000", shell=True)
```
At this point you can run `modal serve 06_gpu_and_ml/comfyui/comfyapp.py` and open the UI in your browser for the classic ComfyUI experience.
Remember to **close your UI tab** when you are done developing.
This will close the connection with the container serving ComfyUI and you will stop being charged.
Running ComfyUI as an API
-------------------------
To run a workflow as an API:
1. Stand up a “headless” ComfyUI server in the background when the app starts.
2. Define an `infer` method that takes in a workflow path and runs the workflow on the ComfyUI server.
3. Create a web handler `api` as a web endpoint, so that we can run our workflow as a service and accept inputs from clients.
We group all these steps into a single Modal `cls` object, which we’ll call `ComfyUI`.
```
@app.cls(
scaledown_window=300, # 5 minute container keep alive after it processes an input
gpu="L40S",
volumes={"/cache": vol},
)
@modal.concurrent(max_inputs=5) # run 5 inputs per container
class ComfyUI:
port: int = 8000
@modal.enter()
def launch_comfy_background(self):
# launch the ComfyUI server exactly once when the container starts
cmd = f"comfy launch --background -- --port {self.port}"
subprocess.run(cmd, shell=True, check=True)
@modal.method()
def infer(self, workflow_path: str = "/root/workflow_api.json"):
# sometimes the ComfyUI server stops responding (we think because of memory leaks), so this makes sure it's still up
self.poll_server_health()
# runs the comfy run --workflow command as a subprocess
cmd = f"comfy run --workflow {workflow_path} --wait --timeout 1200 --verbose"
subprocess.run(cmd, shell=True, check=True)
# completed workflows write output images to this directory
output_dir = "/root/comfy/ComfyUI/output"
# looks up the name of the output image file based on the workflow
workflow = json.loads(Path(workflow_path).read_text())
file_prefix = [
node.get("inputs")
for node in workflow.values()
if node.get("class_type") == "SaveImage"
][0]["filename_prefix"]
# returns the image as bytes
for f in Path(output_dir).iterdir():
if f.name.startswith(file_prefix):
return f.read_bytes()
@modal.fastapi_endpoint(method="POST")
def api(self, item: Dict):
from fastapi import Response
workflow_data = json.loads(
(Path(__file__).parent / "workflow_api.json").read_text()
)
# insert the prompt
workflow_data["6"]["inputs"]["text"] = item["prompt"]
# give the output image a unique id per client request
client_id = uuid.uuid4().hex
workflow_data["9"]["inputs"]["filename_prefix"] = client_id
# save this updated workflow to a new file
new_workflow_file = f"{client_id}.json"
json.dump(workflow_data, Path(new_workflow_file).open("w"))
# run inference on the currently running container
img_bytes = self.infer.local(new_workflow_file)
return Response(img_bytes, media_type="image/jpeg")
def poll_server_health(self) -> Dict:
import socket
import urllib
try:
# check if the server is up (response should be immediate)
req = urllib.request.Request(f"http://127.0.0.1:{self.port}/system_stats")
urllib.request.urlopen(req, timeout=5)
print("ComfyUI server is healthy")
except (socket.timeout, urllib.error.URLError) as e:
# if no response in 5 seconds, stop the container
print(f"Server health check failed: {str(e)}")
modal.experimental.stop_fetching_inputs()
# all queued inputs will be marked "Failed", so you need to catch these errors in your client and then retry
raise Exception("ComfyUI server is not healthy, stopping container")
```
This serves the `workflow_api.json` in this repo. When deploying your own workflows, make sure you select the “Export (API)” option in the ComfyUI menu:
![comfyui menu](https://modal-cdn.com/cdnbot/comfyui_menugo5j8ahx_27d72c45.webp)
More resources
--------------
* Use [memory snapshots](../guide/memory-snapshot.html) to speed up cold starts (check out the `memory_snapshot` directory on [Github](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/comfyui))
* Run a ComfyUI workflow as a [Python script](https://modal.com/blog/comfyui-prototype-to-production)
* When to use [A1111 vs ComfyUI](https://modal.com/blog/a1111-vs-comfyui)
* Understand tradeoffs of parallel processing strategies when [scaling ComfyUI](https://modal.com/blog/scaling-comfyui)
[Run Flux on ComfyUI as an API](#run-flux-on-comfyui-as-an-api)[Quickstart](#quickstart)[Installing ComfyUI](#installing-comfyui)[Downloading custom nodes](#downloading-custom-nodes)[Downloading models](#downloading-models)[Running ComfyUI interactively](#running-comfyui-interactively)[Running ComfyUI as an API](#running-comfyui-as-an-api)[More resources](#more-resources)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 06_gpu_and_ml/comfyui/comfyapp.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/openai_whisper/batched_whisper.py)
Fast Whisper inference using dynamic batching
=============================================
In this example, we demonstrate how to run [dynamically batched inference](../guide/dynamic-batching.html) for OpenAI’s speech recognition model, [Whisper](https://openai.com/index/whisper/), on Modal.
Batching multiple audio samples together or batching chunks of a single audio sample can help to achieve a 2.8x increase
in inference throughput on an A10G!
We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model.
To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper),
simply replace the `MODEL_NAME` and `MODEL_REVISION` variables.
Setup
-----
Let’s start by importing the Modal client and defining the model that we want to serve.
```
from typing import Optional
import modal
MODEL_DIR = "/model"
MODEL_NAME = "openai/whisper-large-v3"
MODEL_REVISION = "afda370583db9c5359511ed5d989400a6199dfe1"
```
Define a container image
------------------------
We’ll start with Modal’s baseline `debian_slim` image and install the relevant libraries.
```
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install(
"torch==2.5.1",
"transformers==4.47.1",
"hf-transfer==0.1.8",
"huggingface_hub==0.27.0",
"librosa==0.10.2",
"soundfile==0.12.1",
"accelerate==1.2.1",
"datasets==3.2.0",
)
# Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s.
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": MODEL_DIR})
)
model_cache = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
app = modal.App(
"example-whisper-batched-inference",
image=image,
volumes={MODEL_DIR: model_cache},
)
```
Caching the model weights
-------------------------
We’ll define a function to download the model and cache it in a volume.
You can `modal run` against this function prior to deploying the App.
```
@app.function()
def download_model():
from huggingface_hub import snapshot_download
from transformers.utils import move_cache
snapshot_download(
MODEL_NAME,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
revision=MODEL_REVISION,
)
move_cache()
```
The model class
---------------
The inference function is best represented using Modal’s [class syntax](../guide/lifecycle-functions.html).
We define a `@modal.enter` method to load the model when the container starts, before it picks up any inputs.
The weights will be loaded from the Hugging Face cache volume so that we don’t need to download them when
we start a new container.
We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching.
This allows us to invoke the function with individual audio samples, and the function will automatically batch them
together before running inference. Batching is critical for making good use of the GPU, since GPUs are designed
for running parallel operations at high throughput.
The `max_batch_size` parameter limits the maximum number of audio samples combined into a single batch.
We used a `max_batch_size` of `64`, the largest power-of-2 batch size that can be accommodated by the 24 A10G GPU memory.
This number will vary depending on the model and the GPU you are using.
The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription.
To tune this parameter, you can set it to the target latency of your application minus the execution time of an inference batch.
This allows the latency of any request to stay within your target latency.
```
@app.cls(
gpu="a10g", # Try using an A100 or H100 if you've got a large model or need big batches!
max_containers=10, # default max GPUs for Modal's free tier
)
class Model:
@modal.enter()
def load_model(self):
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
)
self.processor = AutoProcessor.from_pretrained(MODEL_NAME)
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_safetensors=True,
).to("cuda")
self.model.generation_config.language = "<|en|>"
# Create a pipeline for preprocessing and transcribing speech data
self.pipeline = pipeline(
"automatic-speech-recognition",
model=self.model,
tokenizer=self.processor.tokenizer,
feature_extractor=self.processor.feature_extractor,
torch_dtype=torch.float16,
device="cuda",
)
@modal.batched(max_batch_size=64, wait_ms=1000)
def transcribe(self, audio_samples):
import time
start = time.monotonic_ns()
print(f"Transcribing {len(audio_samples)} audio samples")
transcriptions = self.pipeline(audio_samples, batch_size=len(audio_samples))
end = time.monotonic_ns()
print(
f"Transcribed {len(audio_samples)} samples in {round((end - start) / 1e9, 2)}s"
)
return transcriptions
```
Transcribe a dataset
--------------------
In this example, we use the [librispeech\_asr\_dummy dataset](https://huggingface.co/datasets/hf-internal-testing/librispeech_asr_dummy) from Hugging Face’s Datasets library to test the model.
We use [`map.aio`](../reference/modal.Function.html#map) to asynchronously map over the audio files.
This allows us to invoke the batched transcription method on each audio sample in parallel.
```
@app.function()
async def transcribe_hf_dataset(dataset_name):
from datasets import load_dataset
print("📂 Loading dataset", dataset_name)
ds = load_dataset(dataset_name, "clean", split="validation")
print("📂 Dataset loaded")
batched_whisper = Model()
print("📣 Sending data for transcription")
async for transcription in batched_whisper.transcribe.map.aio(ds["audio"]):
yield transcription
```
Run the model
-------------
We define a [`local_entrypoint`](../guide/apps.html#entrypoints-for-ephemeral-apps) to run the transcription. You can run this locally with `modal run batched_whisper.py`.
```
@app.local_entrypoint()
async def main(dataset_name: Optional[str] = None):
if dataset_name is None:
dataset_name = "hf-internal-testing/librispeech_asr_dummy"
for result in transcribe_hf_dataset.remote_gen(dataset_name):
print(result["text"])
```
[Fast Whisper inference using dynamic batching](#fast-whisper-inference-using-dynamic-batching)[Setup](#setup)[Define a container image](#define-a-container-image)[Caching the model weights](#caching-the-model-weights)[The model class](#the-model-class)[Transcribe a dataset](#transcribe-a-dataset)[Run the model](#run-the-model)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/openai_whisper/batched_whisper.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/embeddings/amazon_embeddings.py)
Embed 30 million Amazon reviews at 575k tokens per second with Qwen2-7B
=======================================================================
This example demonstrates how to create embeddings for a large text dataset. This is
often necessary to enable semantic search, translation, and other language
processing tasks. Modal makes it easy to deploy large, capable embedding models and handles
all of the scaling to process very large datasets in parallel on many cloud GPUs.
We create a Modal Function that will handle all of the data loading and submit inputs to an
inference Cls that will automatically scale up to handle hundreds of large
batches in parallel.
Between the time a batch is submitted and the time it is fetched, it is stored via
Modal’s `spawn` system, which can hold onto up to one million inputs for up to a week.
```
import json
import subprocess
from pathlib import Path
import modal
app = modal.App(name="example-amazon-embeddings")
MINUTES = 60 # seconds
HOURS = 60 * MINUTES
```
We define our `main` function as a `local_entrypoint`. This is what we’ll call locally
to start the job on Modal.
You can run it with the command
```
modal run --detach amazon_embeddings.py
```
By default we `down-scale` to 1/100th of the data for demonstration purposes.
To launch the full job, set the `--down-scale` parameter to `1`.
But note that this will cost you!
The entrypoint starts the job and gets back a `f`unction `c`all ID for each batch.
We can use these IDs to retrieve the embeddings once the job is finished.
Modal will keep the results around for up to 7 days after completion. Take a look at our [job processing guide](../guide/job-queue.html) for more details.
```
@app.local_entrypoint()
def main(
dataset_name: str = "McAuley-Lab/Amazon-Reviews-2023",
dataset_subset: str = "raw_review_Books",
down_scale: float = 0.001,
):
out_path = Path("/tmp") / "embeddings-example-fc-ids.json"
function_ids = launch_job.remote(
dataset_name=dataset_name, dataset_subset=dataset_subset, down_scale=down_scale
)
out_path.write_text(json.dumps(function_ids, indent=2) + "\n")
print(f"output handles saved to {out_path}")
```
Load the data and start the inference job
-----------------------------------------
Next we define the Function that will do the data loading and feed it to our embedding model.
We define a container [Image](../guide/images.html) with the data loading dependencies.
In it, we download the data we need and cache it to the container’s local disk,
which will disappear when the job is finished. We will be saving the review data
along with the embeddings, so we don’t need to keep the dataset around.
Embedding a large dataset like this can take some time, but we don’t need to wait
around for it to finish. We use `spawn` to invoke our embedding Function
and get back a handle with an ID that we can use to get the results later.
This can bottleneck on just sending data over the network for processing, so
we speed things up by using `ThreadPoolExecutor` to submit batches using multiple threads.
Once all of the batches have been sent for inference, we can return the function IDs
to the local client to save.
```
@app.function(
image=modal.Image.debian_slim().pip_install("datasets==3.5.1"), timeout=2 * HOURS
)
def launch_job(dataset_name: str, dataset_subset: str, down_scale: float):
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import load_dataset
from tqdm import tqdm
print("Loading dataset...")
dataset = load_dataset(
dataset_name,
dataset_subset,
split="full",
trust_remote_code=True,
)
data_subset = dataset.select(range(int(len(dataset) * down_scale)))
tei = TextEmbeddingsInference()
batches = generate_batches_of_chunks(data_subset)
start = time.perf_counter()
with ThreadPoolExecutor() as executor:
futures = [executor.submit(tei.embed.spawn, batch) for batch in tqdm(batches)]
function_ids = []
for future in tqdm(as_completed(futures), total=len(futures)):
function_ids.append(future.result().object_id)
print(f"Finished submitting job: {time.perf_counter() - start:.2f}s")
return function_ids
```
Massively scaling up and scaling out embedding inference on many beefy GPUs
---------------------------------------------------------------------------
We’re going to spin up many containers to run inference, and we don’t want each
one to have to download the embedding model from Hugging Face. We can download and save it to a
Modal [Volume](../guide/volumes.html) during the image build step using `run_function`.
We’ll use the [GTE-Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct) model from Alibaba, which performs well on the [Massive Text Embedding Benchmark](https://huggingface.co/spaces/mteb/leaderboard).
```
MODEL_ID = "Alibaba-NLP/gte-Qwen2-7B-instruct"
MODEL_DIR = "/model"
MODEL_CACHE_VOLUME = modal.Volume.from_name(
"embeddings-example-model-cache", create_if_missing=True
)
def download_model():
from huggingface_hub import snapshot_download
snapshot_download(MODEL_ID, cache_dir=MODEL_DIR)
```
For inference, we will use Hugging Face’s [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) framework for embedding model deployment.
Running lots of separate machines is “scaling out”. But we can also “scale up”
by running on large, high-performance machines.
We’ll use L40S GPUs for a good balance between cost and performance. Hugging Face has
prebuilt Docker images we can use as a base for our Modal Image.
We’ll use the one built for the L40S’s [SM89/Ada Lovelace architecture](../../gpu-glossary/device-hardware/streaming-multiprocessor-architecture.html) and install the rest of our dependencies on top.
```
tei_image = "ghcr.io/huggingface/text-embeddings-inference:89-1.7"
inference_image = (
modal.Image.from_registry(tei_image, add_python="3.12")
.dockerfile_commands("ENTRYPOINT []")
.pip_install(
"httpx==0.28.1",
"huggingface_hub[hf_transfer]==0.30.2",
"numpy==2.2.5",
"tqdm==4.67.1",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HOME": MODEL_DIR})
.run_function(download_model, volumes={MODEL_DIR: MODEL_CACHE_VOLUME})
)
```
Next we define our inference class. Modal will auto-scale the number of
containers ready to handle inputs based on the parameters we set in the `@app.cls` and `@modal.concurrent` decorators. Here we limit the total number of containers to
100 and the maximum number of concurrent inputs to 10, which caps us at 1000 concurrent batches.
On Modal’s Starter (free) and Team plans, the maximum number of concurrent GPUs is lower,
reducing the total number of concurrent batches and so the throughput.
Customers on Modal’s Enterprise Plan regularly scale up another order of magnitude above this.
If you’re interested in running on thousands of GPUs, [get in touch](https://form.fillout.com/t/onUBuQZ5vCus).
Here we also specify the GPU type and attach the Modal Volume where we saved the
embedding model.
This class will spawn a local Text Embeddings Inference server when the container
starts, and process each batch by receiving the text data over HTTP, returning a list of
tuples with the batch text data and embeddings.
```
@app.cls(
image=inference_image,
gpu="L40S",
volumes={MODEL_DIR: MODEL_CACHE_VOLUME},
max_containers=100,
scaledown_window=5 * MINUTES, # idle for 5 min without inputs before scaling down
retries=3, # handle transient failures and storms in the cloud
timeout=2 * HOURS, # run for at most 2 hours
)
@modal.concurrent(max_inputs=10)
class TextEmbeddingsInference:
@modal.enter()
def open_connection(self):
from httpx import AsyncClient
print("Starting text embedding inference server...")
self.process = spawn_server()
self.client = AsyncClient(base_url="http://127.0.0.1:8000", timeout=30)
@modal.exit()
def terminate_connection(self):
self.process.terminate()
@modal.method()
async def embed(self, batch):
texts = [chunk[-1] for chunk in batch]
res = await self.client.post("/embed", json={"inputs": texts})
return [chunk + (embedding,) for chunk, embedding in zip(batch, res.json())]
```
Helper Functions
----------------
The book review dataset contains ~30M reviews with ~12B total characters,
indicating an average review length of ~500 characters. Some are much longer.
Embedding models have a limit on the number of tokens they can process in a single
input. We will need to split each review into chunks that are under this limit.
The proper way to split text data is to use a tokenizer to ensure that any
single request is under the models token limit, and to overlap chunks to provide
semantic context and preserve information. For the sake of this example, we’re going
just to split by a set character length (`CHUNK_SIZE`).
While the embedding model has a limit on the number of input tokens for a single
embedding, the number of chunks that we can process in a single batch is limited by
the VRAM of the GPU. We set the `BATCH_SIZE` accordingly.
```
BATCH_SIZE = 256
CHUNK_SIZE = 512
def generate_batches_of_chunks(
dataset, chunk_size: int = CHUNK_SIZE, batch_size: int = BATCH_SIZE
):
"""Creates batches of chunks by naively slicing strings according to CHUNK_SIZE."""
batch = []
for entry_index, data in enumerate(dataset):
product_id = data["asin"]
user_id = data["user_id"]
timestamp = data["timestamp"]
title = data["title"]
text = data["text"]
for chunk_index, chunk_start in enumerate(range(0, len(text), chunk_size)):
batch.append(
(
entry_index,
chunk_index,
product_id,
user_id,
timestamp,
title,
text[chunk_start : chunk_start + chunk_size],
)
)
if len(batch) == batch_size:
yield batch
batch = []
if batch:
yield batch
def spawn_server(
model_id: str = MODEL_ID,
port: int = 8000,
max_client_batch_size: int = BATCH_SIZE,
max_batch_tokens: int = BATCH_SIZE * CHUNK_SIZE,
huggingface_hub_cache: str = MODEL_DIR,
):
"""Starts a text embedding inference server in a subprocess."""
import socket
LAUNCH_FLAGS = [
"--model-id",
model_id,
"--port",
str(port),
"--max-client-batch-size",
str(max_client_batch_size),
"--max-batch-tokens",
str(max_batch_tokens),
"--huggingface-hub-cache",
huggingface_hub_cache,
]
process = subprocess.Popen(["text-embeddings-router"] + LAUNCH_FLAGS)
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
while True:
try:
socket.create_connection(("127.0.0.1", port), timeout=1).close()
print("Inference server ready!")
return process
except (socket.timeout, ConnectionRefusedError):
retcode = process.poll() # Check if the process has terminated.
if retcode is not None:
raise RuntimeError(f"Launcher exited unexpectedly with code {retcode}")
```
[Embed 30 million Amazon reviews at 575k tokens per second with Qwen2-7B](#embed-30-million-amazon-reviews-at-575k-tokens-per-second-with-qwen2-7b)[Load the data and start the inference job](#load-the-data-and-start-the-inference-job)[Massively scaling up and scaling out embedding inference on many beefy GPUs](#massively-scaling-up-and-scaling-out-embedding-inference-on-many-beefy-gpus)[Helper Functions](#helper-functions)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run --detach 06_gpu_and_ml/embeddings/amazon_embeddings.py --dataset-subset raw_review_Magazine_Subscriptions
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/dreambooth/diffusers_lora_finetune.py)
Fine-tune Flux on your pet using LoRA
=====================================
This example finetunes the [Flux.1-dev model](https://huggingface.co/black-forest-labs/FLUX.1-dev) on images of a pet (by default, a puppy named Qwerty)
using a technique called textual inversion from [the “Dreambooth” paper](https://dreambooth.github.io/).
Effectively, it teaches a general image generation model a new “proper noun”,
allowing for the personalized generation of art and photos.
We supplement textual inversion with low-rank adaptation (LoRA)
for increased efficiency during training.
It then makes the model shareable with others — without costing $25/day for a GPU server—
by hosting a [Gradio app](https://gradio.app/) on Modal.
It demonstrates a simple, productive, and cost-effective pathway
to building on large pretrained models using Modal’s building blocks, like [GPU-accelerated](../guide/gpu.html) Modal Functions and Clses for compute-intensive work, [Volumes](../guide/volumes.html) for storage,
and [web endpoints](../guide/webhooks.html) for serving.
And with some light customization, you can use it to generate images of your pet!
![Gradio.app image generation interface](../../_app/immutable/assets/gradio-image-generate.DJVgtpVQ.png)
You can find a video walkthrough of this example on the Modal YouTube channel [here](https://www.youtube.com/watch?v=df-8fiByXMI).
Imports and setup
-----------------
We start by importing the necessary libraries and setting up the environment.
```
from dataclasses import dataclass
from pathlib import Path
import modal
```
Building up the environment
---------------------------
Machine learning environments are complex, and the dependencies can be hard to manage.
Modal makes creating and working with environments easy via [containers and container images](https://modal.com/docs/guide/custom-container).
We start from a base image and specify all of our dependencies.
We’ll call out the interesting ones as they come up below.
Note that these dependencies are not installed locally
— they are only installed in the remote environment where our Modal App runs.
```
app = modal.App(name="example-lora-flux")
image = modal.Image.debian_slim(python_version="3.10").pip_install(
"accelerate==0.31.0",
"datasets~=2.13.0",
"fastapi[standard]==0.115.4",
"ftfy~=6.1.0",
"gradio~=5.5.0",
"huggingface-hub==0.26.2",
"hf_transfer==0.1.8",
"numpy<2",
"peft==0.11.1",
"pydantic==2.9.2",
"sentencepiece>=0.1.91,!=0.1.92",
"smart_open~=6.4.0",
"starlette==0.41.2",
"transformers~=4.41.2",
"torch~=2.2.0",
"torchvision~=0.16",
"triton~=2.2.0",
"wandb==0.17.6",
)
```
### Downloading scripts and installing a git repo with `run_commands`
We’ll use an example script from the `diffusers` library to train the model.
We acquire it from GitHub and install it in our environment with a series of commands.
The container environments Modal Functions run in are highly flexible —
see [the docs](https://modal.com/docs/guide/custom-container) for more details.
```
GIT_SHA = "e649678bf55aeaa4b60bd1f68b1ee726278c0304" # specify the commit to fetch
image = (
image.apt_install("git")
# Perform a shallow fetch of just the target `diffusers` commit, checking out
# the commit in the container's home directory, /root. Then install `diffusers`
.run_commands(
"cd /root && git init .",
"cd /root && git remote add origin https://github.com/huggingface/diffusers",
f"cd /root && git fetch --depth=1 origin {GIT_SHA} && git checkout {GIT_SHA}",
"cd /root && pip install -e .",
)
)
```
### Configuration with `dataclass`es
Machine learning apps often have a lot of configuration information.
We collect up all of our configuration into dataclasses to avoid scattering special/magic values throughout code.
```
@dataclass
class SharedConfig:
"""Configuration information shared across project components."""
# The instance name is the "proper noun" we're teaching the model
instance_name: str = "Qwerty"
# That proper noun is usually a member of some class (person, bird),
# and sharing that information with the model helps it generalize better.
class_name: str = "Golden Retriever"
# identifier for pretrained models on Hugging Face
model_name: str = "black-forest-labs/FLUX.1-dev"
```
### Storing data created by our app with `modal.Volume`
The tools we’ve used so far work well for fetching external information,
which defines the environment our app runs in,
but what about data that we create or modify during the app’s execution?
A persisted [`modal.Volume`](../guide/volumes.html) can store and share data across Modal Apps and Functions.
We’ll use one to store both the original and fine-tuned weights we create during training
and then load them back in for inference.
```
volume = modal.Volume.from_name(
"dreambooth-finetuning-volume-flux", create_if_missing=True
)
MODEL_DIR = "/model"
```
Note that access to the Flux.1-dev model on Hugging Face is [gated by a license agreement](https://huggingface.co/docs/hub/en/models-gated) which
you must agree to [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
After you have accepted the license, [create a Modal Secret](https://modal.com/secrets) with the name `huggingface-secret` following the instructions in the template.
```
huggingface_secret = modal.Secret.from_name(
"huggingface-secret", required_keys=["HF_TOKEN"]
)
image = image.env(
{"HF_HUB_ENABLE_HF_TRANSFER": "1"} # turn on faster downloads from HF
)
@app.function(
volumes={MODEL_DIR: volume},
image=image,
secrets=[huggingface_secret],
timeout=600, # 10 minutes
)
def download_models(config):
import torch
from diffusers import DiffusionPipeline
from huggingface_hub import snapshot_download
snapshot_download(
config.model_name,
local_dir=MODEL_DIR,
ignore_patterns=["*.pt", "*.bin"], # using safetensors
)
DiffusionPipeline.from_pretrained(MODEL_DIR, torch_dtype=torch.bfloat16)
```
### Load fine-tuning dataset
Part of the magic of the low-rank fine-tuning is that we only need 3-10 images for fine-tuning.
So we can fetch just a few images, stored on consumer platforms like Imgur or Google Drive,
whenever we need them — no need for expensive, hard-to-maintain data pipelines.
```
def load_images(image_urls: list[str]) -> Path:
import PIL.Image
from smart_open import open
img_path = Path("/img")
img_path.mkdir(parents=True, exist_ok=True)
for ii, url in enumerate(image_urls):
with open(url, "rb") as f:
image = PIL.Image.open(f)
image.save(img_path / f"{ii}.png")
print(f"{ii + 1} images loaded")
return img_path
```
Low-Rank Adapation (LoRA) fine-tuning for a text-to-image model
---------------------------------------------------------------
The base model we start from is trained to do a sort of “reverse [ekphrasis](https://en.wikipedia.org/wiki/Ekphrasis)”:
it attempts to recreate a visual work of art or image from only its description.
We can use the model to synthesize wholly new images
by combining the concepts it has learned from the training data.
We use a pretrained model, the Flux model from Black Forest Labs.
In this example, we “finetune” Flux, making only small adjustments to the weights.
Furthermore, we don’t change all the weights in the model.
Instead, using a technique called [*low-rank adaptation*](https://arxiv.org/abs/2106.09685),
we change a much smaller matrix that works “alongside” the existing weights, nudging the model in the direction we want.
We can get away with such a small and simple training process because we’re just teach the model the meaning of a single new word: the name of our pet.
The result is a model that can generate novel images of our pet:
as an astronaut in space, as painted by Van Gogh or Bastiat, etc.
### Finetuning with Hugging Face 🧨 Diffusers and Accelerate
The model weights, training libraries, and training script are all provided by [🤗 Hugging Face](https://huggingface.co).
You can kick off a training job with the command `modal run dreambooth_app.py::app.train`.
It should take about ten minutes.
Training machine learning models takes time and produces a lot of metadata —
metrics for performance and resource utilization,
metrics for model quality and training stability,
and model inputs and outputs like images and text.
This is especially important if you’re fiddling around with the configuration parameters.
This example can optionally use [Weights & Biases](https://wandb.ai) to track all of this training information.
Just sign up for an account, switch the flag below, and add your API key as a [Modal Secret](https://modal.com/secrets).
```
USE_WANDB = False
```
You can see an example W&B dashboard [here](https://wandb.ai/cfrye59/dreambooth-lora-sd-xl).
Check out [this run](https://wandb.ai/cfrye59/dreambooth-lora-sd-xl/runs/ca3v1lsh?workspace=user-cfrye59),
which [despite having high GPU utilization](https://wandb.ai/cfrye59/dreambooth-lora-sd-xl/runs/ca3v1lsh/system) suffered from numerical instability during training and produced only black images — hard to debug without experiment management logs!
You can read more about how the values in `TrainConfig` are chosen and adjusted [in this blog post on Hugging Face](https://huggingface.co/blog/dreambooth).
To run training on images of your own pet, upload the images to separate URLs and edit the contents of the file at `TrainConfig.instance_example_urls_file` to point to them.
Tip: if the results you’re seeing don’t match the prompt too well, and instead produce an image
of your subject without taking the prompt into account, the model has likely overfit. In this case, repeat training with a lower
value of `max_train_steps`. If you used W&B, look back at results earlier in training to determine where to stop.
On the other hand, if the results don’t look like your subject, you might need to increase `max_train_steps`.
```
@dataclass
class TrainConfig(SharedConfig):
"""Configuration for the finetuning step."""
# training prompt looks like `{PREFIX} {INSTANCE_NAME} the {CLASS_NAME} {POSTFIX}`
prefix: str = "a photo of"
postfix: str = ""
# locator for plaintext file with urls for images of target instance
instance_example_urls_file: str = str(
Path(__file__).parent / "instance_example_urls.txt"
)
# Hyperparameters/constants from the huggingface training example
resolution: int = 512
train_batch_size: int = 3
rank: int = 16 # lora rank
gradient_accumulation_steps: int = 1
learning_rate: float = 4e-4
lr_scheduler: str = "constant"
lr_warmup_steps: int = 0
max_train_steps: int = 500
checkpointing_steps: int = 1000
seed: int = 117
@app.function(
image=image,
gpu="A100-80GB", # fine-tuning is VRAM-heavy and requires a high-VRAM GPU
volumes={MODEL_DIR: volume}, # stores fine-tuned model
timeout=1800, # 30 minutes
secrets=[huggingface_secret]
+ (
[modal.Secret.from_name("wandb-secret", required_keys=["WANDB_API_KEY"])]
if USE_WANDB
else []
),
)
def train(instance_example_urls, config):
import subprocess
from accelerate.utils import write_basic_config
# load data locally
img_path = load_images(instance_example_urls)
# set up hugging face accelerate library for fast training
write_basic_config(mixed_precision="bf16")
# define the training prompt
instance_phrase = f"{config.instance_name} the {config.class_name}"
prompt = f"{config.prefix} {instance_phrase} {config.postfix}".strip()
# the model training is packaged as a script, so we have to execute it as a subprocess, which adds some boilerplate
def _exec_subprocess(cmd: list[str]):
"""Executes subprocess and prints log to terminal while subprocess is running."""
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
with process.stdout as pipe:
for line in iter(pipe.readline, b""):
line_str = line.decode()
print(f"{line_str}", end="")
if exitcode := process.wait() != 0:
raise subprocess.CalledProcessError(exitcode, "\n".join(cmd))
# run training -- see huggingface accelerate docs for details
print("launching dreambooth training script")
_exec_subprocess(
[
"accelerate",
"launch",
"examples/dreambooth/train_dreambooth_lora_flux.py",
"--mixed_precision=bf16", # half-precision floats most of the time for faster training
f"--pretrained_model_name_or_path={MODEL_DIR}",
f"--instance_data_dir={img_path}",
f"--output_dir={MODEL_DIR}",
f"--instance_prompt={prompt}",
f"--resolution={config.resolution}",
f"--train_batch_size={config.train_batch_size}",
f"--gradient_accumulation_steps={config.gradient_accumulation_steps}",
f"--learning_rate={config.learning_rate}",
f"--lr_scheduler={config.lr_scheduler}",
f"--lr_warmup_steps={config.lr_warmup_steps}",
f"--max_train_steps={config.max_train_steps}",
f"--checkpointing_steps={config.checkpointing_steps}",
f"--seed={config.seed}", # increased reproducibility by seeding the RNG
]
+ (
[
"--report_to=wandb",
# validation output tracking is useful, but currently broken for Flux LoRA training
# f"--validation_prompt={prompt} in space", # simple test prompt
# f"--validation_epochs={config.max_train_steps // 5}",
]
if USE_WANDB
else []
),
)
# The trained model information has been output to the volume mounted at `MODEL_DIR`.
# To persist this data for use in our web app, we 'commit' the changes
# to the volume.
volume.commit()
```
Running our model
-----------------
To generate images from prompts using our fine-tuned model, we define a Modal Function called `inference`.
Naively, this would seem to be a bad fit for the flexible, serverless infrastructure of Modal:
wouldn’t you need to include the steps to load the model and spin it up in every function call?
In order to initialize the model just once on container startup,
we use Modal’s [container lifecycle](../guide/lifecycle-functions.html) features, which require the function to be part
of a class. Note that the `modal.Volume` we saved the model to is mounted here as well,
so that the fine-tuned model created by `train` is available to us.
```
@app.cls(image=image, gpu="A100", volumes={MODEL_DIR: volume})
class Model:
@modal.enter()
def load_model(self):
import torch
from diffusers import DiffusionPipeline
# Reload the modal.Volume to ensure the latest state is accessible.
volume.reload()
# set up a hugging face inference pipeline using our model
pipe = DiffusionPipeline.from_pretrained(
MODEL_DIR,
torch_dtype=torch.bfloat16,
).to("cuda")
pipe.load_lora_weights(MODEL_DIR)
self.pipe = pipe
@modal.method()
def inference(self, text, config):
image = self.pipe(
text,
num_inference_steps=config.num_inference_steps,
guidance_scale=config.guidance_scale,
).images[0]
return image
```
Wrap the trained model in a Gradio web UI
-----------------------------------------
[Gradio](https://gradio.app) makes it super easy to expose a model’s functionality
in an easy-to-use, responsive web interface.
This model is a text-to-image generator,
so we set up an interface that includes a user-entry text box
and a frame for displaying images.
We also provide some example text inputs to help
guide users and to kick-start their creative juices.
And we couldn’t resist adding some Modal style to it as well!
You can deploy the app on Modal with the command `modal deploy dreambooth_app.py`.
You’ll be able to come back days, weeks, or months later and find it still ready to go,
even though you don’t have to pay for a server to run while you’re not using it.
```
@dataclass
class AppConfig(SharedConfig):
"""Configuration information for inference."""
num_inference_steps: int = 50
guidance_scale: float = 6
web_image = image.add_local_dir(
# Add local web assets to the image
Path(__file__).parent / "assets",
remote_path="/assets",
)
@app.function(
image=web_image,
max_containers=1,
)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def fastapi_app():
import gradio as gr
from fastapi import FastAPI
from fastapi.responses import FileResponse
from gradio.routes import mount_gradio_app
web_app = FastAPI()
# Call out to the inference in a separate Modal environment with a GPU
def go(text=""):
if not text:
text = example_prompts[0]
return Model().inference.remote(text, config)
# set up AppConfig
config = AppConfig()
instance_phrase = f"{config.instance_name} the {config.class_name}"
example_prompts = [
f"{instance_phrase}",
f"a painting of {instance_phrase.title()} With A Pearl Earring, by Vermeer",
f"oil painting of {instance_phrase} flying through space as an astronaut",
f"a painting of {instance_phrase} in cyberpunk city. character design by cory loftis. volumetric light, detailed, rendered in octane",
f"drawing of {instance_phrase} high quality, cartoon, path traced, by studio ghibli and don bluth",
]
modal_docs_url = "https://modal.com/docs"
modal_example_url = f"{modal_docs_url}/examples/dreambooth_app"
description = f"""Describe what they are doing or how a particular artist or style would depict them. Be fantastical! Try the examples below for inspiration.
### Learn how to make a "Dreambooth" for your own pet [here]({modal_example_url}).
"""
# custom styles: an icon, a background, and a theme
@web_app.get("/favicon.ico", include_in_schema=False)
async def favicon():
return FileResponse("/assets/favicon.svg")
@web_app.get("/assets/background.svg", include_in_schema=False)
async def background():
return FileResponse("/assets/background.svg")
with open("/assets/index.css") as f:
css = f.read()
theme = gr.themes.Default(
primary_hue="green", secondary_hue="emerald", neutral_hue="neutral"
)
# add a gradio UI around inference
with gr.Blocks(
theme=theme,
css=css,
title=f"Generate images of {config.instance_name} on Modal",
) as interface:
gr.Markdown(
f"# Generate images of {instance_phrase}.\n\n{description}",
)
with gr.Row():
inp = gr.Textbox( # input text component
label="",
placeholder=f"Describe the version of {instance_phrase} you'd like to see",
lines=10,
)
out = gr.Image( # output image component
height=512, width=512, label="", min_width=512, elem_id="output"
)
with gr.Row():
btn = gr.Button("Dream", variant="primary", scale=2)
btn.click(
fn=go, inputs=inp, outputs=out
) # connect inputs and outputs with inference function
gr.Button( # shameless plug
"⚡️ Powered by Modal",
variant="secondary",
link="https://modal.com",
)
with gr.Column(variant="compact"):
# add in a few examples to inspire users
for ii, prompt in enumerate(example_prompts):
btn = gr.Button(prompt, variant="secondary")
btn.click(fn=lambda idx=ii: example_prompts[idx], outputs=inp)
# mount for execution on Modal
return mount_gradio_app(
app=web_app,
blocks=interface,
path="/",
)
```
Running your fine-tuned model from the command line
---------------------------------------------------
You can use the `modal` command-line interface to set up, customize, and deploy this app:
* `modal run diffusers_lora_finetune.py` will train the model. Change the `instance_example_urls_file` to point to your own pet’s images.
* `modal serve diffusers_lora_finetune.py` will [serve](../guide/webhooks.html#developing-with-modal-serve) the Gradio interface at a temporary location. Great for iterating on code!
* `modal shell diffusers_lora_finetune.py` is a convenient helper to open a bash [shell](../guide/developing-debugging.html#interactive-shell) in our image. Great for debugging environment issues.
Remember, once you’ve trained your own fine-tuned model, you can deploy it permanently — for no cost when it is not being used! —
using `modal deploy diffusers_lora_finetune.py`.
If you just want to try the app out, you can find our deployment [here](https://modal-labs--example-lora-flux-fastapi-app.modal.run).
```
@app.local_entrypoint()
def run( # add more config params here to make training configurable
max_train_steps: int = 250,
):
print("🎨 loading model")
download_models.remote(SharedConfig())
print("🎨 setting up training")
config = TrainConfig(max_train_steps=max_train_steps)
instance_example_urls = (
Path(TrainConfig.instance_example_urls_file).read_text().splitlines()
)
train.remote(instance_example_urls, config)
print("🎨 training finished")
```
[Fine-tune Flux on your pet using LoRA](#fine-tune-flux-on-your-pet-using-lora)[Imports and setup](#imports-and-setup)[Building up the environment](#building-up-the-environment)[Downloading scripts and installing a git repo with run\_commands](#downloading-scripts-and-installing-a-git-repo-with-run_commands)[Configuration with dataclasses](#configuration-with-dataclasses)[Storing data created by our app with modal.Volume](#storing-data-created-by-our-app-with-modalvolume)[Load fine-tuning dataset](#load-fine-tuning-dataset)[Low-Rank Adapation (LoRA) fine-tuning for a text-to-image model](#low-rank-adapation-lora-fine-tuning-for-a-text-to-image-model)[Finetuning with Hugging Face 🧨 Diffusers and Accelerate](#finetuning-with-hugging-face--diffusers-and-accelerate)[Running our model](#running-our-model)[Wrap the trained model in a Gradio web UI](#wrap-the-trained-model-in-a-gradio-web-ui)[Running your fine-tuned model from the command line](#running-your-fine-tuned-model-from-the-command-line)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/dreambooth/diffusers_lora_finetune.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
A simple web scraper
====================
In this guide we’ll introduce you to Modal by writing a simple web scraper.
We’ll explain the foundations of a Modal application step by step.
Set up your first Modal app
---------------------------
Modal apps are orchestrated as Python scripts, but can theoretically run
anything you can run in a container. To get you started, make sure to install
the latest Modal Python package and set up an API token (the first two steps of
the [Getting started](../../login%EF%B9%96next=%EA%A4%B7apps.html) page).
Finding links
-------------
First, we create an empty Python file `scrape.py`. This file will contain our
application code. Lets write some basic Python code to fetch the contents of a
web page and print the links (href attributes) it finds in the document:
```
import re
import sys
import urllib.request
def get_links(url):
response = urllib.request.urlopen(url)
html = response.read().decode("utf8")
links = []
for match in re.finditer('href="(.*?)"', html):
links.append(match.group(1))
return links
if __name__ == "__main__":
links = get_links(sys.argv[1])
print(links)
```
Now obviously this is just pure standard library Python code, and you can run it
on your machine:
```
$ python scrape.py http://example.com
['https://www.iana.org/domains/example']
```
Running it in Modal
-------------------
To make the `get_links` function run in Modal instead of your local machine, all
you need to do is
* Import `modal`
* Create a [`modal.App`](../reference/modal.App.html) instance
* Add a `@app.function()` annotation to your function
* Replace the `if __name__ == "__main__":` block with a function decorated with [`@app.local_entrypoint()`](../reference/modal.App.html#local_entrypoint)
* Call `get_links` using `get_links.remote`
```
import re
import urllib.request
import modal
app = modal.App(name="link-scraper")
@app.function()
def get_links(url):
...
@app.local_entrypoint()
def main(url):
links = get_links.remote(url)
print(links)
```
You can now run this with the Modal CLI, using `modal run` instead of `python`.
This time, you’ll see additional progress indicators while the script is
running:
```
$ modal run scrape.py --url http://example.com
✓ Initialized.
✓ Created objects.
['https://www.iana.org/domains/example']
✓ App completed.
```
Custom containers
-----------------
In the code above we make use of the Python standard library `urllib` library.
This works great for static web pages, but many pages these days use javascript
to dynamically load content, which wouldn’t appear in the loaded html file.
Let’s use the [Playwright](https://playwright.dev/python/docs/intro) package to
instead launch a headless Chromium browser which can interpret any javascript
that might be on the page.
We can pass custom container images (defined using [`modal.Image`](../reference/modal.Image.html)) to the `@app.function()` decorator. We’ll make use of the `modal.Image.debian_slim` pre-bundled image add
the shell commands to install Playwright and its dependencies:
```
playwright_image = modal.Image.debian_slim(python_version="3.10").run_commands(
"apt-get update",
"apt-get install -y software-properties-common",
"apt-add-repository non-free",
"apt-add-repository contrib",
"pip install playwright==1.42.0",
"playwright install-deps chromium",
"playwright install chromium",
)
```
Note that we don’t have to install Playwright or Chromium on our development
machine since this will all run in Modal. We can now modify our `get_links` function to make use of the new tools:
```
@app.function(image=playwright_image)
async def get_links(cur_url: str):
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(cur_url)
links = await page.eval_on_selector_all("a[href]", "elements => elements.map(element => element.href)")
await browser.close()
print("Links", links)
return links
```
Since Playwright has a nice async interface, we’ll redeclare our `get_links` function as async (Modal works with both sync and async functions).
The first time you run the function after making this change, you’ll notice that
the output first shows the progress of building the custom image you specified,
after which your function runs like before. This image is then cached so that on
subsequent runs of the function it will not be rebuilt as long as the image
definition is the same.
Scaling out
-----------
So far, our script only fetches the links for a single page. What if we want to
scrape a large list of links in parallel?
We can do this easily with Modal, because of some magic: the function we wrapped
with the `@app.function()` decorator is no longer an ordinary function, but a
Modal [Function](../reference/modal.Function.html) object. This
means it comes with a `map` property built in, that lets us run this function
for all inputs in parallel, scaling up to as many workers as needed.
Let’s change our code to scrape all urls we feed to it in parallel:
```
@app.local_entrypoint()
def main():
urls = ["http://modal.com", "http://github.com"]
for links in get_links.map(urls):
for link in links:
print(link)
```
Schedules and deployments
-------------------------
Let’s say we want to log the scraped links daily. We move the print loop into
its own Modal function and annotate it with a `modal.Period(days=1)` schedule -
indicating we want to run it once per day. Since the scheduled function will not
run from our command line, we also add a hard-coded list of links to crawl for
now. In a more realistic setting we could read this from a database or other
accessible data source.
```
@app.function(schedule=modal.Period(days=1))
def daily_scrape():
urls = ["http://modal.com", "http://github.com"]
for links in get_links.map(urls):
for link in links:
print(link)
```
To deploy this as a permanent app, run the command
```
modal deploy scrape.py
```
Running this command deploys this function and then closes immediately. We can
see the deployment and all of its runs, including the printed links, on the
Modal [Apps page](https://modal.com/apps). Rerunning the script will redeploy
the code with any changes you have made - overwriting an existing deploy with
the same name (“link-scraper” in this case).
Integrations and Secrets
------------------------
Instead of looking at the links in the run logs of our deployments, let’s say we
wanted to post them to our `#scraped-links` Slack channel. To do this, we can
make use of the [Slack API](https://api.slack.com/) and the `slack-sdk` [PyPI package](https://pypi.org/project/slack-sdk/).
The Slack SDK WebClient requires an API token to get access to our Slack
Workspace, and since it’s bad practice to hardcode credentials into application
code we make use of Modal’s **Secrets**. Secrets are snippets of data that will
be injected as environment variables in the containers running your functions.
The easiest way to create Secrets is to go to the [Secrets section of modal.com](https://modal.com/secrets). You can both create a
free-form secret with any environment variables, or make use of presets for
common services. We’ll use the Slack preset and after filling in the necessary
information we are presented with a snippet of code that can be used to post to
Slack using our credentials:
```
import os
slack_sdk_image = modal.Image.debian_slim().pip_install("slack-sdk")
@app.function(image=slack_sdk_image, secrets=[modal.Secret.from_name("my-slack-secret")])
def bot_token_msg(channel, message):
import slack_sdk
client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"])
client.chat_postMessage(channel=channel, text=message)
```
Copy that code as-is, then amend the `daily_scrape` function to call `bot_token_msg`.
```
@app.function(schedule=modal.Period(days=1))
def daily_scrape():
urls = ["http://modal.com", "http://github.com"]
for links in get_links.map(urls):
for link in links:
bot_token_msg.remote("scraped-links", link)
```
Note that we are freely making function calls across completely different
container images, as if they were regular Python functions in the same program.
We rerun the script which overwrites the old deploy with our updated code, and
now we get a daily feed of our scraped links in our Slack channel 🎉
Summary
-------
We have shown how you can use Modal to develop distributed Python data
applications using custom containers. Through simple constructs we were able to
add parallel execution. With the change of a single line of code were were able
to go from experimental development code to a deployed application. The full
code of this example can be found [here](webscraper.html). We hope
this overview gives you a glimpse of what you are able to build using Modal.
[A simple web scraper](#a-simple-web-scraper)[Set up your first Modal app](#set-up-your-first-modal-app)[Finding links](#finding-links)[Running it in Modal](#running-it-in-modal)[Custom containers](#custom-containers)[Scaling out](#scaling-out)[Schedules and deployments](#schedules-and-deployments)[Integrations and Secrets](#integrations-and-secrets)[Summary](#summary)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/07_web_endpoints/fastrtc_flip_webcam.py)
Run a FastRTC app on Modal
==========================
[FastRTC](https://fastrtc.org/) is a Python library for real-time communication on the web.
This example demonstrates how to run a simple FastRTC app in the cloud on Modal.
It’s intended to help you get up and running with real-time streaming applications on Modal
as quickly as possible. If you’re interested in running a production-grade WebRTC app on Modal,
see [this example](webrtc_yolo.html).
In this example, we stream webcam video from a browser to a container on Modal,
where the video is flipped, annotated, and sent back with under 100ms of delay.
You can try it out [here](https://modal-labs-examples--fastrtc-flip-webcam-ui.modal.run/) or just dive straight into the code to run it yourself.
Set up FastRTC on Modal
-----------------------
First, we import the `modal` SDK
and use it to define a [container image](../guide/images.html) with FastRTC and related dependencies.
```
import modal
web_image = modal.Image.debian_slim(python_version="3.12").pip_install(
"fastapi[standard]==0.115.4",
"fastrtc==0.0.23",
"gradio==5.7.1",
"opencv-python-headless==4.11.0.86",
)
```
Then, we set that as the default Image on our Modal [App](../guide/apps.html).
```
app = modal.App("fastrtc-flip-webcam", image=web_image)
```
### Configure WebRTC streaming on Modal
Under the hood, FastRTC uses the WebRTC [APIs](https://www.w3.org/TR/webrtc/) and [protocols](https://datatracker.ietf.org/doc/html/rfc8825).
WebRTC provides low latency (“real-time”) peer-to-peer communication
for Web applications, focusing on audio and video.
Considering that the Web is a platform originally designed
for high-latency, client-server communication of text and images,
that’s no mean feat!
In addition to protocols that implement this communication,
WebRTC includes APIs for describing and manipulating audio/video streams.
In this demo, we set a few simple parameters, like the direction of the webcam
and the minimum frame rate. See the [MDN Web Docs for `MediaTrackConstraints`](https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints) for more.
```
TRACK_CONSTRAINTS = {
"width": {"exact": 640},
"height": {"exact": 480},
"frameRate": {"min": 30},
"facingMode": { # https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackSettings/facingMode
"ideal": "user"
},
}
```
In theory, the Internet is designed for peer-to-peer communication
all the way down to its heart, the Internet Protocol (IP): just send packets between IP addresses.
In practice, peer-to-peer communication on the contemporary Internet is fraught with difficulites,
from restrictive firewalls to finicky work-arounds for [the exhaustion of IPv4 addresses](https://www.a10networks.com/glossary/what-is-ipv4-exhaustion/),
like [Carrier-Grade Network Address Translation (CGNAT)](https://en.wikipedia.org/wiki/Carrier-grade_NAT).
So establishing peer-to-peer connections can be quite involved.
The protocol for doing so is called Interactive Connectivity Establishment (ICE).
It is described in [this RFC](https://datatracker.ietf.org/doc/html/rfc8445#section-2).
ICE involves the peers exchanging a list of connections that might be used.
We use a fairly simple setup here, where our peer on Modal uses the [Session Traversal Utilities for NAT (STUN)](https://datatracker.ietf.org/doc/html/rfc5389) server provided by Google. A STUN server basically just reflects back to a client what their
IP address and port number appear to be when they talk to it. The peer on Modal communicates
that information to the other peer trying to connect to it — in this case, a browser trying to share a webcam feed.
Note the use of `stun` and port `19302` in the URL in place of
something more familiar, like `http` and port `80`.
```
RTC_CONFIG = {"iceServers": [{"url": "stun:stun.l.google.com:19302"}]}
```
Running a FastRTC app on Modal
------------------------------
FastRTC builds on top of the [Gradio](https://www.gradio.app/docs) library for defining Web UIs in Python.
Gradio in turn is compatible with the [Asynchronous Server Gateway Interface (ASGI)](https://asgi.readthedocs.io/en/latest/) protocol for asynchronous Python web servers, like [FastAPI](https://fastrtc.org/userguide/streams/),
so we can host it on Modal’s cloud platform using the [`modal.asgi_app` decorator](../guide/webhooks.html#serving-asgi-and-wsgi-apps) with [Modal Function](../guide/apps.html).
But before we do that, we need to consider limits:
on how many peers can connect to one instance on Modal
and on how long they can stay connected.
We picked some sensible defaults to show how they interact
with the deployment parameters of the Modal Function.
You’ll want to tune these for your application!
```
MAX_CONCURRENT_STREAMS = 10 # number of peers per instance on Modal
MINUTES = 60 # seconds
TIME_LIMIT = 10 * MINUTES # time limit
@app.function(
# gradio requires sticky sessions
# so we limit the number of concurrent containers to 1
# and allow that container to handle concurrent streams
max_containers=1,
scaledown_window=TIME_LIMIT + 1 * MINUTES, # add a small buffer to time limit
)
@modal.concurrent(max_inputs=MAX_CONCURRENT_STREAMS) # inputs per container
@modal.asgi_app() # ASGI on Modal
def ui():
import fastrtc # WebRTC in Gradio
import gradio as gr # WebUIs in Python
from fastapi import FastAPI # asynchronous ASGI server framework
from gradio.routes import mount_gradio_app # connects Gradio and FastAPI
with gr.Blocks() as blocks: # block-wise UI definition
gr.HTML( # simple HTML header
"<h1 style='text-align: center'>"
"Streaming Video Processing with Modal and FastRTC"
"</h1>"
)
with gr.Column(): # a column of UI elements
fastrtc.Stream( # high-level media streaming UI element
modality="video",
mode="send-receive",
handler=flip_vertically, # handler -- handle incoming frame, produce outgoing frame
ui_args={"title": "Click 'Record' to flip your webcam in the cloud"},
rtc_configuration=RTC_CONFIG,
track_constraints=TRACK_CONSTRAINTS,
concurrency_limit=MAX_CONCURRENT_STREAMS, # limit simultaneous connections
time_limit=TIME_LIMIT, # limit time per connection
)
return mount_gradio_app(app=FastAPI(), blocks=blocks, path="/")
```
To try this out for yourself, run
```
modal serve 07_web_endpoints/fastrtc_flip_webcam.py
```
and head to the `modal.run` URL that appears in your terminal.
You can also check on the application’s dashboard
via the `modal.com` URL thatappears below it.
The `modal serve` command produces a hot-reloading development server —
try editing the `title` in the `ui_args` above and watch the server redeploy.
This temporary deployment is tied to your terminal session.
To deploy permanently, run
```
modal deploy 07_web_endponts/fastrtc_flip_webcam.py
```
Note that Modal is a serverless platform with [usage-based pricing](../../pricing.html),
so this application will spin down and cost you nothing when it is not in use.
Addenda
-------
This FastRTC app is very much the “hello world” or “echo server”
of FastRTC: it just flips the incoming webcam stream and adds a “hello” message.
That logic appears below.
```
def flip_vertically(image):
import cv2
import numpy as np
image = image.astype(np.uint8)
if image is None:
print("failed to decode image")
return
# flip vertically and caption to show video was processed on Modal
image = cv2.flip(image, 0)
lines = ["Hello from Modal!"]
caption_image(image, lines)
return image
def caption_image(
img, lines, font_scale=0.8, thickness=2, margin=10, font=None, color=None
):
import cv2
if font is None:
font = cv2.FONT_HERSHEY_SIMPLEX
if color is None:
color = (127, 238, 100, 128) # Modal Green
# get text sizes
sizes = [cv2.getTextSize(line, font, font_scale, thickness)[0] for line in lines]
if not sizes:
return
# position text in bottom right
pos_xs = [img.shape[1] - size[0] - margin for size in sizes]
pos_ys = [img.shape[0] - margin]
for _width, height in reversed(sizes[:-1]):
next_pos = pos_ys[-1] - 2 * height
pos_ys.append(next_pos)
for line, pos in zip(lines, zip(pos_xs, reversed(pos_ys))):
cv2.putText(img, line, pos, font, font_scale, color, thickness)
```
[Run a FastRTC app on Modal](#run-a-fastrtc-app-on-modal)[Set up FastRTC on Modal](#set-up-fastrtc-on-modal)[Configure WebRTC streaming on Modal](#configure-webrtc-streaming-on-modal)[Running a FastRTC app on Modal](#running-a-fastrtc-app-on-modal)[Addenda](#addenda)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal serve 07_web_endpoints/fastrtc_flip_webcam.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/protein-folding/esm3.py)
Build a protein folding dashboard with ESM3, Molstar, and Gradio
================================================================
![Image of dashboard UI for ESM3 protein folding](https://modal-cdn.com/example-esm3-ui.png)
There are perhaps a quadrillion distinct proteins on the planet Earth,
each one a marvel of nanotechnology discovered by painstaking evolution.
We know the amino acid sequence of nearly a billion but we only
know the three-dimensional structure of a few hundred thousand,
gathered by slow, difficult observational methods like X-ray crystallography.
Built upon this data are machine learning models like
EvolutionaryScale’s [ESM3](https://www.evolutionaryscale.ai/blog/esm3-release) that can predict the structure of any sequence in seconds.
In this example, we’ll show how you can use Modal to not
just run the latest protein-folding model but also build tools around it for
you and your team of scientists to understand and analyze the results.
Basic Setup
-----------
```
import base64
import io
from pathlib import Path
from typing import Optional
import modal
MINUTES = 60 # seconds
app = modal.App("example-esm3-dashboard")
```
### Create a Volume to store ESM3 model weights and Entrez sequence data
To minimize cold start times, we’ll store the ESM3 model weights on a Modal [Volume](../guide/volumes.html).
For patterns and best practices for storing model weights on Modal, see [this guide](../guide/model-weights.html).
We’ll use this same distributed storage primitive to store sequence data.
```
volume = modal.Volume.from_name("example-esm3-dashboard", create_if_missing=True)
VOLUME_PATH = Path("/vol")
MODELS_PATH = VOLUME_PATH / "models"
DATA_PATH = VOLUME_PATH / "data"
```
### Define dependencies in container images
The container image for structure inference is based on Modal’s default slim Debian
Linux image with `esm` for loading and running the model, `gemmi` for
managing protein structure file conversions, and `hf_transfer` for faster downloading of the model weights from Hugging Face.
```
esm3_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install(
"esm==3.1.1",
"torch==2.4.1",
"gemmi==0.7.0",
"huggingface_hub[hf_transfer]==0.26.2",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HOME": str(MODELS_PATH)})
)
```
We’ll also define a separate image, with different dependencies,
for the part of our app that hosts the dashboard.
This helps reduce the complexity of Python dependency management
by “walling off” the different parts, e.g. separating
functions that depend on finicky ML packages
from those that depend on pedantic web packages.
Dependencies include `gradio` for building a web UI in Python and `biotite` for extracting sequences from UniProt accession numbers.
You can read more about how to configure container images on Modal in [this guide](../guide/images.html).
```
web_app_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("gradio~=4.44.0", "biotite==0.41.2", "fastapi[standard]==0.115.4")
.add_local_dir(Path(__file__).parent / "frontend", remote_path="/assets")
)
```
Here we “pre-import” libraries that will be used by the functions we run
on Modal in a given image using the `with image.imports` context manager.
```
with esm3_image.imports():
import tempfile
import gemmi
import torch
from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, GenerationConfig
with web_app_image.imports():
import biotite.database.entrez as entrez
import biotite.sequence.io.fasta as fasta
from fastapi import FastAPI
```
Define a `Model` inference class for ESM3
-----------------------------------------
Next, we map the model’s setup and inference code onto Modal.
1. For setup code that only needs to run once, we put it in a method
decorated with `@enter`, which runs on container start. For details,
see [this guide](../guide/cold-start.html).
2. The rest of the inference code goes in a method decorated with `@method`.
3. We accelerate the compute-intensive inference with a GPU, specifically an A10G.
For more on using GPUs on Modal, see [this guide](../guide/gpu.html).
```
@app.cls(
image=esm3_image,
volumes={VOLUME_PATH: volume},
secrets=[modal.Secret.from_name("huggingface-secret")],
gpu="A10G",
timeout=20 * MINUTES,
)
class Model:
@modal.enter()
def enter(self):
self.model = ESM3.from_pretrained("esm3_sm_open_v1")
self.model.to("cuda")
print("using half precision and tensor cores for fast ESM3 inference")
self.model = self.model.half()
torch.backends.cuda.matmul.allow_tf32 = True
self.max_steps = 250
print(f"setting max ESM steps to: {self.max_steps}")
def convert_protein_to_MMCIF(self, esm_protein, output_path):
structure = gemmi.read_pdb_string(esm_protein.to_pdb_string())
doc = structure.make_mmcif_document()
doc.write_file(str(output_path), gemmi.cif.WriteOptions())
def get_generation_config(self, num_steps):
return GenerationConfig(track="structure", num_steps=num_steps)
@modal.method()
def inference(self, sequence: str):
num_steps = min(len(sequence), self.max_steps)
print(f"running ESM3 inference with num_steps={num_steps}")
esm_protein = self.model.generate(
ESMProtein(sequence=sequence), self.get_generation_config(num_steps)
)
print("checking for errors in output")
if hasattr(esm_protein, "error_msg"):
raise ValueError(esm_protein.error_msg)
print("converting ESMProtein into MMCIF file")
save_path = Path(tempfile.mktemp() + ".mmcif")
self.convert_protein_to_MMCIF(esm_protein, save_path)
print("returning MMCIF bytes")
return io.BytesIO(save_path.read_bytes())
```
Serve a dashboard as an `asgi_app`
----------------------------------
In this section we’ll create a web interface around the ESM3 model
that can help scientists and stakeholders understand and interrogate the results of the model.
You can deploy this UI, along with the backing inference endpoint,
with the following command:
```
modal deploy esm3.py
```
### Integrating Modal Functions
The integration between our dashboard and our inference backend
is made simple by the Modal SDK:
because the definition of the `Model` class is available in the same Python
context as the defintion of the web UI,
we can instantiate an instance and call its methods with `.remote`.
The inference runs in a GPU-accelerated container with all of ESM3’s
dependencies, while this code executes in a CPU-only container
with only our web dependencies.
```
def run_esm(sequence: str) -> str:
sequence = sequence.strip()
print("running ESM")
mmcif_buffer = Model().inference.remote(sequence)
print("converting mmCIF bytes to base64 for compatibility with HTML")
mmcif_content = mmcif_buffer.read().decode()
mmcif_base64 = base64.b64encode(mmcif_content.encode()).decode()
return get_molstar_html(mmcif_base64)
```
### Building a UI in Python with Gradio
We’ll visualize the results using [Mol\*](https://molstar.org/).
Mol\* (pronounced “molstar”) is an open-source toolkit for
visualizing and analyzing large-scale molecular data, including secondary structures
and residue-specific positions of proteins.
Second, we’ll create links to lookup the metadata and structure of known
proteins using the [Universal Protein Resource](https://www.uniprot.org/) database from the UniProt consortium which is supported by the European
Bioinformatics Institute, the National Human Genome Research
Institute, and the Swiss Institute of Bioinformatics. UniProt
is also a hub that links to many other databases, like the RCSB Protein
Data Bank.
To pull sequence data, we’ll use the [Biotite](https://www.biotite-python.org/) library to pull [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from
UniProt which contain labelled sequences.
You should see the URL for this UI in the output of `modal deploy` or on your [Modal app dashboard](https://modal.com/apps) for this app.
```
@app.function(
image=web_app_image,
volumes={VOLUME_PATH: volume},
max_containers=1, # Gradio requires sticky sessions
)
@modal.concurrent(max_inputs=1000) # Gradio can handle many async inputs
@modal.asgi_app()
def ui():
import gradio as gr
from fastapi.responses import FileResponse
from gradio.routes import mount_gradio_app
web_app = FastAPI()
# custom styles: an icon, a background, and some CSS
@web_app.get("/favicon.ico", include_in_schema=False)
async def favicon():
return FileResponse("/assets/favicon.svg")
@web_app.get("/assets/background.svg", include_in_schema=False)
async def background():
return FileResponse("/assets/background.svg")
css = Path("/assets/index.css").read_text()
theme = gr.themes.Default(
primary_hue="green", secondary_hue="emerald", neutral_hue="neutral"
)
title = "Predict & Visualize Protein Structures"
with gr.Blocks(theme=theme, css=css, title=title, js=always_dark()) as interface:
gr.Markdown(f"# {title}")
with gr.Row():
with gr.Column():
gr.Markdown("## Enter UniProt ID ")
uniprot_num_box = gr.Textbox(
label="Enter UniProt ID or select one on the right",
placeholder="e.g. P02768, P69905, etc.",
)
get_sequence_button = gr.Button(
"Retrieve Sequence from UniProt ID", variant="primary"
)
uniprot_link_button = gr.Button(value="View protein on UniProt website")
uniprot_link_button.click(
fn=None,
inputs=uniprot_num_box,
js=get_js_for_uniprot_link(),
)
with gr.Column():
example_uniprots = get_uniprot_examples()
def extract_uniprot_num(example_idx):
uniprot = example_uniprots[example_idx]
return uniprot[uniprot.index("[") + 1 : uniprot.index("]")]
gr.Markdown("## Example UniProt Accession Numbers")
with gr.Row():
half_len = int(len(example_uniprots) / 2)
with gr.Column():
for i, uniprot in enumerate(example_uniprots[:half_len]):
btn = gr.Button(uniprot, variant="secondary")
btn.click(
fn=lambda j=i: extract_uniprot_num(j),
outputs=uniprot_num_box,
)
with gr.Column():
for i, uniprot in enumerate(example_uniprots[half_len:]):
btn = gr.Button(uniprot, variant="secondary")
btn.click(
fn=lambda j=i + half_len: extract_uniprot_num(j),
outputs=uniprot_num_box,
)
gr.Markdown("## Enter Sequence")
sequence_box = gr.Textbox(
label="Enter a sequence or retrieve it from a UniProt ID",
placeholder="e.g. MVTRLE..., PVTTIMHALL..., etc.",
)
get_sequence_button.click(
fn=get_sequence, inputs=[uniprot_num_box], outputs=[sequence_box]
)
run_esm_button = gr.Button("Run ESM3 Folding", variant="primary")
gr.Markdown("## ESM3 Predicted Structure")
molstar_html = gr.HTML()
run_esm_button.click(fn=run_esm, inputs=sequence_box, outputs=molstar_html)
# return a FastAPI app for Modal to serve
return mount_gradio_app(app=web_app, blocks=interface, path="/")
```
Folding from the command line
-----------------------------
If you want to quickly run the ESM3 model without the web interface, you can
run it from the command line like this:
```
modal run esm3
```
This will run the same inference code above on Modal. The results are
returned in the [Crystallographic Information File](https://en.wikipedia.org/wiki/Crystallographic_Information_File) format, which you can render with the online [Molstar Viewer](https://molstar.org/viewer/).
```
@app.local_entrypoint()
def main(sequence: Optional[str] = None, output_dir: Optional[str] = None):
if sequence is None:
print("using sequence for insulin [P01308]")
sequence = "MRTPMLLALLALATLCLAGRADAKPGDAESGKGAAFVSKQEGSEVVKRLRRYLDHWLGAPAPYPDPLEPKREVCELNPDCDELADHIGFQEAYRRFYGPV"
if output_dir is None:
output_dir = Path("/tmp/esm3")
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "output.mmcif"
print("starting inference on Modal")
results_buffer = Model().inference.remote(sequence)
print(f"writing results to {output_path}")
output_path.write_bytes(results_buffer.read())
```
Addenda
-------
The remainder of this code is boilerplate.
### Extracting Sequences from UniProt Accession Numbers
To retrieve sequence information we’ll utilize the `biotite` library which
will allow us to fetch [fasta](https://en.wikipedia.org/wiki/FASTA_format) sequence files from the [National Center for Biotechnology Information (NCBI) Entrez database](https://www.ncbi.nlm.nih.gov/Web/Search/entrezfs.html).
```
def get_sequence(uniprot_num: str) -> str:
try:
DATA_PATH.mkdir(parents=True, exist_ok=True)
uniprot_num = uniprot_num.strip()
fasta_path = DATA_PATH / f"{uniprot_num}.fasta"
print(f"Fetching {fasta_path} from the entrez database")
entrez.fetch_single_file(
uniprot_num, fasta_path, db_name="protein", ret_type="fasta"
)
fasta_file = fasta.FastaFile.read(fasta_path)
protein_sequence = fasta.get_sequence(fasta_file)
return str(protein_sequence)
except Exception as e:
return f"Error: {e}"
```
### Supporting functions for the Gradio app
The following Python code is used to enhance the Gradio app,
mostly by generating some extra HTML & JS and handling styling.
```
def get_js_for_uniprot_link():
url = "https://www.uniprot.org/uniprotkb/"
end = "/entry#structure"
return f"""(uni_id) => {{ if (!uni_id) return; window.open("{url}" + uni_id + "{end}"); }}"""
def get_molstar_html(mmcif_base64):
return f"""
<iframe
id="molstar_frame"
style="width: 100%; height: 600px; border: none;"
srcdoc='
<!DOCTYPE html>
<html>
<head>
<script src="https://cdn.jsdelivr.net/npm/@rcsb/rcsb-molstar/build/dist/viewer/rcsb-molstar.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@rcsb/rcsb-molstar/build/dist/viewer/rcsb-molstar.css">
</head>
<body>
<div id="protein-viewer" style="width: 1200px; height: 400px; position: center"></div>
<script>
console.log("Initializing viewer...");
(async function() {{
// Create plugin instance
const viewer = new rcsbMolstar.Viewer("protein-viewer");
// CIF data in base64
const mmcifData = "{mmcif_base64}";
// Convert base64 to blob
const blob = new Blob(
[atob(mmcifData)],
{{ type: "text/plain" }}
);
// Create object URL
const url = URL.createObjectURL(blob);
try {{
// Load structure
await viewer.loadStructureFromUrl(url, "mmcif");
}} catch (error) {{
console.error("Error loading structure:", error);
}}
}})();
</script>
</body>
</html>
'>
</iframe>"""
def get_uniprot_examples():
return [
"Albumin [P02768]",
"Insulin [P01308]",
"Hemoglobin [P69905]",
"Lysozyme [P61626]",
"BRCA1 [P38398]",
"Immunoglobulin [P01857]",
"Actin [P60709]",
"Ribonuclease [P07998]",
]
def always_dark():
return """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
```
[Build a protein folding dashboard with ESM3, Molstar, and Gradio](#build-a-protein-folding-dashboard-with-esm3-molstar-and-gradio)[Basic Setup](#basic-setup)[Create a Volume to store ESM3 model weights and Entrez sequence data](#create-a-volume-to-store-esm3-model-weights-and-entrez-sequence-data)[Define dependencies in container images](#define-dependencies-in-container-images)[Define a Model inference class for ESM3](#define-a-model-inference-class-for-esm3)[Serve a dashboard as an asgi\_app](#serve-a-dashboard-as-an-asgi_app)[Integrating Modal Functions](#integrating-modal-functions)[Building a UI in Python with Gradio](#building-a-ui-in-python-with-gradio)[Folding from the command line](#folding-from-the-command-line)[Addenda](#addenda)[Extracting Sequences from UniProt Accession Numbers](#extracting-sequences-from-uniprot-accession-numbers)[Supporting functions for the Gradio app](#supporting-functions-for-the-gradio-app)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/protein-folding/esm3.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/s3_bucket_mount.py)
Analyze NYC yellow taxi data with DuckDB on Parquet files from S3
=================================================================
This example shows how to use Modal for a classic data science task: loading table-structured data into cloud stores,
analyzing it, and plotting the results.
In particular, we’ll load public NYC taxi ride data into S3 as Parquet files,
then run SQL queries on it with DuckDB.
We’ll mount the S3 bucket in a Modal app with [`CloudBucketMount`](../reference/modal.CloudBucketMount.html).
We will write to and then read from that bucket, in each case using
Modal’s [parallel execution features](../guide/scale.html) to handle many files at once.
Basic setup
-----------
You will need to have an S3 bucket and AWS credentials to run this example. Refer to the documentation
for the exact [IAM permissions](../guide/cloud-bucket-mounts.html#iam-permissions) your credentials will need.
After you are done creating a bucket and configuring IAM settings,
you now need to create a [`Secret`](../guide/secrets.html) to share
the relevant AWS credentials with your Modal apps.
```
from datetime import datetime
from pathlib import Path, PosixPath
import modal
image = modal.Image.debian_slim(python_version="3.12").pip_install(
"requests==2.31.0", "duckdb==0.10.0", "matplotlib==3.8.3"
)
app = modal.App(image=image)
secret = modal.Secret.from_name(
"s3-bucket-secret",
required_keys=["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
)
MOUNT_PATH = PosixPath("/bucket")
YELLOW_TAXI_DATA_PATH = MOUNT_PATH / "yellow_taxi"
```
The dependencies installed above are not available locally. The following block instructs Modal
to only import them inside the container.
```
with image.imports():
import duckdb
import requests
```
Download New York City’s taxi data
----------------------------------
NYC makes data about taxi rides publicly available. The city’s [Taxi & Limousine Commission (TLC)](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) publishes files in the Parquet format. Files are organized by year and month.
We are going to download all available files and store them in an S3 bucket. We do this by
attaching a `modal.CloudBucketMount` with the S3 bucket name and its respective credentials.
The files in the bucket will then be available at `MOUNT_PATH`.
As we’ll see below, this operation can be massively sped up by running it in parallel on Modal.
```
@app.function(
volumes={
MOUNT_PATH: modal.CloudBucketMount("modal-s3mount-test-bucket", secret=secret),
},
)
def download_data(year: int, month: int) -> str:
filename = f"yellow_tripdata_{year}-{month:02d}.parquet"
url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{filename}"
s3_path = MOUNT_PATH / filename
# Skip downloading if file exists.
if not s3_path.exists():
if not YELLOW_TAXI_DATA_PATH.exists():
YELLOW_TAXI_DATA_PATH.mkdir(parents=True, exist_ok=True)
with requests.get(url, stream=True) as r:
r.raise_for_status()
print(f"downloading => {s3_path}")
# It looks like we writing locally, but this is actually writing to S3!
with open(s3_path, "wb") as file:
for chunk in r.iter_content(chunk_size=8192):
file.write(chunk)
return s3_path.as_posix()
```
Analyze data with DuckDB
------------------------
[DuckDB](https://duckdb.org/) is an analytical database with rich support for Parquet files.
It is also very fast. Below, we define a Modal Function that aggregates yellow taxi trips
within a month (each file contains all the rides from a specific month).
```
@app.function(
volumes={
MOUNT_PATH: modal.CloudBucketMount(
"modal-s3mount-test-bucket",
secret=modal.Secret.from_name("s3-bucket-secret"),
)
},
)
def aggregate_data(path: str) -> list[tuple[datetime, int]]:
print(f"processing => {path}")
# Parse file.
year_month_part = path.split("yellow_tripdata_")[1]
year, month = year_month_part.split("-")
month = month.replace(".parquet", "")
# Make DuckDB query using in-memory storage.
con = duckdb.connect(database=":memory:")
q = """
with sub as (
select tpep_pickup_datetime::date d, count(1) c
from read_parquet(?)
group by 1
)
select d, c from sub
where date_part('year', d) = ? -- filter out garbage
and date_part('month', d) = ? -- same
"""
con.execute(q, (path, year, month))
return list(con.fetchall())
```
Plot daily taxi rides
---------------------
Finally, we want to plot our results.
The plot created shows the number of yellow taxi rides per day in NYC.
This function runs remotely, on Modal, so we don’t need to install plotting libraries locally.
```
@app.function()
def plot(dataset) -> bytes:
import io
import matplotlib.pyplot as plt
# Sorting data by date
dataset.sort(key=lambda x: x[0])
# Unpacking dates and values
dates, values = zip(*dataset)
# Plotting
plt.figure(figsize=(10, 6))
plt.plot(dates, values)
plt.title("Number of NYC yellow taxi trips by weekday, 2018-2023")
plt.ylabel("Number of daily trips")
plt.grid(True)
plt.tight_layout()
# Saving plot as raw bytes to send back
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
return buf.getvalue()
```
Run everything
--------------
The `@app.local_entrypoint()` defines what happens when we run our Modal program locally.
We invoke it from the CLI by calling `modal run s3_bucket_mount.py`.
We first call `download_data()` and `starmap` (named because it’s kind of like `map(*args)`)
on tuples of inputs `(year, month)`. This will download, in parallel,
all yellow taxi data files into our locally mounted S3 bucket and return a list of
Parquet file paths. Then, we call `aggregate_data()` with `map` on that list. These files are
also read from our S3 bucket. So one function writes files to S3 and the other
reads files from S3 in; both run across many files in parallel.
Finally, we call `plot` to generate the following figure:
![Number of NYC yellow taxi trips by weekday, 2018-2023](../../_app/immutable/assets/nyc_yellow_taxi_trips_s3_mount.DW1A9-sb.png)
This program should run in less than 30 seconds.
```
@app.local_entrypoint()
def main():
# List of tuples[year, month].
inputs = [(year, month) for year in range(2018, 2023) for month in range(1, 13)]
# List of file paths in S3.
parquet_files: list[str] = []
for path in download_data.starmap(inputs):
print(f"done => {path}")
parquet_files.append(path)
# List of datetimes and number of yellow taxi trips.
dataset = []
for r in aggregate_data.map(parquet_files):
dataset += r
dir = Path("/tmp") / "s3_bucket_mount"
if not dir.exists():
dir.mkdir(exist_ok=True, parents=True)
figure = plot.remote(dataset)
path = dir / "nyc_yellow_taxi_trips_s3_mount.png"
with open(path, "wb") as file:
print(f"Saving figure to {path}")
file.write(figure)
```
[Analyze NYC yellow taxi data with DuckDB on Parquet files from S3](#analyze-nyc-yellow-taxi-data-with-duckdb-on-parquet-files-from-s3)[Basic setup](#basic-setup)[Download New York City’s taxi data](#download-new-york-citys-taxi-data)[Analyze data with DuckDB](#analyze-data-with-duckdb)[Plot daily taxi rides](#plot-daily-taxi-rides)[Run everything](#run-everything)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/s3_bucket_mount.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Using MongoDB Atlas Vector and GeoJSON Search with Modal
========================================================
This [example repo](https://github.com/modal-labs/search-california) demonstrates how to use Modal and MongoDB together
to build a full-stack application.
The application is a hybrid search engine,
like the retrieval engines that power RAG chatbots,
but for satellite images of the state of California.
Images can be searched based on their
geospatial and temporal metadata or based on their semantic content
as captured by a pre-trained embedding model.
We use the [Clay foundation model](https://clay-foundation.github.io/model/index.html) for embeddings and we source the images from the European Space Agency’s [Sentinel satellites](https://www.esa.int/Applications/Observing_the_Earth/Copernicus/The_Sentinel_missions).
You can take our deployment of the application for a spin [here](https://modal-labs-examples--clay-hybrid-search.modal.run/).
Overview
--------
At the center of the application is a MongoDB Atlas instance
that stores metadata for a collection of satellite images.
Modal orchestrates the compute around that database:
retrieving data from elsewhere and storing it in the database,
computing vector embeddings for the data in the database,
and serving both a frontend and a client.
The dataflow looks something like this:
1. Every few days, the European Space Agency’s [Sentinel Satellites](https://www.esa.int/Applications/Observing_the_Earth/Copernicus/The_Sentinel_missions) complete a full pass over the entire Earth, including California.
The images are made available via a [public STAC API](https://element84.com/geospatial/introducing-earth-search-v1-new-datasets-now-available/).
2. Every day, we run a job on Modal that queries that STAC API
for new images of California and store the metadata in a MongoDB Atlas
database instance.
3. Asynchronously, we run a job on Modal to check which entries
in the database don’t have an associated embedding.
These images are then sent to a serverless embedding service
running on Modal. We send the resulting embeddings to the database.
4. We host a database client on Modal that allows the application’s
developers to manipulate the data. This client is also used by two
web endpoints for vector and geospatial search queries powered by
Atlas Search.
5. Finally, we run a simple static FastAPI server on Modal that serves
an Alpine JS frontend for executing those queries and rendering their results.
This entire application —
from API queries and frontend UI to GPU inference and hybrid search —
is delivered using nothing but Modal and MongoDB Atlas.
Setting it up for yourself requires only credentials on these platforms
and a few commands, detailed below.
Deploying the Backend
---------------------
### Setup: Modal and MongoDB Atlas
You’ll need a Python environment on your local machine.
Any recent version of Python should do.
Most of the dependencies will be installed in environments on Modal,
so you don’t need to worry quite so much.
Follow the instructions [here](../guide.html#getting-started) to set up your Modal account.
The $30/month of compute included in Modal’s free tier is
more than enough to deploy and host this example.
You’ll also need an account on MongoDB Atlas.
You can find instructions [here](https://www.mongodb.com/docs/atlas/getting-started/).
We prefer the UI, rather than the CLI, for setup.
The free tier is more than sufficient to run this example.
You’ll want to create a database called `modal-examples`.
Make sure it’s accessible from [all IP addresses](https://stackoverflow.com/questions/66035947/allow-access-from-anywhere-mongodb-atlas).
In the process, you will create a database user with a password.
Navigate to the Modal Secrets dashboard [here](../../login%EF%B9%96next=%EA%A4%B7secrets.html) and add this information, as well as the connection string for your database,
to a Modal Secret based on the MongoDB template available in the dashboard.
### MongoDB Client (`database.py`)
If your Modal Secret and MongoDB Atlas instance are set up correctly,
you should be able to run the following command:
```
modal run -m backend.database::MongoClient.ping
```
Once that command is working, you can start manipulating the database
from Modal.
To start, you’ll want to add an Area of Interest (AOI) to the database:
```
modal run -m backend.database --action add_aoi
```
By default, it’s the state of California as defined by the GeoJSON
in this repository’s `data` folder (originally retrieved from [the `geojsonio` GitHub repository](https://github.com/ropensci/geojsonio/blob/7e4cc683ed3d6eec38a8cae5ce03fa6d82acafc7/inst/examples/california.geojson)).
You can pass a different GeoJSON file to the `add_aoi` action
with the `--target` flag.
The `modal run` command is used for one-off tasks.
To deploy the database client for use in other parts of the app
along with the webhooks that anyone can use to run search queries,
we use `modal deploy`:
```
modal deploy -m backend.database
```
Those webhooks come with interactive OpenAPI docs,
which you can access by navigating to the `/docs` route of the deployment’s URL.
You should see that URL in the terminal output.
You can also find the URL in the app’s [Modal dashboard](../../login%EF%B9%96next=%EA%A4%B7apps.html).
For our deployment, the URL for the interactive docs for the geographic
search endpoint is [`https://modal-labs-examples--clay-mongo-client-geo-search.modal.run/docs`](https://modal-labs-examples--clay-mongo-client-geo-search.modal.run/docs).
If you haven’t yet run the backfill jobs for your database instance,
as described below, this search will not return any results,
but you can use it to check that the database client is deployed.
### Backfill and Updates (`extract.py`)
We add data to the database by querying the Sentinel STAC API for images.
Run the following command to search for images in the AOI
from the preceding week and add them to the database:
```
modal run -m backend.extract
```
You can either check the results via the Atlas UI
or by executing a search query in the database client’s geo search webhook,
as described above.
To regularly update the database with new images,
we deploy the app defined in `extract.py`:
```
modal deploy -m backend.extract
```
This app also runs a regular job to add embeddings to the images
in the database.
But it doesn’t compute the embeddings itself —
embeddings are provided by a separate service,
which is described next.
### Clay Embeddings Service (`embeddings.py`)
To build the environment for the embeddings service
and to test the embedding engine on some sample data,
execute the following command:
```
modal run -m backend.embeddings
```
To deploy this on Modal, we again use `modal deploy`:
```
modal deploy -m backend.embeddings
```
### Putting It All Together
Now that the embedding service is deployed,
we can add vectors by invoking the `enrich_vectors` function in `extract` with `modal run`:
```
modal run -m backend.extract::enrich_vectors
```
This command will ensure all the images in the database have embeddings.
You should be able to observe them on records viewed via the Atlas UI
or by executing a search query via the database client’s geo search webhook,
as described previously.
To use the embeddings for search, we recommend running the frontend UI,
which we walk through next.
Deploying the Frontend
----------------------
The frontend is much simpler than the backend.
It comprises a small Alpine JS app and a FastAPI Python server
to deliver it to client browsers.
You can play with our deployment of the frontend [here](https://modal-labs-examples--clay-hybrid-search.modal.run/).
### Alpine App (`app.js`)
The Alpine app provides a basic interface for constructing geo search queries
by clicking on a map and viewing results.
Clicking on the returned images triggers a vector search for similar images.
Images can be furthermore filtered by date using the date pickers.
### FastAPI Server (`serve.py`)
This app is served to the client by a FastAPI server.
To deploy it, run the following command:
```
modal deploy -m frontend
```
[Using MongoDB Atlas Vector and GeoJSON Search with Modal](#using-mongodb-atlas-vector-and-geojson-search-with-modal)[Overview](#overview)[Deploying the Backend](#deploying-the-backend)[Setup: Modal and MongoDB Atlas](#setup-modal-and-mongodb-atlas)[MongoDB Client (database.py)](#mongodb-client-databasepy)[Backfill and Updates (extract.py)](#backfill-and-updates-extractpy)[Clay Embeddings Service (embeddings.py)](#clay-embeddings-service-embeddingspy)[Putting It All Together](#putting-it-all-together)[Deploying the Frontend](#deploying-the-frontend)[Alpine App (app.js)](#alpine-app-appjs)[FastAPI Server (serve.py)](#fastapi-server-servepy)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/image-to-video/image_to_video.py)
Animate images with Lightricks LTX-Video via CLI, API, and web UI
=================================================================
This example shows how to run [LTX-Video](https://huggingface.co/Lightricks/LTX-Video) on Modal
to generate videos from your local command line, via an API, and in a web UI.
Generating a 5 second video takes ~1 minute from cold start.
Once the container is warm, a 5 second video takes ~15 seconds.
Here is a sample we generated:
[](https://modal-cdn.com/example_image_to_video.mp4)
Basic setup
-----------
```
import io
import random
import time
from pathlib import Path
from typing import Annotated, Optional
import fastapi
import modal
```
All Modal programs need an [`App`](../reference/modal.App.html) —
an object that acts as a recipe for the application.
```
app = modal.App("example-image-to-video")
```
### Configuring dependencies
The model runs remotely, on Modal’s cloud, which means we need to [define the environment it runs in](../guide/images.html).
Below, we start from a lightweight base Linux image
and then install our system and Python dependencies,
like Hugging Face’s `diffusers` library and `torch`.
```
image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("python3-opencv")
.pip_install(
"accelerate==1.4.0",
"diffusers==0.32.2",
"fastapi[standard]==0.115.8",
"huggingface-hub[hf_transfer]==0.29.1",
"imageio==2.37.0",
"imageio-ffmpeg==0.6.0",
"opencv-python==4.11.0.86",
"pillow==11.1.0",
"sentencepiece==0.2.0",
"torch==2.6.0",
"torchvision==0.21.0",
"transformers==4.49.0",
)
)
```
Storing model weights on Modal
------------------------------
We also need the parameters of the model remotely.
They can be loaded at runtime from Hugging Face,
based on a repository ID and a revision (aka a commit SHA).
```
MODEL_ID = "Lightricks/LTX-Video"
MODEL_REVISION_ID = "a6d59ee37c13c58261aa79027d3e41cd41960925"
```
Hugging Face will also cache the weights to disk once they’re downloaded.
But Modal Functions are serverless, and so even disks are ephemeral,
which means the weights would get re-downloaded every time we spin up a new instance.
We can fix this — without any modifications to Hugging Face’s model loading code! —
by pointing the Hugging Face cache at a [Modal Volume](../guide/volumes.html).
```
model_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
MODEL_PATH = "/models" # where the Volume will appear on our Functions' filesystems
image = image.env(
{
"HF_HUB_ENABLE_HF_TRANSFER": "1", # faster downloads
"HF_HUB_CACHE": MODEL_PATH,
}
)
```
Storing model outputs on Modal
------------------------------
Contemporary video models can take a long time to run and they produce large outputs.
That makes them a great candidate for storage on Modal Volumes as well.
Python code running outside of Modal can also access this storage, as we’ll see below.
```
OUTPUT_PATH = "/outputs"
output_volume = modal.Volume.from_name("outputs", create_if_missing=True)
```
Implementing LTX-Video inference on Modal
-----------------------------------------
We wrap the inference logic in a Modal [Cls](../guide/lifecycle-functions.html) that ensures models are loaded and then moved to the GPU once when a new instance
starts, rather than every time we run it.
The `run` function just wraps a `diffusers` pipeline.
It saves the generated video to a Modal Volume, and returns the filename.
We also include a `web` wrapper that makes it possible
to trigger inference via an API call.
For details, see the `/docs` route of the URL ending in `inference-web.modal.run` that appears when you deploy the app.
```
with image.imports(): # loaded on all of our remote Functions
import diffusers
import torch
from PIL import Image
MINUTES = 60
@app.cls(
image=image,
gpu="H100",
timeout=10 * MINUTES,
scaledown_window=10 * MINUTES,
volumes={MODEL_PATH: model_volume, OUTPUT_PATH: output_volume},
)
class Inference:
@modal.enter()
def load_pipeline(self):
self.pipe = diffusers.LTXImageToVideoPipeline.from_pretrained(
MODEL_ID,
revision=MODEL_REVISION_ID,
torch_dtype=torch.bfloat16,
).to("cuda")
@modal.method()
def run(
self,
image_bytes: bytes,
prompt: str,
negative_prompt: Optional[str] = None,
num_frames: Optional[int] = None,
num_inference_steps: Optional[int] = None,
seed: Optional[int] = None,
) -> str:
negative_prompt = (
negative_prompt
or "worst quality, inconsistent motion, blurry, jittery, distorted"
)
width = 768
height = 512
num_frames = num_frames or 25
num_inference_steps = num_inference_steps or 50
seed = seed or random.randint(0, 2**32 - 1)
print(f"Seeding RNG with: {seed}")
torch.manual_seed(seed)
image = diffusers.utils.load_image(Image.open(io.BytesIO(image_bytes)))
video = self.pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=num_frames,
num_inference_steps=num_inference_steps,
).frames[0]
mp4_name = slugify(prompt)
diffusers.utils.export_to_video(
video, f"{Path(OUTPUT_PATH) / mp4_name}", fps=24
)
output_volume.commit()
torch.cuda.empty_cache() # reduce fragmentation
return mp4_name
@modal.fastapi_endpoint(method="POST", docs=True)
def web(
self,
image_bytes: Annotated[bytes, fastapi.File()],
prompt: str,
negative_prompt: Optional[str] = None,
num_frames: Optional[int] = None,
num_inference_steps: Optional[int] = None,
seed: Optional[int] = None,
) -> fastapi.Response:
mp4_name = self.run.local( # run in the same container
image_bytes=image_bytes,
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=num_frames,
num_inference_steps=num_inference_steps,
seed=seed,
)
return fastapi.responses.FileResponse(
path=f"{Path(OUTPUT_PATH) / mp4_name}",
media_type="video/mp4",
filename=mp4_name,
)
```
Generating videos from the command line
---------------------------------------
We add a [local entrypoint](../reference/modal.App.html#local_entrypoint) that calls the `Inference.run` method to run inference from the command line.
The function’s parameters are automatically turned into a CLI.
Run it with
```
modal run image_to_video.py --prompt "A cat looking out the window at a snowy mountain" --image-path /path/to/cat.jpg
```
You can also pass `--help` to see the full list of arguments.
```
@app.local_entrypoint()
def entrypoint(
image_path: str,
prompt: str,
negative_prompt: Optional[str] = None,
num_frames: Optional[int] = None,
num_inference_steps: Optional[int] = None,
seed: Optional[int] = None,
twice: bool = True,
):
import os
import urllib.request
print(f"🎥 Generating a video from the image at {image_path}")
print(f"🎥 using the prompt {prompt}")
if image_path.startswith(("http://", "https://")):
image_bytes = urllib.request.urlopen(image_path).read()
elif os.path.isfile(image_path):
image_bytes = Path(image_path).read_bytes()
else:
raise ValueError(f"{image_path} is not a valid file or URL.")
inference_service = Inference()
for _ in range(1 + twice):
start = time.time()
mp4_name = inference_service.run.remote(
image_bytes=image_bytes,
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=num_frames,
seed=seed,
)
duration = time.time() - start
print(f"🎥 Generated video in {duration:.3f}s")
output_dir = Path("/tmp/image_to_video")
output_dir.mkdir(exist_ok=True, parents=True)
output_path = output_dir / mp4_name
# read in the file from the Modal Volume, then write it to the local disk
output_path.write_bytes(b"".join(output_volume.read_file(mp4_name)))
print(f"🎥 Video saved to {output_path}")
```
Generating videos via an API
----------------------------
The Modal `Cls` above also included a [`fastapi_endpoint`](basic_web.html),
which adds a simple web API to the inference method.
To try it out, run
```
modal deploy image_to_video.py
```
copy the printed URL ending in `inference-web.modal.run`,
and add `/docs` to the end. This will bring up the interactive
Swagger/OpenAPI docs for the endpoint.
Generating videos in a web UI
-----------------------------
Lastly, we add a simple front-end web UI (written in Alpine.js) for
our image to video backend.
This is also deployed when you run
```
modal deploy image_to_video.py.
```
The `Inference` class will serve multiple users from its own auto-scaling pool of warm GPU containers automatically,
and they will spin down when there are no requests.
```
frontend_path = Path(__file__).parent / "frontend"
web_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install("jinja2==3.1.5", "fastapi[standard]==0.115.8")
.add_local_dir( # mount frontend/client code
frontend_path, remote_path="/assets"
)
)
@app.function(image=web_image)
@modal.concurrent(max_inputs=1000)
@modal.asgi_app()
def ui():
import fastapi.staticfiles
import fastapi.templating
web_app = fastapi.FastAPI()
templates = fastapi.templating.Jinja2Templates(directory="/assets")
@web_app.get("/")
async def read_root(request: fastapi.Request):
return templates.TemplateResponse(
"index.html",
{
"request": request,
"inference_url": Inference().web.get_web_url(),
"model_name": "LTX-Video Image to Video",
"default_prompt": "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background.",
},
)
web_app.mount(
"/static",
fastapi.staticfiles.StaticFiles(directory="/assets"),
name="static",
)
return web_app
def slugify(s: str) -> str:
return f"{time.strftime('%Y%m%d_%H%M%S')}_{''.join(c if c.isalnum() else '-' for c in s[:100]).strip('-')}.mp4"
```
[Animate images with Lightricks LTX-Video via CLI, API, and web UI](#animate-images-with-lightricks-ltx-video-via-cli-api-and-web-ui)[Basic setup](#basic-setup)[Configuring dependencies](#configuring-dependencies)[Storing model weights on Modal](#storing-model-weights-on-modal)[Storing model outputs on Modal](#storing-model-outputs-on-modal)[Implementing LTX-Video inference on Modal](#implementing-ltx-video-inference-on-modal)[Generating videos from the command line](#generating-videos-from-the-command-line)[Generating videos via an API](#generating-videos-via-an-api)[Generating videos in a web UI](#generating-videos-in-a-web-ui)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/image-to-video/image_to_video.py --prompt 'A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background.' --image-path https\://modal-cdn.com/example_image_to_video_image.png
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/09_job_queues/doc_ocr_jobs.py)
Run a job queue for GOT-OCR
===========================
This tutorial shows you how to use Modal as an infinitely scalable job queue
that can service async tasks from a web app. For the purpose of this tutorial,
we’ve also built a [React + FastAPI web app on Modal](doc_ocr_webapp.html) that works together with it, but note that you don’t need a web app running on Modal
to use this pattern. You can submit async tasks to Modal from any Python
application (for example, a regular Django app running on Kubernetes).
Our job queue will handle a single task: running OCR transcription for images of receipts.
We’ll make use of a pre-trained model:
the [General OCR Theory (GOT) 2.0 model](https://huggingface.co/stepfun-ai/GOT-OCR2_0).
Try it out for yourself [here](https://modal-labs-examples--example-doc-ocr-webapp-wrapper.modal.run/).
[![Webapp frontend](https://modal-cdn.com/doc_ocr_frontend.jpg)](https://modal-labs-examples--example-doc-ocr-webapp-wrapper.modal.run/)
Define an App
-------------
Let’s first import `modal` and define an [`App`](../reference/modal.App.html).
Later, we’ll use the name provided for our `App` to find it from our web app and submit tasks to it.
```
from typing import Optional
import modal
app = modal.App("example-doc-ocr-jobs")
```
We also define the dependencies for our Function by specifying an [Image](../guide/images.html).
```
inference_image = modal.Image.debian_slim(python_version="3.12").pip_install(
"accelerate==0.28.0",
"huggingface_hub[hf_transfer]==0.27.1",
"numpy<2",
"tiktoken==0.6.0",
"torch==2.5.1",
"torchvision==0.20.1",
"transformers==4.48.0",
"verovio==4.3.1",
)
```
Cache the pre-trained model on a Modal Volume
---------------------------------------------
We can obtain the pre-trained model we want to run from Hugging Face
using its name and a revision identifier.
```
MODEL_NAME = "ucaslcl/GOT-OCR2_0"
MODEL_REVISION = "cf6b7386bc89a54f09785612ba74cb12de6fa17c"
```
The logic for loading the model based on this information
is encapsulated in the `setup` function below.
```
def setup():
import warnings
from transformers import AutoModel, AutoTokenizer
with warnings.catch_warnings(): # filter noisy warnings from GOT modeling code
warnings.simplefilter("ignore")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, revision=MODEL_REVISION, trust_remote_code=True
)
model = AutoModel.from_pretrained(
MODEL_NAME,
revision=MODEL_REVISION,
trust_remote_code=True,
device_map="cuda",
use_safetensors=True,
pad_token_id=tokenizer.eos_token_id,
)
return tokenizer, model
```
The `.from_pretrained` methods from Hugging Face are smart enough
to only download models if they haven’t been downloaded before.
But in Modal’s serverless environment, filesystems are ephemeral,
and so using this code alone would mean that models need to get downloaded
on every request.
So instead, we create a Modal [Volume](../guide/volumes.html) to store the model — a durable filesystem that any Modal Function can access.
```
model_cache = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
```
We also update the environment variables for our Function
to include this new path for the model cache —
and to enable fast downloads with the `hf_transfer` library.
```
MODEL_CACHE_PATH = "/root/models"
inference_image = inference_image.env(
{"HF_HUB_CACHE": MODEL_CACHE_PATH, "HF_HUB_ENABLE_HF_TRANSFER": "1"}
)
```
Run OCR inference on Modal by wrapping with `app.function`
----------------------------------------------------------
Now let’s set up the actual OCR inference.
Using the [`@app.function`](../reference/modal.App.html#function) decorator, we set up a Modal [Function](../reference/modal.Function.html).
We provide arguments to that decorator to customize the hardware, scaling, and other features
of the Function.
Here, we say that this Function should use NVIDIA L40S [GPUs](../guide/gpu.html),
automatically [retry](../guide/retries.html#function-retries) failures up to 3 times,
and have access to our [shared model cache](../guide/volumes.html).
```
@app.function(
gpu="l40s",
retries=3,
volumes={MODEL_CACHE_PATH: model_cache},
image=inference_image,
)
def parse_receipt(image: bytes) -> str:
from tempfile import NamedTemporaryFile
tokenizer, model = setup()
with NamedTemporaryFile(delete=False, mode="wb+") as temp_img_file:
temp_img_file.write(image)
output = model.chat(tokenizer, temp_img_file.name, ocr_type="format")
print("Result: ", output)
return output
```
Deploy
------
Now that we have a function, we can publish it by deploying the app:
```
modal deploy doc_ocr_jobs.py
```
Once it’s published, we can [look up](../guide/trigger-deployed-functions.html) this Function
from another Python process and submit tasks to it:
```
fn = modal.Function.from_name("example-doc-ocr-jobs", "parse_receipt")
fn.spawn(my_image)
```
Modal will auto-scale to handle all the tasks queued, and
then scale back down to 0 when there’s no work left. To see how you could use this from a Python web
app, take a look at the [receipt parser frontend](doc_ocr_webapp.html) tutorial.
Run manually
------------
We can also trigger `parse_receipt` manually for easier debugging:
```
modal run doc_ocr_jobs
```
To try it out, you can find some
example receipts [here](https://drive.google.com/drive/folders/1S2D1gXd4YIft4a5wDtW99jfl38e85ouW).
```
@app.local_entrypoint()
def main(receipt_filename: Optional[str] = None):
import urllib.request
from pathlib import Path
if receipt_filename is None:
receipt_filename = Path(__file__).parent / "receipt.png"
else:
receipt_filename = Path(receipt_filename)
if receipt_filename.exists():
image = receipt_filename.read_bytes()
print(f"running OCR on {receipt_filename}")
else:
receipt_url = "https://modal-cdn.com/cdnbot/Brandys-walmart-receipt-8g68_a_hk_f9c25fce.webp"
request = urllib.request.Request(receipt_url)
with urllib.request.urlopen(request) as response:
image = response.read()
print(f"running OCR on sample from URL {receipt_url}")
print(parse_receipt.remote(image))
```
[Run a job queue for GOT-OCR](#run-a-job-queue-for-got-ocr)[Define an App](#define-an-app)[Cache the pre-trained model on a Modal Volume](#cache-the-pre-trained-model-on-a-modal-volume)[Run OCR inference on Modal by wrapping with app.function](#run-ocr-inference-on-modal-by-wrapping-with-appfunction)[Deploy](#deploy)[Run manually](#run-manually)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 09_job_queues/doc_ocr_jobs.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/stable_diffusion/flux.py)
Run Flux fast on H100s with `torch.compile`
===========================================
In this guide, we’ll run Flux as fast as possible on Modal using open source tools.
We’ll use `torch.compile` and NVIDIA H100 GPUs.
Setting up the image and dependencies
-------------------------------------
```
import time
from io import BytesIO
from pathlib import Path
import modal
```
We’ll make use of the full [CUDA toolkit](../guide/cuda.html) in this example, so we’ll build our container image off of the `nvidia/cuda` base.
```
cuda_version = "12.4.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
cuda_dev_image = modal.Image.from_registry(
f"nvidia/cuda:{tag}", add_python="3.11"
).entrypoint([])
```
Now we install most of our dependencies with `apt` and `pip`.
For Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library
we install from GitHub source and so pin to a specific commit.
PyTorch added [faster attention kernels for Hopper GPUs in version 2.5
```
diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b"
flux_image = (
cuda_dev_image.apt_install(
"git",
"libglib2.0-0",
"libsm6",
"libxrender1",
"libxext6",
"ffmpeg",
"libgl1",
)
.pip_install(
"invisible_watermark==0.2.0",
"transformers==4.44.0",
"huggingface_hub[hf_transfer]==0.26.2",
"accelerate==0.33.0",
"safetensors==0.4.4",
"sentencepiece==0.2.0",
"torch==2.5.0",
f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}",
"numpy<2",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"})
)
```
Later, we’ll also use `torch.compile` to increase the speed further.
Torch compilation needs to be re-executed when each new container starts,
So we turn on some extra caching to reduce compile times for later containers.
```
flux_image = flux_image.env(
{
"TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache",
"TORCHINDUCTOR_FX_GRAPH_CACHE": "1",
}
)
```
Finally, we construct our Modal [App](../reference/modal.App.html),
set its default image to the one we just constructed,
and import `FluxPipeline` for downloading and running Flux.1.
```
app = modal.App("example-flux", image=flux_image)
with flux_image.imports():
import torch
from diffusers import FluxPipeline
```
Defining a parameterized `Model` inference class
------------------------------------------------
Next, we map the model’s setup and inference code onto Modal.
1. We the model setun in the method decorated with `@modal.enter()`. This includes loading the
weights and moving them to the GPU, along with an optional `torch.compile` step (see details below).
The `@modal.enter()` decorator ensures that this method runs only once, when a new container starts,
instead of in the path of every call.
2. We run the actual inference in methods decorated with `@modal.method()`.
```
MINUTES = 60 # seconds
VARIANT = "schnell" # or "dev", but note [dev] requires you to accept terms and conditions on HF
NUM_INFERENCE_STEPS = 4 # use ~50 for [dev], smaller for [schnell]
@app.cls(
gpu="H100", # fastest GPU on Modal
scaledown_window=20 * MINUTES,
timeout=60 * MINUTES, # leave plenty of time for compilation
volumes={ # add Volumes to store serializable compilation artifacts, see section on torch.compile below
"/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True),
"/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True),
"/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True),
"/root/.inductor-cache": modal.Volume.from_name(
"inductor-cache", create_if_missing=True
),
},
)
class Model:
compile: bool = ( # see section on torch.compile below for details
modal.parameter(default=False)
)
@modal.enter()
def enter(self):
pipe = FluxPipeline.from_pretrained(
f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16
).to("cuda") # move model to GPU
self.pipe = optimize(pipe, compile=self.compile)
@modal.method()
def inference(self, prompt: str) -> bytes:
print("🎨 generating image...")
out = self.pipe(
prompt,
output_type="pil",
num_inference_steps=NUM_INFERENCE_STEPS,
).images[0]
byte_stream = BytesIO()
out.save(byte_stream, format="JPEG")
return byte_stream.getvalue()
```
Calling our inference function
------------------------------
To generate an image we just need to call the `Model`’s `generate` method
with `.remote` appended to it.
You can call `.generate.remote` from any Python environment that has access to your Modal credentials.
The local environment will get back the image as bytes.
Here, we wrap the call in a Modal [`local_entrypoint`](../reference/modal.App.html#local_entrypoint) so that it can be run with `modal run`:
```
modal run flux.py
```
By default, we call `generate` twice to demonstrate how much faster
the inference is after cold start. In our tests, clients received images in about 1.2 seconds.
We save the output bytes to a temporary file.
```
@app.local_entrypoint()
def main(
prompt: str = "a computer screen showing ASCII terminal art of the"
" word 'Modal' in neon green. two programmers are pointing excitedly"
" at the screen.",
twice: bool = True,
compile: bool = False,
):
t0 = time.time()
image_bytes = Model(compile=compile).inference.remote(prompt)
print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")
if twice:
t0 = time.time()
image_bytes = Model(compile=compile).inference.remote(prompt)
print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")
output_path = Path("/tmp") / "flux" / "output.jpg"
output_path.parent.mkdir(exist_ok=True, parents=True)
print(f"🎨 saving output to {output_path}")
output_path.write_bytes(image_bytes)
```
Speeding up Flux with `torch.compile`
-------------------------------------
By default, we do some basic optimizations, like adjusting memory layout
and re-expressing the attention head projections as a single matrix multiplication.
But there are additional speedups to be had!
PyTorch 2 added a compiler that optimizes the
compute graphs created dynamically during PyTorch execution.
This feature helps close the gap with the performance of static graph frameworks
like TensorRT and TensorFlow.
Here, we follow the suggestions from Hugging Face’s [guide to fast diffusion inference](https://huggingface.co/docs/diffusers/en/tutorials/fast_diffusion),
which we verified with our own internal benchmarks.
Review that guide for detailed explanations of the choices made below.
The resulting compiled Flux `schnell` deployment returns images to the client in under a second (~700 ms), according to our testing. *Super schnell*!
Compilation takes up to twenty minutes on first iteration.
As of time of writing in late 2024,
the compilation artifacts cannot be fully serialized,
so some compilation work must be re-executed every time a new container is started.
That includes when scaling up an existing deployment or the first time a Function is invoked with `modal run`.
We cache compilation outputs from `nvcc`, `triton`, and `inductor`,
which can reduce compilation time by up to an order of magnitude.
For details see [this tutorial](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html).
You can turn on compilation with the `--compile` flag.
Try it out with:
```
modal run flux.py --compile
```
The `compile` option is passed by a [`modal.parameter`](../reference/modal.parameter.html#modalparameter) on our class.
Each different choice for a `parameter` creates a [separate auto-scaling deployment](https://modal.com/docs/guide/parameterized-functions).
That means your client can use arbitrary logic to decide whether to hit a compiled or eager endpoint.
```
def optimize(pipe, compile=True):
# fuse QKV projections in Transformer and VAE
pipe.transformer.fuse_qkv_projections()
pipe.vae.fuse_qkv_projections()
# switch memory layout to Torch's preferred, channels_last
pipe.transformer.to(memory_format=torch.channels_last)
pipe.vae.to(memory_format=torch.channels_last)
if not compile:
return pipe
# set torch compile flags
config = torch._inductor.config
config.disable_progress = False # show progress bar
config.conv_1x1_as_mm = True # treat 1x1 convolutions as matrix muls
# adjust autotuning algorithm
config.coordinate_descent_tuning = True
config.coordinate_descent_check_all_directions = True
config.epilogue_fusion = False # do not fuse pointwise ops into matmuls
# tag the compute-intensive modules, the Transformer and VAE decoder, for compilation
pipe.transformer = torch.compile(
pipe.transformer, mode="max-autotune", fullgraph=True
)
pipe.vae.decode = torch.compile(
pipe.vae.decode, mode="max-autotune", fullgraph=True
)
# trigger torch compilation
print("🔦 running torch compilation (may take up to 20 minutes)...")
pipe(
"dummy prompt to trigger torch compilation",
output_type="pil",
num_inference_steps=NUM_INFERENCE_STEPS, # use ~50 for [dev], smaller for [schnell]
).images[0]
print("🔦 finished torch compilation")
return pipe
```
[Run Flux fast on H100s with torch.compile](#run-flux-fast-on-h100s-with-torchcompile)[Setting up the image and dependencies](#setting-up-the-image-and-dependencies)[Defining a parameterized Model inference class](#defining-a-parameterized-model-inference-class)[Calling our inference function](#calling-our-inference-function)[Speeding up Flux with torch.compile](#speeding-up-flux-with-torchcompile)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 06_gpu_and_ml/stable_diffusion/flux.py --no-compile
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
Run Continuous Integration (CI) Tests on Modal
==============================================
[This example repo](https://github.com/modal-labs/ci-on-modal) is a
demonstration of one pattern for running tests on Modal: bring your existing
package and test suite (here `my_pkg` and `tests`) and add a Modal App
(`my_pkg.ci`) with a Function (`pytest`) that runs `pytest`.
That’s as straightforward as
```
# my_pkg/ci.py
@app.function(gpu="any")
def pytest():
import subprocess
subprocess.run(["pytest", "-vs"], check=True, cwd="/root")
```
Setup
-----
* Create a Python virtual environment
* `pip install modal`
* That’s it 😎
Usage
-----
All commands below are run from the root of the repository.
### Run tests remotely on Modal
```
modal run -m my_pkg.ci
```
On the first execution, the [container image](../guide/images.html) for your application will be built.
This image will be cached on Modal and only rebuilt if one of its dependencies,
like the `requirements.txt` file, changes.
### Run tests on Modal from GitHub Actions
The same command can be executed from inside a CI runner on another platform.
We provide a sample GitHub Actions workflow in `.github/workflows/ci.yml`.
To run these tests on GitHub Actions, fork this repo and [create a new GitHub Actions secret](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions) that contains your `MODAL_TOKEN_ID` and `MODAL_TOKEN_SECRET`.
You can find this info in the `.modal.toml` file in your home directory.
Now you can [manually trigger the tests to run on GitHub Actions](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow) or trigger them by making a change on our fork and pushing to `main` or making a pull request.
### Debug tests running remotely
To debug the tests, you can open a shell
in the exact same environment that the tests are run in:
```
modal shell -m my_pkg.ci
```
We used the `shell` feature heavily while developing this pattern!
*Note*: On the Modal worker, the `pytest` command is run from the home directory, `/root`,
which contains the `tests` folder, but the `modal shell` command will
drop you at the top of the filesystem, `/`.
[Run Continuous Integration (CI) Tests on Modal](#run-continuous-integration-ci-tests-on-modal)[Setup](#setup)[Usage](#usage)[Run tests remotely on Modal](#run-tests-remotely-on-modal)[Run tests on Modal from GitHub Actions](#run-tests-on-modal-from-github-actions)[Debug tests running remotely](#debug-tests-running-remotely)
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/10_integrations/dbt/dbt_duckdb.py)
Build your own data warehouse with DuckDB, DBT, and Modal
=========================================================
This example contains a minimal but capable [data warehouse](https://en.wikipedia.org/wiki/Data_warehouse).
It’s comprised of the following:
* [DuckDB](https://duckdb.org) as the warehouse’s [OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) database engine
* [AWS S3](https://aws.amazon.com/s3/) as the data storage provider
* [DBT](https://docs.getdbt.com/docs/introduction) as the data transformation tool
Meet your new serverless cloud data warehouse, powered by Modal!
Configure Modal, S3, and DBT
----------------------------
The only thing in the source code that you must update is the S3 bucket name.
AWS S3 bucket names are globally unique, and the one in this source is used by us to host this example.
Update the `BUCKET_NAME` variable below and also any references to the original value
within `sample_proj_duckdb_s3/models/`. The AWS IAM policy below also includes the bucket
name and that must be updated.
```
from pathlib import Path
import modal
BUCKET_NAME = "modal-example-dbt-duckdb-s3"
LOCAL_DBT_PROJECT = ( # local path
Path(__file__).parent / "sample_proj_duckdb_s3"
)
PROJ_PATH = "/root/dbt" # remote paths
PROFILES_PATH = "/root/dbt_profile"
TARGET_PATH = "/root/target"
```
Most of the DBT code and configuration is taken directly from the classic [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop) demo and modified to support
using `dbt-duckdb` with an S3 bucket.
The DBT `profiles.yml` configuration is taken from [the `dbt-duckdb` docs](https://github.com/jwills/dbt-duckdb#configuring-your-profile).
We also define the environment our application will run in —
a container image, as in Docker.
See [this guide](https://modal.com/docs/guide/custom-container) for details.
```
dbt_image = ( # start from a slim Linux image
modal.Image.debian_slim(python_version="3.11")
.pip_install( # install python packages
"boto3~=1.34", # aws client sdk
"dbt-duckdb~=1.8.1", # dbt and duckdb and a connector
"pandas~=2.2.2", # dataframes
"pyarrow~=16.1.0", # columnar data lib
"fastapi[standard]~=0.115.4", # web app
)
.env( # configure DBT environment variables
{
"DBT_PROJECT_DIR": PROJ_PATH,
"DBT_PROFILES_DIR": PROFILES_PATH,
"DBT_TARGET_PATH": TARGET_PATH,
}
)
# Here we add all local code and configuration into the Modal Image
# so that it will be available when we run DBT on Modal.
.add_local_dir(LOCAL_DBT_PROJECT, remote_path=PROJ_PATH)
.add_local_file(
LOCAL_DBT_PROJECT / "profiles.yml",
remote_path=f"{PROFILES_PATH}/profiles.yml",
)
)
app = modal.App(name="example-dbt-duckdb-s3", image=dbt_image)
dbt_target = modal.Volume.from_name("dbt-target-vol", create_if_missing=True)
```
We’ll also need to authenticate with AWS to store data in S3.
```
s3_secret = modal.Secret.from_name(
"modal-examples-aws-user",
required_keys=["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION"],
)
```
Create this Secret using the “AWS” template from the [Secrets dashboard](../../login%EF%B9%96next=%EA%A4%B7secrets.html).
Below we will use the provided credentials in a Modal Function to create an S3 bucket and
populate it with `.parquet` data, so be sure to provide credentials for a user
with permission to create S3 buckets and read & write data from them.
The policy required for this example is the following.
Not that you *must* update the bucket name listed in the policy to your
own bucket name.
```
{
"Statement": [
{
"Action": "s3:*",
"Effect": "Allow",
"Resource": [
"arn:aws:s3:::modal-example-dbt-duckdb-s3/*",
"arn:aws:s3:::modal-example-dbt-duckdb-s3"
],
"Sid": "duckdbs3access"
}
],
"Version": "2012-10-17"
}
```
Upload seed data
----------------
In order to provide source data for DBT to ingest and transform,
we have the below `create_source_data` function which creates an AWS S3 bucket and
populates it with Parquet files based off the CSV data in the `seeds/` directory.
You can kick it off by running this script on Modal:
```
modal run dbt_duckdb.py
```
This script also runs the full data warehouse setup, and the whole process takes a minute or two.
We’ll walk through the rest of the steps below. See the `app.local_entrypoint` below for details.
Note that this is not the typical way that `seeds/` data is used, but it’s useful for this
demonstration. See [the DBT docs](https://docs.getdbt.com/docs/build/seeds) for more info.
```
@app.function(
secrets=[s3_secret],
)
def create_source_data():
import boto3
import pandas as pd
from botocore.exceptions import ClientError
s3_client = boto3.client("s3")
s3_client.create_bucket(Bucket=BUCKET_NAME)
for seed_csv_path in Path(PROJ_PATH, "seeds").glob("*.csv"):
print(f"Found seed file {seed_csv_path}")
name = seed_csv_path.stem
parquet_filename = f"{name}.parquet"
object_key = f"sources/{parquet_filename}"
try:
s3_client.head_object(Bucket=BUCKET_NAME, Key=object_key)
print(
f"File '{object_key}' already exists in bucket '{BUCKET_NAME}'. Skipping."
)
except ClientError:
df = pd.read_csv(seed_csv_path)
df.to_parquet(parquet_filename)
print(f"Uploading '{object_key}' to S3 bucket '{BUCKET_NAME}'")
s3_client.upload_file(parquet_filename, BUCKET_NAME, object_key)
print(f"File '{object_key}' uploaded successfully.")
```
Run DBT on the cloud with Modal
-------------------------------
Modal makes it easy to run Python code in the cloud.
And DBT is a Python tool, so it’s easy to run DBT with Modal:
below, we import the `dbt` library’s `dbtRunner` to pass commands from our
Python code, running on Modal, the same way we’d pass commands on a command line.
Note that this Modal Function has access to our AWS S3 Secret,
the local files associated with our DBT project and profiles,
and a remote Modal Volume that acts as a distributed file system.
```
@app.function(
secrets=[s3_secret],
volumes={TARGET_PATH: dbt_target},
)
def run(command: str) -> None:
from dbt.cli.main import dbtRunner
res = dbtRunner().invoke(command.split(" "))
if res.exception:
print(res.exception)
```
You can run this Modal Function from the command line with
`modal run dbt_duckdb.py::run --command run`
A successful run will log something like the following:
```
03:41:04 Running with dbt=1.5.0
03:41:05 Found 5 models, 8 tests, 0 snapshots, 0 analyses, 313 macros, 0 operations, 3 seed files, 3 sources, 0 exposures, 0 metrics, 0 groups
03:41:05
03:41:06 Concurrency: 1 threads (target='modal')
03:41:06
03:41:06 1 of 5 START sql table model main.stg_customers ................................ [RUN]
03:41:06 1 of 5 OK created sql table model main.stg_customers ........................... [OK in 0.45s]
03:41:06 2 of 5 START sql table model main.stg_orders ................................... [RUN]
03:41:06 2 of 5 OK created sql table model main.stg_orders .............................. [OK in 0.34s]
03:41:06 3 of 5 START sql table model main.stg_payments ................................. [RUN]
03:41:07 3 of 5 OK created sql table model main.stg_payments ............................ [OK in 0.36s]
03:41:07 4 of 5 START sql external model main.customers ................................. [RUN]
03:41:07 4 of 5 OK created sql external model main.customers ............................ [OK in 0.72s]
03:41:07 5 of 5 START sql table model main.orders ....................................... [RUN]
03:41:08 5 of 5 OK created sql table model main.orders .................................. [OK in 0.22s]
03:41:08
03:41:08 Finished running 4 table models, 1 external model in 0 hours 0 minutes and 3.15 seconds (3.15s).
03:41:08 Completed successfully
03:41:08
03:41:08 Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5
```
Look for the `'materialized='external'` DBT config in the SQL templates
to see how `dbt-duckdb` is able to write back the transformed data to AWS S3!
After running the `run` command and seeing it succeed, check what’s contained
under the bucket’s `out/` key prefix. You’ll see that DBT has run the transformations
defined in `sample_proj_duckdb_s3/models/` and produced output `.parquet` files.
Serve fresh data documentation with FastAPI and Modal
-----------------------------------------------------
DBT also automatically generates [rich, interactive data docs](https://docs.getdbt.com/docs/collaborate/explore-projects).
You can serve these docs on Modal.
Just define a simple [FastAPI](https://fastapi.tiangolo.com/) app:
```
@app.function(volumes={TARGET_PATH: dbt_target})
@modal.concurrent(max_inputs=100)
@modal.asgi_app() # wrap a function that returns a FastAPI app in this decorator to host on Modal
def serve_dbt_docs():
import fastapi
from fastapi.staticfiles import StaticFiles
web_app = fastapi.FastAPI()
web_app.mount(
"/",
StaticFiles( # dbt docs are automatically generated and sitting in the Volume
directory=TARGET_PATH, html=True
),
name="static",
)
return web_app
```
And deploy that app to Modal with
```
modal deploy dbt_duckdb.py
# ...
# Created web function serve_dbt_docs => <output-url>
```
If you navigate to the output URL, you should see something like [![example dbt docs](../../_app/immutable/assets/dbt_docs.BwfMuDI8.png)](https://modal-labs-examples--example-dbt-duckdb-s3-serve-dbt-docs.modal.run)
You can also check out our instance of the docs [here](https://modal-labs-examples--example-dbt-duckdb-s3-serve-dbt-docs.modal.run).
The app will be served “serverlessly” — it will automatically scale up or down
during periods of increased or decreased usage, and you won’t be charged at all
when it has scaled to zero.
Schedule daily updates
----------------------
The following `daily_build` function [runs on a schedule](../guide/cron.html) to keep the DuckDB data warehouse up-to-date. It is also deployed by the same `modal deploy` command for the docs app.
The source data for this warehouse is static,
so the daily executions don’t really “update” anything, just re-build. But this example could be extended
to have sources which continually provide new data across time.
It will also generate the DBT docs daily to keep them fresh.
```
@app.function(
schedule=modal.Period(days=1),
secrets=[s3_secret],
volumes={TARGET_PATH: dbt_target},
)
def daily_build() -> None:
run.remote("build")
run.remote("docs generate")
@app.local_entrypoint()
def main():
create_source_data.remote()
run.remote("run")
daily_build.remote()
```
[Build your own data warehouse with DuckDB, DBT, and Modal](#build-your-own-data-warehouse-with-duckdb-dbt-and-modal)[Configure Modal, S3, and DBT](#configure-modal-s3-and-dbt)[Upload seed data](#upload-seed-data)[Run DBT on the cloud with Modal](#run-dbt-on-the-cloud-with-modal)[Serve fresh data documentation with FastAPI and Modal](#serve-fresh-data-documentation-with-fastapi-and-modal)[Schedule daily updates](#schedule-daily-updates)
Try this on Modal!
------------------
You can run this example on Modal in 60 seconds.
[Create account to run](../../signup.html)
After creating a free account, install the Modal Python package, and
create an API token.
$
```
pip install modal
```
$
```
modal setup
```
Clone the [modal-examples](https://github.com/modal-labs/modal-examples) repository and run:
$
```
git clone https://github.com/modal-labs/modal-examples
```
$
```
cd modal-examples
```
$
```
modal run 10_integrations/dbt/dbt_duckdb.py
```
![Modal logo](../../_app/immutable/assets/logotype.CAx-nu9G.svg) © 2025
[About](../../company.html) [Status](https://status.modal.com/) [Changelog](../reference/changelog.html) [Documentation](../guide.html) [Slack Community](../../slack.html) [Pricing](../../pricing.html) [Examples](../examples.html)
[View on GitHub](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/sam/segment_anything.py)
Run Facebook’s Segment Anything Model 2 (SAM 2) on Modal
========================================================
This example demonstrates how to deploy Facebook’s [SAM 2](https://github.com/facebookresearch/sam2) on Modal. SAM2 is a powerful, flexible image and video segmentation model that can be used
for various computer vision tasks like object detection, instance segmentation,
and even as a foundation for more complex computer vision applications.
SAM2 extends the capabilities of the original SAM to include video segmentation.
In particular, this example segments [this video](https://www.youtube.com/watch?v=WAz1406SjVw) of a man jumping off the cliff.
The output should look something like this:
[](https://modal-cdn.com/example-segmented-video.mp4)
Set up dependencies for SAM 2
-----------------------------
First, we set up the necessary dependencies, including `torch`, `opencv`, `huggingface_hub`, `torchvision`, and the `sam2` library.
We also install `ffmpeg`, which we will use to manipulate videos,
and a Python wrapper called `ffmpeg-python` for a clean interface.
```
from pathlib import Path
import modal
MODEL_TYPE = "facebook/sam2-hiera-large"
SAM2_GIT_SHA = (
"c2ec8e14a185632b0a5d8b161928ceb50197eddc" # pin commit! research code is fragile
)
image = (
modal.Image.debian_slim(python_version="3.10")
.apt_install("git", "wget", "python3-opencv", "ffmpeg")
.pip_install(
"torch~=2.4.1",
"torchvision==0.19.1",
"opencv-python==4.10.0.84",
"pycocotools~=2.0.8",
"matplotlib~=3.9.2",
"onnxruntime==1.19.2",
"onnx==1.17.0",
"huggingface_hub==0.25.2",
"ffmpeg-python==0.2.0",
f"git+https://github.com/facebookresearch/sam2.git@{SAM2_GIT_SHA}",
)
)
app = modal.App("sam2-app", image=image)
```
Wrapping the SAM 2 model in a Modal class
-----------------------------------------
Next, we define the `Model` class that will handle SAM 2 operations for both image and video.
We use the `@modal.enter()` decorators here for optimization: it makes sure the initialization
method runs only once, when a new container starts, instead of in the path of every call.
We’ll also use a modal Volume to cache the model weights so that they don’t need to be downloaded
repeatedly when we start new containers.
```
video_vol = modal.Volume.from_name("sam2-inputs", create_if_missing=True)
cache_vol = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
cache_dir = "/cache"
@app.cls(
image=image.env({"HF_HUB_CACHE": cache_dir}),
volumes={"/root/videos": video_vol, cache_dir: cache_vol},
gpu="A100",
)
class Model:
@modal.enter()
def initialize_model(self):
"""Download and initialize model."""
from sam2.sam2_video_predictor import SAM2VideoPredictor
self.video_predictor = SAM2VideoPredictor.from_pretrained(MODEL_TYPE)
@modal.method()
def generate_video_masks(self, video="/root/videos/input.mp4", point_coords=None):
"""Generate masks for a video."""
import ffmpeg
import numpy as np
import torch
from PIL import Image
frames_dir = convert_video_to_frames(video)
# scan all the JPEG files in this directory
frame_names = [
p
for p in frames_dir.iterdir()
if p.suffix in [".jpg", ".jpeg", ".JPG", ".JPEG"]
]
frame_names.sort(key=lambda p: int(p.stem))
# We are hardcoding the input point and label here
# In a real-world scenario, you would want to display the video
# and allow the user to click on the video to select the point
if point_coords is None:
width, height = Image.open(frame_names[0]).size
point_coords = [[width // 2, height // 2]]
points = np.array(point_coords, dtype=np.float32)
# for labels, `1` means positive click and `0` means negative click
labels = np.array([1] * len(points), np.int32)
# run the model on GPU
with (
torch.inference_mode(),
torch.autocast("cuda", dtype=torch.bfloat16),
):
self.inference_state = self.video_predictor.init_state(
video_path=str(frames_dir)
)
# add new prompts and instantly get the output on the same frame
(
frame_idx,
object_ids,
masks,
) = self.video_predictor.add_new_points_or_box(
inference_state=self.inference_state,
frame_idx=0,
obj_id=1,
points=points,
labels=labels,
)
print(f"frame_idx: {frame_idx}, object_ids: {object_ids}, masks: {masks}")
# run propagation throughout the video and collect the results in a dict
video_segments = {} # video_segments contains the per-frame segmentation results
for (
out_frame_idx,
out_obj_ids,
out_mask_logits,
) in self.video_predictor.propagate_in_video(self.inference_state):
video_segments[out_frame_idx] = {
out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
for i, out_obj_id in enumerate(out_obj_ids)
}
out_dir = Path("/root/mask_frames")
out_dir.mkdir(exist_ok=True)
vis_frame_stride = 5 # visualize every 5th frame
save_segmented_frames(
video_segments,
frames_dir,
out_dir,
frame_names,
stride=vis_frame_stride,
)
ffmpeg.input(
f"{out_dir}/frame_*.png",
pattern_type="glob",
framerate=30 / vis_frame_stride,
).filter(
"scale",
"trunc(iw/2)*2",
"trunc(ih/2)*2", # round to even dimensions to encode for "dumb players", https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordu
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment