Modal is a serverless cloud platform that provides automatic scaling for Python workloads. It's particularly useful for:
- Parallel data processing
- Machine learning training and inference
- Hyperparameter optimization
- Web scraping at scale
- Batch computations requiring GPUs
Modal automatically scales containers based on workload with zero configuration required.
import modal
app = modal.App("my-scaling-app")
@app.function()
def process_item(item):
# Automatically scales to handle concurrent requests
return expensive_computation(item)
@app.local_entrypoint()
def main():
# Process 10,000 items in parallel across multiple containers
results = list(process_item.map(range(10_000)))
Configuration Options:
min_containers
: Keep warm containers ready (default: 0)max_containers
: Upper limit on concurrent containersbuffer_containers
: Maintain containers to reduce queuingscaledown_window
: Idle duration before shutdown
@app.function(
min_containers=2,
max_containers=100,
buffer_containers=5,
scaledown_window=120
)
def scalable_function():
pass
@app.function(gpu="T4")
def train_model(hyperparams):
# Train with specific hyperparameters
return model_score
@app.local_entrypoint()
def hyperparameter_search():
param_grid = [
{"lr": 0.001, "batch_size": 32},
{"lr": 0.01, "batch_size": 64},
# ... hundreds more combinations
]
# Parallel execution across GPUs
for params, score in train_model.map(param_grid):
print(f"Params: {params}, Score: {score}")
@app.function(volumes={"/data": volume})
def process_batch(batch_id):
# Process and store results externally
data = fetch_batch(batch_id)
results = process(data)
save_to_volume(results)
@app.local_entrypoint()
def main():
# Submit 100,000 jobs without waiting for results
process_batch.spawn_map(range(100_000))
Process multiple inputs in a single container for I/O-bound workloads:
@app.function()
@modal.concurrent(max_inputs=100, target_inputs=80)
async def fetch_api_data(url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.json()
# Single container handles up to 100 concurrent API calls
Optimize GPU utilization by batching requests:
@app.function(gpu="A100")
@modal.batched(max_batch_size=32, wait_ms=100)
async def predict_batch(inputs: list[np.ndarray]) -> list[dict]:
# Process entire batch on GPU at once
predictions = model.predict(np.stack(inputs))
return [{"prediction": pred} for pred in predictions]
# Individual requests automatically batched
result = predict_batch.remote(single_input)
For long-running background tasks:
@app.function(timeout=3600) # 1 hour timeout
def process_video(video_url: str):
# Long-running video processing
return processed_result
def submit_job(video_url):
# Submit job and get tracking ID
fn = modal.Function.from_name("video-processor", "process_video")
call = fn.spawn(video_url)
return call.object_id
def get_result(job_id):
call = modal.FunctionCall.from_id(job_id)
return call.get() # Blocks until complete
import optuna
import modal
app = modal.App(image=modal.Image.debian_slim()
.pip_install("optuna", "scikit-learn", "pandas"))
@app.function()
def objective(trial_params):
"""Single trial evaluation"""
model = create_model(trial_params)
score = cross_validate(model)
return score
@app.function()
def distributed_optuna_search(n_trials=1000):
"""Distributed hyperparameter optimization"""
study = optuna.create_study(direction="maximize")
# Generate trial parameters
trials = [study.ask() for _ in range(n_trials)]
# Evaluate all trials in parallel
scores = list(objective.map(trials))
# Update study with results
for trial, score in zip(trials, scores):
study.tell(trial, score)
return study.best_params
@app.function(gpu="T4")
def evaluate_model(hyperparams):
"""Evaluate single hyperparameter configuration"""
lr, batch_size, dropout = hyperparams
model = train_model(lr, batch_size, dropout)
accuracy = evaluate(model)
return (hyperparams, accuracy)
@app.local_entrypoint()
def grid_search():
"""Parallel grid search across parameter space"""
param_grid = [
(lr, bs, dr)
for lr in [0.001, 0.01, 0.1]
for bs in [16, 32, 64, 128]
for dr in [0.1, 0.2, 0.3, 0.5]
]
# Run all experiments in parallel
results = list(evaluate_model.map(param_grid))
best_params, best_score = max(results, key=lambda x: x[1])
print(f"Best params: {best_params}, Score: {best_score}")
@app.function()
@modal.concurrent(max_inputs=500)
async def fetch_webpage(url: str):
"""Fetch and process webpage content"""
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html = await response.text()
return extract_data(html)
@app.local_entrypoint()
def scrape_websites():
"""Scrape 100,000 websites in parallel"""
urls = load_url_list() # 100,000 URLs
# Process with automatic scaling
for url, data in fetch_webpage.map(urls):
save_to_database(url, data)
@app.function(
gpu="T4",
image=modal.Image.debian_slim()
.pip_install("torch", "torchvision", "pillow")
)
@modal.batched(max_batch_size=64, wait_ms=50)
async def process_images(image_paths: list[str]) -> list[dict]:
"""Process batch of images on GPU"""
images = [load_image(path) for path in image_paths]
tensors = torch.stack([transform(img) for img in images])
# Process entire batch on GPU
with torch.no_grad():
features = model(tensors.cuda())
return [{"features": f.cpu().numpy()} for f in features]
# Complex ML environment
ml_image = (
modal.Image.from_registry("pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime")
.pip_install("transformers", "datasets", "wandb")
.run_commands("apt-get update && apt-get install -y git")
)
app = modal.App(image=ml_image)
@app.function(gpu="A100")
def train_llm():
# Use pre-configured ML environment
pass
volume = modal.Volume.from_name("my-data-volume")
@app.function(volumes={"/data": volume})
def process_with_storage():
# Read/write to persistent volume
data = pd.read_parquet("/data/input.parquet")
results = process(data)
results.to_parquet("/data/output.parquet")
-
Choose the Right Pattern:
- Use
.map()
for result collection - Use
.spawn_map()
for fire-and-forget - Use
@modal.concurrent
for I/O-bound tasks - Use
@modal.batched
for GPU inference
- Use
-
Optimize Container Configuration:
- Set
min_containers
for low-latency requirements - Use
buffer_containers
to handle burst traffic - Configure appropriate timeouts for long-running tasks
- Set
-
Resource Management:
- Pin dependency versions in images
- Use volumes for shared data
- Leverage GPU acceleration where beneficial
-
Monitoring and Debugging:
- Use Modal dashboard for monitoring
- Implement logging within functions
- Handle failures gracefully with retries
Ideal Use Cases:
- Hyperparameter tuning (grid search, Bayesian optimization)
- Large-scale data processing and ETL
- Distributed model training
- Batch inference on GPUs
- Web scraping and API data collection
- Parallel simulations and Monte Carlo methods
Not Recommended For:
- Real-time, sub-millisecond latency requirements
- Stateful, long-lived services
- Applications requiring specific network configurations
# Install Modal
pip install modal
# Authenticate
modal setup
# Deploy an app
modal deploy my_app.py
# Run a function
modal run my_app.py::function_name
# Run with detached jobs
modal run --detach my_app.py
- Use
.spawn_map()
for large batches to avoid result serialization overhead - Configure
scaledown_window
to balance cost vs latency - Use spot instances for fault-tolerant workloads
- Batch small tasks to reduce container startup overhead
- Use appropriate GPU types (T4 for inference, A100 for training)