Skip to content

Instantly share code, notes, and snippets.

@larkintuckerllc
Last active October 9, 2025 21:02
Show Gist options
  • Save larkintuckerllc/768ffc60fb32b5d73644f1629ae097b0 to your computer and use it in GitHub Desktop.
Save larkintuckerllc/768ffc60fb32b5d73644f1629ae097b0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "2cfaa4ab-d866-4dc2-8a48-df81fa838347",
"metadata": {},
"source": [
"# imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8f768035-d0aa-49d2-a176-22cf440c0ac7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 10-09 20:56:24 [__init__.py:216] Automatically detected platform cuda.\n"
]
}
],
"source": [
"from vllm import LLM, SamplingParams"
]
},
{
"cell_type": "markdown",
"id": "0556b47c-0474-4323-ba02-63669a9efada",
"metadata": {},
"source": [
"# constants"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "eb6e9a30-838e-4224-b93d-fefe0f394fb2",
"metadata": {},
"outputs": [],
"source": [
"PROMPTS = [\n",
" \"Hello, my name is\",\n",
" \"The president of the United States is\",\n",
" \"The capital of France is\",\n",
" \"The future of AI is\",\n",
"]\n",
"TEMPERATURE = 0.8\n",
"TOP_P = 0.95"
]
},
{
"cell_type": "markdown",
"id": "29d4a29f-6423-4d6f-8179-78f584a5682e",
"metadata": {},
"source": [
"# load model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cf1fa66b-536f-42fd-966c-5126f306cb10",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 10-09 20:56:27 [utils.py:233] non-default args: {'disable_log_stats': True, 'model': 'facebook/opt-125m'}\n",
"INFO 10-09 20:56:27 [model.py:547] Resolved architecture: OPTForCausalLM\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"`torch_dtype` is deprecated! Use `dtype` instead!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 10-09 20:56:27 [model.py:1510] Using max model len 2048\n",
"INFO 10-09 20:56:31 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:644] Waiting for init message from front-end.\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=facebook/opt-125m, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={\"level\":3,\"debug_dump_path\":\"\",\"cache_dir\":\"\",\"backend\":\"\",\"custom_ops\":[],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\",\"vllm.mamba_mixer2\",\"vllm.mamba_mixer\",\"vllm.short_conv\",\"vllm.linear_attention\",\"vllm.plamo2_mamba_mixer\",\"vllm.gdn_attention\",\"vllm.sparse_attn_indexer\"],\"use_inductor\":true,\"compile_sizes\":[],\"inductor_compile_config\":{\"enable_auto_functionalized_v2\":false},\"inductor_passes\":{},\"cudagraph_mode\":[2,1],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"cudagraph_copy_inputs\":false,\"full_cuda_graph\":false,\"use_inductor_graph_partition\":false,\"pass_config\":{},\"max_capture_size\":512,\"local_cache_dir\":null}\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m ERROR 10-09 20:56:34 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n",
"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:35 [topk_topp_sampler.py:66] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [gpu_model_runner.py:2602] Starting to load model facebook/opt-125m...\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [gpu_model_runner.py:2634] Loading model from scratch...\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [cuda.py:372] Using FlexAttention backend on V1 engine.\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [weight_utils.py:392] Using model weights format ['*.safetensors', '*.bin', '*.pt']\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "767620c68528482ab3bdf71329541be1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading pt checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [default_loader.py:267] Loading weights took 0.36 seconds\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [gpu_model_runner.py:2653] Model loading took 0.2389 GiB and 0.897046 seconds\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:548] Using cache directory: /home/jtucker/.cache/vllm/torch_compile_cache/f0b3418c3f/rank_0_0/backbone for vLLM's torch.compile\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:559] Dynamo bytecode transform time: 2.90 s\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 0.366 s\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:42 [monitor.py:34] torch.compile takes 2.90 s in total\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [gpu_worker.py:298] Available KV cache memory: 12.36 GiB\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1087] GPU KV cache size: 359,952 tokens\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1091] Maximum concurrency for 2,048 tokens per request: 175.76x\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:43 [gpu_model_runner.py:3663] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with FlexAttentionMetadataBuilder backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|█| 67/67 [00:01<00\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [gpu_model_runner.py:3480] Graph capturing finished in 2 secs, took 0.17 GiB\n",
"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [core.py:210] init engine (profile, create kv cache, warmup model) took 8.33 seconds\n",
"INFO 10-09 20:56:46 [llm.py:306] Supported_tasks: ['generate']\n"
]
}
],
"source": [
"llm = LLM(model=\"facebook/opt-125m\")"
]
},
{
"cell_type": "markdown",
"id": "3cc7e0a7-b9db-4f9c-9484-c81d7fbf20b4",
"metadata": {},
"source": [
"# generate"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9eafd8fc-f082-492e-aaef-7aa8f1d06381",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c109d979a7df47c29569957c0132cf04",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Adding requests: 0%| | 0/4 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ae3b018b838a465292f1aaa3677f72e8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processed prompts: 0%| | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, o"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generated Outputs:\n",
"------------------------------------------------------------\n",
"Prompt: 'Hello, my name is'\n",
"Output: ' Joel, I am a 4yo, I am very naughty, and I like'\n",
"------------------------------------------------------------\n",
"Prompt: 'The president of the United States is'\n",
"Output: ' reportedly holding back from issuing a statement about the Ukraine crisis after he called the International'\n",
"------------------------------------------------------------\n",
"Prompt: 'The capital of France is'\n",
"Output: ' the capital of the French colony of Ireland.\\nLaws of France are not'\n",
"------------------------------------------------------------\n",
"Prompt: 'The future of AI is'\n",
"Output: \" in the hands of the vast majority of people - you've probably seen it already\"\n",
"------------------------------------------------------------\n"
]
}
],
"source": [
"sampling_params = SamplingParams(temperature=TEMPERATURE, top_p=TOP_P)\n",
"outputs = llm.generate(PROMPTS, sampling_params)\n",
"print(\"\\nGenerated Outputs:\\n\" + \"-\" * 60)\n",
"for output in outputs:\n",
" prompt = output.prompt\n",
" generated_text = output.outputs[0].text\n",
" print(f\"Prompt: {prompt!r}\")\n",
" print(f\"Output: {generated_text!r}\")\n",
" print(\"-\" * 60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment