larkintuckerllc · October 9, 2025 21:02
diff --git a/10_offline_batched_inference.ipynb b/10_offline_batched_inference.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2cfaa4ab-d866-4dc2-8a48-df81fa838347",
   "metadata": {},
   "source": [
    "# imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8f768035-d0aa-49d2-a176-22cf440c0ac7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 10-09 20:56:24 [__init__.py:216] Automatically detected platform cuda.\n"
     ]
    }
   ],
   "source": [
    "from vllm import LLM, SamplingParams"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0556b47c-0474-4323-ba02-63669a9efada",
   "metadata": {},
   "source": [
    "# constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb6e9a30-838e-4224-b93d-fefe0f394fb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPTS = [\n",
    "    \"Hello, my name is\",\n",
    "    \"The president of the United States is\",\n",
    "    \"The capital of France is\",\n",
    "    \"The future of AI is\",\n",
    "]\n",
    "TEMPERATURE = 0.8\n",
    "TOP_P = 0.95"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29d4a29f-6423-4d6f-8179-78f584a5682e",
   "metadata": {},
   "source": [
    "# load model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cf1fa66b-536f-42fd-966c-5126f306cb10",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 10-09 20:56:27 [utils.py:233] non-default args: {'disable_log_stats': True, 'model': 'facebook/opt-125m'}\n",
      "INFO 10-09 20:56:27 [model.py:547] Resolved architecture: OPTForCausalLM\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "`torch_dtype` is deprecated! Use `dtype` instead!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 10-09 20:56:27 [model.py:1510] Using max model len 2048\n",
      "INFO 10-09 20:56:31 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:644] Waiting for init message from front-end.\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=facebook/opt-125m, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={\"level\":3,\"debug_dump_path\":\"\",\"cache_dir\":\"\",\"backend\":\"\",\"custom_ops\":[],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\",\"vllm.mamba_mixer2\",\"vllm.mamba_mixer\",\"vllm.short_conv\",\"vllm.linear_attention\",\"vllm.plamo2_mamba_mixer\",\"vllm.gdn_attention\",\"vllm.sparse_attn_indexer\"],\"use_inductor\":true,\"compile_sizes\":[],\"inductor_compile_config\":{\"enable_auto_functionalized_v2\":false},\"inductor_passes\":{},\"cudagraph_mode\":[2,1],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"cudagraph_copy_inputs\":false,\"full_cuda_graph\":false,\"use_inductor_graph_partition\":false,\"pass_config\":{},\"max_capture_size\":512,\"local_cache_dir\":null}\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m ERROR 10-09 20:56:34 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:35 [topk_topp_sampler.py:66] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [gpu_model_runner.py:2602] Starting to load model facebook/opt-125m...\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [gpu_model_runner.py:2634] Loading model from scratch...\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [cuda.py:372] Using FlexAttention backend on V1 engine.\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [weight_utils.py:392] Using model weights format ['*.safetensors', '*.bin', '*.pt']\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "767620c68528482ab3bdf71329541be1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [default_loader.py:267] Loading weights took 0.36 seconds\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [gpu_model_runner.py:2653] Model loading took 0.2389 GiB and 0.897046 seconds\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:548] Using cache directory: /home/jtucker/.cache/vllm/torch_compile_cache/f0b3418c3f/rank_0_0/backbone for vLLM's torch.compile\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:559] Dynamo bytecode transform time: 2.90 s\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 0.366 s\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:42 [monitor.py:34] torch.compile takes 2.90 s in total\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [gpu_worker.py:298] Available KV cache memory: 12.36 GiB\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1087] GPU KV cache size: 359,952 tokens\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1091] Maximum concurrency for 2,048 tokens per request: 175.76x\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:43 [gpu_model_runner.py:3663] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with FlexAttentionMetadataBuilder backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|█| 67/67 [00:01<00\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [gpu_model_runner.py:3480] Graph capturing finished in 2 secs, took 0.17 GiB\n",
      "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [core.py:210] init engine (profile, create kv cache, warmup model) took 8.33 seconds\n",
      "INFO 10-09 20:56:46 [llm.py:306] Supported_tasks: ['generate']\n"
     ]
    }
   ],
   "source": [
    "llm = LLM(model=\"facebook/opt-125m\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3cc7e0a7-b9db-4f9c-9484-c81d7fbf20b4",
   "metadata": {},
   "source": [
    "# generate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9eafd8fc-f082-492e-aaef-7aa8f1d06381",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c109d979a7df47c29569957c0132cf04",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ae3b018b838a465292f1aaa3677f72e8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Processed prompts:   0%| | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, o"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Generated Outputs:\n",
      "------------------------------------------------------------\n",
      "Prompt:    'Hello, my name is'\n",
      "Output:    ' Joel, I am a 4yo, I am very naughty, and I like'\n",
      "------------------------------------------------------------\n",
      "Prompt:    'The president of the United States is'\n",
      "Output:    ' reportedly holding back from issuing a statement about the Ukraine crisis after he called the International'\n",
      "------------------------------------------------------------\n",
      "Prompt:    'The capital of France is'\n",
      "Output:    ' the capital of the French colony of Ireland.\\nLaws of France are not'\n",
      "------------------------------------------------------------\n",
      "Prompt:    'The future of AI is'\n",
      "Output:    \" in the hands of the vast majority of people - you've probably seen it already\"\n",
      "------------------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "sampling_params = SamplingParams(temperature=TEMPERATURE, top_p=TOP_P)\n",
    "outputs = llm.generate(PROMPTS, sampling_params)\n",
    "print(\"\\nGenerated Outputs:\\n\" + \"-\" * 60)\n",
    "for output in outputs:\n",
    "    prompt = output.prompt\n",
    "    generated_text = output.outputs[0].text\n",
    "    print(f\"Prompt:    {prompt!r}\")\n",
    "    print(f\"Output:    {generated_text!r}\")\n",
    "    print(\"-\" * 60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "2cfaa4ab-d866-4dc2-8a48-df81fa838347",
	"metadata": {},
	"source": [
	"# imports"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "8f768035-d0aa-49d2-a176-22cf440c0ac7",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"INFO 10-09 20:56:24 [__init__.py:216] Automatically detected platform cuda.\n"
	]
	}
	],
	"source": [
	"from vllm import LLM, SamplingParams"
	]
	},
	{
	"cell_type": "markdown",
	"id": "0556b47c-0474-4323-ba02-63669a9efada",
	"metadata": {},
	"source": [
	"# constants"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "eb6e9a30-838e-4224-b93d-fefe0f394fb2",
	"metadata": {},
	"outputs": [],
	"source": [
	"PROMPTS = [\n",
	" \"Hello, my name is\",\n",
	" \"The president of the United States is\",\n",
	" \"The capital of France is\",\n",
	" \"The future of AI is\",\n",
	"]\n",
	"TEMPERATURE = 0.8\n",
	"TOP_P = 0.95"
	]
	},
	{
	"cell_type": "markdown",
	"id": "29d4a29f-6423-4d6f-8179-78f584a5682e",
	"metadata": {},
	"source": [
	"# load model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "cf1fa66b-536f-42fd-966c-5126f306cb10",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"INFO 10-09 20:56:27 [utils.py:233] non-default args: {'disable_log_stats': True, 'model': 'facebook/opt-125m'}\n",
	"INFO 10-09 20:56:27 [model.py:547] Resolved architecture: OPTForCausalLM\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"`torch_dtype` is deprecated! Use `dtype` instead!\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"INFO 10-09 20:56:27 [model.py:1510] Using max model len 2048\n",
	"INFO 10-09 20:56:31 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:644] Waiting for init message from front-end.\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=facebook/opt-125m, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={\"level\":3,\"debug_dump_path\":\"\",\"cache_dir\":\"\",\"backend\":\"\",\"custom_ops\":[],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\",\"vllm.mamba_mixer2\",\"vllm.mamba_mixer\",\"vllm.short_conv\",\"vllm.linear_attention\",\"vllm.plamo2_mamba_mixer\",\"vllm.gdn_attention\",\"vllm.sparse_attn_indexer\"],\"use_inductor\":true,\"compile_sizes\":[],\"inductor_compile_config\":{\"enable_auto_functionalized_v2\":false},\"inductor_passes\":{},\"cudagraph_mode\":[2,1],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"cudagraph_copy_inputs\":false,\"full_cuda_graph\":false,\"use_inductor_graph_partition\":false,\"pass_config\":{},\"max_capture_size\":512,\"local_cache_dir\":null}\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m ERROR 10-09 20:56:34 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n",
	"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
	"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
	"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
	"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
	"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
	"[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:35 [topk_topp_sampler.py:66] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [gpu_model_runner.py:2602] Starting to load model facebook/opt-125m...\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [gpu_model_runner.py:2634] Loading model from scratch...\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [cuda.py:372] Using FlexAttention backend on V1 engine.\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [weight_utils.py:392] Using model weights format ['.safetensors', '.bin', '*.pt']\n"
	]
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "767620c68528482ab3bdf71329541be1",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Loading pt checkpoint shards: 0% Completed \| 0/1 [00:00<?, ?it/s]\n"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [default_loader.py:267] Loading weights took 0.36 seconds\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [gpu_model_runner.py:2653] Model loading took 0.2389 GiB and 0.897046 seconds\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:548] Using cache directory: /home/jtucker/.cache/vllm/torch_compile_cache/f0b3418c3f/rank_0_0/backbone for vLLM's torch.compile\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:559] Dynamo bytecode transform time: 2.90 s\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 0.366 s\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:42 [monitor.py:34] torch.compile takes 2.90 s in total\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [gpu_worker.py:298] Available KV cache memory: 12.36 GiB\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1087] GPU KV cache size: 359,952 tokens\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1091] Maximum concurrency for 2,048 tokens per request: 175.76x\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:43 [gpu_model_runner.py:3663] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with FlexAttentionMetadataBuilder backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%\|█\| 67/67 [00:01<00\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [gpu_model_runner.py:3480] Graph capturing finished in 2 secs, took 0.17 GiB\n",
	"\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [core.py:210] init engine (profile, create kv cache, warmup model) took 8.33 seconds\n",
	"INFO 10-09 20:56:46 [llm.py:306] Supported_tasks: ['generate']\n"
	]
	}
	],
	"source": [
	"llm = LLM(model=\"facebook/opt-125m\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3cc7e0a7-b9db-4f9c-9484-c81d7fbf20b4",
	"metadata": {},
	"source": [
	"# generate"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "9eafd8fc-f082-492e-aaef-7aa8f1d06381",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "c109d979a7df47c29569957c0132cf04",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Adding requests: 0%\| \| 0/4 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "ae3b018b838a465292f1aaa3677f72e8",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Processed prompts: 0%\| \| 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, o"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"Generated Outputs:\n",
	"------------------------------------------------------------\n",
	"Prompt: 'Hello, my name is'\n",
	"Output: ' Joel, I am a 4yo, I am very naughty, and I like'\n",
	"------------------------------------------------------------\n",
	"Prompt: 'The president of the United States is'\n",
	"Output: ' reportedly holding back from issuing a statement about the Ukraine crisis after he called the International'\n",
	"------------------------------------------------------------\n",
	"Prompt: 'The capital of France is'\n",
	"Output: ' the capital of the French colony of Ireland.\\nLaws of France are not'\n",
	"------------------------------------------------------------\n",
	"Prompt: 'The future of AI is'\n",
	"Output: \" in the hands of the vast majority of people - you've probably seen it already\"\n",
	"------------------------------------------------------------\n"
	]
	}
	],
	"source": [
	"sampling_params = SamplingParams(temperature=TEMPERATURE, top_p=TOP_P)\n",
	"outputs = llm.generate(PROMPTS, sampling_params)\n",
	"print(\"\\nGenerated Outputs:\\n\" + \"-\" * 60)\n",
	"for output in outputs:\n",
	" prompt = output.prompt\n",
	" generated_text = output.outputs[0].text\n",
	" print(f\"Prompt: {prompt!r}\")\n",
	" print(f\"Output: {generated_text!r}\")\n",
	" print(\"-\" * 60)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.13.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}