Last active
October 9, 2025 21:02
-
-
Save larkintuckerllc/768ffc60fb32b5d73644f1629ae097b0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "2cfaa4ab-d866-4dc2-8a48-df81fa838347", | |
| "metadata": {}, | |
| "source": [ | |
| "# imports" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "8f768035-d0aa-49d2-a176-22cf440c0ac7", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "INFO 10-09 20:56:24 [__init__.py:216] Automatically detected platform cuda.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from vllm import LLM, SamplingParams" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "0556b47c-0474-4323-ba02-63669a9efada", | |
| "metadata": {}, | |
| "source": [ | |
| "# constants" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "eb6e9a30-838e-4224-b93d-fefe0f394fb2", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "PROMPTS = [\n", | |
| " \"Hello, my name is\",\n", | |
| " \"The president of the United States is\",\n", | |
| " \"The capital of France is\",\n", | |
| " \"The future of AI is\",\n", | |
| "]\n", | |
| "TEMPERATURE = 0.8\n", | |
| "TOP_P = 0.95" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "29d4a29f-6423-4d6f-8179-78f584a5682e", | |
| "metadata": {}, | |
| "source": [ | |
| "# load model" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "cf1fa66b-536f-42fd-966c-5126f306cb10", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "INFO 10-09 20:56:27 [utils.py:233] non-default args: {'disable_log_stats': True, 'model': 'facebook/opt-125m'}\n", | |
| "INFO 10-09 20:56:27 [model.py:547] Resolved architecture: OPTForCausalLM\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "`torch_dtype` is deprecated! Use `dtype` instead!\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "INFO 10-09 20:56:27 [model.py:1510] Using max model len 2048\n", | |
| "INFO 10-09 20:56:31 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:644] Waiting for init message from front-end.\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:32 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=facebook/opt-125m, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={\"level\":3,\"debug_dump_path\":\"\",\"cache_dir\":\"\",\"backend\":\"\",\"custom_ops\":[],\"splitting_ops\":[\"vllm.unified_attention\",\"vllm.unified_attention_with_output\",\"vllm.mamba_mixer2\",\"vllm.mamba_mixer\",\"vllm.short_conv\",\"vllm.linear_attention\",\"vllm.plamo2_mamba_mixer\",\"vllm.gdn_attention\",\"vllm.sparse_attn_indexer\"],\"use_inductor\":true,\"compile_sizes\":[],\"inductor_compile_config\":{\"enable_auto_functionalized_v2\":false},\"inductor_passes\":{},\"cudagraph_mode\":[2,1],\"use_cudagraph\":true,\"cudagraph_num_of_warmups\":1,\"cudagraph_capture_sizes\":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"cudagraph_copy_inputs\":false,\"full_cuda_graph\":false,\"use_inductor_graph_partition\":false,\"pass_config\":{},\"max_capture_size\":512,\"local_cache_dir\":null}\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m ERROR 10-09 20:56:34 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n", | |
| "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", | |
| "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", | |
| "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", | |
| "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", | |
| "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", | |
| "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:35 [topk_topp_sampler.py:66] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:35 [gpu_model_runner.py:2602] Starting to load model facebook/opt-125m...\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [gpu_model_runner.py:2634] Loading model from scratch...\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [cuda.py:372] Using FlexAttention backend on V1 engine.\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:36 [weight_utils.py:392] Using model weights format ['*.safetensors', '*.bin', '*.pt']\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "767620c68528482ab3bdf71329541be1", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Loading pt checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]\n" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [default_loader.py:267] Loading weights took 0.36 seconds\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:37 [gpu_model_runner.py:2653] Model loading took 0.2389 GiB and 0.897046 seconds\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:548] Using cache directory: /home/jtucker/.cache/vllm/torch_compile_cache/f0b3418c3f/rank_0_0/backbone for vLLM's torch.compile\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:559] Dynamo bytecode transform time: 2.90 s\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:41 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 0.366 s\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:42 [monitor.py:34] torch.compile takes 2.90 s in total\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [gpu_worker.py:298] Available KV cache memory: 12.36 GiB\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1087] GPU KV cache size: 359,952 tokens\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:43 [kv_cache_utils.py:1091] Maximum concurrency for 2,048 tokens per request: 175.76x\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m WARNING 10-09 20:56:43 [gpu_model_runner.py:3663] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with FlexAttentionMetadataBuilder backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|█| 67/67 [00:01<00\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [gpu_model_runner.py:3480] Graph capturing finished in 2 secs, took 0.17 GiB\n", | |
| "\u001b[1;36m(EngineCore_DP0 pid=3068)\u001b[0;0m INFO 10-09 20:56:46 [core.py:210] init engine (profile, create kv cache, warmup model) took 8.33 seconds\n", | |
| "INFO 10-09 20:56:46 [llm.py:306] Supported_tasks: ['generate']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "llm = LLM(model=\"facebook/opt-125m\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3cc7e0a7-b9db-4f9c-9484-c81d7fbf20b4", | |
| "metadata": {}, | |
| "source": [ | |
| "# generate" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "9eafd8fc-f082-492e-aaef-7aa8f1d06381", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "c109d979a7df47c29569957c0132cf04", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Adding requests: 0%| | 0/4 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "ae3b018b838a465292f1aaa3677f72e8", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Processed prompts: 0%| | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, o" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "Generated Outputs:\n", | |
| "------------------------------------------------------------\n", | |
| "Prompt: 'Hello, my name is'\n", | |
| "Output: ' Joel, I am a 4yo, I am very naughty, and I like'\n", | |
| "------------------------------------------------------------\n", | |
| "Prompt: 'The president of the United States is'\n", | |
| "Output: ' reportedly holding back from issuing a statement about the Ukraine crisis after he called the International'\n", | |
| "------------------------------------------------------------\n", | |
| "Prompt: 'The capital of France is'\n", | |
| "Output: ' the capital of the French colony of Ireland.\\nLaws of France are not'\n", | |
| "------------------------------------------------------------\n", | |
| "Prompt: 'The future of AI is'\n", | |
| "Output: \" in the hands of the vast majority of people - you've probably seen it already\"\n", | |
| "------------------------------------------------------------\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sampling_params = SamplingParams(temperature=TEMPERATURE, top_p=TOP_P)\n", | |
| "outputs = llm.generate(PROMPTS, sampling_params)\n", | |
| "print(\"\\nGenerated Outputs:\\n\" + \"-\" * 60)\n", | |
| "for output in outputs:\n", | |
| " prompt = output.prompt\n", | |
| " generated_text = output.outputs[0].text\n", | |
| " print(f\"Prompt: {prompt!r}\")\n", | |
| " print(f\"Output: {generated_text!r}\")\n", | |
| " print(\"-\" * 60)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.13.8" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment