Skip to content

Instantly share code, notes, and snippets.

{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "vllm",
"type": "debugpy",
"request": "launch",
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "vllm",
"type": "debugpy",
"request": "launch",
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "vllm",
"type": "debugpy",
"request": "launch",
Traceback (most recent call last):
File "/home/xiowei/vllm/vllm/v1/executor/multiproc_executor.py", line 465, in worker_busy_loop
output = func(*args, **kwargs)
File "/home/xiowei/vllm/vllm/v1/worker/tpu_worker.py", line 160, in determine_available_memory
self.model_runner.profile_run(self.model_runner.max_num_tokens)
File "/home/xiowei/vllm/vllm/v1/worker/tpu_model_runner.py", line 1166, in profile_run
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
File "/home/xiowei/vllm/vllm/model_executor/models/gemma3_mm.py", line 588, in get_multimodal_embeddings
return self._process_image_input(image_input)
File "/home/xiowei/vllm/vllm/model_executor/models/gemma3_mm.py", line 569, in _process_image_input
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "vllm",
"type": "debugpy",
"request": "launch",
WARNING:root:libtpu.so and TPU device found. Setting PJRT_DEVICE=TPU.
INFO 04-21 23:15:22 [__init__.py:239] Automatically detected platform tpu.
xw32 line246 engine_args=EngineArgs(model='google/gemma-3-4b-it', served_model_name=None, tokenizer='google/gemma-3-4b-it', hf_config_path=None, task='auto', skip_tokenizer_init=False, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path='', download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', seed=None, max_model_len=None, distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=4, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, block_size=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', disable_sliding_window=False, disable_cascade_attn=False, use_v2_block_manager=True, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, max_num_batched_tokens=256, max_num_partial_prefills=1, max_long_part
def selective_scan_ref(
u, # B L D
delta, # B L D
A, # D N
B, # B L N
C, # B L N
D, # D
h_init: jax.Array | None = None,
) -> jax.Array:
"""Reference function equivalent to pallas_selective_scan."""
bs=4
seq_len=4096
d_inner=1536
d_state=16
delta = random.uniform(random.key(0), (bs, seq_len, d_inner), dtype=jnp.float32)
A = random.uniform(random.key(0), (d_inner, d_state), dtype=jnp.float32)
deltaA_1 = jnp.exp(jnp.einsum('b l d, d n -> b l d n', delta, A))
deltaA_2 = jnp.exp(delta[..., None]*A)
assert jnp.allclose(deltaA_1, deltaA_2)