Created
April 14, 2026 19:36
-
-
Save gcr/45b85410ddbf21c153ab0364b6c1aee5 to your computer and use it in GitHub Desktop.
vllm bug reproduction: long batches
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Updating https://github.com/IBM/vLLM-Hook/ (HEAD) | |
| Updated https://github.com/IBM/vLLM-Hook/ (05494dad8a74db027a21b506b763b00577bd0763) | |
| INFO 04-14 19:30:29 [utils.py:233] non-default args: {'trust_remote_code': True, 'download_dir': './cache/', 'dtype': torch.float16, 'max_model_len': 20480, 'enable_prefix_caching': False, 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'enforce_eager': True, 'worker_cls': 'vllm_hook_plugins.workers.probe_hidden_states_worker.ProbeHiddenStatesWorker', 'model': 'Qwen/Qwen2.5-3B-Instruct'} | |
| WARNING 04-14 19:30:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_USE_V1 | |
| WARNING 04-14 19:30:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_HOOK_DIR | |
| WARNING 04-14 19:30:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_HOOK_FLAG | |
| WARNING 04-14 19:30:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_RUN_ID | |
| WARNING 04-14 19:30:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_HOOK_LAYERS | |
| WARNING 04-14 19:30:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_HOOK_HS_MODE | |
| INFO 04-14 19:30:29 [model.py:549] Resolved architecture: Qwen2ForCausalLM | |
| WARNING 04-14 19:30:29 [model.py:2016] Casting torch.bfloat16 to torch.float16. | |
| INFO 04-14 19:30:29 [model.py:1678] Using max model len 20480 | |
| INFO 04-14 19:30:29 [scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. | |
| INFO 04-14 19:30:29 [vllm.py:790] Asynchronous scheduling is enabled. | |
| WARNING 04-14 19:30:29 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none | |
| WARNING 04-14 19:30:29 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored. | |
| INFO 04-14 19:30:29 [vllm.py:1025] Cudagraph is disabled under eager mode | |
| INFO 04-14 19:30:29 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant | |
| (EngineCore pid=18015) INFO 04-14 19:30:36 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-3B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=20480, download_dir='./cache/', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-3B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} | |
| (EngineCore pid=18015) INFO 04-14 19:30:37 [parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.0.19:49913 backend=nccl | |
| (EngineCore pid=18015) INFO 04-14 19:30:37 [parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A | |
| (EngineCore pid=18015) INFO 04-14 19:30:37 [gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-3B-Instruct... | |
| (EngineCore pid=18015) INFO 04-14 19:30:38 [cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. | |
| (EngineCore pid=18015) INFO 04-14 19:30:38 [flash_attn.py:596] Using FlashAttention version 2 | |
| (EngineCore pid=18015) | |
| Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00<?, ?it/s] | |
| (EngineCore pid=18015) | |
| Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:01<00:01, 1.93s/it] | |
| (EngineCore pid=18015) | |
| Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:03<00:00, 1.43s/it] | |
| (EngineCore pid=18015) | |
| Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:03<00:00, 1.50s/it] | |
| (EngineCore pid=18015) | |
| (EngineCore pid=18015) INFO 04-14 19:30:42 [default_loader.py:384] Loading weights took 3.03 seconds | |
| (EngineCore pid=18015) INFO 04-14 19:30:42 [gpu_model_runner.py:4820] Model loading took 5.79 GiB memory and 3.984216 seconds | |
| (EngineCore pid=18015) Installed 3 hidden-state hooks on layers: ['model.layers.15', 'model.layers.20', 'model.layers.30'] | |
| (EngineCore pid=18015) Hooks installed successfully | |
| (EngineCore pid=18015) INFO 04-14 19:30:45 [gpu_worker.py:436] Available KV cache memory: 21.1 GiB | |
| (EngineCore pid=18015) INFO 04-14 19:30:45 [kv_cache_utils.py:1319] GPU KV cache size: 614,448 tokens | |
| (EngineCore pid=18015) INFO 04-14 19:30:45 [kv_cache_utils.py:1324] Maximum concurrency for 20,480 tokens per request: 30.00x | |
| (EngineCore pid=18015) INFO 04-14 19:30:45 [core.py:283] init engine (profile, create kv cache, warmup model) took 2.74 seconds | |
| (EngineCore pid=18015) INFO 04-14 19:30:46 [vllm.py:790] Asynchronous scheduling is enabled. | |
| (EngineCore pid=18015) WARNING 04-14 19:30:46 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none | |
| (EngineCore pid=18015) WARNING 04-14 19:30:46 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored. | |
| (EngineCore pid=18015) INFO 04-14 19:30:46 [vllm.py:1025] Cudagraph is disabled under eager mode | |
| (EngineCore pid=18015) INFO 04-14 19:30:46 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant | |
| ====== SHORT BATCHES ====== | |
| laborum excepturi nemo necessitatibus temporibus | |
| quas inventore illum temporibus reiciendis | |
| est nostrum sint cumque numquam | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2574.16it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:19, 5.00it/s, est. speed input: 60.04 toks/s, output: 5.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 5.00it/s, est. speed input: 4947.24 toks/s, output: 486.92 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 486.55it/s, est. speed input: 4947.24 toks/s, output: 486.92 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3766.47it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:30, 3.27it/s, est. speed input: 39.27 toks/s, output: 32.72 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.27it/s, est. speed input: 3287.98 toks/s, output: 3236.14 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 323.45it/s, est. speed input: 3287.98 toks/s, output: 3236.14 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3743.48it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1097.77it/s, est. speed input: 11155.97 toks/s, output: 1097.95 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1095.98it/s, est. speed input: 11155.97 toks/s, output: 1097.95 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3691.45it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:27, 3.60it/s, est. speed input: 43.21 toks/s, output: 36.01 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.60it/s, est. speed input: 3627.23 toks/s, output: 3570.02 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 356.83it/s, est. speed input: 3627.23 toks/s, output: 3570.02 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3749.57it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1110.63it/s, est. speed input: 11286.72 toks/s, output: 1110.84 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1108.54it/s, est. speed input: 11286.72 toks/s, output: 1110.84 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3674.64it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:27, 3.65it/s, est. speed input: 43.84 toks/s, output: 36.53 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.65it/s, est. speed input: 3665.68 toks/s, output: 3607.86 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 360.58it/s, est. speed input: 3665.68 toks/s, output: 3607.86 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2764.16it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1161.45it/s, est. speed input: 11803.45 toks/s, output: 1161.68 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1159.34it/s, est. speed input: 11803.45 toks/s, output: 1161.68 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2712.78it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:26, 3.67it/s, est. speed input: 44.09 toks/s, output: 36.74 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.67it/s, est. speed input: 3694.45 toks/s, output: 3636.19 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 363.45it/s, est. speed input: 3694.45 toks/s, output: 3636.19 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2691.19it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1182.87it/s, est. speed input: 12020.78 toks/s, output: 1183.07 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1180.92it/s, est. speed input: 12020.78 toks/s, output: 1183.07 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2701.35it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:26, 3.72it/s, est. speed input: 44.60 toks/s, output: 37.16 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.72it/s, est. speed input: 3733.17 toks/s, output: 3674.30 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 367.24it/s, est. speed input: 3733.17 toks/s, output: 3674.30 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2652.51it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1105.68it/s, est. speed input: 11236.27 toks/s, output: 1105.87 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1103.80it/s, est. speed input: 11236.27 toks/s, output: 1105.87 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2703.18it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:26, 3.68it/s, est. speed input: 44.14 toks/s, output: 36.78 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.68it/s, est. speed input: 3687.52 toks/s, output: 3629.36 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 362.72it/s, est. speed input: 3687.52 toks/s, output: 3629.36 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2661.16it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1210.46it/s, est. speed input: 12301.40 toks/s, output: 1210.69 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1208.13it/s, est. speed input: 12301.40 toks/s, output: 1210.69 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2916.03it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:26, 3.70it/s, est. speed input: 44.45 toks/s, output: 37.04 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.70it/s, est. speed input: 3724.89 toks/s, output: 3666.14 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 366.36it/s, est. speed input: 3724.89 toks/s, output: 3666.14 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3648.36it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1094.54it/s, est. speed input: 11123.12 toks/s, output: 1094.73 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1092.65it/s, est. speed input: 11123.12 toks/s, output: 1094.73 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3590.03it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:27, 3.60it/s, est. speed input: 43.27 toks/s, output: 36.05 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.60it/s, est. speed input: 3628.61 toks/s, output: 3571.39 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 356.95it/s, est. speed input: 3628.61 toks/s, output: 3571.39 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3508.62it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1131.45it/s, est. speed input: 11498.16 toks/s, output: 1131.64 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1129.35it/s, est. speed input: 11498.16 toks/s, output: 1131.64 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3691.58it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:26, 3.69it/s, est. speed input: 44.27 toks/s, output: 36.89 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.69it/s, est. speed input: 3707.18 toks/s, output: 3648.71 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 364.67it/s, est. speed input: 3707.18 toks/s, output: 3648.71 toks/s] | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 3339.65it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1122.44it/s, est. speed input: 11407.66 toks/s, output: 1122.74 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 1120.60it/s, est. speed input: 11407.66 toks/s, output: 1122.74 toks/s] | |
| Hooks deactivated. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 2641.95it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:27, 3.64it/s, est. speed input: 43.70 toks/s, output: 36.41 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 3.64it/s, est. speed input: 3658.71 toks/s, output: 3601.02 toks/s] | |
| Processed prompts: 100%|██████████| 100/100 [00:00<00:00, 359.93it/s, est. speed input: 3658.71 toks/s, output: 3601.02 toks/s] | |
| ====== LONG BATCHES ====== | |
| dicta deserunt vel sequi tempora fugiat omnis itaque a dolores animi illum cum nisi exercitationem eveniet hic necessitatibus molestiae excepturi id odio rerum nam atque magni provident expedita ea ex rem quam esse dolorem repudiandae officiis saepe dolorum maiores doloribus libero nulla iste quis possimus fuga soluta ratione culpa unde delectus eligendi iure neque iusto quaerat perspiciatis harum veniam sit dolor minima ipsum nobis laudantium facilis in aspernatur ut vero voluptatibus consequatur voluptatum laboriosam voluptatem similique maxime porro error nostrum optio cupiditate eaque commodi eius natus asperiores sapiente autem temporibus consequuntur consectetur corporis quas sed obcaecati eum recusandae beatae facere | |
| voluptate laborum sed pariatur libero consequuntur molestiae necessitatibus reiciendis deserunt voluptas et mollitia nulla ullam sunt tenetur temporibus facilis at eveniet iure unde doloremque velit quaerat labore distinctio vero architecto repudiandae dolor officia quos nisi praesentium nam autem esse blanditiis exercitationem hic nostrum aut beatae maiores quibusdam minima soluta tempora saepe id corrupti excepturi odio perferendis doloribus amet expedita quo ipsum asperiores possimus incidunt harum nesciunt neque vel explicabo provident aspernatur recusandae inventore eius quam vitae aperiam omnis rerum assumenda ratione perspiciatis facere enim consectetur culpa dolore maxime natus magnam ea quis voluptates animi ad dolorem alias nemo tempore voluptatibus | |
| consequuntur fugit voluptas similique incidunt alias vitae perferendis delectus impedit nisi ad id eius ea aperiam quidem tempora quo rerum rem odit deleniti distinctio voluptates quis nostrum quas eveniet exercitationem officia corrupti enim placeat quae consequatur odio asperiores nesciunt repudiandae magnam ab voluptatum libero doloribus adipisci reprehenderit omnis dolorum autem aut debitis sunt quam explicabo unde aliquam provident ipsam dolorem ducimus laudantium error illo deserunt dignissimos voluptatibus ullam culpa veritatis officiis corporis obcaecati possimus quibusdam vero nemo neque non quisquam accusamus molestias ratione est nulla minus sit optio sed quia fugiat dicta sequi mollitia a suscipit recusandae totam iure necessitatibus | |
| Logged run ID. | |
| Created hook flag. | |
| Rendering prompts: 0%| | 0/100 [00:00<?, ?it/s] | |
| Rendering prompts: 100%|██████████| 100/100 [00:00<00:00, 1289.29it/s] | |
| Processed prompts: 0%| | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] | |
| Processed prompts: 1%| | 1/100 [00:00<00:17, 5.72it/s, est. speed input: 1098.01 toks/s, output: 5.72 toks/s](EngineCore pid=18015) ERROR 04-14 19:30:51 [dump_input.py:72] Dumping input data for V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-3B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=20480, download_dir='./cache/', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-3B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}, | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [dump_input.py:79] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=2044-b8218796,prompt_token_ids_len=198,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3490, 3491, 3492, 3493, 3494, 3495, 3496, 3497, 3498, 3499, 3500, 3501, 3502],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2045-9b42f813,prompt_token_ids_len=192,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3503, 3504, 3505, 3506, 3507, 3508, 3509, 3510, 3511, 3512, 3513, 3514],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2046-9b825d98,prompt_token_ids_len=203,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3515, 3516, 3517, 3518, 3519, 3520, 3521, 3522, 3523, 3524, 3525, 3526, 3527],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2047-b88c2641,prompt_token_ids_len=193,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3528, 3529, 3530, 3531, 3532, 3533, 3534, 3535, 3536, 3537, 3538, 3539, 3540],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2048-88794fbd,prompt_token_ids_len=194,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3541, 3542, 3543, 3544, 3545, 3546, 3547, 3548, 3549, 3550, 3551, 3552, 3553],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2049-a93587e2,prompt_token_ids_len=189,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3554, 3555, 3556, 3557, 3558, 3559, 3560, 3561, 3562, 3563, 3564, 3565],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2050-bda1d854,prompt_token_ids_len=194,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3566, 3567, 3568, 3569, 3570, 3571, 3572, 3573, 3574, 3575, 3576, 3577, 3578],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2051-a1795cf6,prompt_token_ids_len=198,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3579, 3580, 3581, 3582, 3583, 3584, 3585, 3586, 3587, 3588, 3589, 3590, 3591],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2052-b561229a,prompt_token_ids_len=205,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3592, 3593, 3594, 3595, 3596, 3597, 3598, 3599, 3600, 3601, 3602, 3603, 3604],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2053-ba229a96,prompt_token_ids_len=207,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3605, 3606, 3607, 3608, 3609, 3610, 3611, 3612, 3613, 3614, 3615, 3616, 3617],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2054-9fcddb51,prompt_token_ids_len=189,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3618, 3619, 3620, 3621, 3622, 3623, 3624, 3625, 3626, 3627, 3628, 3629],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2055-a3f3a119,prompt_token_ids_len=192,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3630, 3631, 3632, 3633, 3634, 3635, 3636, 3637, 3638, 3639, 3640, 3641],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2056-becfb274,prompt_token_ids_len=193,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3642, 3643, 3644, 3645, 3646, 3647, 3648, 3649, 3650, 3651, 3652, 3653, 3654],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2057-861bd315,prompt_token_ids_len=187,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3655, 3656, 3657, 3658, 3659, 3660, 3661, 3662, 3663, 3664, 3665, 3666],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2058-8eeee144,prompt_token_ids_len=195,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3667, 3668, 3669, 3670, 3671, 3672, 3673, 3674, 3675, 3676, 3677, 3678, 3679],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2059-b62632f1,prompt_token_ids_len=190,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3680, 3681, 3682, 3683, 3684, 3685, 3686, 3687, 3688, 3689, 3690, 3691],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2060-a87b758c,prompt_token_ids_len=195,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3692, 3693, 3694, 3695, 3696, 3697, 3698, 3699, 3700, 3701, 3702, 3703, 3704],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2061-a62f6842,prompt_token_ids_len=194,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3705, 3706, 3707, 3708, 3709, 3710, 3711, 3712, 3713, 3714, 3715, 3716, 3717],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2062-a542a2a1,prompt_token_ids_len=198,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3718, 3719, 3720, 3721, 3722, 3723, 3724, 3725, 3726, 3727, 3728, 3729, 3730],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2063-98f23884,prompt_token_ids_len=191,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3731, 3732, 3733, 3734, 3735, 3736, 3737, 3738, 3739, 3740, 3741, 3742],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2064-9bd1562b,prompt_token_ids_len=191,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3743, 3744, 3745, 3746, 3747, 3748, 3749, 3750, 3751, 3752, 3753, 3754],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2065-9c4d4728,prompt_token_ids_len=195,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3755, 3756, 3757, 3758, 3759, 3760, 3761, 3762, 3763, 3764, 3765, 3766, 3767],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2066-8451469f,prompt_token_ids_len=186,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3768, 3769, 3770, 3771, 3772, 3773, 3774, 3775, 3776, 3777, 3778, 3779],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2067-af29f731,prompt_token_ids_len=190,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3780, 3781, 3782, 3783, 3784, 3785, 3786, 3787, 3788, 3789, 3790, 3791],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2068-b06c9747,prompt_token_ids_len=204,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3792, 3793, 3794, 3795, 3796, 3797, 3798, 3799, 3800, 3801, 3802, 3803, 3804],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2069-a8487070,prompt_token_ids_len=197,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3805, 3806, 3807, 3808, 3809, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2070-a7c2d1d3,prompt_token_ids_len=193,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3818, 3819, 3820, 3821, 3822, 3823, 3824, 3825, 3826, 3827, 3828, 3829, 3830],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2071-863139ad,prompt_token_ids_len=196,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3831, 3832, 3833, 3834, 3835, 3836, 3837, 3838, 3839, 3840, 3841, 3842, 3843],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2072-96af88d2,prompt_token_ids_len=196,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3844, 3845, 3846, 3847, 3848, 3849, 3850, 3851, 3852, 3853, 3854, 3855, 3856],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2073-bf63bbdc,prompt_token_ids_len=193,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2074-a0d0fec9,prompt_token_ids_len=192,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2075-9442b2b1,prompt_token_ids_len=202,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3882, 3883, 3884, 3885, 3886, 3887, 3888, 3889, 3890, 3891, 3892, 3893, 3894],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2076-bef91e27,prompt_token_ids_len=186,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3895, 3896, 3897, 3898, 3899, 3900, 3901, 3902, 3903, 3904, 3905, 3906],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2077-8061f48d,prompt_token_ids_len=184,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3907, 3908, 3909, 3910, 3911, 3912, 3913, 3914, 3915, 3916, 3917, 3918],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2078-8a4934e0,prompt_token_ids_len=198,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3919, 3920, 3921, 3922, 3923, 3924, 3925, 3926, 3927, 3928, 3929, 3930, 3931],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2079-a28b2e92,prompt_token_ids_len=198,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3932, 3933, 3934, 3935, 3936, 3937, 3938, 3939, 3940, 3941, 3942, 3943, 3944],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2080-a82a711b,prompt_token_ids_len=184,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952, 3953, 3954, 3955, 3956],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2081-b034f0fd,prompt_token_ids_len=189,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3957, 3958, 3959, 3960, 3961, 3962, 3963, 3964, 3965, 3966, 3967, 3968],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2082-a609022b,prompt_token_ids_len=191,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3969, 3970, 3971, 3972, 3973, 3974, 3975, 3976, 3977, 3978, 3979, 3980],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2083-a4a2c089,prompt_token_ids_len=193,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3981, 3982, 3983, 3984, 3985, 3986, 3987, 3988, 3989, 3990, 3991, 3992, 3993],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2084-a7480792,prompt_token_ids_len=199,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([3994, 3995, 3996, 3997, 3998, 3999, 4000, 4001, 4002, 4003, 4004, 4005, 4006],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None), NewRequestData(req_id=2085-b8286a10,prompt_token_ids_len=191,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None),block_ids=([4007, 4008, 4009, 4010],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None)], scheduled_cached_reqs=CachedRequestData(req_ids=['2043-8f8151aa'],resumed_req_ids=set(),new_token_ids_lens=[],all_token_ids_lens={},new_block_ids=[([3478, 3479, 3480, 3481, 3482, 3483, 3484, 3485, 3486, 3487, 3488, 3489],)],num_computed_tokens=[10],num_output_tokens=[0]), num_scheduled_tokens={2085-b8286a10: 50, 2063-98f23884: 191, 2061-a62f6842: 194, 2079-a28b2e92: 198, 2083-a4a2c089: 193, 2057-861bd315: 187, 2051-a1795cf6: 198, 2071-863139ad: 196, 2084-a7480792: 199, 2069-a8487070: 197, 2065-9c4d4728: 195, 2055-a3f3a119: 192, 2059-b62632f1: 190, 2068-b06c9747: 204, 2081-b034f0fd: 189, 2044-b8218796: 198, 2064-9bd1562b: 191, 2076-bef91e27: 186, 2080-a82a711b: 184, 2077-8061f48d: 184, 2043-8f8151aa: 188, 2060-a87b758c: 195, 2070-a7c2d1d3: 193, 2067-af29f731: 190, 2075-9442b2b1: 202, 2054-9fcddb51: 189, 2050-bda1d854: 194, 2066-8451469f: 186, 2082-a609022b: 191, 2048-88794fbd: 194, 2046-9b825d98: 203, 2047-b88c2641: 193, 2058-8eeee144: 195, 2072-96af88d2: 196, 2049-a93587e2: 189, 2062-a542a2a1: 198, 2053-ba229a96: 207, 2073-bf63bbdc: 193, 2056-becfb274: 193, 2052-b561229a: 205, 2078-8a4934e0: 198, 2045-9b42f813: 192, 2074-a0d0fec9: 192}, total_num_scheduled_tokens=8192, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0], finished_req_ids=['2000-8003b899'], free_encoder_mm_hashes=[], preempted_req_ids=[], has_structured_output_requests=false, pending_structured_output_tokens=false, num_invalid_spec_tokens=null, kv_connector_metadata=null, ec_connector_metadata=null, new_block_ids_to_zero=null) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] EngineCore encountered a fatal error. | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] Traceback (most recent call last): | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1101, in run_engine_core | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] engine_core.run_busy_loop() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1142, in run_busy_loop | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] self._process_engine_step() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1181, in _process_engine_step | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] outputs, model_executed = self.step_fn() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 451, in step_with_batch_queue | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] exec_future = self.model_executor.execute_model( | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] scheduler_output, non_block=True | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/executor/uniproc_executor.py", line 114, in execute_model | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] output.result() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/concurrent/futures/_base.py", line 449, in result | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return self.__get_result() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/concurrent/futures/_base.py", line 401, in __get_result | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] raise self._exception | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/executor/uniproc_executor.py", line 84, in collective_rpc | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] result = run_method(self.driver_worker, method, args, kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/serial_utils.py", line 510, in run_method | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return func(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 332, in execute_model | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return self.worker.execute_model(scheduler_output) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/workers/probe_hidden_states_worker.py", line 166, in execute_model | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return super().execute_model(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return func(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 803, in execute_model | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] output = self.model_runner.execute_model( | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] scheduler_output, intermediate_tensors | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return func(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4034, in execute_model | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] model_output = self._model_forward( | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] input_ids=input_ids, | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ...<3 lines>... | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] **model_kwargs, | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/gpu_model_runner.py", line 3515, in _model_forward | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return self.model( | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] input_ids=input_ids, | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ...<3 lines>... | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] **model_kwargs, | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return self._call_impl(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return forward_call(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/model_executor/models/qwen2.py", line 583, in forward | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] hidden_states = self.model( | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] input_ids, positions, intermediate_tensors, inputs_embeds | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/compilation/decorators.py", line 452, in __call__ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return self.forward(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/model_executor/models/qwen2.py", line 444, in forward | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] hidden_states, residual = layer(positions, hidden_states, residual) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return self._call_impl(*args, **kwargs) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1882, in _call_impl | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] return inner() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1843, in inner | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] hook_result = hook(self, args, result) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/workers/probe_hidden_states_worker.py", line 149, in <lambda> | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] lambda m, i, o, n=name, ln=layer_num: hs_hook(o, n, ln) | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~~^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/workers/probe_hidden_states_worker.py", line 117, in hs_hook | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] hidden[last_indices[i + 1] - 1].detach().cpu() | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] ~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ERROR 04-14 19:30:51 [core.py:1110] IndexError: index 8201 is out of bounds for dimension 0 with size 8192 | |
| (EngineCore pid=18015) Process EngineCore: | |
| Traceback (most recent call last): | |
| (EngineCore pid=18015) Traceback (most recent call last): | |
| File "/home/kimmy/vllm-hook-bug-demo/./repeated_run.py", line 79, in <module> | |
| result = llm.generate(test_cases, temperature=0.0, max_tokens=10) | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/hook_llm.py", line 127, in generate | |
| return self._generate_with_hooks( | |
| ~~~~~~~~~~~~~~~~~~~~~~~~~^ | |
| prompts, sampling_params, cleanup, | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ...<2 lines>... | |
| **kwargs | |
| ^^^^^^^^ | |
| ) | |
| ^ | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/hook_llm.py", line 152, in _generate_with_hooks | |
| self.llm.generate(prompts, prefill_params) | |
| ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/entrypoints/llm.py", line 500, in generate | |
| return self._run_completion( | |
| ~~~~~~~~~~~~~~~~~~~~^ | |
| prompts=prompts, | |
| ^^^^^^^^^^^^^^^^ | |
| ...<5 lines>... | |
| priority=priority, | |
| ^^^^^^^^^^^^^^^^^^ | |
| ) | |
| ^ | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/entrypoints/llm.py", line 1859, in _run_completion | |
| return self._run_engine(use_tqdm=use_tqdm, output_type=output_type) | |
| ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/entrypoints/llm.py", line 2011, in _run_engine | |
| step_outputs = self.llm_engine.step() | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/llm_engine.py", line 302, in step | |
| outputs = self.engine_core.get_output() | |
| File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core_client.py", line 780, in get_output | |
| raise self._format_exception(outputs) from None | |
| vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause. | |
| (EngineCore pid=18015) File "/home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/multiprocessing/process.py", line 313, in _bootstrap | |
| (EngineCore pid=18015) self.run() | |
| (EngineCore pid=18015) ~~~~~~~~^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/multiprocessing/process.py", line 108, in run | |
| (EngineCore pid=18015) self._target(*self._args, **self._kwargs) | |
| (EngineCore pid=18015) ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core | |
| (EngineCore pid=18015) raise e | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1101, in run_engine_core | |
| (EngineCore pid=18015) engine_core.run_busy_loop() | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1142, in run_busy_loop | |
| (EngineCore pid=18015) self._process_engine_step() | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 1181, in _process_engine_step | |
| (EngineCore pid=18015) outputs, model_executed = self.step_fn() | |
| (EngineCore pid=18015) ~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/engine/core.py", line 451, in step_with_batch_queue | |
| (EngineCore pid=18015) exec_future = self.model_executor.execute_model( | |
| (EngineCore pid=18015) scheduler_output, non_block=True | |
| (EngineCore pid=18015) ) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/executor/uniproc_executor.py", line 114, in execute_model | |
| (EngineCore pid=18015) output.result() | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/concurrent/futures/_base.py", line 449, in result | |
| (EngineCore pid=18015) return self.__get_result() | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~~~^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/concurrent/futures/_base.py", line 401, in __get_result | |
| (EngineCore pid=18015) raise self._exception | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/executor/uniproc_executor.py", line 84, in collective_rpc | |
| (EngineCore pid=18015) result = run_method(self.driver_worker, method, args, kwargs) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/serial_utils.py", line 510, in run_method | |
| (EngineCore pid=18015) return func(*args, **kwargs) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/worker_base.py", line 332, in execute_model | |
| (EngineCore pid=18015) return self.worker.execute_model(scheduler_output) | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/workers/probe_hidden_states_worker.py", line 166, in execute_model | |
| (EngineCore pid=18015) return super().execute_model(*args, **kwargs) | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context | |
| (EngineCore pid=18015) return func(*args, **kwargs) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/gpu_worker.py", line 803, in execute_model | |
| (EngineCore pid=18015) output = self.model_runner.execute_model( | |
| (EngineCore pid=18015) scheduler_output, intermediate_tensors | |
| (EngineCore pid=18015) ) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context | |
| (EngineCore pid=18015) return func(*args, **kwargs) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/gpu_model_runner.py", line 4034, in execute_model | |
| (EngineCore pid=18015) model_output = self._model_forward( | |
| (EngineCore pid=18015) input_ids=input_ids, | |
| (EngineCore pid=18015) ...<3 lines>... | |
| (EngineCore pid=18015) **model_kwargs, | |
| (EngineCore pid=18015) ) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/v1/worker/gpu_model_runner.py", line 3515, in _model_forward | |
| (EngineCore pid=18015) return self.model( | |
| (EngineCore pid=18015) ~~~~~~~~~~^ | |
| (EngineCore pid=18015) input_ids=input_ids, | |
| (EngineCore pid=18015) ^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ...<3 lines>... | |
| (EngineCore pid=18015) **model_kwargs, | |
| (EngineCore pid=18015) ^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) ) | |
| (EngineCore pid=18015) ^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl | |
| (EngineCore pid=18015) return self._call_impl(*args, **kwargs) | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl | |
| (EngineCore pid=18015) return forward_call(*args, **kwargs) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/model_executor/models/qwen2.py", line 583, in forward | |
| (EngineCore pid=18015) hidden_states = self.model( | |
| (EngineCore pid=18015) input_ids, positions, intermediate_tensors, inputs_embeds | |
| (EngineCore pid=18015) ) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/compilation/decorators.py", line 452, in __call__ | |
| (EngineCore pid=18015) return self.forward(*args, **kwargs) | |
| (EngineCore pid=18015) ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm/model_executor/models/qwen2.py", line 444, in forward | |
| (EngineCore pid=18015) hidden_states, residual = layer(positions, hidden_states, residual) | |
| (EngineCore pid=18015) ~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl | |
| (EngineCore pid=18015) return self._call_impl(*args, **kwargs) | |
| (EngineCore pid=18015) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1882, in _call_impl | |
| (EngineCore pid=18015) return inner() | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1843, in inner | |
| (EngineCore pid=18015) hook_result = hook(self, args, result) | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/workers/probe_hidden_states_worker.py", line 149, in <lambda> | |
| (EngineCore pid=18015) lambda m, i, o, n=name, ln=layer_num: hs_hook(o, n, ln) | |
| (EngineCore pid=18015) ~~~~~~~^^^^^^^^^^ | |
| (EngineCore pid=18015) File "/home/kimmy/.cache/uv/environments-v2/repeated-run-f38d1a8bfa3607e8/lib/python3.13/site-packages/vllm_hook_plugins/workers/probe_hidden_states_worker.py", line 117, in hs_hook | |
| (EngineCore pid=18015) hidden[last_indices[i + 1] - 1].detach().cpu() | |
| (EngineCore pid=18015) ~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| (EngineCore pid=18015) IndexError: index 8201 is out of bounds for dimension 0 with size 8192 | |
| Processed prompts: 1%| | 1/100 [00:00<01:27, 1.13it/s, est. speed input: 1098.01 toks/s, output: 5.72 toks/s] | |
| /home/kimmy/.local/share/uv/python/cpython-3.13.9-linux-x86_64-gnu/lib/python3.13/multiprocessing/resource_tracker.py:324: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown: {'/mp-2uz7v44g'} | |
| warnings.warn( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S -- uv run --script | |
| # /// script | |
| # requires-python = ">=3.13" | |
| # dependencies = [ | |
| # "lorem-text", | |
| # # Requires CUDA on Linux | |
| # "vllm-hook-plugins ; sys_platform == 'linux'", | |
| # "vllm>=0.19.0 ; sys_platform == 'linux'", | |
| # ] | |
| # [tool.uv.sources] | |
| # vllm-hook-plugins = { git = "https://github.com/IBM/vLLM-Hook/", subdirectory = "vllm_hook_plugins/" } | |
| # /// | |
| import json | |
| import os | |
| import multiprocessing as mp | |
| import torch | |
| from pathlib import Path | |
| from lorem_text import lorem | |
| mp.set_start_method("spawn", force=True) | |
| os.environ["VLLM_USE_V1"] = "1" | |
| os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" | |
| from vllm_hook_plugins import HookLLM | |
| if __name__ == "__main__": | |
| cache_dir = "./cache/" | |
| model = "Qwen/Qwen2.5-3B-Instruct" | |
| model_config_json_path = Path(cache_dir) / "model_config.json" | |
| model_config_json = { | |
| "model_info": { | |
| "name": model | |
| }, | |
| "hidden_states": { | |
| "layers": [15, 20, 30], | |
| "mode": "last_token" | |
| } | |
| } | |
| model_config_json_path.parent.mkdir(parents=True, exist_ok=True) | |
| model_config_json_path.write_text(json.dumps(model_config_json)) | |
| llm = HookLLM( | |
| model=model, | |
| worker_name="probe_hidden_states", | |
| analyzer_name="hidden_states", | |
| config_file=str(model_config_json_path), | |
| download_dir=cache_dir, | |
| gpu_memory_utilization=0.7, | |
| max_model_len=20480, | |
| trust_remote_code=True, | |
| dtype=torch.float16, | |
| enable_prefix_caching=False, | |
| enable_hook=True, | |
| tensor_parallel_size=1, | |
| ) | |
| print ("====== SHORT BATCHES ======") | |
| test_cases = [ | |
| lorem.words(5) | |
| for _ in range(100) | |
| ] | |
| print("\n".join(test_cases[:3])) | |
| for _ in range(10): | |
| result = llm.generate(test_cases, temperature=0.0, max_tokens=10) | |
| stats = llm.analyze(analyzer_spec={"reduce": "none"}) | |
| print ("====== LONG BATCHES ======") | |
| test_cases = [ | |
| lorem.words(100) | |
| for _ in range(100) | |
| ] | |
| print("\n".join(test_cases[:3])) | |
| for _ in range(10): | |
| result = llm.generate(test_cases, temperature=0.0, max_tokens=10) | |
| stats = llm.analyze(analyzer_spec={"reduce": "none"}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment