Skip to content

Instantly share code, notes, and snippets.

@gcr
Created April 14, 2026 19:39
Show Gist options
  • Select an option

  • Save gcr/42de99b5f2eac72220a7d202b1caed9d to your computer and use it in GitHub Desktop.

Select an option

Save gcr/42de99b5f2eac72220a7d202b1caed9d to your computer and use it in GitHub Desktop.
#!/usr/bin/env -S -- uv run --script
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "lorem-text",
# # Requires CUDA on Linux
# "vllm-hook-plugins ; sys_platform == 'linux'",
# "vllm>=0.19.0 ; sys_platform == 'linux'",
# ]
# [tool.uv.sources]
# vllm-hook-plugins = { git = "https://github.com/IBM/vLLM-Hook/", subdirectory = "vllm_hook_plugins/" }
# ///
import json
import os
import multiprocessing as mp
import torch
from pathlib import Path
from lorem_text import lorem
mp.set_start_method("spawn", force=True)
os.environ["VLLM_USE_V1"] = "1"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
from vllm_hook_plugins import HookLLM
if __name__ == "__main__":
cache_dir = "./cache/"
model = "Qwen/Qwen2.5-3B-Instruct"
model_config_json_path = Path(cache_dir) / "model_config.json"
model_config_json = {
"model_info": {
"name": model
},
"hidden_states": {
"layers": [15, 20, 30],
"mode": "last_token"
}
}
model_config_json_path.parent.mkdir(parents=True, exist_ok=True)
model_config_json_path.write_text(json.dumps(model_config_json))
llm = HookLLM(
model=model,
worker_name="probe_hidden_states",
analyzer_name="hidden_states",
config_file=str(model_config_json_path),
download_dir=cache_dir,
gpu_memory_utilization=0.7,
max_model_len=20480,
trust_remote_code=True,
dtype=torch.float16,
enable_prefix_caching=False,
enable_hook=True,
tensor_parallel_size=1,
)
print ("====== SHORT BATCHES ======")
test_cases = [
lorem.words(5)
for _ in range(100)
]
print("\n".join(test_cases[:3]))
for _ in range(10):
result = llm.generate(test_cases, temperature=0.0, max_tokens=10)
stats = llm.analyze(analyzer_spec={"reduce": "none"})
print ("====== LONG BATCHES ======")
test_cases = [
lorem.words(100)
for _ in range(100)
]
print("\n".join(test_cases[:3]))
for _ in range(10):
result = llm.generate(test_cases, temperature=0.0, max_tokens=10)
stats = llm.analyze(analyzer_spec={"reduce": "none"})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment