Created
November 13, 2025 09:00
-
-
Save yiliu30/317186949eac45058cde459fbc980ce4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| model_path=inc-res/quantized_model_ds_mxfp8/ | |
| # VLLM_ENABLE_AR_EXT=1 \ | |
| # VLLM_AR_MXFP4_MODULAR_MOE=1 \ | |
| # VLLM_ENABLE_AR_EXT=1 \ | |
| # VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 \ | |
| # VLLM_ENABLE_STATIC_MOE=0 \ | |
| # VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \ | |
| # VLLM_USE_DEEP_GEMM=0 \ | |
| # VLLM_ENABLE_V1_MULTIPROCESSING=1 \ | |
| # vllm bench throughput \ | |
| # --model $model_path \ | |
| # --dataset-name sonnet \ | |
| # --dataset-path ../../vllm/benchmarks/sonnet.txt \ | |
| # --num-prompts 10 | |
| # - BS 10 | |
| # VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 \ | |
| # VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \ | |
| # Throughput: 0.40 requests/s, 276.36 total tokens/s, 60.52 output tokens/s | |
| # Total num prompt tokens: 5350 | |
| # Total num output tokens: 1500 | |
| # - BS 512 | |
| # Throughput: 12.86 requests/s, 8879.82 total tokens/s, 1929.68 output tokens/s | |
| # Total num prompt tokens: 276610 | |
| # Total num output tokens: 76800 | |
| # run_bench.sh: line 62: syntax erro | |
| # VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 \ | |
| # VLLM_MXFP4_PRE_UNPACK_WEIGHTS=1 \ | |
| # Throughput: 3.39 requests/s, 2324.83 total tokens/s, 509.09 output tokens/s | |
| # Total num prompt tokens: 5350 | |
| # Total num output tokens: 1500 | |
| # - BS 512 | |
| # Throughput: 32.56 requests/s, 22471.77 total tokens/s, 4883.37 output tokens/s | |
| # Total num prompt tokens: 276610 | |
| # Total num output tokens: 76800 | |
| model_path=quantized_model_qwen_mxfp4 | |
| # VLLM_LOGGING_LEVEL=DEBUG \ | |
| VLLM_TORCH_PROFILER_RECORD_SHAPES=1 \ | |
| VLLM_TORCH_PROFILER_WITH_FLOPS=1 \ | |
| VLLM_TORCH_PROFILER_DIR=./vllm_profiler/qwen_mxfp4-online-dequant-debug-bs512 \ | |
| VLLM_ENABLE_AR_EXT=1 \ | |
| VLLM_AR_MXFP4_MODULAR_MOE=1 \ | |
| VLLM_ENABLE_AR_EXT=1 \ | |
| VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 \ | |
| VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \ | |
| VLLM_ENABLE_STATIC_MOE=0 \ | |
| VLLM_USE_DEEP_GEMM=0 \ | |
| VLLM_ENABLE_V1_MULTIPROCESSING=1 \ | |
| vllm bench throughput \ | |
| --model $model_path \ | |
| --dataset-name sonnet \ | |
| --dataset-path ../../vllm/benchmarks/sonnet.txt \ | |
| --num-prompts 512 \ | |
| --profile \ | |
| --input-len 32 \ | |
| --output-len 4 | |
| #!/bin/bash | |
| # Model Benchmarking Script | |
| # Usage: ./run_bench.sh | |
| # Define the model path | |
| # MODEL_PATH="/path/to/quantized_model" | |
| # Define the dataset | |
| DATASET_NAME="sonnet" | |
| DATASET_PATH="../../vllm/benchmarks/sonnet.txt" | |
| # Define the list of num-prompts to benchmark | |
| # NUM_PROMPTS_LIST=(32 64 128 256 512) | |
| NUM_PROMPTS_LIST=(4) | |
| # Loop through each num-prompts value and run the benchmark | |
| # for NUM_PROMPTS in "${NUM_PROMPTS_LIST[@]}"; do | |
| # echo "Benchmarking with num-prompts: $NUM_PROMPTS" | |
| # VLLM_ENABLE_AR_EXT=1 \ | |
| # VLLM_AR_MXFP4_MODULAR_MOE=1 \ | |
| # VLLM_ENABLE_AR_EXT=1 \ | |
| # VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 \ | |
| # VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \ | |
| # VLLM_ENABLE_STATIC_MOE=0 \ | |
| # VLLM_USE_DEEP_GEMM=0 \ | |
| # VLLM_ENABLE_V1_MULTIPROCESSING=1 \ | |
| # vllm bench throughput \ | |
| # --model $model_path \ | |
| # --dataset-name $DATASET_NAME \ | |
| # --dataset-path $DATASET_PATH \ | |
| # --num-prompts $NUM_PROMPTS | |
| # echo "Completed benchmarking with num-prompts: $NUM_PROMPTS" | |
| # echo "---------------------------------------------" | |
| # done | |
| # echo "All benchmarks completed." | |
| # # --profile | |
| # lm_eval --model vllm \ | |
| # --model_args "pretrained=${model_path},tensor_parallel_size=${tp_size},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,enable_expert_parallel=True" \ | |
| # --tasks $task_name \ | |
| # --batch_size 16 \ | |
| # --limit 256 \ | |
| # --log_samples \ | |
| # --seed 42 \ | |
| # --profile \ | |
| # --output_path ${output_dir} \ | |
| # --show_config 2>&1 | tee ${output_dir}/log.txt | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment