April 21, 2026 04:33 · March 31, 2026 10:56 · January 21, 2026 03:20 · January 13, 2026 05:07 · January 6, 2026 00:44 · November 19, 2025 02:12
 model_path="/dataset/auto-round/qwen_moe/"
 taskname=gsm8k
 taskname=longbench_hotpotqa
 timestamp=$(date +%Y%m%d_%H%M%S)

 model_path="/storage/yiliu7/meta-llama/Llama-3.1-8B-Instruct"
 output_log_file_name="${taskname}_${timestamp}"
 MAX_MODEL_LEN=40960
 max_length=${MAX_MODEL_LEN}
 taskname=gsm8k
 import os
 from functools import wraps
 # from vllm import envs
 from loguru import logger

 def with_thread_limits():
    """
    Decorator to temporarily set OMP_NUM_THREADS and PyTorch threads,
    and restore them after the function call.
    
 ## install uv on OS
 curl -LsSf https://astral.sh/uv/install.sh | sh

 ## create new project
 uv init myproj

 ## install packages
 uv add django requests "pandas>=2.3"

 ## remove package
 diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 index 69c03d8efb8..f3668018c43 100755
 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
                       router_weights,
                       permuted_weights=True,
                       activation="silu"):
 +        enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
 +        if not enable_moe_chunk:
 model_path=inc-res/quantized_model_ds_mxfp8/

 # VLLM_ENABLE_AR_EXT=1 \
 # VLLM_AR_MXFP4_MODULAR_MOE=1 \
 # VLLM_ENABLE_AR_EXT=1 \
 # VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 \
 # VLLM_ENABLE_STATIC_MOE=0 \
 # VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \
 # VLLM_USE_DEEP_GEMM=0 \
 # VLLM_ENABLE_V1_MULTIPROCESSING=1 \
 #!/bin/bash
 #
 # https://docs.docker.com/build/buildkit/
 # https://github.com/docker/buildx/releases/
 # https://github.com/docker/buildx

 ## docker builder prune --all
 ## docker buildx du --verbose

 ## For Ubuntu 24.04 try: sudo apt install docker-buildx
 diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 index 69c03d8efb8..f3668018c43 100755
 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 @@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
                       router_weights,
                       permuted_weights=True,
                       activation="silu"):
 +        enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
 +        if enable_moe_chunk:
 #!/bin/bash
 # Check if a model name is passed as an argument, otherwise use the default model path
 if [ -z "$1" ]; then
  model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound"
 else
  model_path="$1"
 fi

 tp_size=1
 model_name=$(basename ${model_path})
`torch.cuda`	Replacement	Since
`torch.cuda.is_available()`	`torch.accelerator.is_available()`	2.6
`torch.cuda.device_count()`	`torch.accelerator.device_count()`	2.6
`torch.cuda.current_device()`	`torch.accelerator.current_device_index()`	2.6
	model_path="/dataset/auto-round/qwen_moe/"
	taskname=gsm8k
	taskname=longbench_hotpotqa
	timestamp=$(date +%Y%m%d_%H%M%S)

	model_path="/storage/yiliu7/meta-llama/Llama-3.1-8B-Instruct"
	output_log_file_name="${taskname}_${timestamp}"
	MAX_MODEL_LEN=40960
	max_length=${MAX_MODEL_LEN}
	taskname=gsm8k
	import os
	from functools import wraps
	# from vllm import envs
	from loguru import logger

	def with_thread_limits():
	"""
	Decorator to temporarily set OMP_NUM_THREADS and PyTorch threads,
	and restore them after the function call.
	## install uv on OS
	curl -LsSf https://astral.sh/uv/install.sh \| sh

	## create new project
	uv init myproj

	## install packages
	uv add django requests "pandas>=2.3"

	## remove package
	diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
	index 69c03d8efb8..f3668018c43 100755
	--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
	+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
	@@ -930,6 +930,22 @@ class PatchedVllmMixtureOfExpertsOp(PatchedModuleBase):
	router_weights,
	permuted_weights=True,
	activation="silu"):
	+ enable_moe_chunk = hasattr(self.orig_mod, "enable_moe_chunk") and self.orig_mod.enable_moe_chunk
	+ if not enable_moe_chunk:
	model_path=inc-res/quantized_model_ds_mxfp8/

	# VLLM_ENABLE_AR_EXT=1 \
	# VLLM_AR_MXFP4_MODULAR_MOE=1 \
	# VLLM_ENABLE_AR_EXT=1 \
	# VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 \
	# VLLM_ENABLE_STATIC_MOE=0 \
	# VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 \
	# VLLM_USE_DEEP_GEMM=0 \
	# VLLM_ENABLE_V1_MULTIPROCESSING=1 \
	#!/bin/bash
	#
	# https://docs.docker.com/build/buildkit/
	# https://github.com/docker/buildx/releases/
	# https://github.com/docker/buildx

	## docker builder prune --all
	## docker buildx du --verbose

	## For Ubuntu 24.04 try: sudo apt install docker-buildx
	#!/bin/bash
	# Check if a model name is passed as an argument, otherwise use the default model path
	if [ -z "$1" ]; then
	model_path="Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound"
	else
	model_path="$1"
	fi

	tp_size=1
	model_name=$(basename ${model_path})