justinchuby · March 9, 2026 18:07
diff --git a/mb_flags.py b/mb_flags.py
        # Mask-specific variables
        # TODO: Reconcile differences between `seqlens_k` and `key_total_seq_lens` in the GroupQueryAttention and SparseAttention implementations. Ideally the same subgraph can be shared for both.
        self.mask_attrs = {
            # "mask_name": "",  # Name of node that outputs 4D causal attention mask (used as add_qk in MultiHeadAttention)
            "seqlens_k": "",  # Sum of each row in attention mask - 1 (used as input to GroupQueryAttention)
            "total_seq_len": "",  # Size of total sequence length in attention mask (used as input to GroupQueryAttention and SparseAttention)
            # "block_row_indices": "",  # Row indices of CSR format of block mask (used as input to SparseAttention)
            # "block_col_indices": "",  # Col indices of CSR format of block mask (used as input to SparseAttention)
            # "key_total_seq_lens": "",  # Sum of each row in attention mask (used as input to SparseAttention)
        }

        # Embedding-specific variables
        self.embed_attrs = {
            "scale": 1,  # Scale value to multiply output of Embedding layer by  # from config
        }

        # LayerNorm-specific variables
        epsilon = config.rms_norm_eps if hasattr(config, "rms_norm_eps") else 1e-06
        self.layernorm_attrs = {
            "simple": True,  # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm
            # can have combinations (below)
            "first_layernorm": True,  # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms
            "last_layernorm": False,  # Last LayerNorm = SkipLayerNorm with only output 0 (no output 3)
            # states (below)
            # "root_input": "",  # Root input from parent node for LayerNorm and SkipLayerNorm
            # "skip_input": "",  # Skip input from parent node for SkipLayerNorm
            # "output_0": "",  # Output 0 for LayerNorm and SkipLayerNorm
            # "output_3": "",  # Output 3 for SkipLayerNorm
            "add_offset": 0,  # Offset value for LayerNorm weight
            # config
            "epsilon": epsilon,  # Epsilon value to avoid `sqrt(0)` in LayerNorm
            # useful
            "cast": {  # Casting LayerNorm-specific variables
                "use_fp32": False,  # Use float32 precision to compute LayerNorm
                "root_input": False,  # Cast root_input
                "skip_input": False,  # Cast skip_input
                "output_0": False,  # Cast output_0
                "output_3": False,  # Cast output_3
            },
        }

        # MatMul-specific variables
        is_lora = hasattr(config, "peft_type") and config.peft_type == "LORA"
        self.matmul_attrs = {
            "use_lora": is_lora,  # Use LoRA/QLoRA format
        }

        # RotaryEmbedding-specific variables
        position_scale = config.rope_position_scale if hasattr(config, "rope_position_scale") else 1
        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
        rotemb_dim = int(self.head_size * partial_rotary_factor) if partial_rotary_factor != 1.0 else 0
        rope_theta = (
            config.rope_theta
            if hasattr(config, "rope_theta")
            else config.rope_embedding_base
            if hasattr(config, "rope_embedding_base")
            else 10000
        )
        self.rope_attrs = {
            # caches are shared if they are the same
            "create_caches": True,  # Create cos/sin caches for rotary embeddings
            "save_caches": True,  # Auto-save cos/sin caches for rotary embeddings after creation
            # config
            "cache_length": self.context_length,  # Cache length to use when creating cos/sin caches for rotary embeddings
            "theta": rope_theta,  # Base value if calculating cos/sin caches from scratch
            "partial_rotary_factor": partial_rotary_factor,  # Factor for partial rotary embeddings
            "interleaved": 0,  # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0)
            "rotary_embedding_dim": rotemb_dim,  # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0)
            "rescale_factors": 1,  # Rescale factors when calculating `inv_freq` in rotary embeddings
            "t_dtype": torch.int64,  # Torch dtype when calculating `t` in rotary embeddings
            "position_scale": position_scale,  # Scale value when calculating `t` in rotary embeddings
            "mscale": 1,  # Magnitude scaling factor when scaling `emb.cos()/emb.sin()` in rotary embeddings
            # policies can be different
            "mscale_policy": "",  # Magnitude scaling policy when scaling `emb.cos()/emb.sin()` in rotary embeddings
        }
        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
            self.make_rope_init(config)

        # Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.)
        attn_softcap = (
            config.attn_logit_softcapping
            if hasattr(config, "attn_logit_softcapping") and config.attn_logit_softcapping is not None
            else 0.0
        )  # default is 0.0 in GroupQueryAttention kernel

        # Block-sparse attention-specific variables
        sparse_block_size = config.blocksparse_block_size if hasattr(config, "blocksparse_block_size") else 0
        kernel_block_size = (
            config.blocksparse_triton_kernel_block_size
            if hasattr(config, "blocksparse_triton_kernel_block_size")
            else 0
        )
        local_blocks = config.blocksparse_num_local_blocks if hasattr(config, "blocksparse_num_local_blocks") else 0
        vert_block_stride = config.blocksparse_vert_stride if hasattr(config, "blocksparse_vert_stride") else 0
        homo_head = config.blocksparse_homo_head_pattern if hasattr(config, "blocksparse_homo_head_pattern") else False

        # useful
        self.attention_attrs = {
            # "q_path": "",  # Q path to attention
            # "k_path": "",  # K path to attention
            # "v_path": "",  # V path to attention
            # pick attention op
            # "op_type": "MultiHeadAttention",  # Attention op to use
            # config
            "scale": 1 / np.sqrt(self.head_size),  # Scale value after calculating Q x K' in attention
            "softcap": attn_softcap,  # Softcap value to prevent values from exploding in attention
            # rope fusion
            "use_rope_in_attn": False,  # Use rotary embeddings within attention (instead of a separate RotaryEmbedding op)
            "use_packed_matmul": False,  # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
            # phi model
            "block_sparse": {  # Block-sparse attention-specific variables
                "sparse_block_size": sparse_block_size,  # Sparse block size for SparseAttention op
                "kernel_block_size": kernel_block_size,  # Kernel block size for sparse attention
                "local_blocks": local_blocks,  # Number of local blocks for sparse attention
                "vert_stride": vert_block_stride,  # Vertical stride to use for sparse attention
                "homo_head": homo_head,  # Use homo head pattern for sparse attention
            },
            # useful
            "q_norm": False,  # LayerNorm after MatMul in Q path
            "k_norm": False,  # LayerNorm after MatMul in K path
            "sinks": False,  # Sink values for softmax in attention
        }
        self.make_attention_init()

        # MLP-specific variables
        self.mlp_attrs = {
            # exclusive
            "use_proj": True,  # Use projection style for MLP (GateProj/UpProj/DownProj)
            "use_fc": False,  # Use fully-connected style for MLP (FC1/FC2)
            # "output_0": "",  # Output 0 for MLP layer
        }

        # MoE-specific variables
        # precision determined
        moe_op_type = "QMoE" if self.onnx_dtype == ir.DataType.INT4 else "MoE"
        # config
        num_experts = config.num_local_experts if hasattr(config, "num_local_experts") else 0
        top_k_experts = config.num_experts_per_tok if hasattr(config, "num_experts_per_tok") else 0
        expert_weight_bits = 8 if extra_options.get("use_8bits_moe", False) else 4
        swiglu_limit = config.swiglu_limit if hasattr(config, "swiglu_limit") else None
        # config end
        # all moe attributes!
        self.moe_attrs = {
            "op_type": moe_op_type,  # MoE op to use
            "num_experts": num_experts,  # Number of experts in MoE layer
            "top_k": top_k_experts,  # Number of experts to select in MoE layer
            "activation_alpha": 1.0,  # Alpha parameter used in activation function
            "activation_beta": 0.0,  # Beta parameter used in activation function
            "activation_type": self.activation,  # Activation function for MoE layer
            "expert_weight_bits": expert_weight_bits,  # Number of bits used in quantized MoE weights (only INT4 or INT8 are supported).
            # depend on the model
            # from modeling logic
            "normalize_routing_weights": False,  # Normalize routing weights in MoE layer
            # fusion level depending on the model
            "swiglu_fusion": 0,  # Fusion level for SwiGLU activation function
            "swiglu_limit": swiglu_limit,  # Value used to clamp results into a certain range in SwiGLU activation function
            # phi-3.5 specific
            "use_sparse_mixer": False,  # Use SparseMixer in MoE layer (used in Phi-3.5 MoE)
        }

        # LM head-specific variables
        lm_head_softcap = (
            config.final_logit_softcapping
            if hasattr(config, "final_logit_softcapping") and config.final_logit_softcapping is not None
            else 0.0
        )  # default is 0.0 in GroupQueryAttention kernel
        self.lm_head_attrs = {
            "scale": 1,  # Scale value to multiply output of LM head by
            # gemma specific
            "mask": None,  # LM head mask for tokens in the vocabulary
            "softcap": lm_head_softcap,  # Softcap value to prevent values from exploding in LM head
        }
        if hasattr(config, "dummy_token_indices"):
            # Create LM head mask for tokens in the vocabulary
            dummy_tokens_mask = torch.zeros(self.vocab_size).bool()
            dummy_tokens_mask[config.dummy_token_indices] = True
            self.lm_head_attrs["mask"] = dummy_tokens_mask

        # Quantization-specific variables (INT4, INT8, etc.)
        # if we have quant model input
        int4_algo_config = self.make_int4_algo_config(extra_options.get("int4_algo_config", "default"))
        self.int4_block_size = extra_options.get("int4_block_size", 32)

        # CPU, WebGPU, and TRT-RTX support block-wise quantization for QMoE.
        # TRT-RTX defaults to 128; others default to 32 for consistency with MatMulNBits.
        supported_blockwise_eps = ["cpu", "webgpu", "trt-rtx"]
        # attr
        default_qmoe_block_size = 128 if self.ep == "trt-rtx" else 32
        self.qmoe_block_size = int(extra_options.get("qmoe_block_size", default_qmoe_block_size))

        # Validate that unsupported EPs don't explicitly request block-wise quantization
        if self.ep not in supported_blockwise_eps and "qmoe_block_size" in extra_options and moe_op_type == "QMoE":
            raise ValueError(
                f"The 'qmoe_block_size' option is not supported for {self.ep} execution provider with QMoE. "
                f"Block-wise quantization is only supported for: {', '.join(supported_blockwise_eps)}."
            )

        self.quant_attrs = {
            "int4": {
                "accuracy_level": int(
                    extra_options.get("int4_accuracy_level", 4 if self.ep in ["cpu", "webgpu"] else 0)
                ),
                "qmoe_block_size": int(self.qmoe_block_size),
                "qdq_block_size": int(self.int4_block_size),
                "is_symmetric": extra_options.get("int4_is_symmetric", True),
                "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul",)),
                "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []),
                "algo_config": int4_algo_config,
            },
            "use_qdq": extra_options.get("use_qdq", False),
        }

        # Needs clean up
        # Propagate block_size to MoE/QMoE op when supported.
        # QMoE on supported EPs uses block-wise quantization via the 'block_size' attribute.
        # Ensure the attribute is set on the MoE op so runtime kernels can honor it.
        if self.moe_attrs.get("op_type") == "QMoE" and self.ep in supported_blockwise_eps:
            self.moe_attrs["block_size"] = int(self.qmoe_block_size)

        # Correct
        if self.quant_type is not None:
            # Create quantized attributes from quantization config
            self.quant_attrs["config"] = config.quantization_config
            self.quant_attrs["use_g_idx"] = (
                config.quantization_config["desc_act"] if "desc_act" in config.quantization_config else False
            )

        # controls embedding and lm_head quantization since they often share weights and should be quantized together for consistency. If the lm_head is unquantized, then we should not quantize the embeddings even if the quantization config says to, since that would lead to a large accuracy drop.
        # Determine if lm_head is unquantized. int4/8 can have options to int4_nodes_to_exclude. FP models are always unquantized.
        self.unquantized_lm_head = "/lm_head/MatMul" in self.quant_attrs["int4"][
            "nodes_to_exclude"
        ] or self.onnx_dtype in {ir.DataType.FLOAT, ir.DataType.FLOAT16, ir.DataType.BFLOAT16}
        self.shared_embeddings = extra_options.get(
            "shared_embeddings",
            config.tie_word_embeddings
            if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is not None
            else False,
        )
        self.int8_lm_head = extra_options.get("int4_algo_config", "default") in {
            "k_quant_mixed",
            "k_quant_last",
            "rtn_last",
        }

        # shared_embeddings conflicts with exclude_embeds and exclude_lm_head
        if self.shared_embeddings and (self.exclude_embeds or self.exclude_lm_head):
            self.shared_embeddings = False
        elif self.shared_embeddings and not self.unquantized_lm_head:
            # matmul_nbits_quantizer.py has a different naming for default quantization, so lm_head.MatMul.weight_Q{}G{} does not match.
            self.shared_embeddings = self.int8_lm_head or extra_options.get("int4_algo_config", "default") in {
                "rtn",
                "k_quant",
            }
	# Mask-specific variables
	# TODO: Reconcile differences between `seqlens_k` and `key_total_seq_lens` in the GroupQueryAttention and SparseAttention implementations. Ideally the same subgraph can be shared for both.
	self.mask_attrs = {
	# "mask_name": "", # Name of node that outputs 4D causal attention mask (used as add_qk in MultiHeadAttention)
	"seqlens_k": "", # Sum of each row in attention mask - 1 (used as input to GroupQueryAttention)
	"total_seq_len": "", # Size of total sequence length in attention mask (used as input to GroupQueryAttention and SparseAttention)
	# "block_row_indices": "", # Row indices of CSR format of block mask (used as input to SparseAttention)
	# "block_col_indices": "", # Col indices of CSR format of block mask (used as input to SparseAttention)
	# "key_total_seq_lens": "", # Sum of each row in attention mask (used as input to SparseAttention)
	}

	# Embedding-specific variables
	self.embed_attrs = {
	"scale": 1, # Scale value to multiply output of Embedding layer by # from config
	}

	# LayerNorm-specific variables
	epsilon = config.rms_norm_eps if hasattr(config, "rms_norm_eps") else 1e-06
	self.layernorm_attrs = {
	"simple": True, # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm
	# can have combinations (below)
	"first_layernorm": True, # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms
	"last_layernorm": False, # Last LayerNorm = SkipLayerNorm with only output 0 (no output 3)
	# states (below)
	# "root_input": "", # Root input from parent node for LayerNorm and SkipLayerNorm
	# "skip_input": "", # Skip input from parent node for SkipLayerNorm
	# "output_0": "", # Output 0 for LayerNorm and SkipLayerNorm
	# "output_3": "", # Output 3 for SkipLayerNorm
	"add_offset": 0, # Offset value for LayerNorm weight
	# config
	"epsilon": epsilon, # Epsilon value to avoid `sqrt(0)` in LayerNorm
	# useful
	"cast": { # Casting LayerNorm-specific variables
	"use_fp32": False, # Use float32 precision to compute LayerNorm
	"root_input": False, # Cast root_input
	"skip_input": False, # Cast skip_input
	"output_0": False, # Cast output_0
	"output_3": False, # Cast output_3
	},
	}

	# MatMul-specific variables
	is_lora = hasattr(config, "peft_type") and config.peft_type == "LORA"
	self.matmul_attrs = {
	"use_lora": is_lora, # Use LoRA/QLoRA format
	}

	# RotaryEmbedding-specific variables
	position_scale = config.rope_position_scale if hasattr(config, "rope_position_scale") else 1
	partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
	rotemb_dim = int(self.head_size * partial_rotary_factor) if partial_rotary_factor != 1.0 else 0
	rope_theta = (
	config.rope_theta
	if hasattr(config, "rope_theta")
	else config.rope_embedding_base
	if hasattr(config, "rope_embedding_base")
	else 10000
	)
	self.rope_attrs = {
	# caches are shared if they are the same
	"create_caches": True, # Create cos/sin caches for rotary embeddings
	"save_caches": True, # Auto-save cos/sin caches for rotary embeddings after creation
	# config
	"cache_length": self.context_length, # Cache length to use when creating cos/sin caches for rotary embeddings
	"theta": rope_theta, # Base value if calculating cos/sin caches from scratch
	"partial_rotary_factor": partial_rotary_factor, # Factor for partial rotary embeddings
	"interleaved": 0, # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0)
	"rotary_embedding_dim": rotemb_dim, # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0)
	"rescale_factors": 1, # Rescale factors when calculating `inv_freq` in rotary embeddings
	"t_dtype": torch.int64, # Torch dtype when calculating `t` in rotary embeddings
	"position_scale": position_scale, # Scale value when calculating `t` in rotary embeddings
	"mscale": 1, # Magnitude scaling factor when scaling `emb.cos()/emb.sin()` in rotary embeddings
	# policies can be different
	"mscale_policy": "", # Magnitude scaling policy when scaling `emb.cos()/emb.sin()` in rotary embeddings
	}
	if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
	self.make_rope_init(config)

	# Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.)
	attn_softcap = (
	config.attn_logit_softcapping
	if hasattr(config, "attn_logit_softcapping") and config.attn_logit_softcapping is not None
	else 0.0
	) # default is 0.0 in GroupQueryAttention kernel

	# Block-sparse attention-specific variables
	sparse_block_size = config.blocksparse_block_size if hasattr(config, "blocksparse_block_size") else 0
	kernel_block_size = (
	config.blocksparse_triton_kernel_block_size
	if hasattr(config, "blocksparse_triton_kernel_block_size")
	else 0
	)
	local_blocks = config.blocksparse_num_local_blocks if hasattr(config, "blocksparse_num_local_blocks") else 0
	vert_block_stride = config.blocksparse_vert_stride if hasattr(config, "blocksparse_vert_stride") else 0
	homo_head = config.blocksparse_homo_head_pattern if hasattr(config, "blocksparse_homo_head_pattern") else False

	# useful
	self.attention_attrs = {
	# "q_path": "", # Q path to attention
	# "k_path": "", # K path to attention
	# "v_path": "", # V path to attention
	# pick attention op
	# "op_type": "MultiHeadAttention", # Attention op to use
	# config
	"scale": 1 / np.sqrt(self.head_size), # Scale value after calculating Q x K' in attention
	"softcap": attn_softcap, # Softcap value to prevent values from exploding in attention
	# rope fusion
	"use_rope_in_attn": False, # Use rotary embeddings within attention (instead of a separate RotaryEmbedding op)
	"use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
	# phi model
	"block_sparse": { # Block-sparse attention-specific variables
	"sparse_block_size": sparse_block_size, # Sparse block size for SparseAttention op
	"kernel_block_size": kernel_block_size, # Kernel block size for sparse attention
	"local_blocks": local_blocks, # Number of local blocks for sparse attention
	"vert_stride": vert_block_stride, # Vertical stride to use for sparse attention
	"homo_head": homo_head, # Use homo head pattern for sparse attention
	},
	# useful
	"q_norm": False, # LayerNorm after MatMul in Q path
	"k_norm": False, # LayerNorm after MatMul in K path
	"sinks": False, # Sink values for softmax in attention
	}
	self.make_attention_init()

	# MLP-specific variables
	self.mlp_attrs = {
	# exclusive
	"use_proj": True, # Use projection style for MLP (GateProj/UpProj/DownProj)
	"use_fc": False, # Use fully-connected style for MLP (FC1/FC2)
	# "output_0": "", # Output 0 for MLP layer
	}

	# MoE-specific variables
	# precision determined
	moe_op_type = "QMoE" if self.onnx_dtype == ir.DataType.INT4 else "MoE"
	# config
	num_experts = config.num_local_experts if hasattr(config, "num_local_experts") else 0
	top_k_experts = config.num_experts_per_tok if hasattr(config, "num_experts_per_tok") else 0
	expert_weight_bits = 8 if extra_options.get("use_8bits_moe", False) else 4
	swiglu_limit = config.swiglu_limit if hasattr(config, "swiglu_limit") else None
	# config end
	# all moe attributes!
	self.moe_attrs = {
	"op_type": moe_op_type, # MoE op to use
	"num_experts": num_experts, # Number of experts in MoE layer
	"top_k": top_k_experts, # Number of experts to select in MoE layer
	"activation_alpha": 1.0, # Alpha parameter used in activation function
	"activation_beta": 0.0, # Beta parameter used in activation function
	"activation_type": self.activation, # Activation function for MoE layer
	"expert_weight_bits": expert_weight_bits, # Number of bits used in quantized MoE weights (only INT4 or INT8 are supported).
	# depend on the model
	# from modeling logic
	"normalize_routing_weights": False, # Normalize routing weights in MoE layer
	# fusion level depending on the model
	"swiglu_fusion": 0, # Fusion level for SwiGLU activation function
	"swiglu_limit": swiglu_limit, # Value used to clamp results into a certain range in SwiGLU activation function
	# phi-3.5 specific
	"use_sparse_mixer": False, # Use SparseMixer in MoE layer (used in Phi-3.5 MoE)
	}

	# LM head-specific variables
	lm_head_softcap = (
	config.final_logit_softcapping
	if hasattr(config, "final_logit_softcapping") and config.final_logit_softcapping is not None
	else 0.0
	) # default is 0.0 in GroupQueryAttention kernel
	self.lm_head_attrs = {
	"scale": 1, # Scale value to multiply output of LM head by
	# gemma specific
	"mask": None, # LM head mask for tokens in the vocabulary
	"softcap": lm_head_softcap, # Softcap value to prevent values from exploding in LM head
	}
	if hasattr(config, "dummy_token_indices"):
	# Create LM head mask for tokens in the vocabulary
	dummy_tokens_mask = torch.zeros(self.vocab_size).bool()
	dummy_tokens_mask[config.dummy_token_indices] = True
	self.lm_head_attrs["mask"] = dummy_tokens_mask

	# Quantization-specific variables (INT4, INT8, etc.)
	# if we have quant model input
	int4_algo_config = self.make_int4_algo_config(extra_options.get("int4_algo_config", "default"))
	self.int4_block_size = extra_options.get("int4_block_size", 32)

	# CPU, WebGPU, and TRT-RTX support block-wise quantization for QMoE.
	# TRT-RTX defaults to 128; others default to 32 for consistency with MatMulNBits.
	supported_blockwise_eps = ["cpu", "webgpu", "trt-rtx"]
	# attr
	default_qmoe_block_size = 128 if self.ep == "trt-rtx" else 32
	self.qmoe_block_size = int(extra_options.get("qmoe_block_size", default_qmoe_block_size))

	# Validate that unsupported EPs don't explicitly request block-wise quantization
	if self.ep not in supported_blockwise_eps and "qmoe_block_size" in extra_options and moe_op_type == "QMoE":
	raise ValueError(
	f"The 'qmoe_block_size' option is not supported for {self.ep} execution provider with QMoE. "
	f"Block-wise quantization is only supported for: {', '.join(supported_blockwise_eps)}."
	)

	self.quant_attrs = {
	"int4": {
	"accuracy_level": int(
	extra_options.get("int4_accuracy_level", 4 if self.ep in ["cpu", "webgpu"] else 0)
	),
	"qmoe_block_size": int(self.qmoe_block_size),
	"qdq_block_size": int(self.int4_block_size),
	"is_symmetric": extra_options.get("int4_is_symmetric", True),
	"op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul",)),
	"nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []),
	"algo_config": int4_algo_config,
	},
	"use_qdq": extra_options.get("use_qdq", False),
	}

	# Needs clean up
	# Propagate block_size to MoE/QMoE op when supported.
	# QMoE on supported EPs uses block-wise quantization via the 'block_size' attribute.
	# Ensure the attribute is set on the MoE op so runtime kernels can honor it.
	if self.moe_attrs.get("op_type") == "QMoE" and self.ep in supported_blockwise_eps:
	self.moe_attrs["block_size"] = int(self.qmoe_block_size)

	# Correct
	if self.quant_type is not None:
	# Create quantized attributes from quantization config
	self.quant_attrs["config"] = config.quantization_config
	self.quant_attrs["use_g_idx"] = (
	config.quantization_config["desc_act"] if "desc_act" in config.quantization_config else False
	)

	# controls embedding and lm_head quantization since they often share weights and should be quantized together for consistency. If the lm_head is unquantized, then we should not quantize the embeddings even if the quantization config says to, since that would lead to a large accuracy drop.
	# Determine if lm_head is unquantized. int4/8 can have options to int4_nodes_to_exclude. FP models are always unquantized.
	self.unquantized_lm_head = "/lm_head/MatMul" in self.quant_attrs["int4"][
	"nodes_to_exclude"
	] or self.onnx_dtype in {ir.DataType.FLOAT, ir.DataType.FLOAT16, ir.DataType.BFLOAT16}
	self.shared_embeddings = extra_options.get(
	"shared_embeddings",
	config.tie_word_embeddings
	if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is not None
	else False,
	)
	self.int8_lm_head = extra_options.get("int4_algo_config", "default") in {
	"k_quant_mixed",
	"k_quant_last",
	"rtn_last",
	}

	# shared_embeddings conflicts with exclude_embeds and exclude_lm_head
	if self.shared_embeddings and (self.exclude_embeds or self.exclude_lm_head):
	self.shared_embeddings = False
	elif self.shared_embeddings and not self.unquantized_lm_head:
	# matmul_nbits_quantizer.py has a different naming for default quantization, so lm_head.MatMul.weight_Q{}G{} does not match.
	self.shared_embeddings = self.int8_lm_head or extra_options.get("int4_algo_config", "default") in {
	"rtn",
	"k_quant",
	}
No results found