KCCat · May 6, 2026 15:41
diff --git a/llama.cpp_kld.bash b/llama.cpp_kld.bash
 # 量化 output-tensor，token-embedding到q8, 专家层量化到q8_0
 llama-quantize --output-tensor-type q8_0 --token-embedding-type q8_0 --tensor-type "(ssm_.+|attn_.+|shexp)\.weight=bf16" --tensor-type "exps\.weight=q8_0" {{BF16.gguf}} {{q8.gguf}} Q8_0

 # 量化 output-tensor，token-embedding到q8, 专家层量化到q6k
 llama-quantize --output-tensor-type q8_0 --token-embedding-type q8_0 --tensor-type "(ssm_.+|attn_.+|shexp)\.weight=bf16" --tensor-type "exps\.weight=q6_k" {{BF16.gguf}} {{q6k.gguf}} Q8_0

 # 量化 output-tensor，token-embedding不量化, 专家层量化到q6k
 llama-quantize --output-tensor-type bf16 --token-embedding-type bf16 --tensor-type "(ssm_.+|attn_.+|shexp)\.weight=bf16" --tensor-type "exps\.weight=q6_k" {{BF16.gguf}} {{q6k.gguf}} Q8_0


 # 保存kld基准
 llama-perplexity -m {{BF16.gguf}} --no-mmap -ub 2048 -c 2048 -ctk bf16 -ctv bf16 -fa off -f {{newstest2019.jp_zh.txt}} --save-all-logits {{./bf16-kld-base}}

 # 测量kld
 llama-perplexity -m {{q.gguf}} --no-mmap -ub 2048 -c 2048 -ctk bf16 -ctv bf16 -fa off -f {{newstest2019.jp_zh.txt}} --kl-divergence --kl-divergence-base {{./bf16-kld-base}}

 # --no-mmap : 把模型全部读入显存和内存
 # -ctk bf16 -ctv bf16 -fa off ： 跑基准时建议保持fa关和kv cf16,测量时根据需要调整。
	# 量化 output-tensor，token-embedding到q8, 专家层量化到q8_0
	llama-quantize --output-tensor-type q8_0 --token-embedding-type q8_0 --tensor-type "(ssm_.+\|attn_.+\|shexp)\.weight=bf16" --tensor-type "exps\.weight=q8_0" {{BF16.gguf}} {{q8.gguf}} Q8_0

	# 量化 output-tensor，token-embedding到q8, 专家层量化到q6k
	llama-quantize --output-tensor-type q8_0 --token-embedding-type q8_0 --tensor-type "(ssm_.+\|attn_.+\|shexp)\.weight=bf16" --tensor-type "exps\.weight=q6_k" {{BF16.gguf}} {{q6k.gguf}} Q8_0

	# 量化 output-tensor，token-embedding不量化, 专家层量化到q6k
	llama-quantize --output-tensor-type bf16 --token-embedding-type bf16 --tensor-type "(ssm_.+\|attn_.+\|shexp)\.weight=bf16" --tensor-type "exps\.weight=q6_k" {{BF16.gguf}} {{q6k.gguf}} Q8_0


	# 保存kld基准
	llama-perplexity -m {{BF16.gguf}} --no-mmap -ub 2048 -c 2048 -ctk bf16 -ctv bf16 -fa off -f {{newstest2019.jp_zh.txt}} --save-all-logits {{./bf16-kld-base}}

	# 测量kld
	llama-perplexity -m {{q.gguf}} --no-mmap -ub 2048 -c 2048 -ctk bf16 -ctv bf16 -fa off -f {{newstest2019.jp_zh.txt}} --kl-divergence --kl-divergence-base {{./bf16-kld-base}}

	# --no-mmap : 把模型全部读入显存和内存
	# -ctk bf16 -ctv bf16 -fa off ：跑基准时建议保持fa关和kv cf16,测量时根据需要调整。
No results found