Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save ashvinnihalani/952342f092a11f43edd48abe20959a5c to your computer and use it in GitHub Desktop.

Select an option

Save ashvinnihalani/952342f092a11f43edd48abe20959a5c to your computer and use it in GitHub Desktop.
Slime GLM-5.1-tiny
MOE_ROUTED_EXPERTS=256
MOE_ACTIVE_ROUTED_EXPERTS=8
MOE_SHARED_EXPERTS=1
NHIDDEN=8
MOE_FFN_HIDDEN=32
MOE_SHARED_EXPERT_INTERMEDIATE_SIZE=$(($MOE_FFN_HIDDEN * $MOE_SHARED_EXPERTS))
FFN_HIDDEN=32
N_DENSE_LAYERS=1
N_MOE_LAYERS=1
NHEADS=8
MODEL_ARGS=(
--spec "slime_plugins.models.glm5.glm5" "get_glm5_spec"
--moe-layer-freq [0]*$N_DENSE_LAYERS+[1]*$N_MOE_LAYERS
--num-experts $MOE_ROUTED_EXPERTS
--moe-shared-expert-intermediate-size $MOE_SHARED_EXPERT_INTERMEDIATE_SIZE
--moe-router-topk $MOE_ACTIVE_ROUTED_EXPERTS
--moe-grouped-gemm
--moe-permute-fusion
--moe-ffn-hidden-size $MOE_FFN_HIDDEN
--moe-router-score-function sigmoid
--moe-router-pre-softmax
--moe-router-enable-expert-bias
--moe-router-bias-update-rate 0
--moe-router-load-balancing-type seq_aux_loss
--moe-router-topk-scaling-factor 2.5
--moe-aux-loss-coeff 0
--moe-router-dtype fp32
--make-vocab-size-divisible-by 16
--num-layers $((N_DENSE_LAYERS + N_MOE_LAYERS))
--hidden-size $NHIDDEN
--ffn-hidden-size $FFN_HIDDEN
--num-attention-heads $NHEADS
--disable-bias-linear
--swiglu
--untie-embeddings-and-output-weights
--position-embedding-type rope
--no-position-embedding
--normalization RMSNorm
--qk-layernorm
--multi-latent-attention
--q-lora-rank 32
--kv-lora-rank 512
--qk-head-dim 192
--v-head-dim 256
--kv-channels 192
--qk-pos-emb-head-dim 64
--vocab-size 154880
--rotary-base 1000000
--enable-experimental
# slime specific args
--allgather-cp
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment