CUDA_HOME=/usr/local/cuda gpus=${gpus:-0,1,2,3,4,5,6,7} dataset=${dataset:-sft_train_data} output_dir=${output_dir:-experiments/$(date +"%Y%m%d_%H%M%S")} port=$(shuf -i 10000-20000 -n 1) deepspeed --include localhost:${gpus} --master_port=$port grpo_demo.py \ --deepspeed "ds_zero2.json" \ --model_name_or_path "path to Qwen2.5-1.5B-Instruct/" \ --output_dir outputs/Qwen2.5-1.5B-GRPO-gsm8k \ --run_name Qwen2.5-1.5B-GRPO-gsm8k \ --learning_rate 1e-5 \ --adam_beta1 0.9 \ --adam_beta2 0.99 \ --weight_decay 0.1 \ --warmup_ratio 0.1 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --bf16 True \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 2 \ --num_generations 16 \ --max_prompt_length 512 \ --max_completion_length 768 \ --num_train_epochs 5 \ --save_steps 100 \ --max_grad_norm 0.1 \ --report_to tensorboard \ --log_on_each_node False