# Base model configuration base_model: mistralai/Mistral-Small-24B-Base-2501 model_type: MistralForCausalLM tokenizer_type: AutoTokenizer trust_remote_code: true tokenizer_use_fast: true # Device settings - simpler approach for multi-GPU # Use balanced loading with 4-bit quantization device_map: "balanced" # Memory settings - optimized for dual A40 GPUs load_in_4bit: true # Essential for fitting 24B on A40s load_in_8bit: false # Don't use 8-bit alongside 4-bit bf16: true # Use bfloat16 for better numerical stability than fp16 low_cpu_mem_usage: true # Reduces CPU memory usage during model loading # Memory and speed optimizations flash_attention: true # Significant memory savings AND speed improvement gradient_checkpointing: true # Dataset configuration datasets: - path: json data_files: ./chatlogs.jsonl type: completion # Output tracking dataset_prepared_path: last_run_prepared output_dir: ./outputs/irc-mistral-24b-run1 val_set_size: 0.01 # Sequence and training settings sequence_len: 4096 # captures ~200 IRC messages sample_packing: true # Essential for efficient training with conversation data pad_to_sequence_len: true # Helps with stable memory usage train_on_inputs: true # Train on both inputs and outputs for conversation modeling eval_sample_packing: false # Not supported together with sample_packing # LoRA configuration adapter: lora lora_r: 128 # Scaled up for 24B model (from 64 for 7B) lora_alpha: 256 # Scaled up for 24B model (from 128 for 7B) lora_dropout: 0.1 # Maintained from successful 7B runs lora_target_modules: # Target all key transformer components - q_proj - v_proj - k_proj - o_proj - gate_proj - down_proj - up_proj # Training hyperparameters - optimized for speed while maintaining quality micro_batch_size: 1 gradient_accumulation_steps: 16 # Maintain effective batch size num_epochs: 2 # Consistent with successful 7B runs optimizer: adamw_torch # Standard optimizer choice for LLMs lr_scheduler: cosine # Smooth learning rate decay learning_rate: 0.00008 # Scaled down for 24B stability weight_decay: 0.01 # Consistent with successful 7B runs warmup_ratio: 0.05 # Add proper warmup for training stability # Evaluation and checkpointing evals_per_epoch: 3 include_tokens_per_second: true # Track performance metrics # Performance and quality monitoring group_by_length: true # Group similar sequence lengths for efficiency shuffle_merged_datasets: true # Ensure proper dataset shuffling # Wandb integration wandb_project: irc-llm-training wandb_entity: davidar wandb_name: irc-mistral-24b-run1 wandb_log_model: "false" eval_table_size: 5 # Show 5 samples in WandB UI for qualitative assessment # Mistral model configuration is_mistral_derived_model: true