# Base model configuration
base_model: mistralai/Mistral-Small-24B-Base-2501
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
trust_remote_code: true
tokenizer_use_fast: true

# Device settings - simpler approach for multi-GPU
# Use balanced loading with 4-bit quantization
device_map: "balanced"

# Memory settings - optimized for dual A40 GPUs
load_in_4bit: true          # Essential for fitting 24B on A40s
load_in_8bit: false         # Don't use 8-bit alongside 4-bit
bf16: true                  # Use bfloat16 for better numerical stability than fp16
low_cpu_mem_usage: true     # Reduces CPU memory usage during model loading

# Memory and speed optimizations
flash_attention: true       # Significant memory savings AND speed improvement
gradient_checkpointing: true

# Dataset configuration
datasets:
  - path: json
    data_files: ./chatlogs.jsonl
    type: completion

# Output tracking
dataset_prepared_path: last_run_prepared
output_dir: ./outputs/irc-mistral-24b-run1
val_set_size: 0.01

# Sequence and training settings
sequence_len: 4096          # captures ~200 IRC messages
sample_packing: true        # Essential for efficient training with conversation data
pad_to_sequence_len: true   # Helps with stable memory usage
train_on_inputs: true       # Train on both inputs and outputs for conversation modeling
eval_sample_packing: false  # Not supported together with sample_packing

# LoRA configuration
adapter: lora
lora_r: 128                 # Scaled up for 24B model (from 64 for 7B)
lora_alpha: 256             # Scaled up for 24B model (from 128 for 7B)
lora_dropout: 0.1           # Maintained from successful 7B runs
lora_target_modules:        # Target all key transformer components
  - q_proj
  - v_proj
  - k_proj
  - o_proj
  - gate_proj
  - down_proj
  - up_proj

# Training hyperparameters - optimized for speed while maintaining quality
micro_batch_size: 1
gradient_accumulation_steps: 16    # Maintain effective batch size
num_epochs: 2               # Consistent with successful 7B runs
optimizer: adamw_torch      # Standard optimizer choice for LLMs
lr_scheduler: cosine        # Smooth learning rate decay
learning_rate: 0.00008      # Scaled down for 24B stability
weight_decay: 0.01          # Consistent with successful 7B runs
warmup_ratio: 0.05          # Add proper warmup for training stability

# Evaluation and checkpointing
evals_per_epoch: 3
include_tokens_per_second: true  # Track performance metrics

# Performance and quality monitoring
group_by_length: true       # Group similar sequence lengths for efficiency
shuffle_merged_datasets: true    # Ensure proper dataset shuffling

# Wandb integration
wandb_project: irc-llm-training
wandb_entity: davidar
wandb_name: irc-mistral-24b-run1
wandb_log_model: "false"
eval_table_size: 5          # Show 5 samples in WandB UI for qualitative assessment

# Mistral model configuration
is_mistral_derived_model: true