| { | |
| "learning_rate": 2e-05, | |
| "batch_size": 8, | |
| "gradient_accumulation_steps": 6, | |
| "num_epochs": 3, | |
| "max_length": 2048, | |
| "warmup_ratio": 0.03, | |
| "weight_decay": 0.01, | |
| "max_grad_norm": 1.0, | |
| "seed": 42, | |
| "eval_frequency": 5, | |
| "logging_steps": 10, | |
| "dataloader_num_workers": 8, | |
| "pin_memory": true, | |
| "dataloader_persistent_workers": true, | |
| "prefetch_factor": 4, | |
| "early_stopping_patience": 3, | |
| "early_stopping_min_delta": 0.001, | |
| "checkpoint_dir": "/root/llama3.2-3b-training", | |
| "save_best_model": true, | |
| "save_last_checkpoint": true, | |
| "save_every_n_epochs": 1, | |
| "save_every_n_steps": 5000, | |
| "keep_last_n_checkpoints": 2 | |
| } |