{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 18, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16666666666666666, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.3458607792854309, "logits/rejected": -0.4615038335323334, "logps/chosen": -171.37403869628906, "logps/rejected": -149.31100463867188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.3333333333333333, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.29447269439697266, "logits/rejected": -0.32842525839805603, "logps/chosen": -127.80609130859375, "logps/rejected": -132.02676391601562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.19436776638031006, "logits/rejected": -0.3515459895133972, "logps/chosen": -182.87603759765625, "logps/rejected": -161.9700469970703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3 }, { "epoch": 0.6666666666666666, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.5648705959320068, "logits/rejected": -0.49730727076530457, "logps/chosen": -143.06080627441406, "logps/rejected": -177.17178344726562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4 }, { "epoch": 0.8333333333333334, "grad_norm": 409.343017578125, "learning_rate": 1e-07, "logits/chosen": -0.287241667509079, "logits/rejected": -0.4986671507358551, "logps/chosen": -179.01197814941406, "logps/rejected": -129.13258361816406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5 }, { "epoch": 1.0, "grad_norm": 481.6926574707031, "learning_rate": 2e-07, "logits/chosen": -0.39032474160194397, "logits/rejected": -0.6146747469902039, "logps/chosen": -171.80819702148438, "logps/rejected": -150.6540985107422, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6 }, { "epoch": 1.1666666666666667, "grad_norm": 455.3298645019531, "learning_rate": 3e-07, "logits/chosen": -0.6076681613922119, "logits/rejected": -0.5999705195426941, "logps/chosen": -149.00877380371094, "logps/rejected": -155.99354553222656, "loss": 0.6708, "rewards/accuracies": 0.9375, "rewards/chosen": 0.042467109858989716, "rewards/margins": 0.05151805654168129, "rewards/rejected": -0.009050942026078701, "step": 7 }, { "epoch": 1.3333333333333333, "grad_norm": 196.02310180664062, "learning_rate": 4e-07, "logits/chosen": -0.4407089054584503, "logits/rejected": -0.3964877724647522, "logps/chosen": -130.4448699951172, "logps/rejected": -153.34071350097656, "loss": 0.5316, "rewards/accuracies": 0.875, "rewards/chosen": 0.2952026128768921, "rewards/margins": 0.30782628059387207, "rewards/rejected": -0.012623678892850876, "step": 8 }, { "epoch": 1.5, "grad_norm": 160.99330139160156, "learning_rate": 5e-07, "logits/chosen": -0.386033833026886, "logits/rejected": -0.5585586428642273, "logps/chosen": -202.30712890625, "logps/rejected": -163.55845642089844, "loss": 0.3526, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9829663038253784, "rewards/margins": 1.25348961353302, "rewards/rejected": -0.27052345871925354, "step": 9 }, { "epoch": 1.6666666666666665, "grad_norm": 180.8558807373047, "learning_rate": 4.6153846153846156e-07, "logits/chosen": -0.1295984983444214, "logits/rejected": -0.22666746377944946, "logps/chosen": -163.62046813964844, "logps/rejected": -173.85043334960938, "loss": 0.2326, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1647255420684814, "rewards/margins": 1.9450981616973877, "rewards/rejected": -0.7803725004196167, "step": 10 }, { "epoch": 1.8333333333333335, "grad_norm": 116.59342193603516, "learning_rate": 4.2307692307692304e-07, "logits/chosen": -0.07035604119300842, "logits/rejected": -0.20303675532341003, "logps/chosen": -144.70648193359375, "logps/rejected": -186.86166381835938, "loss": 0.1318, "rewards/accuracies": 0.96875, "rewards/chosen": 1.164882779121399, "rewards/margins": 4.173832416534424, "rewards/rejected": -3.0089497566223145, "step": 11 }, { "epoch": 2.0, "grad_norm": 49.43299865722656, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -0.1386706531047821, "logits/rejected": -0.14760421216487885, "logps/chosen": -176.37342834472656, "logps/rejected": -199.35305786132812, "loss": 0.1436, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6035472750663757, "rewards/margins": 4.147317409515381, "rewards/rejected": -3.5437700748443604, "step": 12 }, { "epoch": 2.1666666666666665, "grad_norm": 90.89972686767578, "learning_rate": 3.461538461538461e-07, "logits/chosen": 0.24338586628437042, "logits/rejected": 0.160726860165596, "logps/chosen": -170.33824157714844, "logps/rejected": -210.45208740234375, "loss": 0.0537, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9079462885856628, "rewards/margins": 7.290245056152344, "rewards/rejected": -6.382298946380615, "step": 13 }, { "epoch": 2.3333333333333335, "grad_norm": 143.731201171875, "learning_rate": 3.076923076923077e-07, "logits/chosen": 0.29322168231010437, "logits/rejected": 0.26528558135032654, "logps/chosen": -184.0413055419922, "logps/rejected": -253.95401000976562, "loss": 0.0489, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2606481611728668, "rewards/margins": 7.589580535888672, "rewards/rejected": -7.32893180847168, "step": 14 }, { "epoch": 2.5, "grad_norm": 69.44292449951172, "learning_rate": 2.692307692307692e-07, "logits/chosen": 0.2146293967962265, "logits/rejected": 0.3031017780303955, "logps/chosen": -199.20730590820312, "logps/rejected": -366.293212890625, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 0.819592297077179, "rewards/margins": 11.11633586883545, "rewards/rejected": -10.296743392944336, "step": 15 }, { "epoch": 2.6666666666666665, "grad_norm": 51.55795669555664, "learning_rate": 2.3076923076923078e-07, "logits/chosen": 0.3652637004852295, "logits/rejected": 0.19149763882160187, "logps/chosen": -174.45452880859375, "logps/rejected": -178.36778259277344, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": 0.8945200443267822, "rewards/margins": 7.584080696105957, "rewards/rejected": -6.689560890197754, "step": 16 }, { "epoch": 2.8333333333333335, "grad_norm": 15.279855728149414, "learning_rate": 1.9230769230769231e-07, "logits/chosen": 0.10915929824113846, "logits/rejected": 0.22957958281040192, "logps/chosen": -113.55706024169922, "logps/rejected": -238.6990509033203, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 1.0251195430755615, "rewards/margins": 9.426384925842285, "rewards/rejected": -8.401266098022461, "step": 17 }, { "epoch": 3.0, "grad_norm": 38.921119689941406, "learning_rate": 1.5384615384615385e-07, "logits/chosen": 0.2816791236400604, "logits/rejected": 0.16515518724918365, "logps/chosen": -206.7659912109375, "logps/rejected": -269.8388977050781, "loss": 0.0225, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8032644391059875, "rewards/margins": 9.53320598602295, "rewards/rejected": -8.729942321777344, "step": 18 } ], "logging_steps": 1.0, "max_steps": 18, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3634086543360.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }