{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9863013698630136, "eval_steps": 50.0, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009962640099626401, "grad_norm": 8.793932914733887, "learning_rate": 1e-05, "logits/chosen": -2.7779335975646973, "logits/rejected": -2.2469589710235596, "logps/chosen": -224.9627685546875, "logps/rejected": -241.19189453125, "loss": 0.6931473016738892, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.049813200498132, "grad_norm": 8.137606620788574, "learning_rate": 5e-05, "logits/chosen": -2.5251381397247314, "logits/rejected": -1.9867161512374878, "logps/chosen": -224.61813354492188, "logps/rejected": -258.90093994140625, "loss": 0.6376858949661255, "rewards/accuracies": 0.5, "rewards/chosen": 0.1259877234697342, "rewards/margins": 0.17248225212097168, "rewards/rejected": -0.046494536101818085, "step": 5 }, { "epoch": 0.099626400996264, "grad_norm": 1.5547863245010376, "learning_rate": 0.0001, "logits/chosen": -3.2977802753448486, "logits/rejected": -3.09322452545166, "logps/chosen": -192.3331756591797, "logps/rejected": -252.07101440429688, "loss": 0.2510965585708618, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.6293911933898926, "rewards/margins": 2.6744332313537598, "rewards/rejected": -0.045041993260383606, "step": 10 }, { "epoch": 0.149439601494396, "grad_norm": 2.169137716293335, "learning_rate": 9.98292246503335e-05, "logits/chosen": -4.032714366912842, "logits/rejected": -3.964298963546753, "logps/chosen": -212.45834350585938, "logps/rejected": -317.91064453125, "loss": 0.17307039499282836, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.576660394668579, "rewards/margins": 7.408028602600098, "rewards/rejected": -5.831367492675781, "step": 15 }, { "epoch": 0.199252801992528, "grad_norm": 8.901778221130371, "learning_rate": 9.931806517013612e-05, "logits/chosen": -3.712759017944336, "logits/rejected": -3.651376724243164, "logps/chosen": -212.9801025390625, "logps/rejected": -334.33697509765625, "loss": 0.17700881958007814, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6058607697486877, "rewards/margins": 8.40457534790039, "rewards/rejected": -9.010436058044434, "step": 20 }, { "epoch": 0.24906600249066002, "grad_norm": 0.7356922626495361, "learning_rate": 9.847001329696653e-05, "logits/chosen": -3.5607047080993652, "logits/rejected": -3.4833595752716064, "logps/chosen": -196.169677734375, "logps/rejected": -339.6310729980469, "loss": 0.0830357849597931, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.2101283073425293, "rewards/margins": 10.467849731445312, "rewards/rejected": -7.257721900939941, "step": 25 }, { "epoch": 0.298879202988792, "grad_norm": 0.57375568151474, "learning_rate": 9.729086208503174e-05, "logits/chosen": -3.4932701587677, "logits/rejected": -3.3746368885040283, "logps/chosen": -167.539306640625, "logps/rejected": -310.1274108886719, "loss": 0.0729694426059723, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 5.267261505126953, "rewards/margins": 10.698602676391602, "rewards/rejected": -5.431341171264648, "step": 30 }, { "epoch": 0.34869240348692404, "grad_norm": 2.0027079582214355, "learning_rate": 9.578866633275288e-05, "logits/chosen": -3.4459826946258545, "logits/rejected": -3.367497205734253, "logps/chosen": -196.55429077148438, "logps/rejected": -359.9834289550781, "loss": 0.07084929347038268, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.655911684036255, "rewards/margins": 12.158984184265137, "rewards/rejected": -9.503072738647461, "step": 35 }, { "epoch": 0.398505603985056, "grad_norm": 1.0954598188400269, "learning_rate": 9.397368756032445e-05, "logits/chosen": -3.567638874053955, "logits/rejected": -3.5081989765167236, "logps/chosen": -218.6673583984375, "logps/rejected": -372.1232604980469, "loss": 0.017929962277412413, "rewards/accuracies": 1.0, "rewards/chosen": 0.9934258460998535, "rewards/margins": 11.813644409179688, "rewards/rejected": -10.820219039916992, "step": 40 }, { "epoch": 0.44831880448318806, "grad_norm": 0.40222010016441345, "learning_rate": 9.185832391312644e-05, "logits/chosen": -3.7192935943603516, "logits/rejected": -3.6724600791931152, "logps/chosen": -215.64779663085938, "logps/rejected": -372.94232177734375, "loss": 0.02694471478462219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6867231130599976, "rewards/margins": 13.12153148651123, "rewards/rejected": -11.434808731079102, "step": 45 }, { "epoch": 0.49813200498132004, "grad_norm": 1.5209081172943115, "learning_rate": 8.945702546981969e-05, "logits/chosen": -3.8341193199157715, "logits/rejected": -3.8093173503875732, "logps/chosen": -214.366455078125, "logps/rejected": -387.33905029296875, "loss": 0.04207087457180023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5063580274581909, "rewards/margins": 14.116338729858398, "rewards/rejected": -13.609980583190918, "step": 50 }, { "epoch": 0.547945205479452, "grad_norm": 0.08626959472894669, "learning_rate": 8.678619553365659e-05, "logits/chosen": -3.7327990531921387, "logits/rejected": -3.682675838470459, "logps/chosen": -231.30410766601562, "logps/rejected": -409.13372802734375, "loss": 0.006610387563705444, "rewards/accuracies": 1.0, "rewards/chosen": -0.7696245908737183, "rewards/margins": 14.068384170532227, "rewards/rejected": -14.838006973266602, "step": 55 }, { "epoch": 0.597758405977584, "grad_norm": 0.436791330575943, "learning_rate": 8.386407858128706e-05, "logits/chosen": -3.347792387008667, "logits/rejected": -3.2968430519104004, "logps/chosen": -215.88504028320312, "logps/rejected": -402.4597473144531, "loss": 0.017044056951999665, "rewards/accuracies": 1.0, "rewards/chosen": -0.36254170536994934, "rewards/margins": 14.750783920288086, "rewards/rejected": -15.113327026367188, "step": 60 }, { "epoch": 0.6475716064757161, "grad_norm": 0.1343541443347931, "learning_rate": 8.07106356344834e-05, "logits/chosen": -3.1761727333068848, "logits/rejected": -3.100703716278076, "logps/chosen": -255.2406768798828, "logps/rejected": -462.63153076171875, "loss": 0.05564171075820923, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.898350954055786, "rewards/margins": 17.227405548095703, "rewards/rejected": -20.125757217407227, "step": 65 }, { "epoch": 0.6973848069738481, "grad_norm": 5.078628063201904, "learning_rate": 7.734740790612136e-05, "logits/chosen": -3.098306179046631, "logits/rejected": -3.000185489654541, "logps/chosen": -228.07308959960938, "logps/rejected": -464.58038330078125, "loss": 0.0879410982131958, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1209559440612793, "rewards/margins": 18.309709548950195, "rewards/rejected": -20.4306640625, "step": 70 }, { "epoch": 0.7471980074719801, "grad_norm": 3.92928147315979, "learning_rate": 7.379736965185368e-05, "logits/chosen": -3.045614242553711, "logits/rejected": -3.0131278038024902, "logps/chosen": -285.69573974609375, "logps/rejected": -476.64642333984375, "loss": 0.015967796742916106, "rewards/accuracies": 1.0, "rewards/chosen": -5.4438796043396, "rewards/margins": 15.962544441223145, "rewards/rejected": -21.406421661376953, "step": 75 }, { "epoch": 0.797011207970112, "grad_norm": 1.83503258228302, "learning_rate": 7.008477123264848e-05, "logits/chosen": -3.0136358737945557, "logits/rejected": -2.960451602935791, "logps/chosen": -298.04095458984375, "logps/rejected": -485.3680725097656, "loss": 0.02975378930568695, "rewards/accuracies": 1.0, "rewards/chosen": -7.388276100158691, "rewards/margins": 15.169245719909668, "rewards/rejected": -22.55752182006836, "step": 80 }, { "epoch": 0.8468244084682441, "grad_norm": 15.052701950073242, "learning_rate": 6.623497346023418e-05, "logits/chosen": -3.0762691497802734, "logits/rejected": -3.040611743927002, "logps/chosen": -311.65093994140625, "logps/rejected": -470.753173828125, "loss": 0.1816992402076721, "rewards/accuracies": 0.9375, "rewards/chosen": -9.027438163757324, "rewards/margins": 13.331235885620117, "rewards/rejected": -22.35867691040039, "step": 85 }, { "epoch": 0.8966376089663761, "grad_norm": 0.4199621379375458, "learning_rate": 6.227427435703997e-05, "logits/chosen": -3.1899125576019287, "logits/rejected": -3.135105609893799, "logps/chosen": -248.90359497070312, "logps/rejected": -442.48529052734375, "loss": 0.05473928451538086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.370785713195801, "rewards/margins": 15.46155071258545, "rewards/rejected": -17.832334518432617, "step": 90 }, { "epoch": 0.9464508094645081, "grad_norm": 0.6793515086174011, "learning_rate": 5.8229729514036705e-05, "logits/chosen": -3.3520407676696777, "logits/rejected": -3.2851250171661377, "logps/chosen": -239.3043212890625, "logps/rejected": -392.2827453613281, "loss": 0.044746071100234985, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4673506021499634, "rewards/margins": 12.658899307250977, "rewards/rejected": -14.126248359680176, "step": 95 }, { "epoch": 0.9962640099626401, "grad_norm": 0.16750197112560272, "learning_rate": 5.4128967273616625e-05, "logits/chosen": -3.4172706604003906, "logits/rejected": -3.370835065841675, "logps/chosen": -244.941650390625, "logps/rejected": -414.86767578125, "loss": 0.02344870865345001, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4666581153869629, "rewards/margins": 14.117535591125488, "rewards/rejected": -14.584193229675293, "step": 100 }, { "epoch": 1.0398505603985055, "grad_norm": 0.3164665400981903, "learning_rate": 5e-05, "logits/chosen": -3.4470374584198, "logits/rejected": -3.4258203506469727, "logps/chosen": -236.54843139648438, "logps/rejected": -401.71307373046875, "loss": 0.03152187168598175, "rewards/accuracies": 0.9857142567634583, "rewards/chosen": -1.0938712358474731, "rewards/margins": 13.463799476623535, "rewards/rejected": -14.557671546936035, "step": 105 }, { "epoch": 1.0896637608966375, "grad_norm": 0.09464748948812485, "learning_rate": 4.5871032726383386e-05, "logits/chosen": -3.564382553100586, "logits/rejected": -3.4938838481903076, "logps/chosen": -250.39602661132812, "logps/rejected": -458.6958923339844, "loss": 0.02826254963874817, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.154679775238037, "rewards/margins": 16.77099609375, "rewards/rejected": -18.925676345825195, "step": 110 }, { "epoch": 1.1394769613947697, "grad_norm": 0.0911044105887413, "learning_rate": 4.17702704859633e-05, "logits/chosen": -3.531079053878784, "logits/rejected": -3.478334903717041, "logps/chosen": -228.14724731445312, "logps/rejected": -384.11590576171875, "loss": 0.024063107371330262, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1524519920349121, "rewards/margins": 12.754313468933105, "rewards/rejected": -12.906765937805176, "step": 115 }, { "epoch": 1.1892901618929017, "grad_norm": 0.08798651397228241, "learning_rate": 3.772572564296005e-05, "logits/chosen": -3.5636494159698486, "logits/rejected": -3.5160324573516846, "logps/chosen": -223.90261840820312, "logps/rejected": -409.7283630371094, "loss": 0.004514996334910393, "rewards/accuracies": 1.0, "rewards/chosen": -1.6805578470230103, "rewards/margins": 14.167802810668945, "rewards/rejected": -15.848361015319824, "step": 120 }, { "epoch": 1.2391033623910337, "grad_norm": 0.8251333236694336, "learning_rate": 3.3765026539765834e-05, "logits/chosen": -3.4935951232910156, "logits/rejected": -3.443601608276367, "logps/chosen": -256.41790771484375, "logps/rejected": -442.462158203125, "loss": 0.05417940616607666, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.516829013824463, "rewards/margins": 15.455021858215332, "rewards/rejected": -17.971851348876953, "step": 125 }, { "epoch": 1.2889165628891657, "grad_norm": 0.012223007157444954, "learning_rate": 2.991522876735154e-05, "logits/chosen": -3.4438552856445312, "logits/rejected": -3.417296886444092, "logps/chosen": -236.6774444580078, "logps/rejected": -401.44287109375, "loss": 0.043891748785972594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2101138830184937, "rewards/margins": 14.020986557006836, "rewards/rejected": -15.231101989746094, "step": 130 }, { "epoch": 1.3387297633872977, "grad_norm": 0.02159416303038597, "learning_rate": 2.6202630348146324e-05, "logits/chosen": -3.4722747802734375, "logits/rejected": -3.4099507331848145, "logps/chosen": -238.39004516601562, "logps/rejected": -418.53826904296875, "loss": 0.008346886187791825, "rewards/accuracies": 1.0, "rewards/chosen": -1.311889886856079, "rewards/margins": 15.225858688354492, "rewards/rejected": -16.537750244140625, "step": 135 }, { "epoch": 1.3885429638854296, "grad_norm": 1.4822603464126587, "learning_rate": 2.2652592093878666e-05, "logits/chosen": -3.3921477794647217, "logits/rejected": -3.339038133621216, "logps/chosen": -236.64584350585938, "logps/rejected": -429.93536376953125, "loss": 0.0617010772228241, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4343018531799316, "rewards/margins": 15.99664306640625, "rewards/rejected": -17.430944442749023, "step": 140 }, { "epoch": 1.4383561643835616, "grad_norm": 0.0011595916002988815, "learning_rate": 1.928936436551661e-05, "logits/chosen": -3.385234832763672, "logits/rejected": -3.2963836193084717, "logps/chosen": -237.45166015625, "logps/rejected": -461.8202209472656, "loss": 0.004302332177758217, "rewards/accuracies": 1.0, "rewards/chosen": -1.762447714805603, "rewards/margins": 18.12508773803711, "rewards/rejected": -19.887537002563477, "step": 145 }, { "epoch": 1.4881693648816936, "grad_norm": 0.038154516369104385, "learning_rate": 1.6135921418712956e-05, "logits/chosen": -3.431607723236084, "logits/rejected": -3.365023374557495, "logps/chosen": -243.6700897216797, "logps/rejected": -441.6103515625, "loss": 0.006278228759765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.4231467247009277, "rewards/margins": 16.2125244140625, "rewards/rejected": -18.635671615600586, "step": 150 }, { "epoch": 1.5379825653798256, "grad_norm": 0.0559125691652298, "learning_rate": 1.3213804466343421e-05, "logits/chosen": -3.4251346588134766, "logits/rejected": -3.371931791305542, "logps/chosen": -244.56710815429688, "logps/rejected": -438.7330017089844, "loss": 0.0028001811355352403, "rewards/accuracies": 1.0, "rewards/chosen": -2.70890736579895, "rewards/margins": 15.939045906066895, "rewards/rejected": -18.647953033447266, "step": 155 }, { "epoch": 1.5877957658779578, "grad_norm": 0.35731741786003113, "learning_rate": 1.0542974530180327e-05, "logits/chosen": -3.3813865184783936, "logits/rejected": -3.315295696258545, "logps/chosen": -251.46725463867188, "logps/rejected": -439.66827392578125, "loss": 0.011943523585796357, "rewards/accuracies": 1.0, "rewards/chosen": -1.925512671470642, "rewards/margins": 15.476956367492676, "rewards/rejected": -17.402469635009766, "step": 160 }, { "epoch": 1.6376089663760895, "grad_norm": 2.2385966777801514, "learning_rate": 8.141676086873572e-06, "logits/chosen": -3.311697483062744, "logits/rejected": -3.256464719772339, "logps/chosen": -249.8314666748047, "logps/rejected": -445.3863220214844, "loss": 0.03819578289985657, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6976162195205688, "rewards/margins": 16.54266357421875, "rewards/rejected": -18.24028205871582, "step": 165 }, { "epoch": 1.6874221668742218, "grad_norm": 0.05370226129889488, "learning_rate": 6.026312439675552e-06, "logits/chosen": -3.3330516815185547, "logits/rejected": -3.2565295696258545, "logps/chosen": -239.17922973632812, "logps/rejected": -452.05328369140625, "loss": 0.00578184649348259, "rewards/accuracies": 1.0, "rewards/chosen": -2.259666919708252, "rewards/margins": 17.493772506713867, "rewards/rejected": -19.75343894958496, "step": 170 }, { "epoch": 1.7372353673723535, "grad_norm": 0.08332812786102295, "learning_rate": 4.2113336672471245e-06, "logits/chosen": -3.325460910797119, "logits/rejected": -3.2441654205322266, "logps/chosen": -248.5566864013672, "logps/rejected": -467.3125915527344, "loss": 0.004923052340745926, "rewards/accuracies": 1.0, "rewards/chosen": -2.043891429901123, "rewards/margins": 17.867572784423828, "rewards/rejected": -19.91146469116211, "step": 175 }, { "epoch": 1.7870485678704857, "grad_norm": 0.04057246446609497, "learning_rate": 2.7091379149682685e-06, "logits/chosen": -3.3428382873535156, "logits/rejected": -3.286740779876709, "logps/chosen": -241.89547729492188, "logps/rejected": -422.464599609375, "loss": 0.0025127800181508064, "rewards/accuracies": 1.0, "rewards/chosen": -1.7920238971710205, "rewards/margins": 15.13818359375, "rewards/rejected": -16.930208206176758, "step": 180 }, { "epoch": 1.8368617683686177, "grad_norm": 0.09395255893468857, "learning_rate": 1.5299867030334814e-06, "logits/chosen": -3.3060898780822754, "logits/rejected": -3.256986618041992, "logps/chosen": -225.3962860107422, "logps/rejected": -433.19036865234375, "loss": 0.019294042885303498, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8883010745048523, "rewards/margins": 16.79229736328125, "rewards/rejected": -17.68059730529785, "step": 185 }, { "epoch": 1.8866749688667497, "grad_norm": 0.19430460035800934, "learning_rate": 6.819348298638839e-07, "logits/chosen": -3.289118528366089, "logits/rejected": -3.237431049346924, "logps/chosen": -221.5372772216797, "logps/rejected": -431.51580810546875, "loss": 0.04272543787956238, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9622983932495117, "rewards/margins": 17.078460693359375, "rewards/rejected": -18.040760040283203, "step": 190 }, { "epoch": 1.9364881693648817, "grad_norm": 0.02927769534289837, "learning_rate": 1.7077534966650766e-07, "logits/chosen": -3.3257076740264893, "logits/rejected": -3.2454612255096436, "logps/chosen": -259.30596923828125, "logps/rejected": -466.82440185546875, "loss": 0.005100560188293457, "rewards/accuracies": 1.0, "rewards/chosen": -3.5521626472473145, "rewards/margins": 17.442373275756836, "rewards/rejected": -20.994539260864258, "step": 195 }, { "epoch": 1.9863013698630136, "grad_norm": 0.252530574798584, "learning_rate": 0.0, "logits/chosen": -3.3383991718292236, "logits/rejected": -3.2897567749023438, "logps/chosen": -252.06442260742188, "logps/rejected": -447.2425231933594, "loss": 0.002530140429735184, "rewards/accuracies": 1.0, "rewards/chosen": -2.556124210357666, "rewards/margins": 16.36166000366211, "rewards/rejected": -18.91778564453125, "step": 200 } ], "logging_steps": 5, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5688915966588273e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }