{ "best_global_step": 210, "best_metric": 0.09402994, "best_model_checkpoint": "/fsx-neo/dedicated-fsx-data-repo-neo-us-east-1/kayleexl/tree_reasoning/logical-reasoning/ms-swift/output_dpo/v7-20260217-182416/checkpoint-210", "epoch": 1.985781990521327, "eval_steps": 50, "global_step": 210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009478672985781991, "grad_norm": 19.55022430419922, "learning_rate": 9.090909090909091e-06, "logits/chosen": -0.8645371198654175, "logits/rejected": -0.8560649752616882, "logps/chosen": -212.84078979492188, "logps/rejected": -181.89553833007812, "loss": 1.9682148694992065, "memory(GiB)": 153.35, "nll_loss": 1.275067687034607, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "train_speed(iter/s)": 0.001829 }, { "epoch": 0.04739336492890995, "grad_norm": 10.874584197998047, "learning_rate": 4.545454545454546e-05, "logits/chosen": -0.8818354606628418, "logits/rejected": -0.8391438722610474, "logps/chosen": -206.83299255371094, "logps/rejected": -210.26193237304688, "loss": 2.225569725036621, "memory(GiB)": 171.5, "nll_loss": 1.5596290826797485, "rewards/accuracies": 0.390625, "rewards/chosen": 0.42497575283050537, "rewards/margins": 0.14753574132919312, "rewards/rejected": 0.27743998169898987, "step": 5, "train_speed(iter/s)": 0.001962 }, { "epoch": 0.0947867298578199, "grad_norm": 3.7358806133270264, "learning_rate": 9.090909090909092e-05, "logits/chosen": -0.42944854497909546, "logits/rejected": -0.4029228687286377, "logps/chosen": -153.91217041015625, "logps/rejected": -171.6427764892578, "loss": 1.2244630813598634, "memory(GiB)": 178.87, "nll_loss": 0.9248598217964172, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 8.403377532958984, "rewards/margins": 3.649811267852783, "rewards/rejected": 4.753565788269043, "step": 10, "train_speed(iter/s)": 0.002002 }, { "epoch": 0.14218009478672985, "grad_norm": 2.5456342697143555, "learning_rate": 9.990034266657467e-05, "logits/chosen": 0.14313745498657227, "logits/rejected": 0.15165017545223236, "logps/chosen": -64.2056655883789, "logps/rejected": -108.23348236083984, "loss": 0.9479263305664063, "memory(GiB)": 178.87, "nll_loss": 0.5506663918495178, "rewards/accuracies": 0.8125, "rewards/chosen": 13.307535171508789, "rewards/margins": 4.5142388343811035, "rewards/rejected": 8.793294906616211, "step": 15, "train_speed(iter/s)": 0.001977 }, { "epoch": 0.1895734597156398, "grad_norm": 9.239604949951172, "learning_rate": 9.949616551002787e-05, "logits/chosen": -0.7042765617370605, "logits/rejected": -0.6620756983757019, "logps/chosen": -57.892356872558594, "logps/rejected": -133.2845916748047, "loss": 0.6575197696685791, "memory(GiB)": 178.87, "nll_loss": 0.44857126474380493, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 15.717184066772461, "rewards/margins": 7.5960211753845215, "rewards/rejected": 8.121164321899414, "step": 20, "train_speed(iter/s)": 0.001969 }, { "epoch": 0.23696682464454977, "grad_norm": 3.75114369392395, "learning_rate": 9.87837549867887e-05, "logits/chosen": -0.5987659096717834, "logits/rejected": -0.5469285249710083, "logps/chosen": -39.35096740722656, "logps/rejected": -128.31814575195312, "loss": 0.4350598335266113, "memory(GiB)": 178.87, "nll_loss": 0.30207258462905884, "rewards/accuracies": 0.9375, "rewards/chosen": 19.038375854492188, "rewards/margins": 10.453929901123047, "rewards/rejected": 8.584444999694824, "step": 25, "train_speed(iter/s)": 0.001984 }, { "epoch": 0.2843601895734597, "grad_norm": 7.996090412139893, "learning_rate": 9.776754757575975e-05, "logits/chosen": -0.7030301094055176, "logits/rejected": -0.6584943532943726, "logps/chosen": -42.931304931640625, "logps/rejected": -132.83807373046875, "loss": 0.625270938873291, "memory(GiB)": 178.87, "nll_loss": 0.3963403105735779, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 16.53431510925293, "rewards/margins": 9.225370407104492, "rewards/rejected": 7.308945655822754, "step": 30, "train_speed(iter/s)": 0.001981 }, { "epoch": 0.33175355450236965, "grad_norm": 0.928902804851532, "learning_rate": 9.645387162638652e-05, "logits/chosen": -0.7767706513404846, "logits/rejected": -0.7252510190010071, "logps/chosen": -30.822132110595703, "logps/rejected": -122.3298110961914, "loss": 0.47859888076782225, "memory(GiB)": 178.87, "nll_loss": 0.25827503204345703, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 18.511274337768555, "rewards/margins": 9.673491477966309, "rewards/rejected": 8.837784767150879, "step": 35, "train_speed(iter/s)": 0.001982 }, { "epoch": 0.3791469194312796, "grad_norm": 4.358126163482666, "learning_rate": 9.485090794937319e-05, "logits/chosen": -0.8528544306755066, "logits/rejected": -0.8117485046386719, "logps/chosen": -28.26708984375, "logps/rejected": -153.4051055908203, "loss": 0.38582923412323, "memory(GiB)": 178.87, "nll_loss": 0.20131754875183105, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 19.518356323242188, "rewards/margins": 13.451945304870605, "rewards/rejected": 6.066409587860107, "step": 40, "train_speed(iter/s)": 0.001983 }, { "epoch": 0.4265402843601896, "grad_norm": 9.361577987670898, "learning_rate": 9.29686388713456e-05, "logits/chosen": -1.4655885696411133, "logits/rejected": -1.4526116847991943, "logps/chosen": -41.395565032958984, "logps/rejected": -169.83302307128906, "loss": 0.4372711658477783, "memory(GiB)": 178.87, "nll_loss": 0.3007845878601074, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 16.87301254272461, "rewards/margins": 13.00146198272705, "rewards/rejected": 3.871551513671875, "step": 45, "train_speed(iter/s)": 0.00198 }, { "epoch": 0.47393364928909953, "grad_norm": 1.3884881734848022, "learning_rate": 9.081878607071996e-05, "logits/chosen": -1.4089267253875732, "logits/rejected": -1.3798249959945679, "logps/chosen": -29.07427978515625, "logps/rejected": -174.85606384277344, "loss": 0.28701162338256836, "memory(GiB)": 178.87, "nll_loss": 0.17918026447296143, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 20.193706512451172, "rewards/margins": 15.493553161621094, "rewards/rejected": 4.700153350830078, "step": 50, "train_speed(iter/s)": 0.001981 }, { "epoch": 0.47393364928909953, "eval_logits/chosen": -1.2822269201278687, "eval_logits/rejected": -1.255699634552002, "eval_logps/chosen": -34.7473258972168, "eval_logps/rejected": -143.84861755371094, "eval_loss": 0.4424428939819336, "eval_nll_loss": 0.2968122363090515, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 19.00676155090332, "eval_rewards/margins": 12.395697593688965, "eval_rewards/rejected": 6.611064910888672, "eval_runtime": 230.2546, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.074, "step": 50 }, { "epoch": 0.5213270142180095, "grad_norm": 1.1772722005844116, "learning_rate": 8.841473758189854e-05, "logits/chosen": -1.1759651899337769, "logits/rejected": -1.1313683986663818, "logps/chosen": -26.088220596313477, "logps/rejected": -150.84393310546875, "loss": 0.22870185375213622, "memory(GiB)": 178.87, "nll_loss": 0.16189467906951904, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 19.440641403198242, "rewards/margins": 13.270869255065918, "rewards/rejected": 6.169772148132324, "step": 55, "train_speed(iter/s)": 0.001958 }, { "epoch": 0.5687203791469194, "grad_norm": 2.0721075534820557, "learning_rate": 8.577146442236857e-05, "logits/chosen": -1.0945132970809937, "logits/rejected": -1.060734748840332, "logps/chosen": -22.90542984008789, "logps/rejected": -128.8797607421875, "loss": 0.24675798416137695, "memory(GiB)": 178.87, "nll_loss": 0.17432959377765656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 17.570837020874023, "rewards/margins": 11.27192497253418, "rewards/rejected": 6.298913955688477, "step": 60, "train_speed(iter/s)": 0.00195 }, { "epoch": 0.6161137440758294, "grad_norm": 2.0549778938293457, "learning_rate": 8.290542736190188e-05, "logits/chosen": -1.080885648727417, "logits/rejected": -1.057293176651001, "logps/chosen": -17.660358428955078, "logps/rejected": -124.87294006347656, "loss": 0.36183264255523684, "memory(GiB)": 178.87, "nll_loss": 0.1854233592748642, "rewards/accuracies": 0.9375, "rewards/chosen": 18.208276748657227, "rewards/margins": 11.001307487487793, "rewards/rejected": 7.206968784332275, "step": 65, "train_speed(iter/s)": 0.001945 }, { "epoch": 0.6635071090047393, "grad_norm": 1.9049893617630005, "learning_rate": 7.983447441444281e-05, "logits/chosen": -1.4264296293258667, "logits/rejected": -1.4030673503875732, "logps/chosen": -22.567996978759766, "logps/rejected": -163.80955505371094, "loss": 0.2892845392227173, "memory(GiB)": 178.87, "nll_loss": 0.15383335947990417, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 18.27324676513672, "rewards/margins": 15.088933944702148, "rewards/rejected": 3.1843135356903076, "step": 70, "train_speed(iter/s)": 0.001941 }, { "epoch": 0.7109004739336493, "grad_norm": 1.0275962352752686, "learning_rate": 7.657772969104508e-05, "logits/chosen": -1.3237228393554688, "logits/rejected": -1.3014802932739258, "logps/chosen": -27.62123680114746, "logps/rejected": -179.39981079101562, "loss": 0.22349367141723633, "memory(GiB)": 178.87, "nll_loss": 0.16326689720153809, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 19.444076538085938, "rewards/margins": 16.357398986816406, "rewards/rejected": 3.0866756439208984, "step": 75, "train_speed(iter/s)": 0.001941 }, { "epoch": 0.7582938388625592, "grad_norm": 2.1720211505889893, "learning_rate": 7.31554743060174e-05, "logits/chosen": -0.9713658094406128, "logits/rejected": -0.9438395500183105, "logps/chosen": -21.5306396484375, "logps/rejected": -158.5912628173828, "loss": 0.2255859136581421, "memory(GiB)": 178.87, "nll_loss": 0.15225784480571747, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 20.218997955322266, "rewards/margins": 14.551397323608398, "rewards/rejected": 5.6676025390625, "step": 80, "train_speed(iter/s)": 0.001941 }, { "epoch": 0.8056872037914692, "grad_norm": 2.247673988342285, "learning_rate": 6.958902007792466e-05, "logits/chosen": -0.7944511771202087, "logits/rejected": -0.7699103355407715, "logps/chosen": -11.273658752441406, "logps/rejected": -142.4789581298828, "loss": 0.1783364772796631, "memory(GiB)": 178.87, "nll_loss": 0.10094492137432098, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 19.9717960357666, "rewards/margins": 13.428759574890137, "rewards/rejected": 6.543033599853516, "step": 85, "train_speed(iter/s)": 0.001943 }, { "epoch": 0.8530805687203792, "grad_norm": 5.502572059631348, "learning_rate": 6.590057681196191e-05, "logits/chosen": -0.7691094875335693, "logits/rejected": -0.7428280711174011, "logps/chosen": -16.701950073242188, "logps/rejected": -178.28001403808594, "loss": 0.21286754608154296, "memory(GiB)": 178.87, "nll_loss": 0.1458219736814499, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 18.113544464111328, "rewards/margins": 14.766156196594238, "rewards/rejected": 3.3473877906799316, "step": 90, "train_speed(iter/s)": 0.001941 }, { "epoch": 0.9004739336492891, "grad_norm": 6.771712779998779, "learning_rate": 6.211311399018916e-05, "logits/chosen": -1.2176296710968018, "logits/rejected": -1.2004112005233765, "logps/chosen": -10.625171661376953, "logps/rejected": -197.5986785888672, "loss": 0.14389824867248535, "memory(GiB)": 178.87, "nll_loss": 0.08421098440885544, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 21.306598663330078, "rewards/margins": 20.043148040771484, "rewards/rejected": 1.2634522914886475, "step": 95, "train_speed(iter/s)": 0.001943 }, { "epoch": 0.9478672985781991, "grad_norm": 0.7936939001083374, "learning_rate": 5.8250217730939973e-05, "logits/chosen": -1.2689450979232788, "logits/rejected": -1.2652291059494019, "logps/chosen": -24.885387420654297, "logps/rejected": -166.68470764160156, "loss": 0.22324090003967284, "memory(GiB)": 178.87, "nll_loss": 0.16993048787117004, "rewards/accuracies": 1.0, "rewards/chosen": 18.267993927001953, "rewards/margins": 15.335573196411133, "rewards/rejected": 2.932422637939453, "step": 100, "train_speed(iter/s)": 0.001939 }, { "epoch": 0.9478672985781991, "eval_logits/chosen": -1.276165246963501, "eval_logits/rejected": -1.2667920589447021, "eval_logps/chosen": -17.997478485107422, "eval_logps/rejected": -172.5413818359375, "eval_loss": 0.18138757348060608, "eval_nll_loss": 0.15586450695991516, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 20.681747436523438, "eval_rewards/margins": 16.939956665039062, "eval_rewards/rejected": 3.741788625717163, "eval_runtime": 230.8449, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.074, "step": 100 }, { "epoch": 0.995260663507109, "grad_norm": 0.7884749174118042, "learning_rate": 5.433594390817756e-05, "logits/chosen": -1.217622995376587, "logits/rejected": -1.2057679891586304, "logps/chosen": -13.138340950012207, "logps/rejected": -187.07447814941406, "loss": 0.12456157207489013, "memory(GiB)": 178.87, "nll_loss": 0.09173186123371124, "rewards/accuracies": 1.0, "rewards/chosen": 19.977113723754883, "rewards/margins": 18.163719177246094, "rewards/rejected": 1.8133970499038696, "step": 105, "train_speed(iter/s)": 0.001931 }, { "epoch": 1.037914691943128, "grad_norm": 0.41725555062294006, "learning_rate": 5.039466834548568e-05, "logits/chosen": -1.0610564947128296, "logits/rejected": -1.0492563247680664, "logps/chosen": -17.22906494140625, "logps/rejected": -178.35047912597656, "loss": 0.10526471138000489, "memory(GiB)": 178.87, "nll_loss": 0.0985727533698082, "rewards/accuracies": 1.0, "rewards/chosen": 19.968387603759766, "rewards/margins": 18.300251007080078, "rewards/rejected": 1.6681346893310547, "step": 110, "train_speed(iter/s)": 0.001939 }, { "epoch": 1.085308056872038, "grad_norm": 0.5403364300727844, "learning_rate": 4.64509350175992e-05, "logits/chosen": -0.9350749254226685, "logits/rejected": -0.9276103973388672, "logps/chosen": -15.192606925964355, "logps/rejected": -196.2861785888672, "loss": 0.10088248252868652, "memory(GiB)": 178.87, "nll_loss": 0.09568502753973007, "rewards/accuracies": 1.0, "rewards/chosen": 20.20822525024414, "rewards/margins": 17.932273864746094, "rewards/rejected": 2.2759501934051514, "step": 115, "train_speed(iter/s)": 0.001939 }, { "epoch": 1.132701421800948, "grad_norm": 0.33638796210289, "learning_rate": 4.2529303204786953e-05, "logits/chosen": -0.8360708355903625, "logits/rejected": -0.8255692720413208, "logps/chosen": -12.213701248168945, "logps/rejected": -178.4746856689453, "loss": 0.08850648403167724, "memory(GiB)": 178.87, "nll_loss": 0.07898052781820297, "rewards/accuracies": 1.0, "rewards/chosen": 20.77065086364746, "rewards/margins": 17.60364532470703, "rewards/rejected": 3.167004346847534, "step": 120, "train_speed(iter/s)": 0.00194 }, { "epoch": 1.180094786729858, "grad_norm": 0.42006343603134155, "learning_rate": 3.8654194551920485e-05, "logits/chosen": -0.8648909330368042, "logits/rejected": -0.8533682823181152, "logps/chosen": -13.303857803344727, "logps/rejected": -196.53375244140625, "loss": 0.11154735088348389, "memory(GiB)": 178.87, "nll_loss": 0.09461511671543121, "rewards/accuracies": 1.0, "rewards/chosen": 19.808521270751953, "rewards/margins": 18.673742294311523, "rewards/rejected": 1.1347795724868774, "step": 125, "train_speed(iter/s)": 0.00194 }, { "epoch": 1.2274881516587677, "grad_norm": 0.4501703679561615, "learning_rate": 3.484974098465636e-05, "logits/chosen": -1.0564872026443481, "logits/rejected": -1.0499274730682373, "logps/chosen": -9.447141647338867, "logps/rejected": -210.6065216064453, "loss": 0.07890591621398926, "memory(GiB)": 178.87, "nll_loss": 0.0762052983045578, "rewards/accuracies": 1.0, "rewards/chosen": 20.350528717041016, "rewards/margins": 20.427692413330078, "rewards/rejected": -0.07716653496026993, "step": 130, "train_speed(iter/s)": 0.00194 }, { "epoch": 1.2748815165876777, "grad_norm": 0.48629000782966614, "learning_rate": 3.11396344298212e-05, "logits/chosen": -1.122717022895813, "logits/rejected": -1.1127209663391113, "logps/chosen": -8.969633102416992, "logps/rejected": -178.16427612304688, "loss": 0.07699697613716125, "memory(GiB)": 178.87, "nll_loss": 0.07132184505462646, "rewards/accuracies": 1.0, "rewards/chosen": 20.212081909179688, "rewards/margins": 18.33711814880371, "rewards/rejected": 1.8749620914459229, "step": 135, "train_speed(iter/s)": 0.001941 }, { "epoch": 1.3222748815165877, "grad_norm": 0.4852510094642639, "learning_rate": 2.754697927585399e-05, "logits/chosen": -1.0894103050231934, "logits/rejected": -1.0850476026535034, "logps/chosen": -13.88347053527832, "logps/rejected": -190.9267120361328, "loss": 0.11755204200744629, "memory(GiB)": 178.87, "nll_loss": 0.09814213216304779, "rewards/accuracies": 1.0, "rewards/chosen": 20.125789642333984, "rewards/margins": 17.44692611694336, "rewards/rejected": 2.678863048553467, "step": 140, "train_speed(iter/s)": 0.001938 }, { "epoch": 1.3696682464454977, "grad_norm": 1.0523611307144165, "learning_rate": 2.4094148492096125e-05, "logits/chosen": -1.18220055103302, "logits/rejected": -1.183691382408142, "logps/chosen": -11.095239639282227, "logps/rejected": -180.09048461914062, "loss": 0.07882866859436036, "memory(GiB)": 178.87, "nll_loss": 0.06894151866436005, "rewards/accuracies": 1.0, "rewards/chosen": 19.55763816833496, "rewards/margins": 17.858949661254883, "rewards/rejected": 1.6986896991729736, "step": 145, "train_speed(iter/s)": 0.001938 }, { "epoch": 1.4170616113744074, "grad_norm": 0.9743487238883972, "learning_rate": 2.0802644302934683e-05, "logits/chosen": -1.2402594089508057, "logits/rejected": -1.229536771774292, "logps/chosen": -10.528487205505371, "logps/rejected": -173.75906372070312, "loss": 0.05581583380699158, "memory(GiB)": 178.87, "nll_loss": 0.04851926118135452, "rewards/accuracies": 1.0, "rewards/chosen": 21.25326156616211, "rewards/margins": 17.409027099609375, "rewards/rejected": 3.84423565864563, "step": 150, "train_speed(iter/s)": 0.001939 }, { "epoch": 1.4170616113744074, "eval_logits/chosen": -1.2280174493789673, "eval_logits/rejected": -1.2243937253952026, "eval_logps/chosen": -11.513480186462402, "eval_logps/rejected": -170.34632873535156, "eval_loss": 0.11434541642665863, "eval_nll_loss": 0.10074843466281891, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 21.33014678955078, "eval_rewards/margins": 17.368852615356445, "eval_rewards/rejected": 3.9612925052642822, "eval_runtime": 228.4343, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.074, "step": 150 }, { "epoch": 1.4644549763033177, "grad_norm": 1.9418814182281494, "learning_rate": 1.7692964284439505e-05, "logits/chosen": -1.2662538290023804, "logits/rejected": -1.2612764835357666, "logps/chosen": -8.698439598083496, "logps/rejected": -195.09063720703125, "loss": 0.07216010689735412, "memory(GiB)": 178.87, "nll_loss": 0.05235465615987778, "rewards/accuracies": 1.0, "rewards/chosen": 20.530902862548828, "rewards/margins": 19.212444305419922, "rewards/rejected": 1.3184587955474854, "step": 155, "train_speed(iter/s)": 0.001934 }, { "epoch": 1.5118483412322274, "grad_norm": 2.3436055183410645, "learning_rate": 1.4784473717366387e-05, "logits/chosen": -1.2591499090194702, "logits/rejected": -1.2514972686767578, "logps/chosen": -9.602866172790527, "logps/rejected": -204.9786376953125, "loss": 0.06997905969619751, "memory(GiB)": 178.87, "nll_loss": 0.0632362961769104, "rewards/accuracies": 1.0, "rewards/chosen": 20.80245590209961, "rewards/margins": 19.767711639404297, "rewards/rejected": 1.0347453355789185, "step": 160, "train_speed(iter/s)": 0.001935 }, { "epoch": 1.5592417061611374, "grad_norm": 0.4779791235923767, "learning_rate": 1.2095284991437733e-05, "logits/chosen": -1.2291038036346436, "logits/rejected": -1.2240577936172485, "logps/chosen": -12.217391014099121, "logps/rejected": -200.1668243408203, "loss": 0.0741503119468689, "memory(GiB)": 178.87, "nll_loss": 0.0690564215183258, "rewards/accuracies": 1.0, "rewards/chosen": 19.801494598388672, "rewards/margins": 18.78908920288086, "rewards/rejected": 1.0124043226242065, "step": 165, "train_speed(iter/s)": 0.001934 }, { "epoch": 1.6066350710900474, "grad_norm": 0.6247928142547607, "learning_rate": 9.642144811900739e-06, "logits/chosen": -1.225555181503296, "logits/rejected": -1.2201206684112549, "logps/chosen": -9.304153442382812, "logps/rejected": -219.2526092529297, "loss": 0.056455212831497195, "memory(GiB)": 178.87, "nll_loss": 0.04982581362128258, "rewards/accuracies": 1.0, "rewards/chosen": 21.17973518371582, "rewards/margins": 21.15430450439453, "rewards/rejected": 0.025428902357816696, "step": 170, "train_speed(iter/s)": 0.001935 }, { "epoch": 1.6540284360189572, "grad_norm": 0.7543458342552185, "learning_rate": 7.440329910775273e-06, "logits/chosen": -1.2147386074066162, "logits/rejected": -1.2046931982040405, "logps/chosen": -11.757909774780273, "logps/rejected": -177.62388610839844, "loss": 0.11545271873474121, "memory(GiB)": 178.87, "nll_loss": 0.0637175664305687, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 19.637483596801758, "rewards/margins": 17.161773681640625, "rewards/rejected": 2.4757096767425537, "step": 175, "train_speed(iter/s)": 0.001935 }, { "epoch": 1.7014218009478674, "grad_norm": 0.9932089447975159, "learning_rate": 5.5035519122409895e-06, "logits/chosen": -1.3096096515655518, "logits/rejected": -1.3018414974212646, "logps/chosen": -10.068361282348633, "logps/rejected": -190.3312225341797, "loss": 0.07646466493606567, "memory(GiB)": 178.87, "nll_loss": 0.07355433702468872, "rewards/accuracies": 1.0, "rewards/chosen": 19.979000091552734, "rewards/margins": 18.556865692138672, "rewards/rejected": 1.4221333265304565, "step": 180, "train_speed(iter/s)": 0.001934 }, { "epoch": 1.7488151658767772, "grad_norm": 1.022765040397644, "learning_rate": 3.843871944606969e-06, "logits/chosen": -1.3755590915679932, "logits/rejected": -1.3671270608901978, "logps/chosen": -8.571699142456055, "logps/rejected": -195.33099365234375, "loss": 0.06720139980316162, "memory(GiB)": 178.87, "nll_loss": 0.06385985761880875, "rewards/accuracies": 1.0, "rewards/chosen": 20.94257164001465, "rewards/margins": 19.358797073364258, "rewards/rejected": 1.5837746858596802, "step": 185, "train_speed(iter/s)": 0.001935 }, { "epoch": 1.7962085308056872, "grad_norm": 0.2912954092025757, "learning_rate": 2.4716255306108605e-06, "logits/chosen": -1.4053622484207153, "logits/rejected": -1.397859811782837, "logps/chosen": -7.519402503967285, "logps/rejected": -204.46128845214844, "loss": 0.05536782741546631, "memory(GiB)": 178.87, "nll_loss": 0.05153592675924301, "rewards/accuracies": 1.0, "rewards/chosen": 21.256345748901367, "rewards/margins": 20.680620193481445, "rewards/rejected": 0.5757262706756592, "step": 190, "train_speed(iter/s)": 0.001936 }, { "epoch": 1.8436018957345972, "grad_norm": 0.3675084114074707, "learning_rate": 1.3953582237871521e-06, "logits/chosen": -1.3809669017791748, "logits/rejected": -1.3703250885009766, "logps/chosen": -15.006329536437988, "logps/rejected": -199.88511657714844, "loss": 0.08639336824417114, "memory(GiB)": 178.87, "nll_loss": 0.07968685030937195, "rewards/accuracies": 1.0, "rewards/chosen": 20.411216735839844, "rewards/margins": 19.984678268432617, "rewards/rejected": 0.4265367090702057, "step": 195, "train_speed(iter/s)": 0.001936 }, { "epoch": 1.890995260663507, "grad_norm": 0.941786527633667, "learning_rate": 6.217723917238128e-07, "logits/chosen": -1.3778371810913086, "logits/rejected": -1.3684101104736328, "logps/chosen": -7.696736812591553, "logps/rejected": -223.34890747070312, "loss": 0.0544640064239502, "memory(GiB)": 178.87, "nll_loss": 0.04780174046754837, "rewards/accuracies": 1.0, "rewards/chosen": 19.833518981933594, "rewards/margins": 21.463306427001953, "rewards/rejected": -1.6297862529754639, "step": 200, "train_speed(iter/s)": 0.001936 }, { "epoch": 1.890995260663507, "eval_logits/chosen": -1.3681780099868774, "eval_logits/rejected": -1.363707423210144, "eval_logps/chosen": -9.783992767333984, "eval_logps/rejected": -174.166015625, "eval_loss": 0.09675905108451843, "eval_nll_loss": 0.08484382927417755, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 21.503095626831055, "eval_rewards/margins": 17.923770904541016, "eval_rewards/rejected": 3.5793240070343018, "eval_runtime": 228.9543, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.074, "step": 200 }, { "epoch": 1.9383886255924172, "grad_norm": 0.34630173444747925, "learning_rate": 1.5568547761034004e-07, "logits/chosen": -1.3920785188674927, "logits/rejected": -1.3849624395370483, "logps/chosen": -9.088478088378906, "logps/rejected": -196.94784545898438, "loss": 0.0544456422328949, "memory(GiB)": 178.87, "nll_loss": 0.04648340493440628, "rewards/accuracies": 1.0, "rewards/chosen": 20.052505493164062, "rewards/margins": 19.269899368286133, "rewards/rejected": 0.7826067805290222, "step": 205, "train_speed(iter/s)": 0.001931 }, { "epoch": 1.985781990521327, "grad_norm": 0.3001299798488617, "learning_rate": 0.0, "logits/chosen": -1.345251202583313, "logits/rejected": -1.3417621850967407, "logps/chosen": -13.094474792480469, "logps/rejected": -177.3877410888672, "loss": 0.10830415487289428, "memory(GiB)": 178.87, "nll_loss": 0.06853027641773224, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 19.333545684814453, "rewards/margins": 17.912960052490234, "rewards/rejected": 1.4205853939056396, "step": 210, "train_speed(iter/s)": 0.00193 }, { "epoch": 1.985781990521327, "eval_logits/chosen": -1.3710763454437256, "eval_logits/rejected": -1.364762783050537, "eval_logps/chosen": -9.65355110168457, "eval_logps/rejected": -172.0828094482422, "eval_loss": 0.09402994066476822, "eval_nll_loss": 0.08348451554775238, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 21.51613998413086, "eval_rewards/margins": 17.72849464416504, "eval_rewards/rejected": 3.787644863128662, "eval_runtime": 228.9559, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.074, "step": 210 } ], "logging_steps": 5, "max_steps": 210, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.9852517843992576e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }