{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completions/clipped_ratio": 0.0078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 485.28125, "completions/mean_terminated_length": 464.91339111328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.004266666666666667, "grad_norm": 0.520917236560749, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 231948.0, "reward": 0.2526041865348816, "reward_std": 0.36488077044487, "rewards/accuracy_reward/mean": 0.2526041567325592, "rewards/accuracy_reward/std": 0.4350726902484894, "step": 1 }, { "completions/clipped_ratio": 0.005859375000000028, "completions/max_length": 3072.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 479.0228042602539, "completions/mean_terminated_length": 463.7552032470703, "completions/min_length": 8.25, "completions/min_terminated_length": 8.25, "epoch": 0.021333333333333333, "grad_norm": 0.605783285453845, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 1138331.0, "reward": 0.209635429084301, "reward_std": 0.3193562999367714, "rewards/accuracy_reward/mean": 0.2096354141831398, "rewards/accuracy_reward/std": 0.4065614491701126, "step": 5 }, { "completions/clipped_ratio": 0.010416666666666675, "completions/max_length": 3072.0, "completions/max_terminated_length": 2966.2, "completions/mean_length": 526.5625305175781, "completions/mean_terminated_length": 499.9768310546875, "completions/min_length": 8.2, "completions/min_terminated_length": 8.2, "epoch": 0.042666666666666665, "grad_norm": 0.5867270283630543, "learning_rate": 3.75e-07, "loss": 0.0, "num_tokens": 2354663.0, "reward": 0.20729167461395265, "reward_std": 0.3129316449165344, "rewards/accuracy_reward/mean": 0.2072916656732559, "rewards/accuracy_reward/std": 0.4027693569660187, "step": 10 }, { "completions/clipped_ratio": 0.00625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2882.8, "completions/mean_length": 497.3052185058594, "completions/mean_terminated_length": 481.084326171875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.064, "grad_norm": 0.4911174024789341, "learning_rate": 5.833333333333334e-07, "loss": 0.0, "num_tokens": 3548565.0, "reward": 0.24270834326744078, "reward_std": 0.3443186104297638, "rewards/accuracy_reward/mean": 0.24270833432674407, "rewards/accuracy_reward/std": 0.4274510681629181, "step": 15 }, { "completions/clipped_ratio": 0.009895833333333326, "completions/max_length": 3072.0, "completions/max_terminated_length": 2833.6, "completions/mean_length": 538.1422119140625, "completions/mean_terminated_length": 512.8737670898438, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.08533333333333333, "grad_norm": 0.41318732667873204, "learning_rate": 7.916666666666666e-07, "loss": 0.0, "num_tokens": 4802754.0, "reward": 0.309375011920929, "reward_std": 0.35375809073448183, "rewards/accuracy_reward/mean": 0.30937500298023224, "rewards/accuracy_reward/std": 0.45929681658744814, "step": 20 }, { "completions/clipped_ratio": 0.011458333333333348, "completions/max_length": 3072.0, "completions/max_terminated_length": 2732.2, "completions/mean_length": 593.0505493164062, "completions/mean_terminated_length": 564.4264831542969, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "epoch": 0.10666666666666667, "grad_norm": 0.35558586141255843, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 6160243.0, "reward": 0.4468750298023224, "reward_std": 0.3761806607246399, "rewards/accuracy_reward/mean": 0.4505040287971497, "rewards/accuracy_reward/std": 0.4906519949436188, "step": 25 }, { "completions/clipped_ratio": 0.0078124999999999774, "completions/max_length": 3072.0, "completions/max_terminated_length": 2722.4, "completions/mean_length": 617.7427368164062, "completions/mean_terminated_length": 598.4890502929687, "completions/min_length": 71.2, "completions/min_terminated_length": 71.2, "epoch": 0.128, "grad_norm": 0.278119169832429, "learning_rate": 9.986018985905899e-07, "loss": 0.0, "num_tokens": 7556165.0, "reward": 0.6734375476837158, "reward_std": 0.3005207061767578, "rewards/accuracy_reward/mean": 0.6734375119209289, "rewards/accuracy_reward/std": 0.4609092056751251, "step": 30 }, { "completions/clipped_ratio": 0.011458333333333303, "completions/max_length": 3072.0, "completions/max_terminated_length": 2779.2, "completions/mean_length": 672.75576171875, "completions/mean_terminated_length": 645.0045166015625, "completions/min_length": 131.6, "completions/min_terminated_length": 131.6, "epoch": 0.14933333333333335, "grad_norm": 0.15489867326898885, "learning_rate": 9.944154131125642e-07, "loss": 0.0, "num_tokens": 9074500.0, "reward": 0.7635416865348816, "reward_std": 0.2427810400724411, "rewards/accuracy_reward/mean": 0.7635416746139526, "rewards/accuracy_reward/std": 0.41730746626853943, "step": 35 }, { "completions/clipped_ratio": 0.020833333333333325, "completions/max_length": 3072.0, "completions/max_terminated_length": 2880.8, "completions/mean_length": 725.8890869140625, "completions/mean_terminated_length": 675.9776489257813, "completions/min_length": 164.2, "completions/min_terminated_length": 164.2, "epoch": 0.17066666666666666, "grad_norm": 0.11987732931652527, "learning_rate": 9.874639560909118e-07, "loss": 0.0, "num_tokens": 10692547.0, "reward": 0.7078125238418579, "reward_std": 0.21871779561042787, "rewards/accuracy_reward/mean": 0.707812511920929, "rewards/accuracy_reward/std": 0.4528683304786682, "step": 40 }, { "completions/clipped_ratio": 0.0265625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 758.0343872070313, "completions/mean_terminated_length": 694.8237915039062, "completions/min_length": 191.6, "completions/min_terminated_length": 191.6, "epoch": 0.192, "grad_norm": 0.18580966989387382, "learning_rate": 9.777864028930705e-07, "loss": 0.0, "num_tokens": 12377449.0, "reward": 0.7604166865348816, "reward_std": 0.21717487871646882, "rewards/accuracy_reward/mean": 0.7604166746139527, "rewards/accuracy_reward/std": 0.42273242473602296, "step": 45 }, { "completions/clipped_ratio": 0.021875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2662.2, "completions/mean_length": 726.3640869140625, "completions/mean_terminated_length": 673.8896240234375, "completions/min_length": 193.6, "completions/min_terminated_length": 193.6, "epoch": 0.21333333333333335, "grad_norm": 0.14629752797045345, "learning_rate": 9.65436874322102e-07, "loss": 0.0, "num_tokens": 13965976.0, "reward": 0.7838541865348816, "reward_std": 0.1965931236743927, "rewards/accuracy_reward/mean": 0.7838541746139527, "rewards/accuracy_reward/std": 0.40644299387931826, "step": 50 }, { "completions/clipped_ratio": 0.021354166666666674, "completions/max_length": 3072.0, "completions/max_terminated_length": 2811.2, "completions/mean_length": 761.35732421875, "completions/mean_terminated_length": 711.0923706054688, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.23466666666666666, "grad_norm": 0.1425119082261623, "learning_rate": 9.504844339512094e-07, "loss": 0.0, "num_tokens": 15644142.0, "reward": 0.7031250238418579, "reward_std": 0.20790221095085143, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4502527713775635, "step": 55 }, { "completions/clipped_ratio": 0.016666666666666673, "completions/max_length": 3072.0, "completions/max_terminated_length": 2749.4, "completions/mean_length": 753.8932495117188, "completions/mean_terminated_length": 714.2966186523438, "completions/min_length": 187.4, "completions/min_terminated_length": 187.4, "epoch": 0.256, "grad_norm": 0.1461473553004321, "learning_rate": 9.330127018922193e-07, "loss": 0.0, "num_tokens": 17331905.0, "reward": 0.6765625238418579, "reward_std": 0.23596890568733214, "rewards/accuracy_reward/mean": 0.6765625, "rewards/accuracy_reward/std": 0.4681254982948303, "step": 60 }, { "completions/clipped_ratio": 0.018750000000000024, "completions/max_length": 3072.0, "completions/max_terminated_length": 2817.2, "completions/mean_length": 685.3172119140625, "completions/mean_terminated_length": 640.050048828125, "completions/min_length": 170.4, "completions/min_terminated_length": 170.4, "epoch": 0.2773333333333333, "grad_norm": 0.14125514308235385, "learning_rate": 9.131193871579974e-07, "loss": 0.0, "num_tokens": 18859586.0, "reward": 0.7755208492279053, "reward_std": 0.2070410817861557, "rewards/accuracy_reward/mean": 0.7755208373069763, "rewards/accuracy_reward/std": 0.41079028844833376, "step": 65 }, { "completions/clipped_ratio": 0.0125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2726.4, "completions/mean_length": 656.052099609375, "completions/mean_terminated_length": 625.34423828125, "completions/min_length": 189.2, "completions/min_terminated_length": 189.2, "epoch": 0.2986666666666667, "grad_norm": 0.15750340319303233, "learning_rate": 8.909157412340149e-07, "loss": 0.0, "num_tokens": 20338386.0, "reward": 0.7687500238418579, "reward_std": 0.15872803926467896, "rewards/accuracy_reward/mean": 0.76875, "rewards/accuracy_reward/std": 0.41025813221931456, "step": 70 }, { "completions/clipped_ratio": 0.011979166666666674, "completions/max_length": 3072.0, "completions/max_terminated_length": 2759.4, "completions/mean_length": 715.2005249023438, "completions/mean_terminated_length": 686.7471069335937, "completions/min_length": 186.4, "completions/min_terminated_length": 186.4, "epoch": 0.32, "grad_norm": 0.12783574425441635, "learning_rate": 8.66525935914913e-07, "loss": 0.0, "num_tokens": 21919939.0, "reward": 0.7572916984558106, "reward_std": 0.19151964485645295, "rewards/accuracy_reward/mean": 0.7572916626930237, "rewards/accuracy_reward/std": 0.4187034428119659, "step": 75 }, { "completions/clipped_ratio": 0.008854166666666653, "completions/max_length": 3055.8, "completions/max_terminated_length": 2853.8, "completions/mean_length": 679.8994995117188, "completions/mean_terminated_length": 658.6912231445312, "completions/min_length": 179.4, "completions/min_terminated_length": 179.4, "epoch": 0.3413333333333333, "grad_norm": 0.1408805660232959, "learning_rate": 8.400863688854596e-07, "loss": 0.0, "num_tokens": 23447586.0, "reward": 0.7567708611488342, "reward_std": 0.18215587139129638, "rewards/accuracy_reward/mean": 0.7567708253860473, "rewards/accuracy_reward/std": 0.4212845742702484, "step": 80 }, { "completions/clipped_ratio": 0.0057291666666666515, "completions/max_length": 3072.0, "completions/max_terminated_length": 2647.2, "completions/mean_length": 653.5302124023438, "completions/mean_terminated_length": 639.655078125, "completions/min_length": 167.2, "completions/min_terminated_length": 167.2, "epoch": 0.3626666666666667, "grad_norm": 0.12563994203099282, "learning_rate": 8.117449009293668e-07, "loss": 0.0, "num_tokens": 24923104.0, "reward": 0.8052083492279053, "reward_std": 0.16214540898799895, "rewards/accuracy_reward/mean": 0.8052083373069763, "rewards/accuracy_reward/std": 0.39457470178604126, "step": 85 }, { "completions/clipped_ratio": 0.00625, "completions/max_length": 2936.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 619.6573120117188, "completions/mean_terminated_length": 604.2880737304688, "completions/min_length": 172.6, "completions/min_terminated_length": 172.6, "epoch": 0.384, "grad_norm": 0.1114838740449701, "learning_rate": 7.81660029031811e-07, "loss": 0.0, "num_tokens": 26341614.0, "reward": 0.770312511920929, "reward_std": 0.13281314820051193, "rewards/accuracy_reward/mean": 0.7703125, "rewards/accuracy_reward/std": 0.41931731104850767, "step": 90 }, { "completions/clipped_ratio": 0.009895833333333303, "completions/max_length": 3072.0, "completions/max_terminated_length": 2587.8, "completions/mean_length": 620.375537109375, "completions/mean_terminated_length": 595.8724243164063, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4053333333333333, "grad_norm": 0.14635688952368842, "learning_rate": 7.5e-07, "loss": 0.0, "num_tokens": 27745255.0, "reward": 0.7864583730697632, "reward_std": 0.17735956311225892, "rewards/accuracy_reward/mean": 0.7864583134651184, "rewards/accuracy_reward/std": 0.40625172257423403, "step": 95 }, { "completions/clipped_ratio": 0.009895833333333326, "completions/max_length": 3072.0, "completions/max_terminated_length": 2805.6, "completions/mean_length": 661.9760620117188, "completions/mean_terminated_length": 637.8560546875, "completions/min_length": 170.2, "completions/min_terminated_length": 170.2, "epoch": 0.4266666666666667, "grad_norm": 0.11608458667846797, "learning_rate": 7.16941869558779e-07, "loss": 0.0, "num_tokens": 29252673.0, "reward": 0.768750011920929, "reward_std": 0.1529282122850418, "rewards/accuracy_reward/mean": 0.76875, "rewards/accuracy_reward/std": 0.4190669238567352, "step": 100 }, { "completions/clipped_ratio": 0.003645833333333348, "completions/max_length": 2994.4, "completions/max_terminated_length": 2802.2, "completions/mean_length": 656.750537109375, "completions/mean_terminated_length": 648.051611328125, "completions/min_length": 188.6, "completions/min_terminated_length": 188.6, "epoch": 0.448, "grad_norm": 0.13559667526065217, "learning_rate": 6.826705121831976e-07, "loss": 0.0, "num_tokens": 30724534.0, "reward": 0.7687500238418579, "reward_std": 0.13878327757120132, "rewards/accuracy_reward/mean": 0.76875, "rewards/accuracy_reward/std": 0.41612801551818845, "step": 105 }, { "completions/clipped_ratio": 0.005208333333333326, "completions/max_length": 2990.6, "completions/max_terminated_length": 2880.0, "completions/mean_length": 630.3677368164062, "completions/mean_terminated_length": 617.6440063476563, "completions/min_length": 165.2, "completions/min_terminated_length": 165.2, "epoch": 0.4693333333333333, "grad_norm": 0.12193914078660352, "learning_rate": 6.473775872054521e-07, "loss": 0.0, "num_tokens": 32158544.0, "reward": 0.7708333492279053, "reward_std": 0.11787779033184051, "rewards/accuracy_reward/mean": 0.7708333373069763, "rewards/accuracy_reward/std": 0.4162010133266449, "step": 110 }, { "completions/clipped_ratio": 0.011458333333333303, "completions/max_length": 3072.0, "completions/max_terminated_length": 2242.6, "completions/mean_length": 627.9906494140625, "completions/mean_terminated_length": 599.6399780273438, "completions/min_length": 164.2, "completions/min_terminated_length": 164.2, "epoch": 0.49066666666666664, "grad_norm": 0.1639782288788134, "learning_rate": 6.112604669781572e-07, "loss": 0.0, "num_tokens": 33592490.0, "reward": 0.7885416865348815, "reward_std": 0.16564108431339264, "rewards/accuracy_reward/mean": 0.7885416626930237, "rewards/accuracy_reward/std": 0.40830116868019106, "step": 115 }, { "completions/clipped_ratio": 0.005729166666666674, "completions/max_length": 2994.2, "completions/max_terminated_length": 2480.0, "completions/mean_length": 637.9521118164063, "completions/mean_terminated_length": 623.8312622070313, "completions/min_length": 180.2, "completions/min_terminated_length": 180.2, "epoch": 0.512, "grad_norm": 0.15931651060828983, "learning_rate": 5.745211330880872e-07, "loss": 0.0, "num_tokens": 35031990.0, "reward": 0.8166666746139526, "reward_std": 0.13533624559640883, "rewards/accuracy_reward/mean": 0.8166666626930237, "rewards/accuracy_reward/std": 0.3768895387649536, "step": 120 }, { "completions/clipped_ratio": 0.0031249999999999776, "completions/max_length": 3072.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 648.9698120117188, "completions/mean_terminated_length": 641.4311157226563, "completions/min_length": 189.2, "completions/min_terminated_length": 189.2, "epoch": 0.5333333333333333, "grad_norm": 0.13986988084150442, "learning_rate": 5.373650467932121e-07, "loss": 0.0, "num_tokens": 36495572.0, "reward": 0.7572916984558106, "reward_std": 0.13581392168998718, "rewards/accuracy_reward/mean": 0.7572916626930237, "rewards/accuracy_reward/std": 0.42815017104148867, "step": 125 }, { "completions/clipped_ratio": 0.005208333333333348, "completions/max_length": 2953.8, "completions/max_terminated_length": 2685.8, "completions/mean_length": 660.8781494140625, "completions/mean_terminated_length": 648.2672607421875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5546666666666666, "grad_norm": 0.09078765454466155, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 37966550.0, "reward": 0.7791666865348816, "reward_std": 0.12897869795560837, "rewards/accuracy_reward/mean": 0.7791666746139526, "rewards/accuracy_reward/std": 0.41312212944030763, "step": 130 }, { "completions/clipped_ratio": 0.008854166666666673, "completions/max_length": 3024.0, "completions/max_terminated_length": 2550.2, "completions/mean_length": 698.8916870117188, "completions/mean_terminated_length": 677.5604736328125, "completions/min_length": 168.8, "completions/min_terminated_length": 168.8, "epoch": 0.576, "grad_norm": 0.18041131034423796, "learning_rate": 4.626349532067879e-07, "loss": 0.0, "num_tokens": 39554662.0, "reward": 0.7265625238418579, "reward_std": 0.20888787209987641, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.4385166049003601, "step": 135 }, { "completions/clipped_ratio": 0.006249999999999978, "completions/max_length": 2983.0, "completions/max_terminated_length": 2668.6, "completions/mean_length": 676.8656494140625, "completions/mean_terminated_length": 661.9003540039063, "completions/min_length": 168.8, "completions/min_terminated_length": 168.8, "epoch": 0.5973333333333334, "grad_norm": 0.12763732327998065, "learning_rate": 4.254788669119127e-07, "loss": 0.0, "num_tokens": 41075236.0, "reward": 0.7583333611488342, "reward_std": 0.15409551858901976, "rewards/accuracy_reward/mean": 0.7583333373069763, "rewards/accuracy_reward/std": 0.42535403966903684, "step": 140 }, { "completions/clipped_ratio": 0.003645833333333326, "completions/max_length": 3072.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 616.8364624023437, "completions/mean_terminated_length": 607.891357421875, "completions/min_length": 165.6, "completions/min_terminated_length": 165.6, "epoch": 0.6186666666666667, "grad_norm": 0.10043809375346945, "learning_rate": 3.8873953302184283e-07, "loss": 0.0, "num_tokens": 42476678.0, "reward": 0.8140625357627869, "reward_std": 0.1411786586046219, "rewards/accuracy_reward/mean": 0.814062488079071, "rewards/accuracy_reward/std": 0.383864825963974, "step": 145 }, { "completions/clipped_ratio": 0.005208333333333326, "completions/max_length": 2962.0, "completions/max_terminated_length": 2659.2, "completions/mean_length": 704.5172119140625, "completions/mean_terminated_length": 692.15576171875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.64, "grad_norm": 0.17289725073315046, "learning_rate": 3.526224127945478e-07, "loss": 0.0, "num_tokens": 44059187.0, "reward": 0.7557291865348816, "reward_std": 0.15531594157218934, "rewards/accuracy_reward/mean": 0.7557291746139526, "rewards/accuracy_reward/std": 0.4264496624469757, "step": 150 }, { "completions/clipped_ratio": 0.003645833333333326, "completions/max_length": 2967.6, "completions/max_terminated_length": 2764.0, "completions/mean_length": 674.6942749023438, "completions/mean_terminated_length": 666.0243530273438, "completions/min_length": 200.4, "completions/min_terminated_length": 200.4, "epoch": 0.6613333333333333, "grad_norm": 0.1350951173580443, "learning_rate": 3.173294878168025e-07, "loss": 0.0, "num_tokens": 45597192.0, "reward": 0.7666666865348816, "reward_std": 0.15866111516952514, "rewards/accuracy_reward/mean": 0.7666666746139527, "rewards/accuracy_reward/std": 0.41897391676902773, "step": 155 }, { "completions/clipped_ratio": 0.007291666666666652, "completions/max_length": 3072.0, "completions/max_terminated_length": 2769.4, "completions/mean_length": 655.6703369140625, "completions/mean_terminated_length": 637.7800170898438, "completions/min_length": 183.4, "completions/min_terminated_length": 183.4, "epoch": 0.6826666666666666, "grad_norm": 0.12153652252369943, "learning_rate": 2.8305813044122093e-07, "loss": 0.0, "num_tokens": 47069847.0, "reward": 0.7557291984558105, "reward_std": 0.1263695999979973, "rewards/accuracy_reward/mean": 0.7557291507720947, "rewards/accuracy_reward/std": 0.4225205659866333, "step": 160 }, { "completions/clipped_ratio": 0.005729166666666674, "completions/max_length": 2966.6, "completions/max_terminated_length": 2621.0, "completions/mean_length": 643.0937744140625, "completions/mean_terminated_length": 629.2384765625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.704, "grad_norm": 0.1495780093047341, "learning_rate": 2.500000000000001e-07, "loss": 0.0, "num_tokens": 48532479.0, "reward": 0.7630208611488343, "reward_std": 0.1342099890112877, "rewards/accuracy_reward/mean": 0.7630208253860473, "rewards/accuracy_reward/std": 0.4203058660030365, "step": 165 }, { "completions/clipped_ratio": 0.004166666666666674, "completions/max_length": 2989.8, "completions/max_terminated_length": 2589.6, "completions/mean_length": 645.0854248046875, "completions/mean_terminated_length": 634.89453125, "completions/min_length": 188.6, "completions/min_terminated_length": 188.6, "epoch": 0.7253333333333334, "grad_norm": 0.1220322619144304, "learning_rate": 2.1833997096818895e-07, "loss": 0.0, "num_tokens": 49999139.0, "reward": 0.8135416746139527, "reward_std": 0.13035370856523515, "rewards/accuracy_reward/mean": 0.8135416746139527, "rewards/accuracy_reward/std": 0.38940677642822263, "step": 170 }, { "completions/clipped_ratio": 0.00625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2509.8, "completions/mean_length": 697.6880493164062, "completions/mean_terminated_length": 682.7364379882813, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7466666666666667, "grad_norm": 0.11867987998261496, "learning_rate": 1.8825509907063326e-07, "loss": 0.0, "num_tokens": 51583392.0, "reward": 0.7697916746139526, "reward_std": 0.17446283400058746, "rewards/accuracy_reward/mean": 0.7697916626930237, "rewards/accuracy_reward/std": 0.4213572680950165, "step": 175 }, { "completions/clipped_ratio": 0.002604166666666674, "completions/max_length": 2956.6, "completions/max_terminated_length": 2636.4, "completions/mean_length": 654.2448120117188, "completions/mean_terminated_length": 647.95419921875, "completions/min_length": 179.6, "completions/min_terminated_length": 179.6, "epoch": 0.768, "grad_norm": 0.15075792052642348, "learning_rate": 1.599136311145402e-07, "loss": 0.0, "num_tokens": 53082926.0, "reward": 0.7802083492279053, "reward_std": 0.14422165006399154, "rewards/accuracy_reward/mean": 0.7802083373069764, "rewards/accuracy_reward/std": 0.4107910990715027, "step": 180 }, { "completions/clipped_ratio": 0.00625, "completions/max_length": 2964.6, "completions/max_terminated_length": 2699.0, "completions/mean_length": 665.9271118164063, "completions/mean_terminated_length": 650.812109375, "completions/min_length": 187.4, "completions/min_terminated_length": 187.4, "epoch": 0.7893333333333333, "grad_norm": 0.13022354229475042, "learning_rate": 1.3347406408508694e-07, "loss": 0.0, "num_tokens": 54587154.0, "reward": 0.7880208611488342, "reward_std": 0.150584477186203, "rewards/accuracy_reward/mean": 0.7880208253860473, "rewards/accuracy_reward/std": 0.40536269545555115, "step": 185 }, { "completions/clipped_ratio": 0.006250000000000022, "completions/max_length": 3031.0, "completions/max_terminated_length": 2392.6, "completions/mean_length": 632.9718994140625, "completions/mean_terminated_length": 617.5747314453125, "completions/min_length": 173.4, "completions/min_terminated_length": 173.4, "epoch": 0.8106666666666666, "grad_norm": 0.1380258547721106, "learning_rate": 1.090842587659851e-07, "loss": 0.0, "num_tokens": 56028048.0, "reward": 0.7979166865348816, "reward_std": 0.1520715445280075, "rewards/accuracy_reward/mean": 0.7979166626930236, "rewards/accuracy_reward/std": 0.39996551275253295, "step": 190 }, { "completions/clipped_ratio": 0.003125, "completions/max_length": 3059.2, "completions/max_terminated_length": 2665.2, "completions/mean_length": 669.544287109375, "completions/mean_terminated_length": 661.9738891601562, "completions/min_length": 203.8, "completions/min_terminated_length": 203.8, "epoch": 0.832, "grad_norm": 0.14757967977113381, "learning_rate": 8.688061284200265e-08, "loss": 0.0, "num_tokens": 57527497.0, "reward": 0.782812523841858, "reward_std": 0.1453171357512474, "rewards/accuracy_reward/mean": 0.7828125, "rewards/accuracy_reward/std": 0.40937721729278564, "step": 195 }, { "completions/clipped_ratio": 0.005729166666666674, "completions/max_length": 3072.0, "completions/max_terminated_length": 2780.8, "completions/mean_length": 659.2458618164062, "completions/mean_terminated_length": 645.3434936523438, "completions/min_length": 196.6, "completions/min_terminated_length": 196.6, "epoch": 0.8533333333333334, "grad_norm": 0.1274348951804976, "learning_rate": 6.698729810778064e-08, "loss": 0.0, "num_tokens": 59008697.0, "reward": 0.785937511920929, "reward_std": 0.13709688782691956, "rewards/accuracy_reward/mean": 0.785937511920929, "rewards/accuracy_reward/std": 0.4097227990627289, "step": 200 }, { "completions/clipped_ratio": 0.0031249999999999776, "completions/max_length": 3072.0, "completions/max_terminated_length": 2707.2, "completions/mean_length": 673.1026245117188, "completions/mean_terminated_length": 665.640576171875, "completions/min_length": 171.2, "completions/min_terminated_length": 171.2, "epoch": 0.8746666666666667, "grad_norm": 0.1367770121399511, "learning_rate": 4.951556604879048e-08, "loss": 0.0, "num_tokens": 60514390.0, "reward": 0.8041666746139526, "reward_std": 0.15395441949367522, "rewards/accuracy_reward/mean": 0.8041666746139526, "rewards/accuracy_reward/std": 0.39167559146881104, "step": 205 }, { "completions/clipped_ratio": 0.00625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2710.8, "completions/mean_length": 679.5974243164062, "completions/mean_terminated_length": 664.3808837890625, "completions/min_length": 177.4, "completions/min_terminated_length": 177.4, "epoch": 0.896, "grad_norm": 0.16010582334244083, "learning_rate": 3.456312567789793e-08, "loss": 0.0, "num_tokens": 62067569.0, "reward": 0.7369791984558105, "reward_std": 0.16339227557182312, "rewards/accuracy_reward/mean": 0.7369791626930237, "rewards/accuracy_reward/std": 0.43414112329483034, "step": 210 }, { "completions/clipped_ratio": 0.00625, "completions/max_length": 2965.4, "completions/max_terminated_length": 2622.6, "completions/mean_length": 707.8198120117188, "completions/mean_terminated_length": 692.9181030273437, "completions/min_length": 169.2, "completions/min_terminated_length": 169.2, "epoch": 0.9173333333333333, "grad_norm": 0.1209267022932848, "learning_rate": 2.2213597106929605e-08, "loss": 0.0, "num_tokens": 63633403.0, "reward": 0.746875011920929, "reward_std": 0.14179472625255585, "rewards/accuracy_reward/mean": 0.746875, "rewards/accuracy_reward/std": 0.432002580165863, "step": 215 }, { "completions/clipped_ratio": 0.004687500000000022, "completions/max_length": 3072.0, "completions/max_terminated_length": 2537.2, "completions/mean_length": 596.109912109375, "completions/mean_terminated_length": 584.4670288085938, "completions/min_length": 180.6, "completions/min_terminated_length": 180.6, "epoch": 0.9386666666666666, "grad_norm": 0.18645596334487735, "learning_rate": 1.253604390908819e-08, "loss": 0.0, "num_tokens": 65006918.0, "reward": 0.7916666865348816, "reward_std": 0.15821044147014618, "rewards/accuracy_reward/mean": 0.7967406034469604, "rewards/accuracy_reward/std": 0.40229756832122804, "step": 220 }, { "completions/clipped_ratio": 0.005208333333333348, "completions/max_length": 3072.0, "completions/max_terminated_length": 2708.0, "completions/mean_length": 658.136474609375, "completions/mean_terminated_length": 645.4781127929688, "completions/min_length": 169.4, "completions/min_terminated_length": 169.4, "epoch": 0.96, "grad_norm": 0.13788320837467583, "learning_rate": 5.5845868874357385e-09, "loss": 0.0, "num_tokens": 66499368.0, "reward": 0.7505208492279053, "reward_std": 0.1512398064136505, "rewards/accuracy_reward/mean": 0.7505208253860474, "rewards/accuracy_reward/std": 0.4306790828704834, "step": 225 }, { "completions/clipped_ratio": 0.0046875, "completions/max_length": 3019.8, "completions/max_terminated_length": 2794.2, "completions/mean_length": 651.5031494140625, "completions/mean_terminated_length": 640.1200805664063, "completions/min_length": 174.6, "completions/min_terminated_length": 174.6, "epoch": 0.9813333333333333, "grad_norm": 0.16453090993765349, "learning_rate": 1.3981014094099353e-09, "loss": 0.0, "num_tokens": 67974774.0, "reward": 0.7588541746139527, "reward_std": 0.16142528057098388, "rewards/accuracy_reward/mean": 0.7588541626930236, "rewards/accuracy_reward/std": 0.4191846787929535, "step": 230 }, { "completions/clipped_ratio": 0.0006510416666666574, "completions/max_length": 2756.5, "completions/max_terminated_length": 2658.25, "completions/mean_length": 666.6046752929688, "completions/mean_terminated_length": 665.0616302490234, "completions/min_length": 202.5, "completions/min_terminated_length": 202.5, "epoch": 0.9984, "num_tokens": 69188684.0, "reward": 0.802083358168602, "reward_std": 0.115657864138484, "rewards/accuracy_reward/mean": 0.8020833283662796, "rewards/accuracy_reward/std": 0.3962276577949524, "step": 234, "total_flos": 0.0, "train_loss": -0.0001143101905266668, "train_runtime": 41451.8465, "train_samples_per_second": 0.181, "train_steps_per_second": 0.006 } ], "logging_steps": 5, "max_steps": 234, "num_input_tokens_seen": 69188684, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }