{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 212, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009478672985781991, "grad_norm": 1.1718926429748535, "learning_rate": 0.0, "loss": 2.6629, "step": 1 }, { "epoch": 0.018957345971563982, "grad_norm": 1.1212024688720703, "learning_rate": 6.666666666666667e-07, "loss": 2.5806, "step": 2 }, { "epoch": 0.02843601895734597, "grad_norm": 1.225563406944275, "learning_rate": 1.3333333333333334e-06, "loss": 2.6841, "step": 3 }, { "epoch": 0.037914691943127965, "grad_norm": 1.2255617380142212, "learning_rate": 2.0000000000000003e-06, "loss": 2.6513, "step": 4 }, { "epoch": 0.04739336492890995, "grad_norm": 1.000268578529358, "learning_rate": 2.666666666666667e-06, "loss": 2.6722, "step": 5 }, { "epoch": 0.05687203791469194, "grad_norm": 0.9953453540802002, "learning_rate": 3.3333333333333333e-06, "loss": 2.4666, "step": 6 }, { "epoch": 0.06635071090047394, "grad_norm": 1.158920168876648, "learning_rate": 4.000000000000001e-06, "loss": 2.6296, "step": 7 }, { "epoch": 0.07582938388625593, "grad_norm": 1.0453510284423828, "learning_rate": 4.666666666666667e-06, "loss": 2.599, "step": 8 }, { "epoch": 0.08530805687203792, "grad_norm": 0.9960002899169922, "learning_rate": 5.333333333333334e-06, "loss": 2.6475, "step": 9 }, { "epoch": 0.0947867298578199, "grad_norm": 1.2907602787017822, "learning_rate": 6e-06, "loss": 2.5586, "step": 10 }, { "epoch": 0.10426540284360189, "grad_norm": 1.0515801906585693, "learning_rate": 6.666666666666667e-06, "loss": 2.5806, "step": 11 }, { "epoch": 0.11374407582938388, "grad_norm": 1.070308804512024, "learning_rate": 7.333333333333333e-06, "loss": 2.7149, "step": 12 }, { "epoch": 0.12322274881516587, "grad_norm": 0.9910991191864014, "learning_rate": 8.000000000000001e-06, "loss": 2.6353, "step": 13 }, { "epoch": 0.13270142180094788, "grad_norm": 0.8306079506874084, "learning_rate": 8.666666666666668e-06, "loss": 2.5439, "step": 14 }, { "epoch": 0.14218009478672985, "grad_norm": 0.945061981678009, "learning_rate": 9.333333333333334e-06, "loss": 2.5597, "step": 15 }, { "epoch": 0.15165876777251186, "grad_norm": 0.8972373604774475, "learning_rate": 1e-05, "loss": 2.4496, "step": 16 }, { "epoch": 0.16113744075829384, "grad_norm": 0.9250657558441162, "learning_rate": 1.0666666666666667e-05, "loss": 2.5905, "step": 17 }, { "epoch": 0.17061611374407584, "grad_norm": 0.8590799570083618, "learning_rate": 1.1333333333333334e-05, "loss": 2.4993, "step": 18 }, { "epoch": 0.18009478672985782, "grad_norm": 0.8903495669364929, "learning_rate": 1.2e-05, "loss": 2.4999, "step": 19 }, { "epoch": 0.1895734597156398, "grad_norm": 0.9213452935218811, "learning_rate": 1.2666666666666667e-05, "loss": 2.4677, "step": 20 }, { "epoch": 0.1990521327014218, "grad_norm": 0.8921183943748474, "learning_rate": 1.3333333333333333e-05, "loss": 2.4343, "step": 21 }, { "epoch": 0.20853080568720378, "grad_norm": 0.767433226108551, "learning_rate": 1.4e-05, "loss": 2.5294, "step": 22 }, { "epoch": 0.21800947867298578, "grad_norm": 0.8856372237205505, "learning_rate": 1.4666666666666666e-05, "loss": 2.4663, "step": 23 }, { "epoch": 0.22748815165876776, "grad_norm": 0.8512458801269531, "learning_rate": 1.5333333333333334e-05, "loss": 2.5763, "step": 24 }, { "epoch": 0.23696682464454977, "grad_norm": 0.8298342823982239, "learning_rate": 1.6000000000000003e-05, "loss": 2.4612, "step": 25 }, { "epoch": 0.24644549763033174, "grad_norm": 1.0803931951522827, "learning_rate": 1.6666666666666667e-05, "loss": 2.515, "step": 26 }, { "epoch": 0.2559241706161137, "grad_norm": 0.8421900868415833, "learning_rate": 1.7333333333333336e-05, "loss": 2.5489, "step": 27 }, { "epoch": 0.26540284360189575, "grad_norm": 0.9771528244018555, "learning_rate": 1.8e-05, "loss": 2.4214, "step": 28 }, { "epoch": 0.27488151658767773, "grad_norm": 1.0094590187072754, "learning_rate": 1.866666666666667e-05, "loss": 2.3959, "step": 29 }, { "epoch": 0.2843601895734597, "grad_norm": 0.9626856446266174, "learning_rate": 1.9333333333333333e-05, "loss": 2.4792, "step": 30 }, { "epoch": 0.2938388625592417, "grad_norm": 0.843460738658905, "learning_rate": 2e-05, "loss": 2.5486, "step": 31 }, { "epoch": 0.3033175355450237, "grad_norm": 0.8346056342124939, "learning_rate": 1.99994050500015e-05, "loss": 2.3746, "step": 32 }, { "epoch": 0.3127962085308057, "grad_norm": 0.9779443144798279, "learning_rate": 1.999762027079909e-05, "loss": 2.4543, "step": 33 }, { "epoch": 0.3222748815165877, "grad_norm": 1.2702308893203735, "learning_rate": 1.9994645874763657e-05, "loss": 2.4247, "step": 34 }, { "epoch": 0.33175355450236965, "grad_norm": 0.9161669611930847, "learning_rate": 1.999048221581858e-05, "loss": 2.5287, "step": 35 }, { "epoch": 0.3412322274881517, "grad_norm": 0.8298777937889099, "learning_rate": 1.9985129789397633e-05, "loss": 2.4522, "step": 36 }, { "epoch": 0.35071090047393366, "grad_norm": 0.8846487402915955, "learning_rate": 1.9978589232386036e-05, "loss": 2.4225, "step": 37 }, { "epoch": 0.36018957345971564, "grad_norm": 0.7903144955635071, "learning_rate": 1.9970861323044667e-05, "loss": 2.421, "step": 38 }, { "epoch": 0.3696682464454976, "grad_norm": 0.8443423509597778, "learning_rate": 1.9961946980917457e-05, "loss": 2.4323, "step": 39 }, { "epoch": 0.3791469194312796, "grad_norm": 0.9062912464141846, "learning_rate": 1.995184726672197e-05, "loss": 2.4179, "step": 40 }, { "epoch": 0.3886255924170616, "grad_norm": 0.8565083146095276, "learning_rate": 1.9940563382223196e-05, "loss": 2.3007, "step": 41 }, { "epoch": 0.3981042654028436, "grad_norm": 0.806698739528656, "learning_rate": 1.9928096670090552e-05, "loss": 2.3509, "step": 42 }, { "epoch": 0.4075829383886256, "grad_norm": 0.8576929569244385, "learning_rate": 1.9914448613738107e-05, "loss": 2.4125, "step": 43 }, { "epoch": 0.41706161137440756, "grad_norm": 1.650278925895691, "learning_rate": 1.989962083714808e-05, "loss": 2.3508, "step": 44 }, { "epoch": 0.4265402843601896, "grad_norm": 0.8928993344306946, "learning_rate": 1.988361510467761e-05, "loss": 2.3976, "step": 45 }, { "epoch": 0.43601895734597157, "grad_norm": 0.8806268572807312, "learning_rate": 1.9866433320848793e-05, "loss": 2.3067, "step": 46 }, { "epoch": 0.44549763033175355, "grad_norm": 0.8654487729072571, "learning_rate": 1.9848077530122083e-05, "loss": 2.4552, "step": 47 }, { "epoch": 0.4549763033175355, "grad_norm": 1.3034582138061523, "learning_rate": 1.9828549916653013e-05, "loss": 2.3354, "step": 48 }, { "epoch": 0.46445497630331756, "grad_norm": 0.9486224055290222, "learning_rate": 1.9807852804032306e-05, "loss": 2.4246, "step": 49 }, { "epoch": 0.47393364928909953, "grad_norm": 0.9173024296760559, "learning_rate": 1.9785988655009386e-05, "loss": 2.4309, "step": 50 }, { "epoch": 0.4834123222748815, "grad_norm": 0.9614086747169495, "learning_rate": 1.9762960071199334e-05, "loss": 2.1943, "step": 51 }, { "epoch": 0.4928909952606635, "grad_norm": 0.9894729852676392, "learning_rate": 1.9738769792773338e-05, "loss": 2.3974, "step": 52 }, { "epoch": 0.5023696682464455, "grad_norm": 0.8646323084831238, "learning_rate": 1.9713420698132614e-05, "loss": 2.3535, "step": 53 }, { "epoch": 0.5118483412322274, "grad_norm": 0.9549570083618164, "learning_rate": 1.9686915803565934e-05, "loss": 2.3486, "step": 54 }, { "epoch": 0.5213270142180095, "grad_norm": 0.8866903781890869, "learning_rate": 1.9659258262890683e-05, "loss": 2.3381, "step": 55 }, { "epoch": 0.5308056872037915, "grad_norm": 0.8965452909469604, "learning_rate": 1.963045136707763e-05, "loss": 2.3125, "step": 56 }, { "epoch": 0.5402843601895735, "grad_norm": 1.01371169090271, "learning_rate": 1.960049854385929e-05, "loss": 2.4584, "step": 57 }, { "epoch": 0.5497630331753555, "grad_norm": 0.9794333577156067, "learning_rate": 1.956940335732209e-05, "loss": 2.4287, "step": 58 }, { "epoch": 0.5592417061611374, "grad_norm": 1.146530270576477, "learning_rate": 1.953716950748227e-05, "loss": 2.4154, "step": 59 }, { "epoch": 0.5687203791469194, "grad_norm": 0.9316839575767517, "learning_rate": 1.9503800829845613e-05, "loss": 2.4184, "step": 60 }, { "epoch": 0.5781990521327014, "grad_norm": 0.8945372700691223, "learning_rate": 1.946930129495106e-05, "loss": 2.3579, "step": 61 }, { "epoch": 0.5876777251184834, "grad_norm": 0.9607009291648865, "learning_rate": 1.9433675007898255e-05, "loss": 2.4485, "step": 62 }, { "epoch": 0.5971563981042654, "grad_norm": 0.986282467842102, "learning_rate": 1.9396926207859085e-05, "loss": 2.4574, "step": 63 }, { "epoch": 0.6066350710900474, "grad_norm": 0.9698584079742432, "learning_rate": 1.935905926757326e-05, "loss": 2.3278, "step": 64 }, { "epoch": 0.6161137440758294, "grad_norm": 0.9542692303657532, "learning_rate": 1.932007869282799e-05, "loss": 2.3371, "step": 65 }, { "epoch": 0.6255924170616114, "grad_norm": 0.9010556936264038, "learning_rate": 1.9279989121921846e-05, "loss": 2.34, "step": 66 }, { "epoch": 0.6350710900473934, "grad_norm": 0.9130122661590576, "learning_rate": 1.9238795325112867e-05, "loss": 2.3215, "step": 67 }, { "epoch": 0.6445497630331753, "grad_norm": 1.236346960067749, "learning_rate": 1.9196502204050925e-05, "loss": 2.3667, "step": 68 }, { "epoch": 0.6540284360189573, "grad_norm": 1.457154393196106, "learning_rate": 1.9153114791194475e-05, "loss": 2.3762, "step": 69 }, { "epoch": 0.6635071090047393, "grad_norm": 0.8446010947227478, "learning_rate": 1.910863824921176e-05, "loss": 2.2987, "step": 70 }, { "epoch": 0.6729857819905213, "grad_norm": 0.9021546840667725, "learning_rate": 1.9063077870366504e-05, "loss": 2.3261, "step": 71 }, { "epoch": 0.6824644549763034, "grad_norm": 1.0001662969589233, "learning_rate": 1.901643907588816e-05, "loss": 2.3167, "step": 72 }, { "epoch": 0.6919431279620853, "grad_norm": 0.9423337578773499, "learning_rate": 1.8968727415326885e-05, "loss": 2.4218, "step": 73 }, { "epoch": 0.7014218009478673, "grad_norm": 0.9422402381896973, "learning_rate": 1.8919948565893144e-05, "loss": 2.3388, "step": 74 }, { "epoch": 0.7109004739336493, "grad_norm": 0.9633351564407349, "learning_rate": 1.887010833178222e-05, "loss": 2.3728, "step": 75 }, { "epoch": 0.7203791469194313, "grad_norm": 0.9586493968963623, "learning_rate": 1.881921264348355e-05, "loss": 2.3689, "step": 76 }, { "epoch": 0.7298578199052133, "grad_norm": 0.8738155961036682, "learning_rate": 1.876726755707508e-05, "loss": 2.2707, "step": 77 }, { "epoch": 0.7393364928909952, "grad_norm": 0.9077432751655579, "learning_rate": 1.8714279253502616e-05, "loss": 2.385, "step": 78 }, { "epoch": 0.7488151658767772, "grad_norm": 0.8949682116508484, "learning_rate": 1.866025403784439e-05, "loss": 2.2765, "step": 79 }, { "epoch": 0.7582938388625592, "grad_norm": 0.8834524750709534, "learning_rate": 1.860519833856079e-05, "loss": 2.274, "step": 80 }, { "epoch": 0.7677725118483413, "grad_norm": 1.1563708782196045, "learning_rate": 1.854911870672947e-05, "loss": 2.3958, "step": 81 }, { "epoch": 0.7772511848341233, "grad_norm": 0.914176881313324, "learning_rate": 1.849202181526579e-05, "loss": 2.2595, "step": 82 }, { "epoch": 0.7867298578199052, "grad_norm": 1.2525924444198608, "learning_rate": 1.843391445812886e-05, "loss": 2.3114, "step": 83 }, { "epoch": 0.7962085308056872, "grad_norm": 0.9277448654174805, "learning_rate": 1.837480354951308e-05, "loss": 2.2949, "step": 84 }, { "epoch": 0.8056872037914692, "grad_norm": 0.9295048117637634, "learning_rate": 1.8314696123025456e-05, "loss": 2.2778, "step": 85 }, { "epoch": 0.8151658767772512, "grad_norm": 0.9221338033676147, "learning_rate": 1.8253599330848638e-05, "loss": 2.35, "step": 86 }, { "epoch": 0.8246445497630331, "grad_norm": 0.910460889339447, "learning_rate": 1.819152044288992e-05, "loss": 2.2655, "step": 87 }, { "epoch": 0.8341232227488151, "grad_norm": 1.0350533723831177, "learning_rate": 1.8128466845916156e-05, "loss": 2.4712, "step": 88 }, { "epoch": 0.8436018957345972, "grad_norm": 1.0058749914169312, "learning_rate": 1.806444604267483e-05, "loss": 2.4232, "step": 89 }, { "epoch": 0.8530805687203792, "grad_norm": 0.8652654886245728, "learning_rate": 1.7999465651001297e-05, "loss": 2.3399, "step": 90 }, { "epoch": 0.8625592417061612, "grad_norm": 0.9553002119064331, "learning_rate": 1.7933533402912354e-05, "loss": 2.2772, "step": 91 }, { "epoch": 0.8720379146919431, "grad_norm": 0.9334009885787964, "learning_rate": 1.786665714368617e-05, "loss": 2.3421, "step": 92 }, { "epoch": 0.8815165876777251, "grad_norm": 0.9072061777114868, "learning_rate": 1.7798844830928818e-05, "loss": 2.2691, "step": 93 }, { "epoch": 0.8909952606635071, "grad_norm": 0.9546661376953125, "learning_rate": 1.773010453362737e-05, "loss": 2.3345, "step": 94 }, { "epoch": 0.9004739336492891, "grad_norm": 1.0151286125183105, "learning_rate": 1.766044443118978e-05, "loss": 2.2977, "step": 95 }, { "epoch": 0.909952606635071, "grad_norm": 0.8759846687316895, "learning_rate": 1.758987281247162e-05, "loss": 2.3405, "step": 96 }, { "epoch": 0.919431279620853, "grad_norm": 1.0017191171646118, "learning_rate": 1.7518398074789776e-05, "loss": 2.3445, "step": 97 }, { "epoch": 0.9289099526066351, "grad_norm": 0.9899027943611145, "learning_rate": 1.7446028722923266e-05, "loss": 2.3084, "step": 98 }, { "epoch": 0.9383886255924171, "grad_norm": 0.9640536904335022, "learning_rate": 1.737277336810124e-05, "loss": 2.2341, "step": 99 }, { "epoch": 0.9478672985781991, "grad_norm": 0.917986273765564, "learning_rate": 1.7298640726978357e-05, "loss": 2.3085, "step": 100 }, { "epoch": 0.957345971563981, "grad_norm": 0.9574633240699768, "learning_rate": 1.7223639620597556e-05, "loss": 2.3822, "step": 101 }, { "epoch": 0.966824644549763, "grad_norm": 0.9915265440940857, "learning_rate": 1.7147778973340466e-05, "loss": 2.3087, "step": 102 }, { "epoch": 0.976303317535545, "grad_norm": 0.9175536632537842, "learning_rate": 1.7071067811865477e-05, "loss": 2.3098, "step": 103 }, { "epoch": 0.985781990521327, "grad_norm": 0.9363272190093994, "learning_rate": 1.699351526403367e-05, "loss": 2.3394, "step": 104 }, { "epoch": 0.995260663507109, "grad_norm": 1.0321186780929565, "learning_rate": 1.6915130557822698e-05, "loss": 2.1852, "step": 105 }, { "epoch": 1.0, "grad_norm": 1.5194014310836792, "learning_rate": 1.6835923020228714e-05, "loss": 2.3883, "step": 106 }, { "epoch": 1.009478672985782, "grad_norm": 0.9262144565582275, "learning_rate": 1.6755902076156606e-05, "loss": 2.299, "step": 107 }, { "epoch": 1.018957345971564, "grad_norm": 0.943099319934845, "learning_rate": 1.6675077247298475e-05, "loss": 2.3014, "step": 108 }, { "epoch": 1.028436018957346, "grad_norm": 0.9212830066680908, "learning_rate": 1.659345815100069e-05, "loss": 2.2374, "step": 109 }, { "epoch": 1.037914691943128, "grad_norm": 0.9187367558479309, "learning_rate": 1.6511054499119493e-05, "loss": 2.2557, "step": 110 }, { "epoch": 1.04739336492891, "grad_norm": 0.9865244626998901, "learning_rate": 1.6427876096865394e-05, "loss": 2.2526, "step": 111 }, { "epoch": 1.0568720379146919, "grad_norm": 0.9716766476631165, "learning_rate": 1.6343932841636455e-05, "loss": 2.252, "step": 112 }, { "epoch": 1.066350710900474, "grad_norm": 0.9111779928207397, "learning_rate": 1.6259234721840595e-05, "loss": 2.3602, "step": 113 }, { "epoch": 1.0758293838862558, "grad_norm": 1.201020359992981, "learning_rate": 1.6173791815707053e-05, "loss": 2.2086, "step": 114 }, { "epoch": 1.085308056872038, "grad_norm": 0.9832363724708557, "learning_rate": 1.608761429008721e-05, "loss": 2.2785, "step": 115 }, { "epoch": 1.09478672985782, "grad_norm": 1.136888027191162, "learning_rate": 1.6000712399244813e-05, "loss": 2.169, "step": 116 }, { "epoch": 1.1042654028436019, "grad_norm": 1.6922038793563843, "learning_rate": 1.5913096483635827e-05, "loss": 2.3197, "step": 117 }, { "epoch": 1.113744075829384, "grad_norm": 0.9305840730667114, "learning_rate": 1.5824776968678024e-05, "loss": 2.3094, "step": 118 }, { "epoch": 1.1232227488151658, "grad_norm": 0.9827175140380859, "learning_rate": 1.573576436351046e-05, "loss": 2.2807, "step": 119 }, { "epoch": 1.132701421800948, "grad_norm": 1.0022766590118408, "learning_rate": 1.5646069259743007e-05, "loss": 2.246, "step": 120 }, { "epoch": 1.1421800947867298, "grad_norm": 1.0958722829818726, "learning_rate": 1.5555702330196024e-05, "loss": 2.3214, "step": 121 }, { "epoch": 1.1516587677725119, "grad_norm": 1.084460973739624, "learning_rate": 1.5464674327630437e-05, "loss": 2.2418, "step": 122 }, { "epoch": 1.161137440758294, "grad_norm": 1.0701860189437866, "learning_rate": 1.5372996083468242e-05, "loss": 2.4143, "step": 123 }, { "epoch": 1.1706161137440758, "grad_norm": 0.9905973672866821, "learning_rate": 1.528067850650368e-05, "loss": 2.2589, "step": 124 }, { "epoch": 1.180094786729858, "grad_norm": 1.046353816986084, "learning_rate": 1.5187732581605217e-05, "loss": 2.2511, "step": 125 }, { "epoch": 1.1895734597156398, "grad_norm": 1.0096780061721802, "learning_rate": 1.509416936840842e-05, "loss": 2.2505, "step": 126 }, { "epoch": 1.1990521327014219, "grad_norm": 1.0571825504302979, "learning_rate": 1.5000000000000002e-05, "loss": 2.3112, "step": 127 }, { "epoch": 1.2085308056872037, "grad_norm": 1.090767741203308, "learning_rate": 1.4905235681593079e-05, "loss": 2.1126, "step": 128 }, { "epoch": 1.2180094786729858, "grad_norm": 1.2226624488830566, "learning_rate": 1.4809887689193878e-05, "loss": 2.2798, "step": 129 }, { "epoch": 1.2274881516587677, "grad_norm": 1.1004244089126587, "learning_rate": 1.4713967368259981e-05, "loss": 2.3454, "step": 130 }, { "epoch": 1.2369668246445498, "grad_norm": 0.963148295879364, "learning_rate": 1.4617486132350343e-05, "loss": 2.2193, "step": 131 }, { "epoch": 1.2464454976303316, "grad_norm": 1.0694116353988647, "learning_rate": 1.45204554617672e-05, "loss": 2.146, "step": 132 }, { "epoch": 1.2559241706161137, "grad_norm": 1.0674649477005005, "learning_rate": 1.4422886902190014e-05, "loss": 2.3284, "step": 133 }, { "epoch": 1.2654028436018958, "grad_norm": 1.1457089185714722, "learning_rate": 1.4324792063301662e-05, "loss": 2.2393, "step": 134 }, { "epoch": 1.2748815165876777, "grad_norm": 1.0317751169204712, "learning_rate": 1.4226182617406996e-05, "loss": 2.2869, "step": 135 }, { "epoch": 1.2843601895734598, "grad_norm": 1.0415928363800049, "learning_rate": 1.4127070298043949e-05, "loss": 2.231, "step": 136 }, { "epoch": 1.2938388625592416, "grad_norm": 1.224968433380127, "learning_rate": 1.4027466898587375e-05, "loss": 2.1672, "step": 137 }, { "epoch": 1.3033175355450237, "grad_norm": 1.0224634408950806, "learning_rate": 1.3927384270845744e-05, "loss": 2.3001, "step": 138 }, { "epoch": 1.3127962085308056, "grad_norm": 1.098523497581482, "learning_rate": 1.3826834323650899e-05, "loss": 2.2753, "step": 139 }, { "epoch": 1.3222748815165877, "grad_norm": 1.060198426246643, "learning_rate": 1.372582902144103e-05, "loss": 2.2222, "step": 140 }, { "epoch": 1.3317535545023698, "grad_norm": 1.015621304512024, "learning_rate": 1.3624380382837017e-05, "loss": 2.2784, "step": 141 }, { "epoch": 1.3412322274881516, "grad_norm": 1.0406889915466309, "learning_rate": 1.3522500479212337e-05, "loss": 2.2017, "step": 142 }, { "epoch": 1.3507109004739337, "grad_norm": 1.1058852672576904, "learning_rate": 1.342020143325669e-05, "loss": 2.3003, "step": 143 }, { "epoch": 1.3601895734597156, "grad_norm": 1.0425012111663818, "learning_rate": 1.3317495417533523e-05, "loss": 2.2575, "step": 144 }, { "epoch": 1.3696682464454977, "grad_norm": 1.1006783246994019, "learning_rate": 1.3214394653031616e-05, "loss": 2.242, "step": 145 }, { "epoch": 1.3791469194312795, "grad_norm": 2.838590145111084, "learning_rate": 1.3110911407710909e-05, "loss": 2.263, "step": 146 }, { "epoch": 1.3886255924170616, "grad_norm": 1.3155434131622314, "learning_rate": 1.300705799504273e-05, "loss": 2.2381, "step": 147 }, { "epoch": 1.3981042654028437, "grad_norm": 0.9725741147994995, "learning_rate": 1.2902846772544625e-05, "loss": 2.1814, "step": 148 }, { "epoch": 1.4075829383886256, "grad_norm": 1.072764277458191, "learning_rate": 1.2798290140309924e-05, "loss": 2.2758, "step": 149 }, { "epoch": 1.4170616113744074, "grad_norm": 1.0916640758514404, "learning_rate": 1.2693400539532263e-05, "loss": 2.1405, "step": 150 }, { "epoch": 1.4265402843601895, "grad_norm": 1.2010222673416138, "learning_rate": 1.2588190451025209e-05, "loss": 2.2559, "step": 151 }, { "epoch": 1.4360189573459716, "grad_norm": 1.0151344537734985, "learning_rate": 1.2482672393737164e-05, "loss": 2.1031, "step": 152 }, { "epoch": 1.4454976303317535, "grad_norm": 1.0264817476272583, "learning_rate": 1.2376858923261732e-05, "loss": 2.2273, "step": 153 }, { "epoch": 1.4549763033175356, "grad_norm": 1.0587806701660156, "learning_rate": 1.2270762630343734e-05, "loss": 2.2498, "step": 154 }, { "epoch": 1.4644549763033177, "grad_norm": 2.114945411682129, "learning_rate": 1.2164396139381029e-05, "loss": 2.2337, "step": 155 }, { "epoch": 1.4739336492890995, "grad_norm": 1.2019884586334229, "learning_rate": 1.205777210692235e-05, "loss": 2.2812, "step": 156 }, { "epoch": 1.4834123222748814, "grad_norm": 2.341759443283081, "learning_rate": 1.1950903220161286e-05, "loss": 2.3088, "step": 157 }, { "epoch": 1.4928909952606635, "grad_norm": 1.7187213897705078, "learning_rate": 1.1843802195426634e-05, "loss": 2.2101, "step": 158 }, { "epoch": 1.5023696682464456, "grad_norm": 1.2256115674972534, "learning_rate": 1.1736481776669307e-05, "loss": 2.2221, "step": 159 }, { "epoch": 1.5118483412322274, "grad_norm": 1.133819580078125, "learning_rate": 1.162895473394589e-05, "loss": 2.2709, "step": 160 }, { "epoch": 1.5213270142180095, "grad_norm": 1.2712329626083374, "learning_rate": 1.1521233861899168e-05, "loss": 2.3259, "step": 161 }, { "epoch": 1.5308056872037916, "grad_norm": 1.1242793798446655, "learning_rate": 1.1413331978235677e-05, "loss": 2.2541, "step": 162 }, { "epoch": 1.5402843601895735, "grad_norm": 1.1369976997375488, "learning_rate": 1.130526192220052e-05, "loss": 2.3066, "step": 163 }, { "epoch": 1.5497630331753554, "grad_norm": 1.1637866497039795, "learning_rate": 1.1197036553049626e-05, "loss": 2.2572, "step": 164 }, { "epoch": 1.5592417061611374, "grad_norm": 1.1549181938171387, "learning_rate": 1.1088668748519646e-05, "loss": 2.3492, "step": 165 }, { "epoch": 1.5687203791469195, "grad_norm": 1.8263827562332153, "learning_rate": 1.098017140329561e-05, "loss": 2.2654, "step": 166 }, { "epoch": 1.5781990521327014, "grad_norm": 2.184051036834717, "learning_rate": 1.0871557427476585e-05, "loss": 2.1792, "step": 167 }, { "epoch": 1.5876777251184833, "grad_norm": 1.0918291807174683, "learning_rate": 1.0762839745039526e-05, "loss": 2.3085, "step": 168 }, { "epoch": 1.5971563981042654, "grad_norm": 1.1960687637329102, "learning_rate": 1.0654031292301432e-05, "loss": 2.2613, "step": 169 }, { "epoch": 1.6066350710900474, "grad_norm": 1.1339130401611328, "learning_rate": 1.0545145016380065e-05, "loss": 2.2101, "step": 170 }, { "epoch": 1.6161137440758293, "grad_norm": 1.076588749885559, "learning_rate": 1.0436193873653362e-05, "loss": 2.2652, "step": 171 }, { "epoch": 1.6255924170616114, "grad_norm": 1.0522938966751099, "learning_rate": 1.0327190828217763e-05, "loss": 2.2018, "step": 172 }, { "epoch": 1.6350710900473935, "grad_norm": 1.137953758239746, "learning_rate": 1.0218148850345613e-05, "loss": 2.1993, "step": 173 }, { "epoch": 1.6445497630331753, "grad_norm": 1.147080898284912, "learning_rate": 1.0109080914941825e-05, "loss": 2.2474, "step": 174 }, { "epoch": 1.6540284360189572, "grad_norm": 1.111204981803894, "learning_rate": 1e-05, "loss": 2.258, "step": 175 }, { "epoch": 1.6635071090047393, "grad_norm": 1.1011035442352295, "learning_rate": 9.890919085058179e-06, "loss": 2.2643, "step": 176 }, { "epoch": 1.6729857819905214, "grad_norm": 1.18864107131958, "learning_rate": 9.78185114965439e-06, "loss": 2.3374, "step": 177 }, { "epoch": 1.6824644549763033, "grad_norm": 1.4493237733840942, "learning_rate": 9.67280917178224e-06, "loss": 2.2591, "step": 178 }, { "epoch": 1.6919431279620853, "grad_norm": 1.218247413635254, "learning_rate": 9.563806126346643e-06, "loss": 2.2945, "step": 179 }, { "epoch": 1.7014218009478674, "grad_norm": 1.1910370588302612, "learning_rate": 9.454854983619936e-06, "loss": 2.2092, "step": 180 }, { "epoch": 1.7109004739336493, "grad_norm": 1.2177734375, "learning_rate": 9.34596870769857e-06, "loss": 2.3225, "step": 181 }, { "epoch": 1.7203791469194312, "grad_norm": 1.3048025369644165, "learning_rate": 9.237160254960477e-06, "loss": 2.3232, "step": 182 }, { "epoch": 1.7298578199052133, "grad_norm": 1.2348284721374512, "learning_rate": 9.128442572523418e-06, "loss": 2.2683, "step": 183 }, { "epoch": 1.7393364928909953, "grad_norm": 1.0725488662719727, "learning_rate": 9.019828596704394e-06, "loss": 2.1446, "step": 184 }, { "epoch": 1.7488151658767772, "grad_norm": 1.0923892259597778, "learning_rate": 8.911331251480357e-06, "loss": 2.197, "step": 185 }, { "epoch": 1.758293838862559, "grad_norm": 1.2618827819824219, "learning_rate": 8.802963446950378e-06, "loss": 2.2826, "step": 186 }, { "epoch": 1.7677725118483414, "grad_norm": 1.2491176128387451, "learning_rate": 8.694738077799487e-06, "loss": 2.3177, "step": 187 }, { "epoch": 1.7772511848341233, "grad_norm": 1.2089641094207764, "learning_rate": 8.586668021764328e-06, "loss": 2.2569, "step": 188 }, { "epoch": 1.7867298578199051, "grad_norm": 1.1279925107955933, "learning_rate": 8.478766138100834e-06, "loss": 2.2282, "step": 189 }, { "epoch": 1.7962085308056872, "grad_norm": 1.251165747642517, "learning_rate": 8.371045266054114e-06, "loss": 2.2188, "step": 190 }, { "epoch": 1.8056872037914693, "grad_norm": 1.268534541130066, "learning_rate": 8.263518223330698e-06, "loss": 2.342, "step": 191 }, { "epoch": 1.8151658767772512, "grad_norm": 1.401013970375061, "learning_rate": 8.156197804573368e-06, "loss": 2.2666, "step": 192 }, { "epoch": 1.824644549763033, "grad_norm": 1.194873332977295, "learning_rate": 8.04909677983872e-06, "loss": 2.3066, "step": 193 }, { "epoch": 1.8341232227488151, "grad_norm": 1.1607239246368408, "learning_rate": 7.942227893077652e-06, "loss": 2.1649, "step": 194 }, { "epoch": 1.8436018957345972, "grad_norm": 1.2142109870910645, "learning_rate": 7.835603860618973e-06, "loss": 2.247, "step": 195 }, { "epoch": 1.853080568720379, "grad_norm": 1.1745545864105225, "learning_rate": 7.72923736965627e-06, "loss": 2.2399, "step": 196 }, { "epoch": 1.8625592417061612, "grad_norm": 1.1588683128356934, "learning_rate": 7.623141076738271e-06, "loss": 2.2041, "step": 197 }, { "epoch": 1.8720379146919433, "grad_norm": 1.2988152503967285, "learning_rate": 7.5173276062628364e-06, "loss": 2.2678, "step": 198 }, { "epoch": 1.8815165876777251, "grad_norm": 1.1855872869491577, "learning_rate": 7.411809548974792e-06, "loss": 2.2461, "step": 199 }, { "epoch": 1.890995260663507, "grad_norm": 1.379715919494629, "learning_rate": 7.306599460467741e-06, "loss": 2.2438, "step": 200 }, { "epoch": 1.900473933649289, "grad_norm": 1.2153151035308838, "learning_rate": 7.201709859690081e-06, "loss": 2.3228, "step": 201 }, { "epoch": 1.9099526066350712, "grad_norm": 1.2174468040466309, "learning_rate": 7.097153227455379e-06, "loss": 2.3403, "step": 202 }, { "epoch": 1.919431279620853, "grad_norm": 1.1492794752120972, "learning_rate": 6.992942004957271e-06, "loss": 2.2773, "step": 203 }, { "epoch": 1.9289099526066351, "grad_norm": 1.3066004514694214, "learning_rate": 6.889088592289092e-06, "loss": 2.1981, "step": 204 }, { "epoch": 1.9383886255924172, "grad_norm": 2.041125774383545, "learning_rate": 6.785605346968387e-06, "loss": 2.2094, "step": 205 }, { "epoch": 1.947867298578199, "grad_norm": 1.2032493352890015, "learning_rate": 6.682504582466482e-06, "loss": 2.2519, "step": 206 }, { "epoch": 1.957345971563981, "grad_norm": 1.9550625085830688, "learning_rate": 6.579798566743314e-06, "loss": 2.3024, "step": 207 }, { "epoch": 1.966824644549763, "grad_norm": 1.1052924394607544, "learning_rate": 6.4774995207876654e-06, "loss": 2.2448, "step": 208 }, { "epoch": 1.9763033175355451, "grad_norm": 1.244057059288025, "learning_rate": 6.375619617162985e-06, "loss": 2.2521, "step": 209 }, { "epoch": 1.985781990521327, "grad_norm": 1.2490400075912476, "learning_rate": 6.274170978558971e-06, "loss": 2.2775, "step": 210 }, { "epoch": 1.9952606635071088, "grad_norm": 1.2607371807098389, "learning_rate": 6.173165676349103e-06, "loss": 2.2871, "step": 211 }, { "epoch": 2.0, "grad_norm": 1.732487440109253, "learning_rate": 6.072615729154261e-06, "loss": 2.3964, "step": 212 } ], "logging_steps": 1, "max_steps": 318, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3818863769915392e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }