{ "best_global_step": 34848, "best_metric": 0.016729312017560005, "best_model_checkpoint": "./phonetic_wav2vec2\\checkpoints\\checkpoint-34848", "epoch": 10.0, "eval_steps": 500, "global_step": 43560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01147842056932966, "grad_norm": 13.107927322387695, "learning_rate": 1.998875114784206e-05, "loss": 16.8453, "step": 50 }, { "epoch": 0.02295684113865932, "grad_norm": 6.318288326263428, "learning_rate": 1.997727272727273e-05, "loss": 3.987, "step": 100 }, { "epoch": 0.03443526170798898, "grad_norm": 2.223297357559204, "learning_rate": 1.9965794306703398e-05, "loss": 3.632, "step": 150 }, { "epoch": 0.04591368227731864, "grad_norm": 4.863732814788818, "learning_rate": 1.995431588613407e-05, "loss": 3.4343, "step": 200 }, { "epoch": 0.0573921028466483, "grad_norm": 2.74699068069458, "learning_rate": 1.994283746556474e-05, "loss": 3.3426, "step": 250 }, { "epoch": 0.06887052341597796, "grad_norm": 1.5760693550109863, "learning_rate": 1.993135904499541e-05, "loss": 3.284, "step": 300 }, { "epoch": 0.08034894398530762, "grad_norm": 2.6991376876831055, "learning_rate": 1.991988062442608e-05, "loss": 3.1762, "step": 350 }, { "epoch": 0.09182736455463728, "grad_norm": 2.397361993789673, "learning_rate": 1.990840220385675e-05, "loss": 3.1427, "step": 400 }, { "epoch": 0.10330578512396695, "grad_norm": 2.45298433303833, "learning_rate": 1.989692378328742e-05, "loss": 3.0952, "step": 450 }, { "epoch": 0.1147842056932966, "grad_norm": 2.8498406410217285, "learning_rate": 1.9885445362718093e-05, "loss": 3.0068, "step": 500 }, { "epoch": 0.12626262626262627, "grad_norm": 2.892845869064331, "learning_rate": 1.987396694214876e-05, "loss": 2.8898, "step": 550 }, { "epoch": 0.13774104683195593, "grad_norm": 3.208521604537964, "learning_rate": 1.986248852157943e-05, "loss": 2.7498, "step": 600 }, { "epoch": 0.14921946740128558, "grad_norm": 3.8007776737213135, "learning_rate": 1.9851010101010103e-05, "loss": 2.6005, "step": 650 }, { "epoch": 0.16069788797061524, "grad_norm": 2.7883293628692627, "learning_rate": 1.9839531680440774e-05, "loss": 2.437, "step": 700 }, { "epoch": 0.1721763085399449, "grad_norm": 2.9501352310180664, "learning_rate": 1.9828053259871442e-05, "loss": 2.281, "step": 750 }, { "epoch": 0.18365472910927455, "grad_norm": 4.725921630859375, "learning_rate": 1.9816574839302113e-05, "loss": 2.0765, "step": 800 }, { "epoch": 0.19513314967860423, "grad_norm": 5.996265411376953, "learning_rate": 1.9805096418732784e-05, "loss": 1.9618, "step": 850 }, { "epoch": 0.2066115702479339, "grad_norm": 3.274717330932617, "learning_rate": 1.9793617998163455e-05, "loss": 1.8363, "step": 900 }, { "epoch": 0.21808999081726355, "grad_norm": 7.056197643280029, "learning_rate": 1.9782139577594123e-05, "loss": 1.6849, "step": 950 }, { "epoch": 0.2295684113865932, "grad_norm": 3.918736696243286, "learning_rate": 1.9770661157024794e-05, "loss": 1.5448, "step": 1000 }, { "epoch": 0.24104683195592286, "grad_norm": 8.081425666809082, "learning_rate": 1.9759182736455465e-05, "loss": 1.3842, "step": 1050 }, { "epoch": 0.25252525252525254, "grad_norm": 11.406682968139648, "learning_rate": 1.9747704315886137e-05, "loss": 1.2925, "step": 1100 }, { "epoch": 0.26400367309458217, "grad_norm": 14.28106689453125, "learning_rate": 1.9736225895316804e-05, "loss": 1.2177, "step": 1150 }, { "epoch": 0.27548209366391185, "grad_norm": 7.810894012451172, "learning_rate": 1.9724747474747476e-05, "loss": 1.098, "step": 1200 }, { "epoch": 0.2869605142332415, "grad_norm": 10.361730575561523, "learning_rate": 1.9713269054178147e-05, "loss": 1.0432, "step": 1250 }, { "epoch": 0.29843893480257117, "grad_norm": 12.806737899780273, "learning_rate": 1.9701790633608818e-05, "loss": 0.9738, "step": 1300 }, { "epoch": 0.30991735537190085, "grad_norm": 8.492304801940918, "learning_rate": 1.9690312213039486e-05, "loss": 0.8986, "step": 1350 }, { "epoch": 0.3213957759412305, "grad_norm": 13.812825202941895, "learning_rate": 1.9678833792470157e-05, "loss": 0.833, "step": 1400 }, { "epoch": 0.33287419651056016, "grad_norm": 10.016443252563477, "learning_rate": 1.9667355371900828e-05, "loss": 0.8182, "step": 1450 }, { "epoch": 0.3443526170798898, "grad_norm": 10.108992576599121, "learning_rate": 1.96558769513315e-05, "loss": 0.7655, "step": 1500 }, { "epoch": 0.3558310376492195, "grad_norm": 10.642434120178223, "learning_rate": 1.9644398530762167e-05, "loss": 0.7019, "step": 1550 }, { "epoch": 0.3673094582185491, "grad_norm": 16.46535301208496, "learning_rate": 1.9632920110192838e-05, "loss": 0.6561, "step": 1600 }, { "epoch": 0.3787878787878788, "grad_norm": 8.467092514038086, "learning_rate": 1.962144168962351e-05, "loss": 0.6837, "step": 1650 }, { "epoch": 0.39026629935720847, "grad_norm": 7.166141986846924, "learning_rate": 1.960996326905418e-05, "loss": 0.6416, "step": 1700 }, { "epoch": 0.4017447199265381, "grad_norm": 10.553528785705566, "learning_rate": 1.959848484848485e-05, "loss": 0.595, "step": 1750 }, { "epoch": 0.4132231404958678, "grad_norm": 9.658117294311523, "learning_rate": 1.958700642791552e-05, "loss": 0.5735, "step": 1800 }, { "epoch": 0.4247015610651974, "grad_norm": 10.819725036621094, "learning_rate": 1.957552800734619e-05, "loss": 0.5459, "step": 1850 }, { "epoch": 0.4361799816345271, "grad_norm": 71.3021469116211, "learning_rate": 1.956404958677686e-05, "loss": 0.5819, "step": 1900 }, { "epoch": 0.4476584022038568, "grad_norm": 8.742064476013184, "learning_rate": 1.9552571166207533e-05, "loss": 0.5227, "step": 1950 }, { "epoch": 0.4591368227731864, "grad_norm": 5.44516134262085, "learning_rate": 1.95410927456382e-05, "loss": 0.508, "step": 2000 }, { "epoch": 0.4706152433425161, "grad_norm": 14.485369682312012, "learning_rate": 1.9529614325068872e-05, "loss": 0.4995, "step": 2050 }, { "epoch": 0.4820936639118457, "grad_norm": 7.328050136566162, "learning_rate": 1.9518135904499543e-05, "loss": 0.4802, "step": 2100 }, { "epoch": 0.4935720844811754, "grad_norm": 9.694188117980957, "learning_rate": 1.9506657483930214e-05, "loss": 0.4895, "step": 2150 }, { "epoch": 0.5050505050505051, "grad_norm": 13.486627578735352, "learning_rate": 1.9495179063360882e-05, "loss": 0.4827, "step": 2200 }, { "epoch": 0.5165289256198347, "grad_norm": 9.643098831176758, "learning_rate": 1.9483700642791553e-05, "loss": 0.4536, "step": 2250 }, { "epoch": 0.5280073461891643, "grad_norm": 7.223649978637695, "learning_rate": 1.9472222222222224e-05, "loss": 0.4245, "step": 2300 }, { "epoch": 0.539485766758494, "grad_norm": 18.04121208190918, "learning_rate": 1.9460743801652895e-05, "loss": 0.4195, "step": 2350 }, { "epoch": 0.5509641873278237, "grad_norm": 9.087358474731445, "learning_rate": 1.9449265381083563e-05, "loss": 0.4392, "step": 2400 }, { "epoch": 0.5624426078971534, "grad_norm": 6.228804588317871, "learning_rate": 1.9437786960514234e-05, "loss": 0.428, "step": 2450 }, { "epoch": 0.573921028466483, "grad_norm": 4.25866174697876, "learning_rate": 1.9426308539944905e-05, "loss": 0.3793, "step": 2500 }, { "epoch": 0.5853994490358126, "grad_norm": 14.782279968261719, "learning_rate": 1.9414830119375577e-05, "loss": 0.3933, "step": 2550 }, { "epoch": 0.5968778696051423, "grad_norm": 11.153953552246094, "learning_rate": 1.9403351698806244e-05, "loss": 0.3883, "step": 2600 }, { "epoch": 0.608356290174472, "grad_norm": 9.42304515838623, "learning_rate": 1.9391873278236916e-05, "loss": 0.3872, "step": 2650 }, { "epoch": 0.6198347107438017, "grad_norm": 16.37066650390625, "learning_rate": 1.9380394857667587e-05, "loss": 0.3781, "step": 2700 }, { "epoch": 0.6313131313131313, "grad_norm": 11.046980857849121, "learning_rate": 1.9368916437098258e-05, "loss": 0.3789, "step": 2750 }, { "epoch": 0.642791551882461, "grad_norm": 30.32474136352539, "learning_rate": 1.9357438016528926e-05, "loss": 0.3876, "step": 2800 }, { "epoch": 0.6542699724517906, "grad_norm": 15.86506175994873, "learning_rate": 1.9345959595959597e-05, "loss": 0.3471, "step": 2850 }, { "epoch": 0.6657483930211203, "grad_norm": 13.437570571899414, "learning_rate": 1.9334481175390268e-05, "loss": 0.3394, "step": 2900 }, { "epoch": 0.67722681359045, "grad_norm": 12.49765682220459, "learning_rate": 1.932300275482094e-05, "loss": 0.3394, "step": 2950 }, { "epoch": 0.6887052341597796, "grad_norm": 11.368181228637695, "learning_rate": 1.9311524334251607e-05, "loss": 0.3547, "step": 3000 }, { "epoch": 0.7001836547291093, "grad_norm": 7.207362174987793, "learning_rate": 1.9300045913682278e-05, "loss": 0.3123, "step": 3050 }, { "epoch": 0.711662075298439, "grad_norm": 16.953811645507812, "learning_rate": 1.928856749311295e-05, "loss": 0.3037, "step": 3100 }, { "epoch": 0.7231404958677686, "grad_norm": 10.758194923400879, "learning_rate": 1.927708907254362e-05, "loss": 0.3224, "step": 3150 }, { "epoch": 0.7346189164370982, "grad_norm": 61.555606842041016, "learning_rate": 1.9265610651974288e-05, "loss": 0.3006, "step": 3200 }, { "epoch": 0.7460973370064279, "grad_norm": 12.163214683532715, "learning_rate": 1.925413223140496e-05, "loss": 0.2921, "step": 3250 }, { "epoch": 0.7575757575757576, "grad_norm": 8.236907005310059, "learning_rate": 1.924265381083563e-05, "loss": 0.3121, "step": 3300 }, { "epoch": 0.7690541781450873, "grad_norm": 7.487551212310791, "learning_rate": 1.9231175390266302e-05, "loss": 0.3106, "step": 3350 }, { "epoch": 0.7805325987144169, "grad_norm": 10.351700782775879, "learning_rate": 1.921969696969697e-05, "loss": 0.2994, "step": 3400 }, { "epoch": 0.7920110192837465, "grad_norm": 31.927387237548828, "learning_rate": 1.920821854912764e-05, "loss": 0.289, "step": 3450 }, { "epoch": 0.8034894398530762, "grad_norm": 10.100557327270508, "learning_rate": 1.9196740128558312e-05, "loss": 0.2788, "step": 3500 }, { "epoch": 0.8149678604224059, "grad_norm": 9.068075180053711, "learning_rate": 1.9185261707988983e-05, "loss": 0.2614, "step": 3550 }, { "epoch": 0.8264462809917356, "grad_norm": 8.70858383178711, "learning_rate": 1.917378328741965e-05, "loss": 0.2881, "step": 3600 }, { "epoch": 0.8379247015610652, "grad_norm": 10.465408325195312, "learning_rate": 1.9162304866850322e-05, "loss": 0.278, "step": 3650 }, { "epoch": 0.8494031221303948, "grad_norm": 47.335506439208984, "learning_rate": 1.9150826446280993e-05, "loss": 0.2709, "step": 3700 }, { "epoch": 0.8608815426997245, "grad_norm": 9.927533149719238, "learning_rate": 1.9139348025711664e-05, "loss": 0.273, "step": 3750 }, { "epoch": 0.8723599632690542, "grad_norm": 13.918431282043457, "learning_rate": 1.9127869605142332e-05, "loss": 0.257, "step": 3800 }, { "epoch": 0.8838383838383839, "grad_norm": 9.076457023620605, "learning_rate": 1.9116391184573003e-05, "loss": 0.2807, "step": 3850 }, { "epoch": 0.8953168044077136, "grad_norm": 10.926765441894531, "learning_rate": 1.9104912764003674e-05, "loss": 0.2336, "step": 3900 }, { "epoch": 0.9067952249770431, "grad_norm": 22.74556541442871, "learning_rate": 1.9093434343434346e-05, "loss": 0.2735, "step": 3950 }, { "epoch": 0.9182736455463728, "grad_norm": 9.520804405212402, "learning_rate": 1.9081955922865017e-05, "loss": 0.2379, "step": 4000 }, { "epoch": 0.9297520661157025, "grad_norm": 14.446419715881348, "learning_rate": 1.9070477502295684e-05, "loss": 0.2477, "step": 4050 }, { "epoch": 0.9412304866850322, "grad_norm": 3.0111658573150635, "learning_rate": 1.9058999081726356e-05, "loss": 0.2489, "step": 4100 }, { "epoch": 0.9527089072543617, "grad_norm": 26.547456741333008, "learning_rate": 1.9047520661157027e-05, "loss": 0.2313, "step": 4150 }, { "epoch": 0.9641873278236914, "grad_norm": 22.840234756469727, "learning_rate": 1.9036042240587698e-05, "loss": 0.2481, "step": 4200 }, { "epoch": 0.9756657483930211, "grad_norm": 8.96515941619873, "learning_rate": 1.9024563820018366e-05, "loss": 0.2443, "step": 4250 }, { "epoch": 0.9871441689623508, "grad_norm": 44.00730514526367, "learning_rate": 1.9013085399449037e-05, "loss": 0.2527, "step": 4300 }, { "epoch": 0.9986225895316805, "grad_norm": 6.602777481079102, "learning_rate": 1.9001606978879708e-05, "loss": 0.2331, "step": 4350 }, { "epoch": 1.0, "eval_loss": 0.10477270185947418, "eval_runtime": 80.7825, "eval_samples_per_second": 95.85, "eval_steps_per_second": 5.991, "step": 4356 }, { "epoch": 1.0101010101010102, "grad_norm": 19.036680221557617, "learning_rate": 1.899012855831038e-05, "loss": 0.2318, "step": 4400 }, { "epoch": 1.0215794306703398, "grad_norm": 15.221837997436523, "learning_rate": 1.8978650137741047e-05, "loss": 0.243, "step": 4450 }, { "epoch": 1.0330578512396693, "grad_norm": 18.07478904724121, "learning_rate": 1.8967171717171718e-05, "loss": 0.2226, "step": 4500 }, { "epoch": 1.044536271808999, "grad_norm": 12.655671119689941, "learning_rate": 1.895569329660239e-05, "loss": 0.2299, "step": 4550 }, { "epoch": 1.0560146923783287, "grad_norm": 12.812118530273438, "learning_rate": 1.894421487603306e-05, "loss": 0.2099, "step": 4600 }, { "epoch": 1.0674931129476584, "grad_norm": 13.74438762664795, "learning_rate": 1.893273645546373e-05, "loss": 0.2023, "step": 4650 }, { "epoch": 1.078971533516988, "grad_norm": 9.28097915649414, "learning_rate": 1.89212580348944e-05, "loss": 0.2303, "step": 4700 }, { "epoch": 1.0904499540863177, "grad_norm": 9.024033546447754, "learning_rate": 1.890977961432507e-05, "loss": 0.2306, "step": 4750 }, { "epoch": 1.1019283746556474, "grad_norm": 9.018461227416992, "learning_rate": 1.8898301193755742e-05, "loss": 0.2254, "step": 4800 }, { "epoch": 1.113406795224977, "grad_norm": 8.497326850891113, "learning_rate": 1.888682277318641e-05, "loss": 0.2283, "step": 4850 }, { "epoch": 1.1248852157943068, "grad_norm": 10.593148231506348, "learning_rate": 1.887534435261708e-05, "loss": 0.2187, "step": 4900 }, { "epoch": 1.1363636363636362, "grad_norm": 6.4472761154174805, "learning_rate": 1.8863865932047752e-05, "loss": 0.2084, "step": 4950 }, { "epoch": 1.1478420569329661, "grad_norm": 5.391965389251709, "learning_rate": 1.8852387511478423e-05, "loss": 0.2021, "step": 5000 }, { "epoch": 1.1593204775022956, "grad_norm": 12.448431968688965, "learning_rate": 1.884090909090909e-05, "loss": 0.2192, "step": 5050 }, { "epoch": 1.1707988980716253, "grad_norm": 8.670736312866211, "learning_rate": 1.8829430670339762e-05, "loss": 0.1852, "step": 5100 }, { "epoch": 1.182277318640955, "grad_norm": 18.672786712646484, "learning_rate": 1.8817952249770433e-05, "loss": 0.1903, "step": 5150 }, { "epoch": 1.1937557392102847, "grad_norm": 6.486687660217285, "learning_rate": 1.8806473829201104e-05, "loss": 0.2006, "step": 5200 }, { "epoch": 1.2052341597796143, "grad_norm": 12.253338813781738, "learning_rate": 1.8794995408631772e-05, "loss": 0.1986, "step": 5250 }, { "epoch": 1.216712580348944, "grad_norm": 6.225432872772217, "learning_rate": 1.8783516988062443e-05, "loss": 0.1895, "step": 5300 }, { "epoch": 1.2281910009182737, "grad_norm": 21.496686935424805, "learning_rate": 1.8772038567493114e-05, "loss": 0.1939, "step": 5350 }, { "epoch": 1.2396694214876034, "grad_norm": 15.535727500915527, "learning_rate": 1.8760560146923786e-05, "loss": 0.1935, "step": 5400 }, { "epoch": 1.251147842056933, "grad_norm": 17.813045501708984, "learning_rate": 1.8749081726354453e-05, "loss": 0.1969, "step": 5450 }, { "epoch": 1.2626262626262625, "grad_norm": 3.9138498306274414, "learning_rate": 1.8737603305785125e-05, "loss": 0.1601, "step": 5500 }, { "epoch": 1.2741046831955922, "grad_norm": 8.116711616516113, "learning_rate": 1.8726124885215796e-05, "loss": 0.1778, "step": 5550 }, { "epoch": 1.285583103764922, "grad_norm": 19.589506149291992, "learning_rate": 1.8714646464646467e-05, "loss": 0.1777, "step": 5600 }, { "epoch": 1.2970615243342516, "grad_norm": 1.9838666915893555, "learning_rate": 1.8703168044077135e-05, "loss": 0.1742, "step": 5650 }, { "epoch": 1.3085399449035813, "grad_norm": 6.865425109863281, "learning_rate": 1.8691689623507806e-05, "loss": 0.187, "step": 5700 }, { "epoch": 1.320018365472911, "grad_norm": 6.143863677978516, "learning_rate": 1.8680211202938477e-05, "loss": 0.1747, "step": 5750 }, { "epoch": 1.3314967860422406, "grad_norm": 4.835748195648193, "learning_rate": 1.8668732782369148e-05, "loss": 0.1614, "step": 5800 }, { "epoch": 1.3429752066115703, "grad_norm": 9.543431282043457, "learning_rate": 1.8657254361799816e-05, "loss": 0.1586, "step": 5850 }, { "epoch": 1.3544536271809, "grad_norm": 9.114174842834473, "learning_rate": 1.8645775941230487e-05, "loss": 0.1859, "step": 5900 }, { "epoch": 1.3659320477502295, "grad_norm": 11.661153793334961, "learning_rate": 1.8634297520661158e-05, "loss": 0.1678, "step": 5950 }, { "epoch": 1.3774104683195592, "grad_norm": 9.939918518066406, "learning_rate": 1.862281910009183e-05, "loss": 0.1728, "step": 6000 }, { "epoch": 1.3888888888888888, "grad_norm": 1.8444671630859375, "learning_rate": 1.8611340679522497e-05, "loss": 0.1712, "step": 6050 }, { "epoch": 1.4003673094582185, "grad_norm": 4.69300651550293, "learning_rate": 1.859986225895317e-05, "loss": 0.152, "step": 6100 }, { "epoch": 1.4118457300275482, "grad_norm": 13.470431327819824, "learning_rate": 1.858838383838384e-05, "loss": 0.1458, "step": 6150 }, { "epoch": 1.423324150596878, "grad_norm": 7.658647060394287, "learning_rate": 1.857690541781451e-05, "loss": 0.1669, "step": 6200 }, { "epoch": 1.4348025711662076, "grad_norm": 7.40774393081665, "learning_rate": 1.856542699724518e-05, "loss": 0.1614, "step": 6250 }, { "epoch": 1.4462809917355373, "grad_norm": 8.834815979003906, "learning_rate": 1.855394857667585e-05, "loss": 0.1713, "step": 6300 }, { "epoch": 1.457759412304867, "grad_norm": 10.573930740356445, "learning_rate": 1.854247015610652e-05, "loss": 0.1301, "step": 6350 }, { "epoch": 1.4692378328741964, "grad_norm": 18.1447811126709, "learning_rate": 1.8530991735537192e-05, "loss": 0.1872, "step": 6400 }, { "epoch": 1.4807162534435263, "grad_norm": 21.671560287475586, "learning_rate": 1.8519513314967863e-05, "loss": 0.1555, "step": 6450 }, { "epoch": 1.4921946740128558, "grad_norm": 6.022467613220215, "learning_rate": 1.850803489439853e-05, "loss": 0.1561, "step": 6500 }, { "epoch": 1.5036730945821855, "grad_norm": 19.958520889282227, "learning_rate": 1.8496556473829202e-05, "loss": 0.1546, "step": 6550 }, { "epoch": 1.5151515151515151, "grad_norm": 5.951800346374512, "learning_rate": 1.8485078053259873e-05, "loss": 0.1472, "step": 6600 }, { "epoch": 1.5266299357208448, "grad_norm": 4.502755165100098, "learning_rate": 1.8473599632690544e-05, "loss": 0.1532, "step": 6650 }, { "epoch": 1.5381083562901745, "grad_norm": 6.9762701988220215, "learning_rate": 1.8462121212121212e-05, "loss": 0.1753, "step": 6700 }, { "epoch": 1.549586776859504, "grad_norm": 3.4291458129882812, "learning_rate": 1.8450642791551883e-05, "loss": 0.1614, "step": 6750 }, { "epoch": 1.5610651974288339, "grad_norm": 6.287264823913574, "learning_rate": 1.8439164370982555e-05, "loss": 0.1583, "step": 6800 }, { "epoch": 1.5725436179981633, "grad_norm": 15.749639511108398, "learning_rate": 1.8427685950413226e-05, "loss": 0.1487, "step": 6850 }, { "epoch": 1.5840220385674932, "grad_norm": 6.084504127502441, "learning_rate": 1.8416207529843893e-05, "loss": 0.1604, "step": 6900 }, { "epoch": 1.5955004591368227, "grad_norm": 3.519554615020752, "learning_rate": 1.8404729109274565e-05, "loss": 0.1543, "step": 6950 }, { "epoch": 1.6069788797061524, "grad_norm": 8.225354194641113, "learning_rate": 1.8393250688705236e-05, "loss": 0.148, "step": 7000 }, { "epoch": 1.618457300275482, "grad_norm": 3.9564664363861084, "learning_rate": 1.8381772268135907e-05, "loss": 0.1379, "step": 7050 }, { "epoch": 1.6299357208448118, "grad_norm": 5.005980968475342, "learning_rate": 1.8370293847566575e-05, "loss": 0.1555, "step": 7100 }, { "epoch": 1.6414141414141414, "grad_norm": 13.176301002502441, "learning_rate": 1.8358815426997246e-05, "loss": 0.1523, "step": 7150 }, { "epoch": 1.6528925619834711, "grad_norm": 6.749804973602295, "learning_rate": 1.8347337006427917e-05, "loss": 0.1554, "step": 7200 }, { "epoch": 1.6643709825528008, "grad_norm": 4.850983142852783, "learning_rate": 1.8335858585858588e-05, "loss": 0.1467, "step": 7250 }, { "epoch": 1.6758494031221303, "grad_norm": 4.673733234405518, "learning_rate": 1.8324380165289256e-05, "loss": 0.1277, "step": 7300 }, { "epoch": 1.6873278236914602, "grad_norm": 17.790790557861328, "learning_rate": 1.8312901744719927e-05, "loss": 0.1721, "step": 7350 }, { "epoch": 1.6988062442607896, "grad_norm": 4.262311935424805, "learning_rate": 1.83014233241506e-05, "loss": 0.1271, "step": 7400 }, { "epoch": 1.7102846648301195, "grad_norm": 15.977132797241211, "learning_rate": 1.828994490358127e-05, "loss": 0.1421, "step": 7450 }, { "epoch": 1.721763085399449, "grad_norm": 9.290844917297363, "learning_rate": 1.8278466483011937e-05, "loss": 0.1566, "step": 7500 }, { "epoch": 1.7332415059687787, "grad_norm": 5.980143070220947, "learning_rate": 1.826698806244261e-05, "loss": 0.1322, "step": 7550 }, { "epoch": 1.7447199265381084, "grad_norm": 4.977065086364746, "learning_rate": 1.825550964187328e-05, "loss": 0.1362, "step": 7600 }, { "epoch": 1.756198347107438, "grad_norm": 3.3617167472839355, "learning_rate": 1.824403122130395e-05, "loss": 0.1411, "step": 7650 }, { "epoch": 1.7676767676767677, "grad_norm": 11.891363143920898, "learning_rate": 1.823255280073462e-05, "loss": 0.1314, "step": 7700 }, { "epoch": 1.7791551882460972, "grad_norm": 2.5021345615386963, "learning_rate": 1.822107438016529e-05, "loss": 0.1143, "step": 7750 }, { "epoch": 1.790633608815427, "grad_norm": 3.1529271602630615, "learning_rate": 1.820959595959596e-05, "loss": 0.1396, "step": 7800 }, { "epoch": 1.8021120293847566, "grad_norm": 9.28099250793457, "learning_rate": 1.8198117539026632e-05, "loss": 0.1381, "step": 7850 }, { "epoch": 1.8135904499540865, "grad_norm": 11.942655563354492, "learning_rate": 1.81866391184573e-05, "loss": 0.1308, "step": 7900 }, { "epoch": 1.825068870523416, "grad_norm": 4.391113758087158, "learning_rate": 1.817516069788797e-05, "loss": 0.134, "step": 7950 }, { "epoch": 1.8365472910927456, "grad_norm": 15.060769081115723, "learning_rate": 1.8163682277318642e-05, "loss": 0.1188, "step": 8000 }, { "epoch": 1.8480257116620753, "grad_norm": 3.9280049800872803, "learning_rate": 1.8152203856749313e-05, "loss": 0.1264, "step": 8050 }, { "epoch": 1.859504132231405, "grad_norm": 9.125816345214844, "learning_rate": 1.814072543617998e-05, "loss": 0.124, "step": 8100 }, { "epoch": 1.8709825528007347, "grad_norm": 7.657766342163086, "learning_rate": 1.8129247015610652e-05, "loss": 0.1606, "step": 8150 }, { "epoch": 1.8824609733700641, "grad_norm": 12.375577926635742, "learning_rate": 1.8117768595041323e-05, "loss": 0.147, "step": 8200 }, { "epoch": 1.893939393939394, "grad_norm": 4.459670066833496, "learning_rate": 1.8106290174471995e-05, "loss": 0.1495, "step": 8250 }, { "epoch": 1.9054178145087235, "grad_norm": 10.317970275878906, "learning_rate": 1.8094811753902662e-05, "loss": 0.1306, "step": 8300 }, { "epoch": 1.9168962350780534, "grad_norm": 4.797674179077148, "learning_rate": 1.8083333333333334e-05, "loss": 0.1366, "step": 8350 }, { "epoch": 1.9283746556473829, "grad_norm": 23.560129165649414, "learning_rate": 1.8071854912764005e-05, "loss": 0.1174, "step": 8400 }, { "epoch": 1.9398530762167125, "grad_norm": 8.83558464050293, "learning_rate": 1.8060376492194676e-05, "loss": 0.1267, "step": 8450 }, { "epoch": 1.9513314967860422, "grad_norm": 7.331394195556641, "learning_rate": 1.8048898071625344e-05, "loss": 0.148, "step": 8500 }, { "epoch": 1.962809917355372, "grad_norm": 2.2308285236358643, "learning_rate": 1.8037419651056015e-05, "loss": 0.1334, "step": 8550 }, { "epoch": 1.9742883379247016, "grad_norm": 9.97135066986084, "learning_rate": 1.8025941230486686e-05, "loss": 0.1438, "step": 8600 }, { "epoch": 1.985766758494031, "grad_norm": 3.0034067630767822, "learning_rate": 1.8014462809917357e-05, "loss": 0.1266, "step": 8650 }, { "epoch": 1.997245179063361, "grad_norm": 2.510180711746216, "learning_rate": 1.800298438934803e-05, "loss": 0.1291, "step": 8700 }, { "epoch": 2.0, "eval_loss": 0.04918314889073372, "eval_runtime": 76.3259, "eval_samples_per_second": 101.447, "eval_steps_per_second": 6.341, "step": 8712 }, { "epoch": 2.0087235996326904, "grad_norm": 4.513134956359863, "learning_rate": 1.7991505968778696e-05, "loss": 0.1129, "step": 8750 }, { "epoch": 2.0202020202020203, "grad_norm": 4.570469856262207, "learning_rate": 1.7980027548209367e-05, "loss": 0.1234, "step": 8800 }, { "epoch": 2.03168044077135, "grad_norm": 5.6341094970703125, "learning_rate": 1.796854912764004e-05, "loss": 0.1184, "step": 8850 }, { "epoch": 2.0431588613406797, "grad_norm": 7.645381450653076, "learning_rate": 1.795707070707071e-05, "loss": 0.1408, "step": 8900 }, { "epoch": 2.054637281910009, "grad_norm": 5.244714260101318, "learning_rate": 1.7945592286501377e-05, "loss": 0.1224, "step": 8950 }, { "epoch": 2.0661157024793386, "grad_norm": 10.038858413696289, "learning_rate": 1.793411386593205e-05, "loss": 0.1304, "step": 9000 }, { "epoch": 2.0775941230486685, "grad_norm": 1.395878791809082, "learning_rate": 1.792263544536272e-05, "loss": 0.1161, "step": 9050 }, { "epoch": 2.089072543617998, "grad_norm": 16.125038146972656, "learning_rate": 1.791115702479339e-05, "loss": 0.122, "step": 9100 }, { "epoch": 2.100550964187328, "grad_norm": 9.266343116760254, "learning_rate": 1.789967860422406e-05, "loss": 0.1408, "step": 9150 }, { "epoch": 2.1120293847566574, "grad_norm": 5.7215704917907715, "learning_rate": 1.788820018365473e-05, "loss": 0.1123, "step": 9200 }, { "epoch": 2.1235078053259873, "grad_norm": 18.952178955078125, "learning_rate": 1.78767217630854e-05, "loss": 0.1132, "step": 9250 }, { "epoch": 2.1349862258953167, "grad_norm": 7.574341773986816, "learning_rate": 1.7865243342516072e-05, "loss": 0.135, "step": 9300 }, { "epoch": 2.1464646464646466, "grad_norm": 16.399192810058594, "learning_rate": 1.785376492194674e-05, "loss": 0.1042, "step": 9350 }, { "epoch": 2.157943067033976, "grad_norm": 5.161776065826416, "learning_rate": 1.784228650137741e-05, "loss": 0.1243, "step": 9400 }, { "epoch": 2.169421487603306, "grad_norm": 5.728438854217529, "learning_rate": 1.7830808080808082e-05, "loss": 0.1322, "step": 9450 }, { "epoch": 2.1808999081726355, "grad_norm": 4.479170322418213, "learning_rate": 1.7819329660238753e-05, "loss": 0.1285, "step": 9500 }, { "epoch": 2.192378328741965, "grad_norm": 10.023076057434082, "learning_rate": 1.780785123966942e-05, "loss": 0.1407, "step": 9550 }, { "epoch": 2.203856749311295, "grad_norm": 7.020942687988281, "learning_rate": 1.7796372819100092e-05, "loss": 0.1221, "step": 9600 }, { "epoch": 2.2153351698806243, "grad_norm": 37.59624481201172, "learning_rate": 1.7784894398530764e-05, "loss": 0.1168, "step": 9650 }, { "epoch": 2.226813590449954, "grad_norm": 10.755002975463867, "learning_rate": 1.7773415977961435e-05, "loss": 0.1301, "step": 9700 }, { "epoch": 2.2382920110192837, "grad_norm": 5.69231653213501, "learning_rate": 1.7761937557392102e-05, "loss": 0.0952, "step": 9750 }, { "epoch": 2.2497704315886136, "grad_norm": 3.6895787715911865, "learning_rate": 1.7750459136822774e-05, "loss": 0.1019, "step": 9800 }, { "epoch": 2.261248852157943, "grad_norm": 2.6681618690490723, "learning_rate": 1.7738980716253445e-05, "loss": 0.1298, "step": 9850 }, { "epoch": 2.2727272727272725, "grad_norm": 2.6417527198791504, "learning_rate": 1.7727502295684116e-05, "loss": 0.1111, "step": 9900 }, { "epoch": 2.2842056932966024, "grad_norm": 8.144768714904785, "learning_rate": 1.7716023875114784e-05, "loss": 0.1153, "step": 9950 }, { "epoch": 2.2956841138659323, "grad_norm": 2.0761029720306396, "learning_rate": 1.7704545454545455e-05, "loss": 0.1195, "step": 10000 }, { "epoch": 2.3071625344352618, "grad_norm": 22.608049392700195, "learning_rate": 1.7693067033976126e-05, "loss": 0.1157, "step": 10050 }, { "epoch": 2.318640955004591, "grad_norm": 34.84150695800781, "learning_rate": 1.7681588613406797e-05, "loss": 0.1196, "step": 10100 }, { "epoch": 2.330119375573921, "grad_norm": 14.069072723388672, "learning_rate": 1.7670110192837465e-05, "loss": 0.0931, "step": 10150 }, { "epoch": 2.3415977961432506, "grad_norm": 1.8730385303497314, "learning_rate": 1.7658631772268136e-05, "loss": 0.1379, "step": 10200 }, { "epoch": 2.3530762167125805, "grad_norm": 9.990337371826172, "learning_rate": 1.7647153351698807e-05, "loss": 0.1028, "step": 10250 }, { "epoch": 2.36455463728191, "grad_norm": 6.857508659362793, "learning_rate": 1.763567493112948e-05, "loss": 0.106, "step": 10300 }, { "epoch": 2.37603305785124, "grad_norm": 3.2709596157073975, "learning_rate": 1.7624196510560146e-05, "loss": 0.1124, "step": 10350 }, { "epoch": 2.3875114784205693, "grad_norm": 10.294021606445312, "learning_rate": 1.7612718089990817e-05, "loss": 0.1088, "step": 10400 }, { "epoch": 2.398989898989899, "grad_norm": 25.204206466674805, "learning_rate": 1.760123966942149e-05, "loss": 0.1127, "step": 10450 }, { "epoch": 2.4104683195592287, "grad_norm": 7.042158603668213, "learning_rate": 1.758976124885216e-05, "loss": 0.1113, "step": 10500 }, { "epoch": 2.421946740128558, "grad_norm": 10.08143424987793, "learning_rate": 1.7578282828282828e-05, "loss": 0.0981, "step": 10550 }, { "epoch": 2.433425160697888, "grad_norm": 3.9184889793395996, "learning_rate": 1.75668044077135e-05, "loss": 0.1098, "step": 10600 }, { "epoch": 2.4449035812672175, "grad_norm": 3.8381764888763428, "learning_rate": 1.755532598714417e-05, "loss": 0.1254, "step": 10650 }, { "epoch": 2.4563820018365474, "grad_norm": 20.33098602294922, "learning_rate": 1.754384756657484e-05, "loss": 0.115, "step": 10700 }, { "epoch": 2.467860422405877, "grad_norm": 6.342065334320068, "learning_rate": 1.753236914600551e-05, "loss": 0.1041, "step": 10750 }, { "epoch": 2.479338842975207, "grad_norm": 5.98074197769165, "learning_rate": 1.752089072543618e-05, "loss": 0.1093, "step": 10800 }, { "epoch": 2.4908172635445363, "grad_norm": 4.1304216384887695, "learning_rate": 1.750941230486685e-05, "loss": 0.0807, "step": 10850 }, { "epoch": 2.502295684113866, "grad_norm": 8.746936798095703, "learning_rate": 1.7497933884297522e-05, "loss": 0.0957, "step": 10900 }, { "epoch": 2.5137741046831956, "grad_norm": 57.13322067260742, "learning_rate": 1.748645546372819e-05, "loss": 0.1242, "step": 10950 }, { "epoch": 2.525252525252525, "grad_norm": 9.396512985229492, "learning_rate": 1.747497704315886e-05, "loss": 0.1077, "step": 11000 }, { "epoch": 2.536730945821855, "grad_norm": 49.20916748046875, "learning_rate": 1.7463498622589536e-05, "loss": 0.1002, "step": 11050 }, { "epoch": 2.5482093663911844, "grad_norm": 6.599617004394531, "learning_rate": 1.7452020202020204e-05, "loss": 0.1067, "step": 11100 }, { "epoch": 2.5596877869605144, "grad_norm": 3.373964548110962, "learning_rate": 1.7440541781450875e-05, "loss": 0.1057, "step": 11150 }, { "epoch": 2.571166207529844, "grad_norm": 18.471527099609375, "learning_rate": 1.7429063360881543e-05, "loss": 0.1218, "step": 11200 }, { "epoch": 2.5826446280991737, "grad_norm": 4.968184471130371, "learning_rate": 1.7417584940312217e-05, "loss": 0.1121, "step": 11250 }, { "epoch": 2.594123048668503, "grad_norm": 44.23242950439453, "learning_rate": 1.7406106519742885e-05, "loss": 0.1251, "step": 11300 }, { "epoch": 2.6056014692378326, "grad_norm": 4.141283988952637, "learning_rate": 1.7394628099173556e-05, "loss": 0.1115, "step": 11350 }, { "epoch": 2.6170798898071626, "grad_norm": 8.843578338623047, "learning_rate": 1.7383149678604224e-05, "loss": 0.1039, "step": 11400 }, { "epoch": 2.6285583103764925, "grad_norm": 3.805959463119507, "learning_rate": 1.73716712580349e-05, "loss": 0.1009, "step": 11450 }, { "epoch": 2.640036730945822, "grad_norm": 5.138465881347656, "learning_rate": 1.7360192837465566e-05, "loss": 0.0964, "step": 11500 }, { "epoch": 2.6515151515151514, "grad_norm": 14.198784828186035, "learning_rate": 1.7348714416896237e-05, "loss": 0.1019, "step": 11550 }, { "epoch": 2.6629935720844813, "grad_norm": 11.450353622436523, "learning_rate": 1.7337235996326905e-05, "loss": 0.0919, "step": 11600 }, { "epoch": 2.6744719926538107, "grad_norm": 4.942192077636719, "learning_rate": 1.732575757575758e-05, "loss": 0.1011, "step": 11650 }, { "epoch": 2.6859504132231407, "grad_norm": 3.038707971572876, "learning_rate": 1.7314279155188247e-05, "loss": 0.0823, "step": 11700 }, { "epoch": 2.69742883379247, "grad_norm": 1.4619789123535156, "learning_rate": 1.730280073461892e-05, "loss": 0.1075, "step": 11750 }, { "epoch": 2.7089072543618, "grad_norm": 9.916839599609375, "learning_rate": 1.7291322314049586e-05, "loss": 0.1023, "step": 11800 }, { "epoch": 2.7203856749311295, "grad_norm": 4.720894813537598, "learning_rate": 1.727984389348026e-05, "loss": 0.0984, "step": 11850 }, { "epoch": 2.731864095500459, "grad_norm": 8.532073020935059, "learning_rate": 1.726836547291093e-05, "loss": 0.0928, "step": 11900 }, { "epoch": 2.743342516069789, "grad_norm": 5.163074970245361, "learning_rate": 1.72568870523416e-05, "loss": 0.1174, "step": 11950 }, { "epoch": 2.7548209366391183, "grad_norm": 52.860877990722656, "learning_rate": 1.7245408631772268e-05, "loss": 0.102, "step": 12000 }, { "epoch": 2.766299357208448, "grad_norm": 5.281815052032471, "learning_rate": 1.7233930211202942e-05, "loss": 0.0964, "step": 12050 }, { "epoch": 2.7777777777777777, "grad_norm": 4.720059871673584, "learning_rate": 1.722245179063361e-05, "loss": 0.1207, "step": 12100 }, { "epoch": 2.7892561983471076, "grad_norm": 36.368900299072266, "learning_rate": 1.721097337006428e-05, "loss": 0.0945, "step": 12150 }, { "epoch": 2.800734618916437, "grad_norm": 4.791744232177734, "learning_rate": 1.719949494949495e-05, "loss": 0.0986, "step": 12200 }, { "epoch": 2.8122130394857665, "grad_norm": 1.9031206369400024, "learning_rate": 1.7188016528925623e-05, "loss": 0.1235, "step": 12250 }, { "epoch": 2.8236914600550964, "grad_norm": 12.855907440185547, "learning_rate": 1.717653810835629e-05, "loss": 0.1059, "step": 12300 }, { "epoch": 2.8351698806244263, "grad_norm": 6.975632667541504, "learning_rate": 1.7165059687786962e-05, "loss": 0.0972, "step": 12350 }, { "epoch": 2.846648301193756, "grad_norm": 6.8774638175964355, "learning_rate": 1.715358126721763e-05, "loss": 0.1102, "step": 12400 }, { "epoch": 2.8581267217630852, "grad_norm": 8.191630363464355, "learning_rate": 1.7142102846648305e-05, "loss": 0.0838, "step": 12450 }, { "epoch": 2.869605142332415, "grad_norm": 3.605515480041504, "learning_rate": 1.7130624426078973e-05, "loss": 0.0868, "step": 12500 }, { "epoch": 2.8810835629017446, "grad_norm": 10.86032485961914, "learning_rate": 1.7119146005509644e-05, "loss": 0.08, "step": 12550 }, { "epoch": 2.8925619834710745, "grad_norm": 11.766641616821289, "learning_rate": 1.710766758494031e-05, "loss": 0.1212, "step": 12600 }, { "epoch": 2.904040404040404, "grad_norm": 5.032931804656982, "learning_rate": 1.7096189164370983e-05, "loss": 0.0835, "step": 12650 }, { "epoch": 2.915518824609734, "grad_norm": 4.484053134918213, "learning_rate": 1.7084710743801654e-05, "loss": 0.0992, "step": 12700 }, { "epoch": 2.9269972451790633, "grad_norm": 19.982370376586914, "learning_rate": 1.7073232323232325e-05, "loss": 0.0907, "step": 12750 }, { "epoch": 2.938475665748393, "grad_norm": 15.969244956970215, "learning_rate": 1.7061753902662993e-05, "loss": 0.0894, "step": 12800 }, { "epoch": 2.9499540863177227, "grad_norm": 3.1585652828216553, "learning_rate": 1.7050275482093664e-05, "loss": 0.102, "step": 12850 }, { "epoch": 2.9614325068870526, "grad_norm": 7.989742755889893, "learning_rate": 1.7038797061524335e-05, "loss": 0.1088, "step": 12900 }, { "epoch": 2.972910927456382, "grad_norm": 3.577751874923706, "learning_rate": 1.7027318640955006e-05, "loss": 0.1175, "step": 12950 }, { "epoch": 2.9843893480257115, "grad_norm": 23.6451416015625, "learning_rate": 1.7015840220385674e-05, "loss": 0.0965, "step": 13000 }, { "epoch": 2.9958677685950414, "grad_norm": 23.72952651977539, "learning_rate": 1.7004361799816345e-05, "loss": 0.0999, "step": 13050 }, { "epoch": 3.0, "eval_loss": 0.038582693785429, "eval_runtime": 76.0619, "eval_samples_per_second": 101.799, "eval_steps_per_second": 6.363, "step": 13068 }, { "epoch": 3.007346189164371, "grad_norm": 27.318923950195312, "learning_rate": 1.6992883379247016e-05, "loss": 0.1051, "step": 13100 }, { "epoch": 3.018824609733701, "grad_norm": 3.265124797821045, "learning_rate": 1.6981404958677687e-05, "loss": 0.0742, "step": 13150 }, { "epoch": 3.0303030303030303, "grad_norm": 12.635414123535156, "learning_rate": 1.6969926538108355e-05, "loss": 0.093, "step": 13200 }, { "epoch": 3.04178145087236, "grad_norm": 7.196776390075684, "learning_rate": 1.6958448117539026e-05, "loss": 0.1015, "step": 13250 }, { "epoch": 3.0532598714416896, "grad_norm": 4.578052997589111, "learning_rate": 1.6946969696969698e-05, "loss": 0.0887, "step": 13300 }, { "epoch": 3.064738292011019, "grad_norm": 4.053121089935303, "learning_rate": 1.693549127640037e-05, "loss": 0.0735, "step": 13350 }, { "epoch": 3.076216712580349, "grad_norm": 4.015145778656006, "learning_rate": 1.6924012855831037e-05, "loss": 0.0937, "step": 13400 }, { "epoch": 3.0876951331496785, "grad_norm": 31.265470504760742, "learning_rate": 1.6912534435261708e-05, "loss": 0.1064, "step": 13450 }, { "epoch": 3.0991735537190084, "grad_norm": 3.9313504695892334, "learning_rate": 1.6901056014692382e-05, "loss": 0.1015, "step": 13500 }, { "epoch": 3.110651974288338, "grad_norm": 14.833806991577148, "learning_rate": 1.688957759412305e-05, "loss": 0.0842, "step": 13550 }, { "epoch": 3.1221303948576677, "grad_norm": 3.8324055671691895, "learning_rate": 1.687809917355372e-05, "loss": 0.0863, "step": 13600 }, { "epoch": 3.133608815426997, "grad_norm": 15.220489501953125, "learning_rate": 1.686662075298439e-05, "loss": 0.0988, "step": 13650 }, { "epoch": 3.145087235996327, "grad_norm": 7.489311695098877, "learning_rate": 1.6855142332415064e-05, "loss": 0.0877, "step": 13700 }, { "epoch": 3.1565656565656566, "grad_norm": 4.19669771194458, "learning_rate": 1.684366391184573e-05, "loss": 0.1144, "step": 13750 }, { "epoch": 3.168044077134986, "grad_norm": 3.2063517570495605, "learning_rate": 1.6832185491276402e-05, "loss": 0.0875, "step": 13800 }, { "epoch": 3.179522497704316, "grad_norm": 8.29419994354248, "learning_rate": 1.682070707070707e-05, "loss": 0.1016, "step": 13850 }, { "epoch": 3.1910009182736454, "grad_norm": 7.446157932281494, "learning_rate": 1.6809228650137745e-05, "loss": 0.0832, "step": 13900 }, { "epoch": 3.2024793388429753, "grad_norm": 4.334078311920166, "learning_rate": 1.6797750229568413e-05, "loss": 0.0967, "step": 13950 }, { "epoch": 3.2139577594123048, "grad_norm": 3.4967422485351562, "learning_rate": 1.6786271808999084e-05, "loss": 0.0975, "step": 14000 }, { "epoch": 3.2254361799816347, "grad_norm": 3.725661277770996, "learning_rate": 1.677479338842975e-05, "loss": 0.0934, "step": 14050 }, { "epoch": 3.236914600550964, "grad_norm": 2.0020086765289307, "learning_rate": 1.6763314967860426e-05, "loss": 0.077, "step": 14100 }, { "epoch": 3.248393021120294, "grad_norm": 4.054165363311768, "learning_rate": 1.6751836547291094e-05, "loss": 0.0827, "step": 14150 }, { "epoch": 3.2598714416896235, "grad_norm": 11.6398344039917, "learning_rate": 1.6740358126721765e-05, "loss": 0.0786, "step": 14200 }, { "epoch": 3.271349862258953, "grad_norm": 25.29025650024414, "learning_rate": 1.6728879706152433e-05, "loss": 0.079, "step": 14250 }, { "epoch": 3.282828282828283, "grad_norm": 7.283995151519775, "learning_rate": 1.6717401285583107e-05, "loss": 0.0811, "step": 14300 }, { "epoch": 3.2943067033976123, "grad_norm": 12.82802963256836, "learning_rate": 1.6705922865013775e-05, "loss": 0.0746, "step": 14350 }, { "epoch": 3.3057851239669422, "grad_norm": 1.6801600456237793, "learning_rate": 1.6694444444444446e-05, "loss": 0.0878, "step": 14400 }, { "epoch": 3.3172635445362717, "grad_norm": 2.1131277084350586, "learning_rate": 1.6682966023875114e-05, "loss": 0.1034, "step": 14450 }, { "epoch": 3.3287419651056016, "grad_norm": 4.358730316162109, "learning_rate": 1.667148760330579e-05, "loss": 0.0795, "step": 14500 }, { "epoch": 3.340220385674931, "grad_norm": 1.4821679592132568, "learning_rate": 1.6660009182736456e-05, "loss": 0.0819, "step": 14550 }, { "epoch": 3.351698806244261, "grad_norm": 38.24598693847656, "learning_rate": 1.6648530762167128e-05, "loss": 0.0882, "step": 14600 }, { "epoch": 3.3631772268135904, "grad_norm": 24.0532169342041, "learning_rate": 1.6637052341597795e-05, "loss": 0.1021, "step": 14650 }, { "epoch": 3.3746556473829203, "grad_norm": 4.011071681976318, "learning_rate": 1.662557392102847e-05, "loss": 0.0844, "step": 14700 }, { "epoch": 3.38613406795225, "grad_norm": 4.307682991027832, "learning_rate": 1.6614095500459138e-05, "loss": 0.0812, "step": 14750 }, { "epoch": 3.3976124885215793, "grad_norm": 12.250354766845703, "learning_rate": 1.660261707988981e-05, "loss": 0.0878, "step": 14800 }, { "epoch": 3.409090909090909, "grad_norm": 4.938148498535156, "learning_rate": 1.6591138659320477e-05, "loss": 0.0865, "step": 14850 }, { "epoch": 3.4205693296602386, "grad_norm": 3.517449140548706, "learning_rate": 1.657966023875115e-05, "loss": 0.0954, "step": 14900 }, { "epoch": 3.4320477502295685, "grad_norm": 3.1695127487182617, "learning_rate": 1.656818181818182e-05, "loss": 0.0872, "step": 14950 }, { "epoch": 3.443526170798898, "grad_norm": 12.07983684539795, "learning_rate": 1.655670339761249e-05, "loss": 0.0846, "step": 15000 }, { "epoch": 3.455004591368228, "grad_norm": 1.8993598222732544, "learning_rate": 1.6545224977043158e-05, "loss": 0.0931, "step": 15050 }, { "epoch": 3.4664830119375574, "grad_norm": 2.263831853866577, "learning_rate": 1.6533746556473832e-05, "loss": 0.0728, "step": 15100 }, { "epoch": 3.477961432506887, "grad_norm": 2.992927074432373, "learning_rate": 1.65222681359045e-05, "loss": 0.0768, "step": 15150 }, { "epoch": 3.4894398530762167, "grad_norm": 1.4961662292480469, "learning_rate": 1.651078971533517e-05, "loss": 0.0948, "step": 15200 }, { "epoch": 3.5009182736455466, "grad_norm": 1.779676079750061, "learning_rate": 1.649931129476584e-05, "loss": 0.0791, "step": 15250 }, { "epoch": 3.512396694214876, "grad_norm": 4.589473247528076, "learning_rate": 1.6487832874196514e-05, "loss": 0.0938, "step": 15300 }, { "epoch": 3.5238751147842056, "grad_norm": 11.575867652893066, "learning_rate": 1.647635445362718e-05, "loss": 0.0774, "step": 15350 }, { "epoch": 3.5353535353535355, "grad_norm": 63.73720169067383, "learning_rate": 1.6464876033057853e-05, "loss": 0.0791, "step": 15400 }, { "epoch": 3.546831955922865, "grad_norm": 1.5093408823013306, "learning_rate": 1.645339761248852e-05, "loss": 0.0858, "step": 15450 }, { "epoch": 3.558310376492195, "grad_norm": 28.32036781311035, "learning_rate": 1.6441919191919195e-05, "loss": 0.0868, "step": 15500 }, { "epoch": 3.5697887970615243, "grad_norm": 9.240126609802246, "learning_rate": 1.6430440771349863e-05, "loss": 0.0844, "step": 15550 }, { "epoch": 3.581267217630854, "grad_norm": 2.6464617252349854, "learning_rate": 1.6418962350780534e-05, "loss": 0.0754, "step": 15600 }, { "epoch": 3.5927456382001837, "grad_norm": 13.340537071228027, "learning_rate": 1.6407483930211202e-05, "loss": 0.0797, "step": 15650 }, { "epoch": 3.604224058769513, "grad_norm": 9.116503715515137, "learning_rate": 1.6396005509641876e-05, "loss": 0.0797, "step": 15700 }, { "epoch": 3.615702479338843, "grad_norm": 4.237601280212402, "learning_rate": 1.6384527089072544e-05, "loss": 0.0642, "step": 15750 }, { "epoch": 3.6271808999081725, "grad_norm": 9.713337898254395, "learning_rate": 1.6373048668503215e-05, "loss": 0.0737, "step": 15800 }, { "epoch": 3.6386593204775024, "grad_norm": 2.1654858589172363, "learning_rate": 1.6361570247933886e-05, "loss": 0.083, "step": 15850 }, { "epoch": 3.650137741046832, "grad_norm": 30.951499938964844, "learning_rate": 1.6350091827364558e-05, "loss": 0.0932, "step": 15900 }, { "epoch": 3.6616161616161618, "grad_norm": 8.739038467407227, "learning_rate": 1.633861340679523e-05, "loss": 0.0733, "step": 15950 }, { "epoch": 3.6730945821854912, "grad_norm": 3.219669818878174, "learning_rate": 1.6327134986225896e-05, "loss": 0.0803, "step": 16000 }, { "epoch": 3.6845730027548207, "grad_norm": 3.333641767501831, "learning_rate": 1.6315656565656568e-05, "loss": 0.0942, "step": 16050 }, { "epoch": 3.6960514233241506, "grad_norm": 4.10658073425293, "learning_rate": 1.630417814508724e-05, "loss": 0.0806, "step": 16100 }, { "epoch": 3.7075298438934805, "grad_norm": 12.419151306152344, "learning_rate": 1.629269972451791e-05, "loss": 0.0771, "step": 16150 }, { "epoch": 3.71900826446281, "grad_norm": 7.276992321014404, "learning_rate": 1.6281221303948578e-05, "loss": 0.0821, "step": 16200 }, { "epoch": 3.7304866850321394, "grad_norm": 5.558807373046875, "learning_rate": 1.626974288337925e-05, "loss": 0.0726, "step": 16250 }, { "epoch": 3.7419651056014693, "grad_norm": 1.9196701049804688, "learning_rate": 1.625826446280992e-05, "loss": 0.0907, "step": 16300 }, { "epoch": 3.753443526170799, "grad_norm": 1.9817266464233398, "learning_rate": 1.624678604224059e-05, "loss": 0.0776, "step": 16350 }, { "epoch": 3.7649219467401287, "grad_norm": 4.608344554901123, "learning_rate": 1.623530762167126e-05, "loss": 0.0675, "step": 16400 }, { "epoch": 3.776400367309458, "grad_norm": 3.517875909805298, "learning_rate": 1.622382920110193e-05, "loss": 0.0873, "step": 16450 }, { "epoch": 3.787878787878788, "grad_norm": 21.386676788330078, "learning_rate": 1.62123507805326e-05, "loss": 0.0663, "step": 16500 }, { "epoch": 3.7993572084481175, "grad_norm": 6.083133697509766, "learning_rate": 1.6200872359963273e-05, "loss": 0.0676, "step": 16550 }, { "epoch": 3.810835629017447, "grad_norm": 8.673909187316895, "learning_rate": 1.618939393939394e-05, "loss": 0.0787, "step": 16600 }, { "epoch": 3.822314049586777, "grad_norm": 22.145618438720703, "learning_rate": 1.617791551882461e-05, "loss": 0.0882, "step": 16650 }, { "epoch": 3.8337924701561064, "grad_norm": 7.471016883850098, "learning_rate": 1.6166437098255283e-05, "loss": 0.0669, "step": 16700 }, { "epoch": 3.8452708907254363, "grad_norm": 11.069772720336914, "learning_rate": 1.6154958677685954e-05, "loss": 0.0801, "step": 16750 }, { "epoch": 3.8567493112947657, "grad_norm": 18.581052780151367, "learning_rate": 1.614348025711662e-05, "loss": 0.0633, "step": 16800 }, { "epoch": 3.8682277318640956, "grad_norm": 2.5162479877471924, "learning_rate": 1.6132001836547293e-05, "loss": 0.0948, "step": 16850 }, { "epoch": 3.879706152433425, "grad_norm": 3.725700855255127, "learning_rate": 1.6120523415977964e-05, "loss": 0.0774, "step": 16900 }, { "epoch": 3.8911845730027546, "grad_norm": 0.9673944711685181, "learning_rate": 1.6109044995408635e-05, "loss": 0.074, "step": 16950 }, { "epoch": 3.9026629935720845, "grad_norm": 114.74365234375, "learning_rate": 1.6097566574839303e-05, "loss": 0.0824, "step": 17000 }, { "epoch": 3.9141414141414144, "grad_norm": 19.110740661621094, "learning_rate": 1.6086088154269974e-05, "loss": 0.0799, "step": 17050 }, { "epoch": 3.925619834710744, "grad_norm": 12.809351921081543, "learning_rate": 1.6074609733700645e-05, "loss": 0.0779, "step": 17100 }, { "epoch": 3.9370982552800733, "grad_norm": 6.999904155731201, "learning_rate": 1.6063131313131316e-05, "loss": 0.0774, "step": 17150 }, { "epoch": 3.948576675849403, "grad_norm": 1.5391592979431152, "learning_rate": 1.6051652892561984e-05, "loss": 0.0954, "step": 17200 }, { "epoch": 3.9600550964187327, "grad_norm": 7.3448944091796875, "learning_rate": 1.6040174471992655e-05, "loss": 0.079, "step": 17250 }, { "epoch": 3.9715335169880626, "grad_norm": 5.708894729614258, "learning_rate": 1.6028696051423323e-05, "loss": 0.0646, "step": 17300 }, { "epoch": 3.983011937557392, "grad_norm": 2.655522584915161, "learning_rate": 1.6017217630853998e-05, "loss": 0.0852, "step": 17350 }, { "epoch": 3.994490358126722, "grad_norm": 9.058150291442871, "learning_rate": 1.6005739210284665e-05, "loss": 0.077, "step": 17400 }, { "epoch": 4.0, "eval_loss": 0.026178738102316856, "eval_runtime": 76.4114, "eval_samples_per_second": 101.333, "eval_steps_per_second": 6.334, "step": 17424 }, { "epoch": 4.005968778696052, "grad_norm": 13.794172286987305, "learning_rate": 1.5994260789715337e-05, "loss": 0.0893, "step": 17450 }, { "epoch": 4.017447199265381, "grad_norm": 9.89699649810791, "learning_rate": 1.5982782369146004e-05, "loss": 0.0907, "step": 17500 }, { "epoch": 4.028925619834711, "grad_norm": 12.411093711853027, "learning_rate": 1.597130394857668e-05, "loss": 0.0992, "step": 17550 }, { "epoch": 4.040404040404041, "grad_norm": 0.8096949458122253, "learning_rate": 1.5959825528007347e-05, "loss": 0.0706, "step": 17600 }, { "epoch": 4.05188246097337, "grad_norm": 10.733264923095703, "learning_rate": 1.5948347107438018e-05, "loss": 0.0863, "step": 17650 }, { "epoch": 4.0633608815427, "grad_norm": 4.159455299377441, "learning_rate": 1.5936868686868686e-05, "loss": 0.0698, "step": 17700 }, { "epoch": 4.0748393021120295, "grad_norm": 50.61993408203125, "learning_rate": 1.592539026629936e-05, "loss": 0.0581, "step": 17750 }, { "epoch": 4.086317722681359, "grad_norm": 6.64367151260376, "learning_rate": 1.5913911845730028e-05, "loss": 0.0519, "step": 17800 }, { "epoch": 4.097796143250688, "grad_norm": 2.9516868591308594, "learning_rate": 1.59024334251607e-05, "loss": 0.0774, "step": 17850 }, { "epoch": 4.109274563820018, "grad_norm": 10.22035026550293, "learning_rate": 1.5890955004591367e-05, "loss": 0.079, "step": 17900 }, { "epoch": 4.120752984389348, "grad_norm": 12.595512390136719, "learning_rate": 1.587947658402204e-05, "loss": 0.0729, "step": 17950 }, { "epoch": 4.132231404958677, "grad_norm": 8.707656860351562, "learning_rate": 1.586799816345271e-05, "loss": 0.0807, "step": 18000 }, { "epoch": 4.143709825528007, "grad_norm": 10.56053638458252, "learning_rate": 1.585651974288338e-05, "loss": 0.0585, "step": 18050 }, { "epoch": 4.155188246097337, "grad_norm": 0.2249426394701004, "learning_rate": 1.5845041322314048e-05, "loss": 0.0571, "step": 18100 }, { "epoch": 4.166666666666667, "grad_norm": 3.3897705078125, "learning_rate": 1.5833562901744723e-05, "loss": 0.0786, "step": 18150 }, { "epoch": 4.178145087235996, "grad_norm": 7.3916168212890625, "learning_rate": 1.5822084481175394e-05, "loss": 0.0608, "step": 18200 }, { "epoch": 4.189623507805326, "grad_norm": 8.739521026611328, "learning_rate": 1.581060606060606e-05, "loss": 0.0822, "step": 18250 }, { "epoch": 4.201101928374656, "grad_norm": 1.7405649423599243, "learning_rate": 1.5799127640036733e-05, "loss": 0.074, "step": 18300 }, { "epoch": 4.212580348943986, "grad_norm": 12.468271255493164, "learning_rate": 1.5787649219467404e-05, "loss": 0.0649, "step": 18350 }, { "epoch": 4.224058769513315, "grad_norm": 12.860107421875, "learning_rate": 1.5776170798898075e-05, "loss": 0.0804, "step": 18400 }, { "epoch": 4.235537190082645, "grad_norm": 1.63035249710083, "learning_rate": 1.5764692378328743e-05, "loss": 0.0637, "step": 18450 }, { "epoch": 4.2470156106519745, "grad_norm": 1.4469000101089478, "learning_rate": 1.5753213957759414e-05, "loss": 0.0838, "step": 18500 }, { "epoch": 4.2584940312213035, "grad_norm": 2.655168056488037, "learning_rate": 1.5741735537190085e-05, "loss": 0.0694, "step": 18550 }, { "epoch": 4.2699724517906334, "grad_norm": 5.923055171966553, "learning_rate": 1.5730257116620756e-05, "loss": 0.0673, "step": 18600 }, { "epoch": 4.281450872359963, "grad_norm": 3.5013010501861572, "learning_rate": 1.5718778696051424e-05, "loss": 0.084, "step": 18650 }, { "epoch": 4.292929292929293, "grad_norm": 7.023970127105713, "learning_rate": 1.5707300275482095e-05, "loss": 0.0632, "step": 18700 }, { "epoch": 4.304407713498622, "grad_norm": 2.661977529525757, "learning_rate": 1.5695821854912767e-05, "loss": 0.0712, "step": 18750 }, { "epoch": 4.315886134067952, "grad_norm": 9.35297966003418, "learning_rate": 1.5684343434343438e-05, "loss": 0.0824, "step": 18800 }, { "epoch": 4.327364554637282, "grad_norm": 4.116842269897461, "learning_rate": 1.5672865013774105e-05, "loss": 0.0573, "step": 18850 }, { "epoch": 4.338842975206612, "grad_norm": 5.965562343597412, "learning_rate": 1.5661386593204777e-05, "loss": 0.0788, "step": 18900 }, { "epoch": 4.350321395775941, "grad_norm": 1.7762409448623657, "learning_rate": 1.5649908172635448e-05, "loss": 0.0616, "step": 18950 }, { "epoch": 4.361799816345271, "grad_norm": 7.686234951019287, "learning_rate": 1.563842975206612e-05, "loss": 0.0747, "step": 19000 }, { "epoch": 4.373278236914601, "grad_norm": 23.71392822265625, "learning_rate": 1.5626951331496787e-05, "loss": 0.0709, "step": 19050 }, { "epoch": 4.38475665748393, "grad_norm": 3.0295040607452393, "learning_rate": 1.5615472910927458e-05, "loss": 0.081, "step": 19100 }, { "epoch": 4.39623507805326, "grad_norm": 72.48674011230469, "learning_rate": 1.560399449035813e-05, "loss": 0.0642, "step": 19150 }, { "epoch": 4.40771349862259, "grad_norm": 3.4693922996520996, "learning_rate": 1.55925160697888e-05, "loss": 0.0579, "step": 19200 }, { "epoch": 4.41919191919192, "grad_norm": 23.947956085205078, "learning_rate": 1.5581037649219468e-05, "loss": 0.0763, "step": 19250 }, { "epoch": 4.430670339761249, "grad_norm": 3.694312334060669, "learning_rate": 1.556955922865014e-05, "loss": 0.0603, "step": 19300 }, { "epoch": 4.4421487603305785, "grad_norm": 1.5703692436218262, "learning_rate": 1.555808080808081e-05, "loss": 0.0793, "step": 19350 }, { "epoch": 4.453627180899908, "grad_norm": 6.27282190322876, "learning_rate": 1.554660238751148e-05, "loss": 0.0628, "step": 19400 }, { "epoch": 4.465105601469238, "grad_norm": 9.569915771484375, "learning_rate": 1.553512396694215e-05, "loss": 0.0739, "step": 19450 }, { "epoch": 4.476584022038567, "grad_norm": 7.230015277862549, "learning_rate": 1.552364554637282e-05, "loss": 0.0598, "step": 19500 }, { "epoch": 4.488062442607897, "grad_norm": 11.842591285705566, "learning_rate": 1.551216712580349e-05, "loss": 0.0833, "step": 19550 }, { "epoch": 4.499540863177227, "grad_norm": 17.185081481933594, "learning_rate": 1.5500688705234163e-05, "loss": 0.0718, "step": 19600 }, { "epoch": 4.511019283746556, "grad_norm": 8.17883014678955, "learning_rate": 1.548921028466483e-05, "loss": 0.0867, "step": 19650 }, { "epoch": 4.522497704315886, "grad_norm": 8.615788459777832, "learning_rate": 1.5477731864095502e-05, "loss": 0.0809, "step": 19700 }, { "epoch": 4.533976124885216, "grad_norm": 4.707847595214844, "learning_rate": 1.5466253443526173e-05, "loss": 0.0738, "step": 19750 }, { "epoch": 4.545454545454545, "grad_norm": 6.768332481384277, "learning_rate": 1.5454775022956844e-05, "loss": 0.0695, "step": 19800 }, { "epoch": 4.556932966023875, "grad_norm": 11.24840259552002, "learning_rate": 1.5443296602387512e-05, "loss": 0.0594, "step": 19850 }, { "epoch": 4.568411386593205, "grad_norm": 3.4467811584472656, "learning_rate": 1.5431818181818183e-05, "loss": 0.0698, "step": 19900 }, { "epoch": 4.579889807162535, "grad_norm": 0.09050484746694565, "learning_rate": 1.5420339761248854e-05, "loss": 0.0741, "step": 19950 }, { "epoch": 4.591368227731865, "grad_norm": 3.4562206268310547, "learning_rate": 1.5408861340679525e-05, "loss": 0.0587, "step": 20000 }, { "epoch": 4.602846648301194, "grad_norm": 0.6945027112960815, "learning_rate": 1.5397382920110193e-05, "loss": 0.0655, "step": 20050 }, { "epoch": 4.6143250688705235, "grad_norm": 8.564302444458008, "learning_rate": 1.5385904499540864e-05, "loss": 0.081, "step": 20100 }, { "epoch": 4.625803489439853, "grad_norm": 5.788010120391846, "learning_rate": 1.5374426078971535e-05, "loss": 0.0819, "step": 20150 }, { "epoch": 4.637281910009182, "grad_norm": 9.682851791381836, "learning_rate": 1.5362947658402207e-05, "loss": 0.063, "step": 20200 }, { "epoch": 4.648760330578512, "grad_norm": 23.960376739501953, "learning_rate": 1.5351469237832874e-05, "loss": 0.0624, "step": 20250 }, { "epoch": 4.660238751147842, "grad_norm": 10.756389617919922, "learning_rate": 1.5339990817263546e-05, "loss": 0.0772, "step": 20300 }, { "epoch": 4.671717171717171, "grad_norm": 4.357571125030518, "learning_rate": 1.5328512396694217e-05, "loss": 0.0555, "step": 20350 }, { "epoch": 4.683195592286501, "grad_norm": 4.373106956481934, "learning_rate": 1.5317033976124888e-05, "loss": 0.0535, "step": 20400 }, { "epoch": 4.694674012855831, "grad_norm": 5.709158897399902, "learning_rate": 1.5305555555555556e-05, "loss": 0.0608, "step": 20450 }, { "epoch": 4.706152433425161, "grad_norm": 10.952962875366211, "learning_rate": 1.5294077134986227e-05, "loss": 0.061, "step": 20500 }, { "epoch": 4.71763085399449, "grad_norm": 3.540158748626709, "learning_rate": 1.5282598714416898e-05, "loss": 0.0538, "step": 20550 }, { "epoch": 4.72910927456382, "grad_norm": 3.774223804473877, "learning_rate": 1.527112029384757e-05, "loss": 0.0759, "step": 20600 }, { "epoch": 4.74058769513315, "grad_norm": 30.93947982788086, "learning_rate": 1.525964187327824e-05, "loss": 0.0524, "step": 20650 }, { "epoch": 4.75206611570248, "grad_norm": 3.6765692234039307, "learning_rate": 1.5248163452708908e-05, "loss": 0.0726, "step": 20700 }, { "epoch": 4.763544536271809, "grad_norm": 3.635209083557129, "learning_rate": 1.523668503213958e-05, "loss": 0.0595, "step": 20750 }, { "epoch": 4.775022956841139, "grad_norm": 5.130997657775879, "learning_rate": 1.5225206611570249e-05, "loss": 0.0712, "step": 20800 }, { "epoch": 4.7865013774104685, "grad_norm": 1.4420883655548096, "learning_rate": 1.521372819100092e-05, "loss": 0.0623, "step": 20850 }, { "epoch": 4.797979797979798, "grad_norm": 2.573410749435425, "learning_rate": 1.520224977043159e-05, "loss": 0.0691, "step": 20900 }, { "epoch": 4.8094582185491275, "grad_norm": 3.910555362701416, "learning_rate": 1.519077134986226e-05, "loss": 0.0673, "step": 20950 }, { "epoch": 4.820936639118457, "grad_norm": 10.7109375, "learning_rate": 1.517929292929293e-05, "loss": 0.0538, "step": 21000 }, { "epoch": 4.832415059687787, "grad_norm": 2.241975784301758, "learning_rate": 1.5167814508723601e-05, "loss": 0.0738, "step": 21050 }, { "epoch": 4.843893480257116, "grad_norm": 0.9203261137008667, "learning_rate": 1.515633608815427e-05, "loss": 0.0761, "step": 21100 }, { "epoch": 4.855371900826446, "grad_norm": 6.893440246582031, "learning_rate": 1.5144857667584942e-05, "loss": 0.0687, "step": 21150 }, { "epoch": 4.866850321395776, "grad_norm": 1.8644015789031982, "learning_rate": 1.5133379247015611e-05, "loss": 0.063, "step": 21200 }, { "epoch": 4.878328741965106, "grad_norm": 14.406646728515625, "learning_rate": 1.5121900826446282e-05, "loss": 0.0614, "step": 21250 }, { "epoch": 4.889807162534435, "grad_norm": 5.122009754180908, "learning_rate": 1.5110422405876952e-05, "loss": 0.0676, "step": 21300 }, { "epoch": 4.901285583103765, "grad_norm": 8.310937881469727, "learning_rate": 1.5098943985307623e-05, "loss": 0.063, "step": 21350 }, { "epoch": 4.912764003673095, "grad_norm": 24.121702194213867, "learning_rate": 1.5087465564738293e-05, "loss": 0.0568, "step": 21400 }, { "epoch": 4.924242424242424, "grad_norm": 1.7431272268295288, "learning_rate": 1.5075987144168964e-05, "loss": 0.0627, "step": 21450 }, { "epoch": 4.935720844811754, "grad_norm": 3.9782192707061768, "learning_rate": 1.5064508723599633e-05, "loss": 0.06, "step": 21500 }, { "epoch": 4.947199265381084, "grad_norm": 4.4752044677734375, "learning_rate": 1.5053030303030304e-05, "loss": 0.0719, "step": 21550 }, { "epoch": 4.958677685950414, "grad_norm": 3.175583839416504, "learning_rate": 1.5041551882460974e-05, "loss": 0.0504, "step": 21600 }, { "epoch": 4.970156106519743, "grad_norm": 9.640055656433105, "learning_rate": 1.5030073461891645e-05, "loss": 0.059, "step": 21650 }, { "epoch": 4.9816345270890725, "grad_norm": 16.657512664794922, "learning_rate": 1.5018595041322314e-05, "loss": 0.0687, "step": 21700 }, { "epoch": 4.993112947658402, "grad_norm": 3.0832114219665527, "learning_rate": 1.5007116620752986e-05, "loss": 0.0694, "step": 21750 }, { "epoch": 5.0, "eval_loss": 0.023562723770737648, "eval_runtime": 75.7751, "eval_samples_per_second": 102.184, "eval_steps_per_second": 6.387, "step": 21780 }, { "epoch": 5.004591368227731, "grad_norm": 8.29771900177002, "learning_rate": 1.4995638200183655e-05, "loss": 0.066, "step": 21800 }, { "epoch": 5.016069788797061, "grad_norm": 13.276568412780762, "learning_rate": 1.4984159779614326e-05, "loss": 0.0731, "step": 21850 }, { "epoch": 5.027548209366391, "grad_norm": 8.002633094787598, "learning_rate": 1.4972681359044996e-05, "loss": 0.0599, "step": 21900 }, { "epoch": 5.039026629935721, "grad_norm": 0.4478878974914551, "learning_rate": 1.4961202938475667e-05, "loss": 0.065, "step": 21950 }, { "epoch": 5.05050505050505, "grad_norm": 6.891519069671631, "learning_rate": 1.4949724517906336e-05, "loss": 0.0806, "step": 22000 }, { "epoch": 5.06198347107438, "grad_norm": 4.401981353759766, "learning_rate": 1.4938246097337008e-05, "loss": 0.0661, "step": 22050 }, { "epoch": 5.07346189164371, "grad_norm": 6.784157752990723, "learning_rate": 1.4926767676767677e-05, "loss": 0.064, "step": 22100 }, { "epoch": 5.08494031221304, "grad_norm": 3.379763603210449, "learning_rate": 1.4915289256198348e-05, "loss": 0.0637, "step": 22150 }, { "epoch": 5.096418732782369, "grad_norm": 11.847981452941895, "learning_rate": 1.4903810835629018e-05, "loss": 0.0653, "step": 22200 }, { "epoch": 5.107897153351699, "grad_norm": 0.8654012680053711, "learning_rate": 1.4892332415059689e-05, "loss": 0.0671, "step": 22250 }, { "epoch": 5.119375573921029, "grad_norm": 41.94646072387695, "learning_rate": 1.4880853994490358e-05, "loss": 0.0713, "step": 22300 }, { "epoch": 5.130853994490358, "grad_norm": 1.4954692125320435, "learning_rate": 1.486937557392103e-05, "loss": 0.0523, "step": 22350 }, { "epoch": 5.142332415059688, "grad_norm": 3.803544759750366, "learning_rate": 1.4857897153351699e-05, "loss": 0.0656, "step": 22400 }, { "epoch": 5.1538108356290175, "grad_norm": 4.063436508178711, "learning_rate": 1.484641873278237e-05, "loss": 0.0677, "step": 22450 }, { "epoch": 5.1652892561983474, "grad_norm": 4.6195597648620605, "learning_rate": 1.483494031221304e-05, "loss": 0.0514, "step": 22500 }, { "epoch": 5.1767676767676765, "grad_norm": 8.505845069885254, "learning_rate": 1.482346189164371e-05, "loss": 0.0701, "step": 22550 }, { "epoch": 5.188246097337006, "grad_norm": 3.2405736446380615, "learning_rate": 1.481198347107438e-05, "loss": 0.056, "step": 22600 }, { "epoch": 5.199724517906336, "grad_norm": 4.152194023132324, "learning_rate": 1.4800505050505051e-05, "loss": 0.0494, "step": 22650 }, { "epoch": 5.211202938475665, "grad_norm": 1.7295576333999634, "learning_rate": 1.478902662993572e-05, "loss": 0.0528, "step": 22700 }, { "epoch": 5.222681359044995, "grad_norm": 15.780937194824219, "learning_rate": 1.4777548209366392e-05, "loss": 0.0574, "step": 22750 }, { "epoch": 5.234159779614325, "grad_norm": 4.668914794921875, "learning_rate": 1.4766069788797061e-05, "loss": 0.0548, "step": 22800 }, { "epoch": 5.245638200183655, "grad_norm": 8.875202178955078, "learning_rate": 1.4754591368227733e-05, "loss": 0.0667, "step": 22850 }, { "epoch": 5.257116620752984, "grad_norm": 1.8688851594924927, "learning_rate": 1.4743112947658404e-05, "loss": 0.0567, "step": 22900 }, { "epoch": 5.268595041322314, "grad_norm": 3.337451934814453, "learning_rate": 1.4731634527089073e-05, "loss": 0.0718, "step": 22950 }, { "epoch": 5.280073461891644, "grad_norm": 23.922536849975586, "learning_rate": 1.4720156106519744e-05, "loss": 0.0614, "step": 23000 }, { "epoch": 5.291551882460974, "grad_norm": 6.531986236572266, "learning_rate": 1.4708677685950414e-05, "loss": 0.0479, "step": 23050 }, { "epoch": 5.303030303030303, "grad_norm": 18.948335647583008, "learning_rate": 1.4697199265381085e-05, "loss": 0.0716, "step": 23100 }, { "epoch": 5.314508723599633, "grad_norm": 5.292490482330322, "learning_rate": 1.4685720844811755e-05, "loss": 0.0739, "step": 23150 }, { "epoch": 5.325987144168963, "grad_norm": 8.712806701660156, "learning_rate": 1.4674242424242426e-05, "loss": 0.0573, "step": 23200 }, { "epoch": 5.337465564738292, "grad_norm": 2.29594349861145, "learning_rate": 1.4662764003673095e-05, "loss": 0.078, "step": 23250 }, { "epoch": 5.3489439853076215, "grad_norm": 1.0702353715896606, "learning_rate": 1.4651285583103766e-05, "loss": 0.0571, "step": 23300 }, { "epoch": 5.360422405876951, "grad_norm": 4.020137786865234, "learning_rate": 1.4639807162534436e-05, "loss": 0.0556, "step": 23350 }, { "epoch": 5.371900826446281, "grad_norm": 0.4657507836818695, "learning_rate": 1.4628328741965107e-05, "loss": 0.0701, "step": 23400 }, { "epoch": 5.38337924701561, "grad_norm": 7.942168712615967, "learning_rate": 1.4616850321395776e-05, "loss": 0.0708, "step": 23450 }, { "epoch": 5.39485766758494, "grad_norm": 60.43735885620117, "learning_rate": 1.4605371900826448e-05, "loss": 0.0763, "step": 23500 }, { "epoch": 5.40633608815427, "grad_norm": 0.18964794278144836, "learning_rate": 1.4593893480257117e-05, "loss": 0.0734, "step": 23550 }, { "epoch": 5.4178145087236, "grad_norm": 11.646339416503906, "learning_rate": 1.4582415059687788e-05, "loss": 0.0597, "step": 23600 }, { "epoch": 5.429292929292929, "grad_norm": 4.795817852020264, "learning_rate": 1.4570936639118458e-05, "loss": 0.0691, "step": 23650 }, { "epoch": 5.440771349862259, "grad_norm": 4.878921985626221, "learning_rate": 1.4559458218549129e-05, "loss": 0.0575, "step": 23700 }, { "epoch": 5.452249770431589, "grad_norm": 10.02763557434082, "learning_rate": 1.4547979797979798e-05, "loss": 0.0593, "step": 23750 }, { "epoch": 5.463728191000918, "grad_norm": 69.54541015625, "learning_rate": 1.453650137741047e-05, "loss": 0.0603, "step": 23800 }, { "epoch": 5.475206611570248, "grad_norm": 19.735029220581055, "learning_rate": 1.4525022956841139e-05, "loss": 0.0619, "step": 23850 }, { "epoch": 5.486685032139578, "grad_norm": 3.271541118621826, "learning_rate": 1.451354453627181e-05, "loss": 0.0732, "step": 23900 }, { "epoch": 5.498163452708908, "grad_norm": 5.735942840576172, "learning_rate": 1.450206611570248e-05, "loss": 0.0604, "step": 23950 }, { "epoch": 5.509641873278237, "grad_norm": 0.7948861718177795, "learning_rate": 1.449058769513315e-05, "loss": 0.0445, "step": 24000 }, { "epoch": 5.5211202938475665, "grad_norm": 30.460046768188477, "learning_rate": 1.447910927456382e-05, "loss": 0.0511, "step": 24050 }, { "epoch": 5.532598714416896, "grad_norm": 9.005084991455078, "learning_rate": 1.4467630853994491e-05, "loss": 0.0446, "step": 24100 }, { "epoch": 5.544077134986226, "grad_norm": 1.7187128067016602, "learning_rate": 1.4456152433425161e-05, "loss": 0.059, "step": 24150 }, { "epoch": 5.555555555555555, "grad_norm": 3.7255849838256836, "learning_rate": 1.4444674012855832e-05, "loss": 0.0632, "step": 24200 }, { "epoch": 5.567033976124885, "grad_norm": 17.45580291748047, "learning_rate": 1.4433195592286502e-05, "loss": 0.0613, "step": 24250 }, { "epoch": 5.578512396694215, "grad_norm": 2.1115639209747314, "learning_rate": 1.4421717171717173e-05, "loss": 0.0658, "step": 24300 }, { "epoch": 5.589990817263544, "grad_norm": 4.710561275482178, "learning_rate": 1.4410238751147842e-05, "loss": 0.0442, "step": 24350 }, { "epoch": 5.601469237832874, "grad_norm": 5.299075603485107, "learning_rate": 1.4398760330578513e-05, "loss": 0.0668, "step": 24400 }, { "epoch": 5.612947658402204, "grad_norm": 9.545364379882812, "learning_rate": 1.4387281910009183e-05, "loss": 0.0602, "step": 24450 }, { "epoch": 5.624426078971534, "grad_norm": 20.036054611206055, "learning_rate": 1.4375803489439854e-05, "loss": 0.0542, "step": 24500 }, { "epoch": 5.635904499540863, "grad_norm": 0.9716740250587463, "learning_rate": 1.4364325068870523e-05, "loss": 0.0569, "step": 24550 }, { "epoch": 5.647382920110193, "grad_norm": 0.8161399364471436, "learning_rate": 1.4352846648301195e-05, "loss": 0.0806, "step": 24600 }, { "epoch": 5.658861340679523, "grad_norm": 3.2273051738739014, "learning_rate": 1.4341368227731864e-05, "loss": 0.0573, "step": 24650 }, { "epoch": 5.670339761248853, "grad_norm": 5.651678562164307, "learning_rate": 1.4329889807162535e-05, "loss": 0.0701, "step": 24700 }, { "epoch": 5.681818181818182, "grad_norm": 11.217434883117676, "learning_rate": 1.4318411386593205e-05, "loss": 0.0707, "step": 24750 }, { "epoch": 5.693296602387512, "grad_norm": 3.718209981918335, "learning_rate": 1.4306932966023876e-05, "loss": 0.0683, "step": 24800 }, { "epoch": 5.7047750229568415, "grad_norm": 6.940698146820068, "learning_rate": 1.4295454545454545e-05, "loss": 0.0615, "step": 24850 }, { "epoch": 5.7162534435261705, "grad_norm": 2.7064337730407715, "learning_rate": 1.4283976124885217e-05, "loss": 0.0697, "step": 24900 }, { "epoch": 5.7277318640955, "grad_norm": 7.539518356323242, "learning_rate": 1.4272497704315886e-05, "loss": 0.074, "step": 24950 }, { "epoch": 5.73921028466483, "grad_norm": 9.48625373840332, "learning_rate": 1.4261019283746557e-05, "loss": 0.0508, "step": 25000 }, { "epoch": 5.750688705234159, "grad_norm": 3.839548349380493, "learning_rate": 1.4249540863177227e-05, "loss": 0.04, "step": 25050 }, { "epoch": 5.762167125803489, "grad_norm": 1.1874516010284424, "learning_rate": 1.4238062442607898e-05, "loss": 0.0693, "step": 25100 }, { "epoch": 5.773645546372819, "grad_norm": 4.769851207733154, "learning_rate": 1.4226584022038567e-05, "loss": 0.0451, "step": 25150 }, { "epoch": 5.785123966942149, "grad_norm": 1.1769243478775024, "learning_rate": 1.4215105601469238e-05, "loss": 0.0571, "step": 25200 }, { "epoch": 5.796602387511478, "grad_norm": 3.589566946029663, "learning_rate": 1.4203627180899911e-05, "loss": 0.063, "step": 25250 }, { "epoch": 5.808080808080808, "grad_norm": 2.4310691356658936, "learning_rate": 1.4192148760330579e-05, "loss": 0.0573, "step": 25300 }, { "epoch": 5.819559228650138, "grad_norm": 14.951000213623047, "learning_rate": 1.4180670339761252e-05, "loss": 0.0539, "step": 25350 }, { "epoch": 5.831037649219468, "grad_norm": 8.794573783874512, "learning_rate": 1.416919191919192e-05, "loss": 0.0684, "step": 25400 }, { "epoch": 5.842516069788797, "grad_norm": 3.2409491539001465, "learning_rate": 1.4157713498622591e-05, "loss": 0.0625, "step": 25450 }, { "epoch": 5.853994490358127, "grad_norm": 3.78106951713562, "learning_rate": 1.414623507805326e-05, "loss": 0.0571, "step": 25500 }, { "epoch": 5.865472910927457, "grad_norm": 22.782184600830078, "learning_rate": 1.4134756657483931e-05, "loss": 0.0621, "step": 25550 }, { "epoch": 5.876951331496786, "grad_norm": 7.916112899780273, "learning_rate": 1.4123278236914601e-05, "loss": 0.0591, "step": 25600 }, { "epoch": 5.8884297520661155, "grad_norm": 3.8059706687927246, "learning_rate": 1.4111799816345272e-05, "loss": 0.0515, "step": 25650 }, { "epoch": 5.899908172635445, "grad_norm": 1.811797857284546, "learning_rate": 1.4100321395775942e-05, "loss": 0.0535, "step": 25700 }, { "epoch": 5.911386593204775, "grad_norm": 1.024452805519104, "learning_rate": 1.4088842975206613e-05, "loss": 0.0404, "step": 25750 }, { "epoch": 5.922865013774104, "grad_norm": 14.647411346435547, "learning_rate": 1.4077364554637282e-05, "loss": 0.053, "step": 25800 }, { "epoch": 5.934343434343434, "grad_norm": 1.5343114137649536, "learning_rate": 1.4065886134067953e-05, "loss": 0.0527, "step": 25850 }, { "epoch": 5.945821854912764, "grad_norm": 3.019164800643921, "learning_rate": 1.4054407713498623e-05, "loss": 0.0627, "step": 25900 }, { "epoch": 5.957300275482094, "grad_norm": 13.243929862976074, "learning_rate": 1.4042929292929294e-05, "loss": 0.0544, "step": 25950 }, { "epoch": 5.968778696051423, "grad_norm": 11.462658882141113, "learning_rate": 1.4031450872359964e-05, "loss": 0.0403, "step": 26000 }, { "epoch": 5.980257116620753, "grad_norm": 8.393050193786621, "learning_rate": 1.4019972451790635e-05, "loss": 0.059, "step": 26050 }, { "epoch": 5.991735537190083, "grad_norm": 1.0723531246185303, "learning_rate": 1.4008494031221304e-05, "loss": 0.0684, "step": 26100 }, { "epoch": 6.0, "eval_loss": 0.02229415252804756, "eval_runtime": 76.1274, "eval_samples_per_second": 101.711, "eval_steps_per_second": 6.358, "step": 26136 }, { "epoch": 6.003213957759412, "grad_norm": 84.26821899414062, "learning_rate": 1.3997015610651975e-05, "loss": 0.0644, "step": 26150 }, { "epoch": 6.014692378328742, "grad_norm": 2.227952480316162, "learning_rate": 1.3985537190082645e-05, "loss": 0.0491, "step": 26200 }, { "epoch": 6.026170798898072, "grad_norm": 15.150116920471191, "learning_rate": 1.3974058769513316e-05, "loss": 0.0567, "step": 26250 }, { "epoch": 6.037649219467402, "grad_norm": 3.5049636363983154, "learning_rate": 1.3962580348943985e-05, "loss": 0.055, "step": 26300 }, { "epoch": 6.049127640036731, "grad_norm": 1.4526829719543457, "learning_rate": 1.3951101928374657e-05, "loss": 0.0457, "step": 26350 }, { "epoch": 6.0606060606060606, "grad_norm": 3.735417604446411, "learning_rate": 1.3939623507805326e-05, "loss": 0.0575, "step": 26400 }, { "epoch": 6.0720844811753905, "grad_norm": 12.61750602722168, "learning_rate": 1.3928145087235997e-05, "loss": 0.0567, "step": 26450 }, { "epoch": 6.08356290174472, "grad_norm": 7.446401596069336, "learning_rate": 1.3916666666666667e-05, "loss": 0.0545, "step": 26500 }, { "epoch": 6.095041322314049, "grad_norm": 4.25519323348999, "learning_rate": 1.3905188246097338e-05, "loss": 0.0582, "step": 26550 }, { "epoch": 6.106519742883379, "grad_norm": 2.447784185409546, "learning_rate": 1.3893709825528007e-05, "loss": 0.0471, "step": 26600 }, { "epoch": 6.117998163452709, "grad_norm": 3.3071296215057373, "learning_rate": 1.3882231404958678e-05, "loss": 0.0569, "step": 26650 }, { "epoch": 6.129476584022038, "grad_norm": 3.5032758712768555, "learning_rate": 1.3870752984389348e-05, "loss": 0.0522, "step": 26700 }, { "epoch": 6.140955004591368, "grad_norm": 1.7730631828308105, "learning_rate": 1.3859274563820019e-05, "loss": 0.0544, "step": 26750 }, { "epoch": 6.152433425160698, "grad_norm": 3.405440092086792, "learning_rate": 1.3847796143250689e-05, "loss": 0.0543, "step": 26800 }, { "epoch": 6.163911845730028, "grad_norm": 6.4918131828308105, "learning_rate": 1.383631772268136e-05, "loss": 0.0505, "step": 26850 }, { "epoch": 6.175390266299357, "grad_norm": 2.4078381061553955, "learning_rate": 1.382483930211203e-05, "loss": 0.0667, "step": 26900 }, { "epoch": 6.186868686868687, "grad_norm": 1.5094975233078003, "learning_rate": 1.38133608815427e-05, "loss": 0.0496, "step": 26950 }, { "epoch": 6.198347107438017, "grad_norm": 8.32642650604248, "learning_rate": 1.380188246097337e-05, "loss": 0.0525, "step": 27000 }, { "epoch": 6.209825528007346, "grad_norm": 1.715224266052246, "learning_rate": 1.3790404040404041e-05, "loss": 0.0461, "step": 27050 }, { "epoch": 6.221303948576676, "grad_norm": 3.4211654663085938, "learning_rate": 1.377892561983471e-05, "loss": 0.0652, "step": 27100 }, { "epoch": 6.232782369146006, "grad_norm": 18.152185440063477, "learning_rate": 1.3767447199265382e-05, "loss": 0.0529, "step": 27150 }, { "epoch": 6.2442607897153355, "grad_norm": 4.866644859313965, "learning_rate": 1.3755968778696051e-05, "loss": 0.0479, "step": 27200 }, { "epoch": 6.2557392102846645, "grad_norm": 6.297235012054443, "learning_rate": 1.3744490358126722e-05, "loss": 0.0614, "step": 27250 }, { "epoch": 6.267217630853994, "grad_norm": 8.21524429321289, "learning_rate": 1.3733011937557392e-05, "loss": 0.062, "step": 27300 }, { "epoch": 6.278696051423324, "grad_norm": 5.716306686401367, "learning_rate": 1.3721533516988063e-05, "loss": 0.0662, "step": 27350 }, { "epoch": 6.290174471992654, "grad_norm": 13.417250633239746, "learning_rate": 1.3710055096418732e-05, "loss": 0.0486, "step": 27400 }, { "epoch": 6.301652892561983, "grad_norm": 9.384743690490723, "learning_rate": 1.3698576675849404e-05, "loss": 0.0499, "step": 27450 }, { "epoch": 6.313131313131313, "grad_norm": 3.7721974849700928, "learning_rate": 1.3687098255280073e-05, "loss": 0.0477, "step": 27500 }, { "epoch": 6.324609733700643, "grad_norm": 4.3138298988342285, "learning_rate": 1.3675619834710744e-05, "loss": 0.0644, "step": 27550 }, { "epoch": 6.336088154269972, "grad_norm": 7.6092329025268555, "learning_rate": 1.3664141414141414e-05, "loss": 0.0479, "step": 27600 }, { "epoch": 6.347566574839302, "grad_norm": 25.45762062072754, "learning_rate": 1.3652662993572085e-05, "loss": 0.06, "step": 27650 }, { "epoch": 6.359044995408632, "grad_norm": 8.24765396118164, "learning_rate": 1.3641184573002758e-05, "loss": 0.0472, "step": 27700 }, { "epoch": 6.370523415977962, "grad_norm": 24.076820373535156, "learning_rate": 1.3629706152433425e-05, "loss": 0.0622, "step": 27750 }, { "epoch": 6.382001836547291, "grad_norm": 1.1568782329559326, "learning_rate": 1.3618227731864098e-05, "loss": 0.0425, "step": 27800 }, { "epoch": 6.393480257116621, "grad_norm": 24.59627342224121, "learning_rate": 1.3606749311294766e-05, "loss": 0.0417, "step": 27850 }, { "epoch": 6.404958677685951, "grad_norm": 5.506301403045654, "learning_rate": 1.3595270890725439e-05, "loss": 0.0509, "step": 27900 }, { "epoch": 6.41643709825528, "grad_norm": 1.0578150749206543, "learning_rate": 1.3583792470156107e-05, "loss": 0.0457, "step": 27950 }, { "epoch": 6.4279155188246095, "grad_norm": 2.7465415000915527, "learning_rate": 1.357231404958678e-05, "loss": 0.0424, "step": 28000 }, { "epoch": 6.4393939393939394, "grad_norm": 6.444134712219238, "learning_rate": 1.3560835629017447e-05, "loss": 0.0473, "step": 28050 }, { "epoch": 6.450872359963269, "grad_norm": 6.196394920349121, "learning_rate": 1.354935720844812e-05, "loss": 0.0562, "step": 28100 }, { "epoch": 6.462350780532598, "grad_norm": 21.69679832458496, "learning_rate": 1.3537878787878788e-05, "loss": 0.0599, "step": 28150 }, { "epoch": 6.473829201101928, "grad_norm": 2.0899710655212402, "learning_rate": 1.3526400367309461e-05, "loss": 0.059, "step": 28200 }, { "epoch": 6.485307621671258, "grad_norm": 1.6385844945907593, "learning_rate": 1.3514921946740129e-05, "loss": 0.051, "step": 28250 }, { "epoch": 6.496786042240588, "grad_norm": 1.325900912284851, "learning_rate": 1.3503443526170802e-05, "loss": 0.046, "step": 28300 }, { "epoch": 6.508264462809917, "grad_norm": 0.2050769031047821, "learning_rate": 1.349196510560147e-05, "loss": 0.0708, "step": 28350 }, { "epoch": 6.519742883379247, "grad_norm": 4.3140082359313965, "learning_rate": 1.3480486685032142e-05, "loss": 0.0599, "step": 28400 }, { "epoch": 6.531221303948577, "grad_norm": 2.6614534854888916, "learning_rate": 1.346900826446281e-05, "loss": 0.0486, "step": 28450 }, { "epoch": 6.542699724517906, "grad_norm": 19.535537719726562, "learning_rate": 1.3457529843893483e-05, "loss": 0.0574, "step": 28500 }, { "epoch": 6.554178145087236, "grad_norm": 2.166963815689087, "learning_rate": 1.344605142332415e-05, "loss": 0.0609, "step": 28550 }, { "epoch": 6.565656565656566, "grad_norm": 3.7270150184631348, "learning_rate": 1.3434573002754823e-05, "loss": 0.0378, "step": 28600 }, { "epoch": 6.577134986225896, "grad_norm": 5.435728073120117, "learning_rate": 1.3423094582185491e-05, "loss": 0.0442, "step": 28650 }, { "epoch": 6.588613406795225, "grad_norm": 20.308040618896484, "learning_rate": 1.3411616161616164e-05, "loss": 0.0651, "step": 28700 }, { "epoch": 6.600091827364555, "grad_norm": 12.638875007629395, "learning_rate": 1.3400137741046832e-05, "loss": 0.0466, "step": 28750 }, { "epoch": 6.6115702479338845, "grad_norm": 3.274132251739502, "learning_rate": 1.3388659320477505e-05, "loss": 0.0578, "step": 28800 }, { "epoch": 6.623048668503214, "grad_norm": 11.721548080444336, "learning_rate": 1.3377180899908172e-05, "loss": 0.0594, "step": 28850 }, { "epoch": 6.634527089072543, "grad_norm": 53.534358978271484, "learning_rate": 1.3365702479338845e-05, "loss": 0.0533, "step": 28900 }, { "epoch": 6.646005509641873, "grad_norm": 1.387225866317749, "learning_rate": 1.3354224058769513e-05, "loss": 0.0451, "step": 28950 }, { "epoch": 6.657483930211203, "grad_norm": 1.3664370775222778, "learning_rate": 1.3342745638200186e-05, "loss": 0.0506, "step": 29000 }, { "epoch": 6.668962350780532, "grad_norm": 27.240333557128906, "learning_rate": 1.3331267217630854e-05, "loss": 0.0468, "step": 29050 }, { "epoch": 6.680440771349862, "grad_norm": 0.6074036955833435, "learning_rate": 1.3319788797061527e-05, "loss": 0.0412, "step": 29100 }, { "epoch": 6.691919191919192, "grad_norm": 6.00919771194458, "learning_rate": 1.3308310376492194e-05, "loss": 0.0634, "step": 29150 }, { "epoch": 6.703397612488522, "grad_norm": 4.426175117492676, "learning_rate": 1.3296831955922867e-05, "loss": 0.0562, "step": 29200 }, { "epoch": 6.714876033057851, "grad_norm": 6.7816290855407715, "learning_rate": 1.3285353535353535e-05, "loss": 0.0522, "step": 29250 }, { "epoch": 6.726354453627181, "grad_norm": 5.750943183898926, "learning_rate": 1.3273875114784208e-05, "loss": 0.0607, "step": 29300 }, { "epoch": 6.737832874196511, "grad_norm": 2.613318920135498, "learning_rate": 1.3262396694214876e-05, "loss": 0.0679, "step": 29350 }, { "epoch": 6.749311294765841, "grad_norm": 9.185781478881836, "learning_rate": 1.3250918273645549e-05, "loss": 0.0566, "step": 29400 }, { "epoch": 6.76078971533517, "grad_norm": 22.78681755065918, "learning_rate": 1.3239439853076216e-05, "loss": 0.0618, "step": 29450 }, { "epoch": 6.7722681359045, "grad_norm": 15.7301025390625, "learning_rate": 1.322796143250689e-05, "loss": 0.0533, "step": 29500 }, { "epoch": 6.7837465564738295, "grad_norm": 5.148148536682129, "learning_rate": 1.3216483011937557e-05, "loss": 0.0433, "step": 29550 }, { "epoch": 6.7952249770431585, "grad_norm": 1.9450081586837769, "learning_rate": 1.320500459136823e-05, "loss": 0.053, "step": 29600 }, { "epoch": 6.806703397612488, "grad_norm": 4.65706205368042, "learning_rate": 1.3193526170798898e-05, "loss": 0.052, "step": 29650 }, { "epoch": 6.818181818181818, "grad_norm": 26.548072814941406, "learning_rate": 1.318204775022957e-05, "loss": 0.0556, "step": 29700 }, { "epoch": 6.829660238751147, "grad_norm": 3.8676860332489014, "learning_rate": 1.3170569329660238e-05, "loss": 0.0577, "step": 29750 }, { "epoch": 6.841138659320477, "grad_norm": 0.8915737271308899, "learning_rate": 1.3159090909090911e-05, "loss": 0.0604, "step": 29800 }, { "epoch": 6.852617079889807, "grad_norm": 6.853430271148682, "learning_rate": 1.3147612488521579e-05, "loss": 0.0511, "step": 29850 }, { "epoch": 6.864095500459137, "grad_norm": 6.921698093414307, "learning_rate": 1.3136134067952252e-05, "loss": 0.041, "step": 29900 }, { "epoch": 6.875573921028466, "grad_norm": 4.4696149826049805, "learning_rate": 1.312465564738292e-05, "loss": 0.0448, "step": 29950 }, { "epoch": 6.887052341597796, "grad_norm": 7.3403520584106445, "learning_rate": 1.3113177226813592e-05, "loss": 0.0498, "step": 30000 }, { "epoch": 6.898530762167126, "grad_norm": 34.43242263793945, "learning_rate": 1.3101698806244264e-05, "loss": 0.0523, "step": 30050 }, { "epoch": 6.910009182736456, "grad_norm": 8.639431953430176, "learning_rate": 1.3090220385674931e-05, "loss": 0.0433, "step": 30100 }, { "epoch": 6.921487603305785, "grad_norm": 2.6271109580993652, "learning_rate": 1.3078741965105604e-05, "loss": 0.0481, "step": 30150 }, { "epoch": 6.932966023875115, "grad_norm": 16.850522994995117, "learning_rate": 1.3067263544536272e-05, "loss": 0.0591, "step": 30200 }, { "epoch": 6.944444444444445, "grad_norm": 10.231602668762207, "learning_rate": 1.3055785123966945e-05, "loss": 0.0703, "step": 30250 }, { "epoch": 6.955922865013774, "grad_norm": 1.738244652748108, "learning_rate": 1.3044306703397613e-05, "loss": 0.0365, "step": 30300 }, { "epoch": 6.967401285583104, "grad_norm": 2.247811794281006, "learning_rate": 1.3032828282828285e-05, "loss": 0.0646, "step": 30350 }, { "epoch": 6.9788797061524335, "grad_norm": 0.34755614399909973, "learning_rate": 1.3021349862258953e-05, "loss": 0.0459, "step": 30400 }, { "epoch": 6.990358126721763, "grad_norm": 6.685552597045898, "learning_rate": 1.3009871441689626e-05, "loss": 0.0525, "step": 30450 }, { "epoch": 7.0, "eval_loss": 0.01915881782770157, "eval_runtime": 76.6273, "eval_samples_per_second": 101.048, "eval_steps_per_second": 6.316, "step": 30492 }, { "epoch": 7.001836547291092, "grad_norm": 5.556710243225098, "learning_rate": 1.2998393021120294e-05, "loss": 0.045, "step": 30500 }, { "epoch": 7.013314967860422, "grad_norm": 0.7394046783447266, "learning_rate": 1.2986914600550967e-05, "loss": 0.054, "step": 30550 }, { "epoch": 7.024793388429752, "grad_norm": 5.4557061195373535, "learning_rate": 1.2975436179981634e-05, "loss": 0.0583, "step": 30600 }, { "epoch": 7.036271808999082, "grad_norm": 4.826509475708008, "learning_rate": 1.2963957759412307e-05, "loss": 0.0471, "step": 30650 }, { "epoch": 7.047750229568411, "grad_norm": 1.5092955827713013, "learning_rate": 1.2952479338842975e-05, "loss": 0.0455, "step": 30700 }, { "epoch": 7.059228650137741, "grad_norm": 1.1466028690338135, "learning_rate": 1.2941000918273648e-05, "loss": 0.0488, "step": 30750 }, { "epoch": 7.070707070707071, "grad_norm": 1.5234471559524536, "learning_rate": 1.2929522497704316e-05, "loss": 0.0448, "step": 30800 }, { "epoch": 7.0821854912764, "grad_norm": 47.59840774536133, "learning_rate": 1.2918044077134989e-05, "loss": 0.0577, "step": 30850 }, { "epoch": 7.09366391184573, "grad_norm": 6.229769229888916, "learning_rate": 1.2906565656565656e-05, "loss": 0.0487, "step": 30900 }, { "epoch": 7.10514233241506, "grad_norm": 12.753508567810059, "learning_rate": 1.289508723599633e-05, "loss": 0.0507, "step": 30950 }, { "epoch": 7.11662075298439, "grad_norm": 32.12752151489258, "learning_rate": 1.2883608815426997e-05, "loss": 0.0491, "step": 31000 }, { "epoch": 7.128099173553719, "grad_norm": 13.951565742492676, "learning_rate": 1.287213039485767e-05, "loss": 0.0549, "step": 31050 }, { "epoch": 7.139577594123049, "grad_norm": 3.9033539295196533, "learning_rate": 1.2860651974288338e-05, "loss": 0.0435, "step": 31100 }, { "epoch": 7.1510560146923785, "grad_norm": 29.589632034301758, "learning_rate": 1.284917355371901e-05, "loss": 0.0528, "step": 31150 }, { "epoch": 7.162534435261708, "grad_norm": 0.29234039783477783, "learning_rate": 1.2837695133149678e-05, "loss": 0.05, "step": 31200 }, { "epoch": 7.174012855831037, "grad_norm": 33.43843078613281, "learning_rate": 1.2826216712580351e-05, "loss": 0.0415, "step": 31250 }, { "epoch": 7.185491276400367, "grad_norm": 6.711655139923096, "learning_rate": 1.2814738292011019e-05, "loss": 0.047, "step": 31300 }, { "epoch": 7.196969696969697, "grad_norm": 0.34917572140693665, "learning_rate": 1.2803259871441692e-05, "loss": 0.0432, "step": 31350 }, { "epoch": 7.208448117539026, "grad_norm": 0.1620061844587326, "learning_rate": 1.279178145087236e-05, "loss": 0.0374, "step": 31400 }, { "epoch": 7.219926538108356, "grad_norm": 10.62319278717041, "learning_rate": 1.2780303030303032e-05, "loss": 0.0531, "step": 31450 }, { "epoch": 7.231404958677686, "grad_norm": 2.900805711746216, "learning_rate": 1.27688246097337e-05, "loss": 0.0331, "step": 31500 }, { "epoch": 7.242883379247016, "grad_norm": 8.705903053283691, "learning_rate": 1.2757346189164373e-05, "loss": 0.0581, "step": 31550 }, { "epoch": 7.254361799816345, "grad_norm": 9.089072227478027, "learning_rate": 1.274586776859504e-05, "loss": 0.0614, "step": 31600 }, { "epoch": 7.265840220385675, "grad_norm": 1.296374797821045, "learning_rate": 1.2734389348025714e-05, "loss": 0.0567, "step": 31650 }, { "epoch": 7.277318640955005, "grad_norm": 8.041852951049805, "learning_rate": 1.2722910927456381e-05, "loss": 0.0538, "step": 31700 }, { "epoch": 7.288797061524335, "grad_norm": 1.5367282629013062, "learning_rate": 1.2711432506887054e-05, "loss": 0.0498, "step": 31750 }, { "epoch": 7.300275482093664, "grad_norm": 32.02037048339844, "learning_rate": 1.2699954086317722e-05, "loss": 0.0417, "step": 31800 }, { "epoch": 7.311753902662994, "grad_norm": 0.8069809675216675, "learning_rate": 1.2688475665748395e-05, "loss": 0.0446, "step": 31850 }, { "epoch": 7.3232323232323235, "grad_norm": 6.496412754058838, "learning_rate": 1.2676997245179063e-05, "loss": 0.0471, "step": 31900 }, { "epoch": 7.3347107438016526, "grad_norm": 2.1507184505462646, "learning_rate": 1.2665518824609736e-05, "loss": 0.0399, "step": 31950 }, { "epoch": 7.3461891643709825, "grad_norm": 0.3067164123058319, "learning_rate": 1.2654040404040403e-05, "loss": 0.034, "step": 32000 }, { "epoch": 7.357667584940312, "grad_norm": 4.468069553375244, "learning_rate": 1.2642561983471076e-05, "loss": 0.0514, "step": 32050 }, { "epoch": 7.369146005509642, "grad_norm": 0.5549958348274231, "learning_rate": 1.2631083562901744e-05, "loss": 0.0446, "step": 32100 }, { "epoch": 7.380624426078971, "grad_norm": 3.4569270610809326, "learning_rate": 1.2619605142332417e-05, "loss": 0.045, "step": 32150 }, { "epoch": 7.392102846648301, "grad_norm": 0.9428212642669678, "learning_rate": 1.2608126721763085e-05, "loss": 0.0463, "step": 32200 }, { "epoch": 7.403581267217631, "grad_norm": 14.596854209899902, "learning_rate": 1.2596648301193758e-05, "loss": 0.0711, "step": 32250 }, { "epoch": 7.41505968778696, "grad_norm": 0.37691494822502136, "learning_rate": 1.2585169880624425e-05, "loss": 0.0414, "step": 32300 }, { "epoch": 7.42653810835629, "grad_norm": 8.298638343811035, "learning_rate": 1.2573691460055098e-05, "loss": 0.0472, "step": 32350 }, { "epoch": 7.43801652892562, "grad_norm": 4.2591705322265625, "learning_rate": 1.256221303948577e-05, "loss": 0.0705, "step": 32400 }, { "epoch": 7.44949494949495, "grad_norm": 17.276363372802734, "learning_rate": 1.2550734618916439e-05, "loss": 0.0498, "step": 32450 }, { "epoch": 7.460973370064279, "grad_norm": 3.541402578353882, "learning_rate": 1.253925619834711e-05, "loss": 0.0472, "step": 32500 }, { "epoch": 7.472451790633609, "grad_norm": 26.691303253173828, "learning_rate": 1.252777777777778e-05, "loss": 0.0542, "step": 32550 }, { "epoch": 7.483930211202939, "grad_norm": 0.3552834689617157, "learning_rate": 1.251629935720845e-05, "loss": 0.0431, "step": 32600 }, { "epoch": 7.495408631772268, "grad_norm": 11.444090843200684, "learning_rate": 1.250482093663912e-05, "loss": 0.0359, "step": 32650 }, { "epoch": 7.506887052341598, "grad_norm": 0.06625628471374512, "learning_rate": 1.2493342516069791e-05, "loss": 0.056, "step": 32700 }, { "epoch": 7.5183654729109275, "grad_norm": 1.0319030284881592, "learning_rate": 1.248186409550046e-05, "loss": 0.046, "step": 32750 }, { "epoch": 7.529843893480257, "grad_norm": 0.6352688074111938, "learning_rate": 1.2470385674931132e-05, "loss": 0.0533, "step": 32800 }, { "epoch": 7.541322314049586, "grad_norm": 3.268843412399292, "learning_rate": 1.2458907254361801e-05, "loss": 0.0418, "step": 32850 }, { "epoch": 7.552800734618916, "grad_norm": 30.818876266479492, "learning_rate": 1.2447428833792473e-05, "loss": 0.0445, "step": 32900 }, { "epoch": 7.564279155188246, "grad_norm": 22.012792587280273, "learning_rate": 1.2435950413223142e-05, "loss": 0.0332, "step": 32950 }, { "epoch": 7.575757575757576, "grad_norm": 5.019522666931152, "learning_rate": 1.2424471992653813e-05, "loss": 0.0558, "step": 33000 }, { "epoch": 7.587235996326905, "grad_norm": 8.356417655944824, "learning_rate": 1.2412993572084483e-05, "loss": 0.0426, "step": 33050 }, { "epoch": 7.598714416896235, "grad_norm": 6.667174816131592, "learning_rate": 1.2401515151515154e-05, "loss": 0.0591, "step": 33100 }, { "epoch": 7.610192837465565, "grad_norm": 4.474754810333252, "learning_rate": 1.2390036730945823e-05, "loss": 0.0462, "step": 33150 }, { "epoch": 7.621671258034894, "grad_norm": 4.941595077514648, "learning_rate": 1.2378558310376494e-05, "loss": 0.0515, "step": 33200 }, { "epoch": 7.633149678604224, "grad_norm": 4.913635730743408, "learning_rate": 1.2367079889807164e-05, "loss": 0.0449, "step": 33250 }, { "epoch": 7.644628099173554, "grad_norm": 18.17800521850586, "learning_rate": 1.2355601469237835e-05, "loss": 0.0506, "step": 33300 }, { "epoch": 7.656106519742884, "grad_norm": 5.670760154724121, "learning_rate": 1.2344123048668505e-05, "loss": 0.0394, "step": 33350 }, { "epoch": 7.667584940312213, "grad_norm": 7.622896194458008, "learning_rate": 1.2332644628099176e-05, "loss": 0.0499, "step": 33400 }, { "epoch": 7.679063360881543, "grad_norm": 7.355600833892822, "learning_rate": 1.2321166207529845e-05, "loss": 0.06, "step": 33450 }, { "epoch": 7.6905417814508725, "grad_norm": 10.804757118225098, "learning_rate": 1.2309687786960516e-05, "loss": 0.0406, "step": 33500 }, { "epoch": 7.702020202020202, "grad_norm": 15.45306396484375, "learning_rate": 1.2298209366391186e-05, "loss": 0.0492, "step": 33550 }, { "epoch": 7.7134986225895315, "grad_norm": 5.345247745513916, "learning_rate": 1.2286730945821857e-05, "loss": 0.0411, "step": 33600 }, { "epoch": 7.724977043158861, "grad_norm": 6.32952356338501, "learning_rate": 1.2275252525252526e-05, "loss": 0.0521, "step": 33650 }, { "epoch": 7.736455463728191, "grad_norm": 0.02861524373292923, "learning_rate": 1.2263774104683198e-05, "loss": 0.0374, "step": 33700 }, { "epoch": 7.74793388429752, "grad_norm": 8.649580001831055, "learning_rate": 1.2252295684113867e-05, "loss": 0.0451, "step": 33750 }, { "epoch": 7.75941230486685, "grad_norm": 8.837603569030762, "learning_rate": 1.2240817263544538e-05, "loss": 0.0535, "step": 33800 }, { "epoch": 7.77089072543618, "grad_norm": 12.859838485717773, "learning_rate": 1.2229338842975208e-05, "loss": 0.0436, "step": 33850 }, { "epoch": 7.78236914600551, "grad_norm": 8.033489227294922, "learning_rate": 1.2217860422405879e-05, "loss": 0.049, "step": 33900 }, { "epoch": 7.793847566574839, "grad_norm": 0.8371500968933105, "learning_rate": 1.2206382001836548e-05, "loss": 0.049, "step": 33950 }, { "epoch": 7.805325987144169, "grad_norm": 15.77631950378418, "learning_rate": 1.219490358126722e-05, "loss": 0.0548, "step": 34000 }, { "epoch": 7.816804407713499, "grad_norm": 10.52796745300293, "learning_rate": 1.2183425160697889e-05, "loss": 0.0424, "step": 34050 }, { "epoch": 7.828282828282829, "grad_norm": 4.152801990509033, "learning_rate": 1.217194674012856e-05, "loss": 0.0436, "step": 34100 }, { "epoch": 7.839761248852158, "grad_norm": 0.09426809847354889, "learning_rate": 1.216046831955923e-05, "loss": 0.0449, "step": 34150 }, { "epoch": 7.851239669421488, "grad_norm": 8.26068115234375, "learning_rate": 1.21489898989899e-05, "loss": 0.0614, "step": 34200 }, { "epoch": 7.862718089990818, "grad_norm": 3.072333574295044, "learning_rate": 1.213751147842057e-05, "loss": 0.0501, "step": 34250 }, { "epoch": 7.874196510560147, "grad_norm": 6.068106174468994, "learning_rate": 1.2126033057851241e-05, "loss": 0.0631, "step": 34300 }, { "epoch": 7.8856749311294765, "grad_norm": 3.627613067626953, "learning_rate": 1.2114554637281911e-05, "loss": 0.0394, "step": 34350 }, { "epoch": 7.897153351698806, "grad_norm": 8.929315567016602, "learning_rate": 1.2103076216712582e-05, "loss": 0.0412, "step": 34400 }, { "epoch": 7.908631772268135, "grad_norm": 0.3535842299461365, "learning_rate": 1.2091597796143252e-05, "loss": 0.0495, "step": 34450 }, { "epoch": 7.920110192837465, "grad_norm": 1.072227120399475, "learning_rate": 1.2080119375573923e-05, "loss": 0.0466, "step": 34500 }, { "epoch": 7.931588613406795, "grad_norm": 8.14429759979248, "learning_rate": 1.2068640955004592e-05, "loss": 0.0506, "step": 34550 }, { "epoch": 7.943067033976125, "grad_norm": 3.011425495147705, "learning_rate": 1.2057162534435263e-05, "loss": 0.0547, "step": 34600 }, { "epoch": 7.954545454545455, "grad_norm": 8.273822784423828, "learning_rate": 1.2045684113865933e-05, "loss": 0.0449, "step": 34650 }, { "epoch": 7.966023875114784, "grad_norm": 7.706473350524902, "learning_rate": 1.2034205693296604e-05, "loss": 0.0658, "step": 34700 }, { "epoch": 7.977502295684114, "grad_norm": 5.488572120666504, "learning_rate": 1.2022727272727275e-05, "loss": 0.0634, "step": 34750 }, { "epoch": 7.988980716253444, "grad_norm": 174.94033813476562, "learning_rate": 1.2011248852157945e-05, "loss": 0.0434, "step": 34800 }, { "epoch": 8.0, "eval_loss": 0.016729312017560005, "eval_runtime": 76.2552, "eval_samples_per_second": 101.541, "eval_steps_per_second": 6.347, "step": 34848 }, { "epoch": 8.000459136822773, "grad_norm": 1.1001577377319336, "learning_rate": 1.1999770431588616e-05, "loss": 0.0526, "step": 34850 }, { "epoch": 8.011937557392104, "grad_norm": 1.4751074314117432, "learning_rate": 1.1988292011019285e-05, "loss": 0.0406, "step": 34900 }, { "epoch": 8.023415977961433, "grad_norm": 4.765047073364258, "learning_rate": 1.1976813590449956e-05, "loss": 0.0479, "step": 34950 }, { "epoch": 8.034894398530762, "grad_norm": 4.843242168426514, "learning_rate": 1.1965335169880626e-05, "loss": 0.0423, "step": 35000 }, { "epoch": 8.046372819100092, "grad_norm": 0.6305195689201355, "learning_rate": 1.1953856749311297e-05, "loss": 0.058, "step": 35050 }, { "epoch": 8.057851239669422, "grad_norm": 0.30947351455688477, "learning_rate": 1.1942378328741967e-05, "loss": 0.0413, "step": 35100 }, { "epoch": 8.06932966023875, "grad_norm": 0.871351420879364, "learning_rate": 1.1930899908172638e-05, "loss": 0.0371, "step": 35150 }, { "epoch": 8.080808080808081, "grad_norm": 2.7750191688537598, "learning_rate": 1.1919421487603307e-05, "loss": 0.0468, "step": 35200 }, { "epoch": 8.09228650137741, "grad_norm": 8.000408172607422, "learning_rate": 1.1907943067033978e-05, "loss": 0.0535, "step": 35250 }, { "epoch": 8.10376492194674, "grad_norm": 3.2866196632385254, "learning_rate": 1.1896464646464648e-05, "loss": 0.0507, "step": 35300 }, { "epoch": 8.11524334251607, "grad_norm": 2.6242516040802, "learning_rate": 1.1884986225895319e-05, "loss": 0.0486, "step": 35350 }, { "epoch": 8.1267217630854, "grad_norm": 4.227886199951172, "learning_rate": 1.1873507805325988e-05, "loss": 0.0402, "step": 35400 }, { "epoch": 8.13820018365473, "grad_norm": 18.829917907714844, "learning_rate": 1.186202938475666e-05, "loss": 0.0549, "step": 35450 }, { "epoch": 8.149678604224059, "grad_norm": 1.0419710874557495, "learning_rate": 1.1850550964187329e-05, "loss": 0.0329, "step": 35500 }, { "epoch": 8.161157024793388, "grad_norm": 5.157543659210205, "learning_rate": 1.1839072543618e-05, "loss": 0.0517, "step": 35550 }, { "epoch": 8.172635445362719, "grad_norm": 6.277066230773926, "learning_rate": 1.182759412304867e-05, "loss": 0.0462, "step": 35600 }, { "epoch": 8.184113865932048, "grad_norm": 4.789900302886963, "learning_rate": 1.1816115702479341e-05, "loss": 0.0481, "step": 35650 }, { "epoch": 8.195592286501377, "grad_norm": 8.018492698669434, "learning_rate": 1.180463728191001e-05, "loss": 0.0468, "step": 35700 }, { "epoch": 8.207070707070708, "grad_norm": 1.7527846097946167, "learning_rate": 1.1793158861340681e-05, "loss": 0.0464, "step": 35750 }, { "epoch": 8.218549127640037, "grad_norm": 0.3079041540622711, "learning_rate": 1.1781680440771351e-05, "loss": 0.0381, "step": 35800 }, { "epoch": 8.230027548209366, "grad_norm": 1.6494468450546265, "learning_rate": 1.1770202020202022e-05, "loss": 0.0517, "step": 35850 }, { "epoch": 8.241505968778696, "grad_norm": 3.2260401248931885, "learning_rate": 1.1758723599632692e-05, "loss": 0.0434, "step": 35900 }, { "epoch": 8.252984389348025, "grad_norm": 2.658402919769287, "learning_rate": 1.1747245179063363e-05, "loss": 0.0433, "step": 35950 }, { "epoch": 8.264462809917354, "grad_norm": 3.176905870437622, "learning_rate": 1.1735766758494032e-05, "loss": 0.0588, "step": 36000 }, { "epoch": 8.275941230486685, "grad_norm": 7.359071254730225, "learning_rate": 1.1724288337924703e-05, "loss": 0.0337, "step": 36050 }, { "epoch": 8.287419651056014, "grad_norm": 14.401073455810547, "learning_rate": 1.1712809917355373e-05, "loss": 0.058, "step": 36100 }, { "epoch": 8.298898071625345, "grad_norm": 5.240502834320068, "learning_rate": 1.1701331496786044e-05, "loss": 0.044, "step": 36150 }, { "epoch": 8.310376492194674, "grad_norm": 0.6861463189125061, "learning_rate": 1.1689853076216714e-05, "loss": 0.0406, "step": 36200 }, { "epoch": 8.321854912764003, "grad_norm": 3.6862287521362305, "learning_rate": 1.1678374655647385e-05, "loss": 0.0443, "step": 36250 }, { "epoch": 8.333333333333334, "grad_norm": 8.920327186584473, "learning_rate": 1.1666896235078054e-05, "loss": 0.035, "step": 36300 }, { "epoch": 8.344811753902663, "grad_norm": 2.025834083557129, "learning_rate": 1.1655417814508725e-05, "loss": 0.0402, "step": 36350 }, { "epoch": 8.356290174471992, "grad_norm": 2.055579900741577, "learning_rate": 1.1643939393939395e-05, "loss": 0.0406, "step": 36400 }, { "epoch": 8.367768595041323, "grad_norm": 3.195596933364868, "learning_rate": 1.1632460973370066e-05, "loss": 0.0377, "step": 36450 }, { "epoch": 8.379247015610652, "grad_norm": 4.6741108894348145, "learning_rate": 1.1620982552800735e-05, "loss": 0.0483, "step": 36500 }, { "epoch": 8.39072543617998, "grad_norm": 3.2810351848602295, "learning_rate": 1.1609504132231407e-05, "loss": 0.0465, "step": 36550 }, { "epoch": 8.402203856749312, "grad_norm": 0.6095794439315796, "learning_rate": 1.1598025711662076e-05, "loss": 0.0485, "step": 36600 }, { "epoch": 8.41368227731864, "grad_norm": 117.99384307861328, "learning_rate": 1.1586547291092747e-05, "loss": 0.0597, "step": 36650 }, { "epoch": 8.425160697887971, "grad_norm": 7.3065595626831055, "learning_rate": 1.1575068870523417e-05, "loss": 0.0387, "step": 36700 }, { "epoch": 8.4366391184573, "grad_norm": 2.3449833393096924, "learning_rate": 1.1563590449954088e-05, "loss": 0.0298, "step": 36750 }, { "epoch": 8.44811753902663, "grad_norm": 38.62355041503906, "learning_rate": 1.1552112029384757e-05, "loss": 0.0417, "step": 36800 }, { "epoch": 8.45959595959596, "grad_norm": 0.17742887139320374, "learning_rate": 1.1540633608815428e-05, "loss": 0.0444, "step": 36850 }, { "epoch": 8.47107438016529, "grad_norm": 12.625749588012695, "learning_rate": 1.1529155188246098e-05, "loss": 0.0411, "step": 36900 }, { "epoch": 8.482552800734618, "grad_norm": 9.287821769714355, "learning_rate": 1.1517676767676769e-05, "loss": 0.0403, "step": 36950 }, { "epoch": 8.494031221303949, "grad_norm": 1.3574144840240479, "learning_rate": 1.1506198347107439e-05, "loss": 0.0488, "step": 37000 }, { "epoch": 8.505509641873278, "grad_norm": 3.9417381286621094, "learning_rate": 1.149471992653811e-05, "loss": 0.0385, "step": 37050 }, { "epoch": 8.516988062442607, "grad_norm": 0.6221179366111755, "learning_rate": 1.1483241505968781e-05, "loss": 0.0372, "step": 37100 }, { "epoch": 8.528466483011938, "grad_norm": 0.9008545875549316, "learning_rate": 1.147176308539945e-05, "loss": 0.0337, "step": 37150 }, { "epoch": 8.539944903581267, "grad_norm": 1.975165605545044, "learning_rate": 1.1460284664830122e-05, "loss": 0.0355, "step": 37200 }, { "epoch": 8.551423324150598, "grad_norm": 6.644387722015381, "learning_rate": 1.1448806244260791e-05, "loss": 0.0395, "step": 37250 }, { "epoch": 8.562901744719927, "grad_norm": 15.01865291595459, "learning_rate": 1.1437327823691462e-05, "loss": 0.0307, "step": 37300 }, { "epoch": 8.574380165289256, "grad_norm": 4.933335304260254, "learning_rate": 1.1425849403122132e-05, "loss": 0.0487, "step": 37350 }, { "epoch": 8.585858585858587, "grad_norm": 1.2647886276245117, "learning_rate": 1.1414370982552803e-05, "loss": 0.052, "step": 37400 }, { "epoch": 8.597337006427916, "grad_norm": 4.2719526290893555, "learning_rate": 1.1402892561983472e-05, "loss": 0.04, "step": 37450 }, { "epoch": 8.608815426997245, "grad_norm": 12.289006233215332, "learning_rate": 1.1391414141414143e-05, "loss": 0.0515, "step": 37500 }, { "epoch": 8.620293847566575, "grad_norm": 25.737457275390625, "learning_rate": 1.1379935720844813e-05, "loss": 0.0377, "step": 37550 }, { "epoch": 8.631772268135904, "grad_norm": 1.7222784757614136, "learning_rate": 1.1368457300275484e-05, "loss": 0.04, "step": 37600 }, { "epoch": 8.643250688705233, "grad_norm": 2.792962074279785, "learning_rate": 1.1356978879706154e-05, "loss": 0.0405, "step": 37650 }, { "epoch": 8.654729109274564, "grad_norm": 44.52890396118164, "learning_rate": 1.1345500459136825e-05, "loss": 0.0403, "step": 37700 }, { "epoch": 8.666207529843893, "grad_norm": 2.666327476501465, "learning_rate": 1.1334022038567494e-05, "loss": 0.0454, "step": 37750 }, { "epoch": 8.677685950413224, "grad_norm": 3.5977885723114014, "learning_rate": 1.1322543617998165e-05, "loss": 0.0475, "step": 37800 }, { "epoch": 8.689164370982553, "grad_norm": 8.373371124267578, "learning_rate": 1.1311065197428835e-05, "loss": 0.039, "step": 37850 }, { "epoch": 8.700642791551882, "grad_norm": 4.166191101074219, "learning_rate": 1.1299586776859506e-05, "loss": 0.0398, "step": 37900 }, { "epoch": 8.712121212121213, "grad_norm": 9.34977912902832, "learning_rate": 1.1288108356290175e-05, "loss": 0.0405, "step": 37950 }, { "epoch": 8.723599632690542, "grad_norm": 1.3293687105178833, "learning_rate": 1.1276629935720847e-05, "loss": 0.0371, "step": 38000 }, { "epoch": 8.73507805325987, "grad_norm": 8.244464874267578, "learning_rate": 1.1265151515151516e-05, "loss": 0.0316, "step": 38050 }, { "epoch": 8.746556473829202, "grad_norm": 5.400195598602295, "learning_rate": 1.1253673094582187e-05, "loss": 0.0438, "step": 38100 }, { "epoch": 8.75803489439853, "grad_norm": 0.05519772320985794, "learning_rate": 1.1242194674012857e-05, "loss": 0.0571, "step": 38150 }, { "epoch": 8.76951331496786, "grad_norm": 6.469328880310059, "learning_rate": 1.1230716253443528e-05, "loss": 0.0362, "step": 38200 }, { "epoch": 8.78099173553719, "grad_norm": 1.1116520166397095, "learning_rate": 1.1219237832874197e-05, "loss": 0.0341, "step": 38250 }, { "epoch": 8.79247015610652, "grad_norm": 5.681356430053711, "learning_rate": 1.1207759412304869e-05, "loss": 0.0484, "step": 38300 }, { "epoch": 8.80394857667585, "grad_norm": 4.1637043952941895, "learning_rate": 1.1196280991735538e-05, "loss": 0.0312, "step": 38350 }, { "epoch": 8.81542699724518, "grad_norm": 4.019221782684326, "learning_rate": 1.118480257116621e-05, "loss": 0.03, "step": 38400 }, { "epoch": 8.826905417814508, "grad_norm": 25.72397232055664, "learning_rate": 1.1173324150596879e-05, "loss": 0.0552, "step": 38450 }, { "epoch": 8.83838383838384, "grad_norm": 5.5635223388671875, "learning_rate": 1.116184573002755e-05, "loss": 0.0448, "step": 38500 }, { "epoch": 8.849862258953168, "grad_norm": 5.186960220336914, "learning_rate": 1.115036730945822e-05, "loss": 0.0423, "step": 38550 }, { "epoch": 8.861340679522497, "grad_norm": 5.871897220611572, "learning_rate": 1.113888888888889e-05, "loss": 0.0337, "step": 38600 }, { "epoch": 8.872819100091828, "grad_norm": 34.960208892822266, "learning_rate": 1.112741046831956e-05, "loss": 0.0445, "step": 38650 }, { "epoch": 8.884297520661157, "grad_norm": 1.9662295579910278, "learning_rate": 1.1115932047750231e-05, "loss": 0.0376, "step": 38700 }, { "epoch": 8.895775941230486, "grad_norm": 0.4357619881629944, "learning_rate": 1.11044536271809e-05, "loss": 0.0431, "step": 38750 }, { "epoch": 8.907254361799817, "grad_norm": 92.24552154541016, "learning_rate": 1.1092975206611572e-05, "loss": 0.044, "step": 38800 }, { "epoch": 8.918732782369146, "grad_norm": 9.879362106323242, "learning_rate": 1.1081496786042241e-05, "loss": 0.0438, "step": 38850 }, { "epoch": 8.930211202938477, "grad_norm": 2.515532970428467, "learning_rate": 1.1070018365472912e-05, "loss": 0.0323, "step": 38900 }, { "epoch": 8.941689623507806, "grad_norm": 6.299307823181152, "learning_rate": 1.1058539944903582e-05, "loss": 0.0415, "step": 38950 }, { "epoch": 8.953168044077135, "grad_norm": 7.774527549743652, "learning_rate": 1.1047061524334253e-05, "loss": 0.045, "step": 39000 }, { "epoch": 8.964646464646465, "grad_norm": 11.112778663635254, "learning_rate": 1.1035583103764922e-05, "loss": 0.048, "step": 39050 }, { "epoch": 8.976124885215794, "grad_norm": 11.962413787841797, "learning_rate": 1.1024104683195594e-05, "loss": 0.0404, "step": 39100 }, { "epoch": 8.987603305785123, "grad_norm": 10.883230209350586, "learning_rate": 1.1012626262626263e-05, "loss": 0.045, "step": 39150 }, { "epoch": 8.999081726354454, "grad_norm": 0.13980256021022797, "learning_rate": 1.1001147842056934e-05, "loss": 0.0377, "step": 39200 }, { "epoch": 9.0, "eval_loss": 0.01716773770749569, "eval_runtime": 76.2391, "eval_samples_per_second": 101.562, "eval_steps_per_second": 6.348, "step": 39204 }, { "epoch": 9.010560146923783, "grad_norm": 15.515166282653809, "learning_rate": 1.0989669421487604e-05, "loss": 0.0355, "step": 39250 }, { "epoch": 9.022038567493112, "grad_norm": 18.02402687072754, "learning_rate": 1.0978191000918275e-05, "loss": 0.0447, "step": 39300 }, { "epoch": 9.033516988062443, "grad_norm": 11.112682342529297, "learning_rate": 1.0966712580348944e-05, "loss": 0.0382, "step": 39350 }, { "epoch": 9.044995408631772, "grad_norm": 12.32161808013916, "learning_rate": 1.0955234159779616e-05, "loss": 0.0457, "step": 39400 }, { "epoch": 9.056473829201101, "grad_norm": 5.1924214363098145, "learning_rate": 1.0943755739210287e-05, "loss": 0.035, "step": 39450 }, { "epoch": 9.067952249770432, "grad_norm": 2.899353504180908, "learning_rate": 1.0932277318640956e-05, "loss": 0.0329, "step": 39500 }, { "epoch": 9.079430670339761, "grad_norm": 2.215280055999756, "learning_rate": 1.0920798898071627e-05, "loss": 0.0333, "step": 39550 }, { "epoch": 9.090909090909092, "grad_norm": 3.4734854698181152, "learning_rate": 1.0909320477502297e-05, "loss": 0.0454, "step": 39600 }, { "epoch": 9.10238751147842, "grad_norm": 21.926847457885742, "learning_rate": 1.0897842056932968e-05, "loss": 0.0451, "step": 39650 }, { "epoch": 9.11386593204775, "grad_norm": 0.6164843440055847, "learning_rate": 1.0886363636363637e-05, "loss": 0.0463, "step": 39700 }, { "epoch": 9.12534435261708, "grad_norm": 1.438193917274475, "learning_rate": 1.0874885215794309e-05, "loss": 0.0394, "step": 39750 }, { "epoch": 9.13682277318641, "grad_norm": 16.018314361572266, "learning_rate": 1.0863406795224978e-05, "loss": 0.0492, "step": 39800 }, { "epoch": 9.148301193755739, "grad_norm": 3.2010951042175293, "learning_rate": 1.085192837465565e-05, "loss": 0.0772, "step": 39850 }, { "epoch": 9.15977961432507, "grad_norm": 1.5158915519714355, "learning_rate": 1.0840449954086319e-05, "loss": 0.0366, "step": 39900 }, { "epoch": 9.171258034894398, "grad_norm": 0.26671886444091797, "learning_rate": 1.082897153351699e-05, "loss": 0.04, "step": 39950 }, { "epoch": 9.182736455463727, "grad_norm": 0.10536785423755646, "learning_rate": 1.081749311294766e-05, "loss": 0.037, "step": 40000 }, { "epoch": 9.194214876033058, "grad_norm": 9.137739181518555, "learning_rate": 1.080601469237833e-05, "loss": 0.0446, "step": 40050 }, { "epoch": 9.205693296602387, "grad_norm": 5.278678894042969, "learning_rate": 1.0794536271809e-05, "loss": 0.0415, "step": 40100 }, { "epoch": 9.217171717171718, "grad_norm": 16.726768493652344, "learning_rate": 1.0783057851239671e-05, "loss": 0.0442, "step": 40150 }, { "epoch": 9.228650137741047, "grad_norm": 1.2483489513397217, "learning_rate": 1.077157943067034e-05, "loss": 0.0336, "step": 40200 }, { "epoch": 9.240128558310376, "grad_norm": 0.604836642742157, "learning_rate": 1.0760101010101012e-05, "loss": 0.0387, "step": 40250 }, { "epoch": 9.251606978879707, "grad_norm": 8.892909049987793, "learning_rate": 1.0748622589531681e-05, "loss": 0.0336, "step": 40300 }, { "epoch": 9.263085399449036, "grad_norm": 10.292204856872559, "learning_rate": 1.0737144168962352e-05, "loss": 0.0392, "step": 40350 }, { "epoch": 9.274563820018365, "grad_norm": 1.8617104291915894, "learning_rate": 1.0725665748393022e-05, "loss": 0.0397, "step": 40400 }, { "epoch": 9.286042240587696, "grad_norm": 3.834566354751587, "learning_rate": 1.0714187327823693e-05, "loss": 0.0566, "step": 40450 }, { "epoch": 9.297520661157025, "grad_norm": 2.3097574710845947, "learning_rate": 1.0702708907254363e-05, "loss": 0.024, "step": 40500 }, { "epoch": 9.308999081726354, "grad_norm": 2.451305866241455, "learning_rate": 1.0691230486685034e-05, "loss": 0.0412, "step": 40550 }, { "epoch": 9.320477502295685, "grad_norm": 24.204326629638672, "learning_rate": 1.0679752066115703e-05, "loss": 0.0356, "step": 40600 }, { "epoch": 9.331955922865014, "grad_norm": 29.253292083740234, "learning_rate": 1.0668273645546374e-05, "loss": 0.0413, "step": 40650 }, { "epoch": 9.343434343434343, "grad_norm": 2.595519781112671, "learning_rate": 1.0656795224977044e-05, "loss": 0.0368, "step": 40700 }, { "epoch": 9.354912764003673, "grad_norm": 0.10153094679117203, "learning_rate": 1.0645316804407715e-05, "loss": 0.0439, "step": 40750 }, { "epoch": 9.366391184573002, "grad_norm": 1.2798782587051392, "learning_rate": 1.0633838383838384e-05, "loss": 0.0391, "step": 40800 }, { "epoch": 9.377869605142333, "grad_norm": 1.1510014533996582, "learning_rate": 1.0622359963269056e-05, "loss": 0.0344, "step": 40850 }, { "epoch": 9.389348025711662, "grad_norm": 2.036505937576294, "learning_rate": 1.0610881542699725e-05, "loss": 0.0384, "step": 40900 }, { "epoch": 9.400826446280991, "grad_norm": 2.8112146854400635, "learning_rate": 1.0599403122130396e-05, "loss": 0.0304, "step": 40950 }, { "epoch": 9.412304866850322, "grad_norm": 2.634861707687378, "learning_rate": 1.0587924701561066e-05, "loss": 0.0472, "step": 41000 }, { "epoch": 9.423783287419651, "grad_norm": 0.061108626425266266, "learning_rate": 1.0576446280991737e-05, "loss": 0.0467, "step": 41050 }, { "epoch": 9.43526170798898, "grad_norm": 4.1158246994018555, "learning_rate": 1.0564967860422406e-05, "loss": 0.037, "step": 41100 }, { "epoch": 9.44674012855831, "grad_norm": 3.366605520248413, "learning_rate": 1.0553489439853078e-05, "loss": 0.0404, "step": 41150 }, { "epoch": 9.45821854912764, "grad_norm": 7.561800956726074, "learning_rate": 1.0542011019283747e-05, "loss": 0.0448, "step": 41200 }, { "epoch": 9.469696969696969, "grad_norm": 7.5595011711120605, "learning_rate": 1.0530532598714418e-05, "loss": 0.0419, "step": 41250 }, { "epoch": 9.4811753902663, "grad_norm": 0.12694787979125977, "learning_rate": 1.0519054178145088e-05, "loss": 0.0372, "step": 41300 }, { "epoch": 9.492653810835629, "grad_norm": 7.773220062255859, "learning_rate": 1.0507575757575759e-05, "loss": 0.0332, "step": 41350 }, { "epoch": 9.50413223140496, "grad_norm": 2.2563424110412598, "learning_rate": 1.0496097337006428e-05, "loss": 0.0423, "step": 41400 }, { "epoch": 9.515610651974288, "grad_norm": 2.7214303016662598, "learning_rate": 1.04846189164371e-05, "loss": 0.0374, "step": 41450 }, { "epoch": 9.527089072543617, "grad_norm": 1.6576491594314575, "learning_rate": 1.0473140495867769e-05, "loss": 0.0409, "step": 41500 }, { "epoch": 9.538567493112948, "grad_norm": 0.7711743712425232, "learning_rate": 1.046166207529844e-05, "loss": 0.0358, "step": 41550 }, { "epoch": 9.550045913682277, "grad_norm": 7.214943885803223, "learning_rate": 1.045018365472911e-05, "loss": 0.0388, "step": 41600 }, { "epoch": 9.561524334251606, "grad_norm": 1.1326754093170166, "learning_rate": 1.043870523415978e-05, "loss": 0.0407, "step": 41650 }, { "epoch": 9.573002754820937, "grad_norm": 0.5054967403411865, "learning_rate": 1.042722681359045e-05, "loss": 0.0389, "step": 41700 }, { "epoch": 9.584481175390266, "grad_norm": 34.790584564208984, "learning_rate": 1.0415748393021121e-05, "loss": 0.0362, "step": 41750 }, { "epoch": 9.595959595959595, "grad_norm": 6.4580206871032715, "learning_rate": 1.040426997245179e-05, "loss": 0.0298, "step": 41800 }, { "epoch": 9.607438016528926, "grad_norm": 3.479710817337036, "learning_rate": 1.0392791551882462e-05, "loss": 0.0492, "step": 41850 }, { "epoch": 9.618916437098255, "grad_norm": 2.8853957653045654, "learning_rate": 1.0381313131313133e-05, "loss": 0.0349, "step": 41900 }, { "epoch": 9.630394857667586, "grad_norm": 1.1016287803649902, "learning_rate": 1.0369834710743803e-05, "loss": 0.0351, "step": 41950 }, { "epoch": 9.641873278236915, "grad_norm": 0.6443769931793213, "learning_rate": 1.0358356290174474e-05, "loss": 0.0383, "step": 42000 }, { "epoch": 9.653351698806244, "grad_norm": 15.486309051513672, "learning_rate": 1.0346877869605143e-05, "loss": 0.0468, "step": 42050 }, { "epoch": 9.664830119375575, "grad_norm": 2.9086568355560303, "learning_rate": 1.0335399449035814e-05, "loss": 0.0398, "step": 42100 }, { "epoch": 9.676308539944904, "grad_norm": 0.7633037567138672, "learning_rate": 1.0323921028466484e-05, "loss": 0.0287, "step": 42150 }, { "epoch": 9.687786960514233, "grad_norm": 1.5675113201141357, "learning_rate": 1.0312442607897155e-05, "loss": 0.0342, "step": 42200 }, { "epoch": 9.699265381083563, "grad_norm": 1.1701419353485107, "learning_rate": 1.0300964187327825e-05, "loss": 0.044, "step": 42250 }, { "epoch": 9.710743801652892, "grad_norm": 12.629388809204102, "learning_rate": 1.0289485766758496e-05, "loss": 0.041, "step": 42300 }, { "epoch": 9.722222222222221, "grad_norm": 0.04379061236977577, "learning_rate": 1.0278007346189165e-05, "loss": 0.0339, "step": 42350 }, { "epoch": 9.733700642791552, "grad_norm": 6.223551273345947, "learning_rate": 1.0266528925619836e-05, "loss": 0.0313, "step": 42400 }, { "epoch": 9.745179063360881, "grad_norm": 0.9906750917434692, "learning_rate": 1.0255050505050506e-05, "loss": 0.0294, "step": 42450 }, { "epoch": 9.756657483930212, "grad_norm": 4.081332683563232, "learning_rate": 1.0243572084481177e-05, "loss": 0.0311, "step": 42500 }, { "epoch": 9.768135904499541, "grad_norm": 6.388052940368652, "learning_rate": 1.0232093663911846e-05, "loss": 0.0445, "step": 42550 }, { "epoch": 9.77961432506887, "grad_norm": 3.943826913833618, "learning_rate": 1.0220615243342518e-05, "loss": 0.0329, "step": 42600 }, { "epoch": 9.7910927456382, "grad_norm": 26.89732551574707, "learning_rate": 1.0209136822773187e-05, "loss": 0.0479, "step": 42650 }, { "epoch": 9.80257116620753, "grad_norm": 2.8657171726226807, "learning_rate": 1.0197658402203858e-05, "loss": 0.0384, "step": 42700 }, { "epoch": 9.814049586776859, "grad_norm": 5.352105617523193, "learning_rate": 1.0186179981634528e-05, "loss": 0.0366, "step": 42750 }, { "epoch": 9.82552800734619, "grad_norm": 0.2520829737186432, "learning_rate": 1.0174701561065199e-05, "loss": 0.0424, "step": 42800 }, { "epoch": 9.837006427915519, "grad_norm": 0.595446765422821, "learning_rate": 1.0163223140495868e-05, "loss": 0.0362, "step": 42850 }, { "epoch": 9.848484848484848, "grad_norm": 4.6209211349487305, "learning_rate": 1.015174471992654e-05, "loss": 0.0408, "step": 42900 }, { "epoch": 9.859963269054179, "grad_norm": 38.87813186645508, "learning_rate": 1.0140266299357209e-05, "loss": 0.0569, "step": 42950 }, { "epoch": 9.871441689623508, "grad_norm": 0.21165037155151367, "learning_rate": 1.012878787878788e-05, "loss": 0.0401, "step": 43000 }, { "epoch": 9.882920110192838, "grad_norm": 0.5416375994682312, "learning_rate": 1.011730945821855e-05, "loss": 0.0298, "step": 43050 }, { "epoch": 9.894398530762167, "grad_norm": 4.422178268432617, "learning_rate": 1.010583103764922e-05, "loss": 0.0299, "step": 43100 }, { "epoch": 9.905876951331496, "grad_norm": 7.897778511047363, "learning_rate": 1.009435261707989e-05, "loss": 0.0436, "step": 43150 }, { "epoch": 9.917355371900827, "grad_norm": 3.7302865982055664, "learning_rate": 1.0082874196510561e-05, "loss": 0.0327, "step": 43200 }, { "epoch": 9.928833792470156, "grad_norm": 3.7748823165893555, "learning_rate": 1.0071395775941231e-05, "loss": 0.0414, "step": 43250 }, { "epoch": 9.940312213039485, "grad_norm": 2.044543981552124, "learning_rate": 1.0059917355371902e-05, "loss": 0.0278, "step": 43300 }, { "epoch": 9.951790633608816, "grad_norm": 3.267146348953247, "learning_rate": 1.0048438934802572e-05, "loss": 0.0388, "step": 43350 }, { "epoch": 9.963269054178145, "grad_norm": 3.9683544635772705, "learning_rate": 1.0036960514233243e-05, "loss": 0.0322, "step": 43400 }, { "epoch": 9.974747474747474, "grad_norm": 2.942369222640991, "learning_rate": 1.0025482093663912e-05, "loss": 0.0513, "step": 43450 }, { "epoch": 9.986225895316805, "grad_norm": 2.7790541648864746, "learning_rate": 1.0014003673094583e-05, "loss": 0.0361, "step": 43500 }, { "epoch": 9.997704315886134, "grad_norm": 22.74088478088379, "learning_rate": 1.0002525252525253e-05, "loss": 0.0387, "step": 43550 }, { "epoch": 10.0, "eval_loss": 0.01870335452258587, "eval_runtime": 111.7828, "eval_samples_per_second": 69.268, "eval_steps_per_second": 4.33, "step": 43560 } ], "logging_steps": 50, "max_steps": 87120, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6314999178260347e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }