{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999347130639159, "eval_steps": 500, "global_step": 7658, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013057387216817914, "grad_norm": 7.0214951132907455, "learning_rate": 4.9999999999999996e-05, "loss": 5.3688, "step": 10 }, { "epoch": 0.002611477443363583, "grad_norm": 5.526045891152434, "learning_rate": 6.505149978319905e-05, "loss": 2.6941, "step": 20 }, { "epoch": 0.003917216165045374, "grad_norm": 2.7324451334570155, "learning_rate": 7.385606273598311e-05, "loss": 1.316, "step": 30 }, { "epoch": 0.005222954886727166, "grad_norm": 2.282593737859076, "learning_rate": 8.01029995663981e-05, "loss": 0.9746, "step": 40 }, { "epoch": 0.006528693608408957, "grad_norm": 1.8793409902100164, "learning_rate": 8.494850021680092e-05, "loss": 0.8996, "step": 50 }, { "epoch": 0.007834432330090749, "grad_norm": 1.7828447068912778, "learning_rate": 8.890756251918216e-05, "loss": 0.7875, "step": 60 }, { "epoch": 0.00914017105177254, "grad_norm": 1.568201519588466, "learning_rate": 9.225490200071284e-05, "loss": 0.6908, "step": 70 }, { "epoch": 0.010445909773454332, "grad_norm": 2.082981810512909, "learning_rate": 9.515449934959716e-05, "loss": 0.6858, "step": 80 }, { "epoch": 0.011751648495136123, "grad_norm": 1.3994825238930793, "learning_rate": 9.771212547196623e-05, "loss": 0.6509, "step": 90 }, { "epoch": 0.013057387216817914, "grad_norm": 1.6390187012026554, "learning_rate": 9.999999999999999e-05, "loss": 0.6263, "step": 100 }, { "epoch": 0.014363125938499706, "grad_norm": 0.9466340282583945, "learning_rate": 9.98809208785393e-05, "loss": 0.601, "step": 110 }, { "epoch": 0.015668864660181497, "grad_norm": 1.5229715668412607, "learning_rate": 9.974861074358295e-05, "loss": 0.5954, "step": 120 }, { "epoch": 0.01697460338186329, "grad_norm": 1.3112830688402828, "learning_rate": 9.961630060862662e-05, "loss": 0.5855, "step": 130 }, { "epoch": 0.01828034210354508, "grad_norm": 1.3809120875716931, "learning_rate": 9.948399047367028e-05, "loss": 0.5903, "step": 140 }, { "epoch": 0.01958608082522687, "grad_norm": 1.1446570482318412, "learning_rate": 9.935168033871395e-05, "loss": 0.5185, "step": 150 }, { "epoch": 0.020891819546908663, "grad_norm": 1.099138110422475, "learning_rate": 9.921937020375761e-05, "loss": 0.5654, "step": 160 }, { "epoch": 0.022197558268590455, "grad_norm": 1.2141818063147904, "learning_rate": 9.908706006880127e-05, "loss": 0.5041, "step": 170 }, { "epoch": 0.023503296990272246, "grad_norm": 1.1517042950358218, "learning_rate": 9.895474993384494e-05, "loss": 0.564, "step": 180 }, { "epoch": 0.024809035711954038, "grad_norm": 0.9796723680759294, "learning_rate": 9.88224397988886e-05, "loss": 0.5086, "step": 190 }, { "epoch": 0.02611477443363583, "grad_norm": 1.1902408719107485, "learning_rate": 9.869012966393227e-05, "loss": 0.4893, "step": 200 }, { "epoch": 0.02742051315531762, "grad_norm": 0.9121307914712619, "learning_rate": 9.855781952897593e-05, "loss": 0.5281, "step": 210 }, { "epoch": 0.028726251876999412, "grad_norm": 0.8683470165994364, "learning_rate": 9.84255093940196e-05, "loss": 0.5765, "step": 220 }, { "epoch": 0.030031990598681203, "grad_norm": 1.1996034985153523, "learning_rate": 9.829319925906324e-05, "loss": 0.5024, "step": 230 }, { "epoch": 0.031337729320362995, "grad_norm": 0.8937790685228523, "learning_rate": 9.816088912410691e-05, "loss": 0.4751, "step": 240 }, { "epoch": 0.032643468042044786, "grad_norm": 0.890527069670524, "learning_rate": 9.802857898915057e-05, "loss": 0.5339, "step": 250 }, { "epoch": 0.03394920676372658, "grad_norm": 0.9513384944623474, "learning_rate": 9.789626885419424e-05, "loss": 0.5377, "step": 260 }, { "epoch": 0.03525494548540837, "grad_norm": 1.2850114799131154, "learning_rate": 9.77639587192379e-05, "loss": 0.4389, "step": 270 }, { "epoch": 0.03656068420709016, "grad_norm": 1.2028503176228915, "learning_rate": 9.763164858428157e-05, "loss": 0.482, "step": 280 }, { "epoch": 0.03786642292877195, "grad_norm": 0.8383158153343386, "learning_rate": 9.749933844932523e-05, "loss": 0.492, "step": 290 }, { "epoch": 0.03917216165045374, "grad_norm": 0.9206639912616187, "learning_rate": 9.736702831436888e-05, "loss": 0.4448, "step": 300 }, { "epoch": 0.040477900372135535, "grad_norm": 0.8383883754593989, "learning_rate": 9.723471817941254e-05, "loss": 0.4812, "step": 310 }, { "epoch": 0.041783639093817326, "grad_norm": 1.0258817006500343, "learning_rate": 9.710240804445621e-05, "loss": 0.4532, "step": 320 }, { "epoch": 0.04308937781549912, "grad_norm": 0.7927596640945009, "learning_rate": 9.697009790949987e-05, "loss": 0.4549, "step": 330 }, { "epoch": 0.04439511653718091, "grad_norm": 0.8307862111837001, "learning_rate": 9.683778777454354e-05, "loss": 0.4194, "step": 340 }, { "epoch": 0.0457008552588627, "grad_norm": 0.9313521655236179, "learning_rate": 9.67054776395872e-05, "loss": 0.4661, "step": 350 }, { "epoch": 0.04700659398054449, "grad_norm": 1.0271573891738304, "learning_rate": 9.657316750463085e-05, "loss": 0.5069, "step": 360 }, { "epoch": 0.048312332702226284, "grad_norm": 1.0535262872455642, "learning_rate": 9.644085736967452e-05, "loss": 0.4792, "step": 370 }, { "epoch": 0.049618071423908075, "grad_norm": 0.8750590787378165, "learning_rate": 9.630854723471818e-05, "loss": 0.4602, "step": 380 }, { "epoch": 0.050923810145589866, "grad_norm": 1.052964032935776, "learning_rate": 9.617623709976184e-05, "loss": 0.4975, "step": 390 }, { "epoch": 0.05222954886727166, "grad_norm": 1.1918907361394102, "learning_rate": 9.604392696480551e-05, "loss": 0.5042, "step": 400 }, { "epoch": 0.05353528758895345, "grad_norm": 0.6794092255651759, "learning_rate": 9.591161682984917e-05, "loss": 0.4352, "step": 410 }, { "epoch": 0.05484102631063524, "grad_norm": 0.9715220585303527, "learning_rate": 9.577930669489284e-05, "loss": 0.4554, "step": 420 }, { "epoch": 0.05614676503231703, "grad_norm": 0.8414840824319874, "learning_rate": 9.56469965599365e-05, "loss": 0.4089, "step": 430 }, { "epoch": 0.057452503753998824, "grad_norm": 0.6883489226314008, "learning_rate": 9.551468642498016e-05, "loss": 0.449, "step": 440 }, { "epoch": 0.058758242475680615, "grad_norm": 0.9291967417707445, "learning_rate": 9.538237629002383e-05, "loss": 0.4148, "step": 450 }, { "epoch": 0.06006398119736241, "grad_norm": 1.0317189870505563, "learning_rate": 9.525006615506749e-05, "loss": 0.4898, "step": 460 }, { "epoch": 0.0613697199190442, "grad_norm": 0.7983144737592249, "learning_rate": 9.511775602011114e-05, "loss": 0.448, "step": 470 }, { "epoch": 0.06267545864072599, "grad_norm": 0.9106639223946725, "learning_rate": 9.498544588515481e-05, "loss": 0.4366, "step": 480 }, { "epoch": 0.06398119736240779, "grad_norm": 0.8124092599596697, "learning_rate": 9.485313575019847e-05, "loss": 0.3993, "step": 490 }, { "epoch": 0.06528693608408957, "grad_norm": 1.1326736647436941, "learning_rate": 9.472082561524214e-05, "loss": 0.4928, "step": 500 }, { "epoch": 0.06659267480577137, "grad_norm": 0.9221935550054978, "learning_rate": 9.45885154802858e-05, "loss": 0.4515, "step": 510 }, { "epoch": 0.06789841352745316, "grad_norm": 0.9127561844278823, "learning_rate": 9.445620534532945e-05, "loss": 0.4216, "step": 520 }, { "epoch": 0.06920415224913495, "grad_norm": 0.8331354387005372, "learning_rate": 9.432389521037311e-05, "loss": 0.4453, "step": 530 }, { "epoch": 0.07050989097081674, "grad_norm": 0.8418704678293376, "learning_rate": 9.419158507541678e-05, "loss": 0.4409, "step": 540 }, { "epoch": 0.07181562969249854, "grad_norm": 0.7363083824312942, "learning_rate": 9.405927494046044e-05, "loss": 0.4807, "step": 550 }, { "epoch": 0.07312136841418032, "grad_norm": 0.7294356390732534, "learning_rate": 9.39269648055041e-05, "loss": 0.4828, "step": 560 }, { "epoch": 0.07442710713586212, "grad_norm": 1.1473553177363394, "learning_rate": 9.379465467054777e-05, "loss": 0.4137, "step": 570 }, { "epoch": 0.0757328458575439, "grad_norm": 0.850745450813387, "learning_rate": 9.366234453559143e-05, "loss": 0.4461, "step": 580 }, { "epoch": 0.0770385845792257, "grad_norm": 0.8129818357433601, "learning_rate": 9.353003440063509e-05, "loss": 0.4441, "step": 590 }, { "epoch": 0.07834432330090749, "grad_norm": 0.6381617873606064, "learning_rate": 9.339772426567875e-05, "loss": 0.4351, "step": 600 }, { "epoch": 0.07965006202258929, "grad_norm": 1.150672415512087, "learning_rate": 9.326541413072241e-05, "loss": 0.405, "step": 610 }, { "epoch": 0.08095580074427107, "grad_norm": 0.7928681684584729, "learning_rate": 9.313310399576608e-05, "loss": 0.3906, "step": 620 }, { "epoch": 0.08226153946595287, "grad_norm": 0.6367292845299426, "learning_rate": 9.300079386080974e-05, "loss": 0.405, "step": 630 }, { "epoch": 0.08356727818763465, "grad_norm": 0.8779444643953495, "learning_rate": 9.28684837258534e-05, "loss": 0.4411, "step": 640 }, { "epoch": 0.08487301690931645, "grad_norm": 0.713411212870052, "learning_rate": 9.273617359089707e-05, "loss": 0.4342, "step": 650 }, { "epoch": 0.08617875563099824, "grad_norm": 0.6358041113268869, "learning_rate": 9.260386345594073e-05, "loss": 0.4781, "step": 660 }, { "epoch": 0.08748449435268003, "grad_norm": 0.9800618765485707, "learning_rate": 9.24715533209844e-05, "loss": 0.4599, "step": 670 }, { "epoch": 0.08879023307436182, "grad_norm": 0.926030362362806, "learning_rate": 9.233924318602806e-05, "loss": 0.4243, "step": 680 }, { "epoch": 0.09009597179604362, "grad_norm": 0.7587936617462951, "learning_rate": 9.220693305107173e-05, "loss": 0.423, "step": 690 }, { "epoch": 0.0914017105177254, "grad_norm": 0.8777471178826035, "learning_rate": 9.207462291611538e-05, "loss": 0.4455, "step": 700 }, { "epoch": 0.0927074492394072, "grad_norm": 0.8410323396176128, "learning_rate": 9.194231278115904e-05, "loss": 0.3687, "step": 710 }, { "epoch": 0.09401318796108898, "grad_norm": 0.7707193329049491, "learning_rate": 9.18100026462027e-05, "loss": 0.4498, "step": 720 }, { "epoch": 0.09531892668277078, "grad_norm": 0.9111299912049232, "learning_rate": 9.167769251124637e-05, "loss": 0.4297, "step": 730 }, { "epoch": 0.09662466540445257, "grad_norm": 0.8483195776365777, "learning_rate": 9.154538237629002e-05, "loss": 0.4844, "step": 740 }, { "epoch": 0.09793040412613437, "grad_norm": 0.6475520453488, "learning_rate": 9.141307224133368e-05, "loss": 0.4443, "step": 750 }, { "epoch": 0.09923614284781615, "grad_norm": 0.7637644072788764, "learning_rate": 9.128076210637735e-05, "loss": 0.4184, "step": 760 }, { "epoch": 0.10054188156949795, "grad_norm": 0.9609531606030173, "learning_rate": 9.114845197142101e-05, "loss": 0.4541, "step": 770 }, { "epoch": 0.10184762029117973, "grad_norm": 0.7069950344550908, "learning_rate": 9.101614183646468e-05, "loss": 0.3806, "step": 780 }, { "epoch": 0.10315335901286153, "grad_norm": 0.7403433889244767, "learning_rate": 9.088383170150834e-05, "loss": 0.4303, "step": 790 }, { "epoch": 0.10445909773454332, "grad_norm": 0.8041753054026264, "learning_rate": 9.0751521566552e-05, "loss": 0.4172, "step": 800 }, { "epoch": 0.10576483645622511, "grad_norm": 0.8036242187073421, "learning_rate": 9.061921143159567e-05, "loss": 0.3918, "step": 810 }, { "epoch": 0.1070705751779069, "grad_norm": 0.6898044362997443, "learning_rate": 9.048690129663933e-05, "loss": 0.4461, "step": 820 }, { "epoch": 0.1083763138995887, "grad_norm": 0.5709377447055167, "learning_rate": 9.035459116168298e-05, "loss": 0.4045, "step": 830 }, { "epoch": 0.10968205262127048, "grad_norm": 0.6220375429427475, "learning_rate": 9.022228102672665e-05, "loss": 0.4083, "step": 840 }, { "epoch": 0.11098779134295228, "grad_norm": 0.9323474858085096, "learning_rate": 9.008997089177031e-05, "loss": 0.481, "step": 850 }, { "epoch": 0.11229353006463406, "grad_norm": 0.881090958172637, "learning_rate": 8.995766075681398e-05, "loss": 0.4208, "step": 860 }, { "epoch": 0.11359926878631586, "grad_norm": 0.7296968162362631, "learning_rate": 8.982535062185764e-05, "loss": 0.4381, "step": 870 }, { "epoch": 0.11490500750799765, "grad_norm": 0.5739147796276914, "learning_rate": 8.96930404869013e-05, "loss": 0.4131, "step": 880 }, { "epoch": 0.11621074622967945, "grad_norm": 0.7078280035536865, "learning_rate": 8.956073035194497e-05, "loss": 0.3974, "step": 890 }, { "epoch": 0.11751648495136123, "grad_norm": 0.9554922790649535, "learning_rate": 8.942842021698863e-05, "loss": 0.4267, "step": 900 }, { "epoch": 0.11882222367304303, "grad_norm": 0.7706108206903026, "learning_rate": 8.92961100820323e-05, "loss": 0.3829, "step": 910 }, { "epoch": 0.12012796239472481, "grad_norm": 0.7174489954464013, "learning_rate": 8.916379994707596e-05, "loss": 0.3812, "step": 920 }, { "epoch": 0.12143370111640661, "grad_norm": 0.6950854318931686, "learning_rate": 8.903148981211961e-05, "loss": 0.4256, "step": 930 }, { "epoch": 0.1227394398380884, "grad_norm": 0.9125502859810919, "learning_rate": 8.889917967716327e-05, "loss": 0.4485, "step": 940 }, { "epoch": 0.1240451785597702, "grad_norm": 0.8512902284971161, "learning_rate": 8.876686954220693e-05, "loss": 0.4636, "step": 950 }, { "epoch": 0.12535091728145198, "grad_norm": 0.8051474001539173, "learning_rate": 8.863455940725059e-05, "loss": 0.4107, "step": 960 }, { "epoch": 0.12665665600313378, "grad_norm": 0.8939131740898922, "learning_rate": 8.850224927229425e-05, "loss": 0.4738, "step": 970 }, { "epoch": 0.12796239472481558, "grad_norm": 0.7739593115495323, "learning_rate": 8.836993913733792e-05, "loss": 0.4449, "step": 980 }, { "epoch": 0.12926813344649735, "grad_norm": 0.8462434894540093, "learning_rate": 8.823762900238158e-05, "loss": 0.3884, "step": 990 }, { "epoch": 0.13057387216817914, "grad_norm": 0.6719241963243315, "learning_rate": 8.810531886742525e-05, "loss": 0.4159, "step": 1000 }, { "epoch": 0.13187961088986094, "grad_norm": 0.6966284900804444, "learning_rate": 8.797300873246891e-05, "loss": 0.4042, "step": 1010 }, { "epoch": 0.13318534961154274, "grad_norm": 0.7910476587709023, "learning_rate": 8.784069859751257e-05, "loss": 0.4261, "step": 1020 }, { "epoch": 0.1344910883332245, "grad_norm": 0.592066096261948, "learning_rate": 8.770838846255624e-05, "loss": 0.4433, "step": 1030 }, { "epoch": 0.1357968270549063, "grad_norm": 0.7244590191230463, "learning_rate": 8.75760783275999e-05, "loss": 0.3646, "step": 1040 }, { "epoch": 0.1371025657765881, "grad_norm": 0.794590652611156, "learning_rate": 8.744376819264357e-05, "loss": 0.4376, "step": 1050 }, { "epoch": 0.1384083044982699, "grad_norm": 0.8975959590033455, "learning_rate": 8.731145805768722e-05, "loss": 0.415, "step": 1060 }, { "epoch": 0.13971404321995168, "grad_norm": 0.6436981619710428, "learning_rate": 8.717914792273088e-05, "loss": 0.4323, "step": 1070 }, { "epoch": 0.14101978194163348, "grad_norm": 0.8555043967490022, "learning_rate": 8.704683778777455e-05, "loss": 0.3922, "step": 1080 }, { "epoch": 0.14232552066331527, "grad_norm": 1.0193037531210019, "learning_rate": 8.691452765281821e-05, "loss": 0.3967, "step": 1090 }, { "epoch": 0.14363125938499707, "grad_norm": 0.790368797522844, "learning_rate": 8.678221751786187e-05, "loss": 0.3726, "step": 1100 }, { "epoch": 0.14493699810667884, "grad_norm": 0.6667906742515436, "learning_rate": 8.664990738290554e-05, "loss": 0.4354, "step": 1110 }, { "epoch": 0.14624273682836064, "grad_norm": 0.6473596328671104, "learning_rate": 8.65175972479492e-05, "loss": 0.434, "step": 1120 }, { "epoch": 0.14754847555004244, "grad_norm": 0.8695385254465634, "learning_rate": 8.638528711299287e-05, "loss": 0.4147, "step": 1130 }, { "epoch": 0.14885421427172424, "grad_norm": 0.7978572027689828, "learning_rate": 8.625297697803653e-05, "loss": 0.4016, "step": 1140 }, { "epoch": 0.150159952993406, "grad_norm": 0.8287621846436213, "learning_rate": 8.61206668430802e-05, "loss": 0.4096, "step": 1150 }, { "epoch": 0.1514656917150878, "grad_norm": 0.47875792483329627, "learning_rate": 8.598835670812386e-05, "loss": 0.3925, "step": 1160 }, { "epoch": 0.1527714304367696, "grad_norm": 0.5953643036815572, "learning_rate": 8.585604657316751e-05, "loss": 0.4285, "step": 1170 }, { "epoch": 0.1540771691584514, "grad_norm": 0.6386746462414951, "learning_rate": 8.572373643821117e-05, "loss": 0.4104, "step": 1180 }, { "epoch": 0.15538290788013318, "grad_norm": 0.7018609573559429, "learning_rate": 8.559142630325482e-05, "loss": 0.4088, "step": 1190 }, { "epoch": 0.15668864660181497, "grad_norm": 0.7809318043607342, "learning_rate": 8.545911616829849e-05, "loss": 0.3807, "step": 1200 }, { "epoch": 0.15799438532349677, "grad_norm": 0.9389472238938235, "learning_rate": 8.532680603334215e-05, "loss": 0.3765, "step": 1210 }, { "epoch": 0.15930012404517857, "grad_norm": 0.87248877800822, "learning_rate": 8.519449589838582e-05, "loss": 0.3916, "step": 1220 }, { "epoch": 0.16060586276686034, "grad_norm": 0.8209765134898813, "learning_rate": 8.506218576342948e-05, "loss": 0.3462, "step": 1230 }, { "epoch": 0.16191160148854214, "grad_norm": 0.9725065823624313, "learning_rate": 8.492987562847314e-05, "loss": 0.4371, "step": 1240 }, { "epoch": 0.16321734021022394, "grad_norm": 0.8011747057792274, "learning_rate": 8.479756549351681e-05, "loss": 0.3502, "step": 1250 }, { "epoch": 0.16452307893190574, "grad_norm": 0.8475634666587575, "learning_rate": 8.466525535856047e-05, "loss": 0.3755, "step": 1260 }, { "epoch": 0.1658288176535875, "grad_norm": 0.9933583970681755, "learning_rate": 8.453294522360414e-05, "loss": 0.3982, "step": 1270 }, { "epoch": 0.1671345563752693, "grad_norm": 0.7860430505297333, "learning_rate": 8.44006350886478e-05, "loss": 0.4183, "step": 1280 }, { "epoch": 0.1684402950969511, "grad_norm": 0.7594700810060895, "learning_rate": 8.426832495369145e-05, "loss": 0.3869, "step": 1290 }, { "epoch": 0.1697460338186329, "grad_norm": 0.9198889130451715, "learning_rate": 8.413601481873512e-05, "loss": 0.4092, "step": 1300 }, { "epoch": 0.17105177254031467, "grad_norm": 0.9022871581556421, "learning_rate": 8.400370468377878e-05, "loss": 0.3794, "step": 1310 }, { "epoch": 0.17235751126199647, "grad_norm": 0.9610552142608674, "learning_rate": 8.387139454882244e-05, "loss": 0.428, "step": 1320 }, { "epoch": 0.17366324998367827, "grad_norm": 0.6749523572162098, "learning_rate": 8.373908441386611e-05, "loss": 0.3687, "step": 1330 }, { "epoch": 0.17496898870536007, "grad_norm": 1.1407658004205217, "learning_rate": 8.360677427890977e-05, "loss": 0.3833, "step": 1340 }, { "epoch": 0.17627472742704184, "grad_norm": 1.2578248716967748, "learning_rate": 8.347446414395344e-05, "loss": 0.3924, "step": 1350 }, { "epoch": 0.17758046614872364, "grad_norm": 0.8688609275978233, "learning_rate": 8.33421540089971e-05, "loss": 0.3955, "step": 1360 }, { "epoch": 0.17888620487040544, "grad_norm": 0.7599569177679184, "learning_rate": 8.320984387404076e-05, "loss": 0.4139, "step": 1370 }, { "epoch": 0.18019194359208723, "grad_norm": 0.9220154852758501, "learning_rate": 8.307753373908443e-05, "loss": 0.424, "step": 1380 }, { "epoch": 0.181497682313769, "grad_norm": 0.6478544991942465, "learning_rate": 8.294522360412808e-05, "loss": 0.4353, "step": 1390 }, { "epoch": 0.1828034210354508, "grad_norm": 0.7259646284350896, "learning_rate": 8.281291346917174e-05, "loss": 0.4007, "step": 1400 }, { "epoch": 0.1841091597571326, "grad_norm": 0.739703955573357, "learning_rate": 8.268060333421541e-05, "loss": 0.4013, "step": 1410 }, { "epoch": 0.1854148984788144, "grad_norm": 0.7182765909658406, "learning_rate": 8.254829319925906e-05, "loss": 0.4494, "step": 1420 }, { "epoch": 0.18672063720049617, "grad_norm": 0.7950163178824874, "learning_rate": 8.241598306430272e-05, "loss": 0.4154, "step": 1430 }, { "epoch": 0.18802637592217797, "grad_norm": 0.7188206726080598, "learning_rate": 8.228367292934639e-05, "loss": 0.4086, "step": 1440 }, { "epoch": 0.18933211464385977, "grad_norm": 0.6141548073966633, "learning_rate": 8.215136279439005e-05, "loss": 0.4104, "step": 1450 }, { "epoch": 0.19063785336554157, "grad_norm": 0.9371585126936247, "learning_rate": 8.201905265943371e-05, "loss": 0.3973, "step": 1460 }, { "epoch": 0.19194359208722334, "grad_norm": 0.940163279416942, "learning_rate": 8.188674252447738e-05, "loss": 0.3977, "step": 1470 }, { "epoch": 0.19324933080890513, "grad_norm": 0.6162587838829751, "learning_rate": 8.175443238952104e-05, "loss": 0.3469, "step": 1480 }, { "epoch": 0.19455506953058693, "grad_norm": 0.7700029985339574, "learning_rate": 8.16221222545647e-05, "loss": 0.3571, "step": 1490 }, { "epoch": 0.19586080825226873, "grad_norm": 0.7834228618660923, "learning_rate": 8.148981211960837e-05, "loss": 0.3819, "step": 1500 }, { "epoch": 0.1971665469739505, "grad_norm": 0.8795211361523141, "learning_rate": 8.135750198465203e-05, "loss": 0.3525, "step": 1510 }, { "epoch": 0.1984722856956323, "grad_norm": 0.6981170950193488, "learning_rate": 8.12251918496957e-05, "loss": 0.4582, "step": 1520 }, { "epoch": 0.1997780244173141, "grad_norm": 0.5328031828246813, "learning_rate": 8.109288171473935e-05, "loss": 0.4044, "step": 1530 }, { "epoch": 0.2010837631389959, "grad_norm": 0.7686508132468608, "learning_rate": 8.096057157978301e-05, "loss": 0.3842, "step": 1540 }, { "epoch": 0.20238950186067767, "grad_norm": 0.7809022971900477, "learning_rate": 8.082826144482668e-05, "loss": 0.3837, "step": 1550 }, { "epoch": 0.20369524058235947, "grad_norm": 0.5611493169032764, "learning_rate": 8.069595130987034e-05, "loss": 0.3534, "step": 1560 }, { "epoch": 0.20500097930404126, "grad_norm": 0.8938023407169691, "learning_rate": 8.0563641174914e-05, "loss": 0.4131, "step": 1570 }, { "epoch": 0.20630671802572306, "grad_norm": 1.1145122351340218, "learning_rate": 8.043133103995767e-05, "loss": 0.3941, "step": 1580 }, { "epoch": 0.20761245674740483, "grad_norm": 0.8717980848992837, "learning_rate": 8.029902090500133e-05, "loss": 0.446, "step": 1590 }, { "epoch": 0.20891819546908663, "grad_norm": 0.8760551575072543, "learning_rate": 8.016671077004498e-05, "loss": 0.4052, "step": 1600 }, { "epoch": 0.21022393419076843, "grad_norm": 0.6407079074392377, "learning_rate": 8.003440063508865e-05, "loss": 0.3851, "step": 1610 }, { "epoch": 0.21152967291245023, "grad_norm": 0.8066960599097299, "learning_rate": 7.990209050013231e-05, "loss": 0.3986, "step": 1620 }, { "epoch": 0.212835411634132, "grad_norm": 0.6759203212762297, "learning_rate": 7.976978036517598e-05, "loss": 0.399, "step": 1630 }, { "epoch": 0.2141411503558138, "grad_norm": 0.8000617666053981, "learning_rate": 7.963747023021964e-05, "loss": 0.3748, "step": 1640 }, { "epoch": 0.2154468890774956, "grad_norm": 1.014334383251508, "learning_rate": 7.950516009526329e-05, "loss": 0.392, "step": 1650 }, { "epoch": 0.2167526277991774, "grad_norm": 0.7051474331387947, "learning_rate": 7.937284996030696e-05, "loss": 0.4346, "step": 1660 }, { "epoch": 0.21805836652085916, "grad_norm": 0.701564773927804, "learning_rate": 7.924053982535062e-05, "loss": 0.4095, "step": 1670 }, { "epoch": 0.21936410524254096, "grad_norm": 0.6736104560781098, "learning_rate": 7.910822969039428e-05, "loss": 0.399, "step": 1680 }, { "epoch": 0.22066984396422276, "grad_norm": 0.6783966264076667, "learning_rate": 7.897591955543795e-05, "loss": 0.4239, "step": 1690 }, { "epoch": 0.22197558268590456, "grad_norm": 0.6184822028848435, "learning_rate": 7.884360942048161e-05, "loss": 0.3465, "step": 1700 }, { "epoch": 0.22328132140758633, "grad_norm": 0.5772691836417206, "learning_rate": 7.871129928552528e-05, "loss": 0.3695, "step": 1710 }, { "epoch": 0.22458706012926813, "grad_norm": 0.6300253366485316, "learning_rate": 7.857898915056894e-05, "loss": 0.3831, "step": 1720 }, { "epoch": 0.22589279885094993, "grad_norm": 0.7614909210021444, "learning_rate": 7.84466790156126e-05, "loss": 0.3543, "step": 1730 }, { "epoch": 0.22719853757263173, "grad_norm": 0.7167831641275993, "learning_rate": 7.831436888065627e-05, "loss": 0.3834, "step": 1740 }, { "epoch": 0.2285042762943135, "grad_norm": 0.8564594793963834, "learning_rate": 7.818205874569993e-05, "loss": 0.4398, "step": 1750 }, { "epoch": 0.2298100150159953, "grad_norm": 0.6490871785602306, "learning_rate": 7.804974861074358e-05, "loss": 0.4188, "step": 1760 }, { "epoch": 0.2311157537376771, "grad_norm": 0.7947065049211688, "learning_rate": 7.791743847578725e-05, "loss": 0.3828, "step": 1770 }, { "epoch": 0.2324214924593589, "grad_norm": 0.6708789509916131, "learning_rate": 7.778512834083091e-05, "loss": 0.4061, "step": 1780 }, { "epoch": 0.23372723118104066, "grad_norm": 0.9089620921859204, "learning_rate": 7.765281820587458e-05, "loss": 0.3527, "step": 1790 }, { "epoch": 0.23503296990272246, "grad_norm": 0.7631958034048972, "learning_rate": 7.752050807091824e-05, "loss": 0.3641, "step": 1800 }, { "epoch": 0.23633870862440426, "grad_norm": 0.9288441167845626, "learning_rate": 7.73881979359619e-05, "loss": 0.3861, "step": 1810 }, { "epoch": 0.23764444734608606, "grad_norm": 0.6396894921010845, "learning_rate": 7.725588780100555e-05, "loss": 0.3635, "step": 1820 }, { "epoch": 0.23895018606776783, "grad_norm": 0.7082487695704852, "learning_rate": 7.712357766604922e-05, "loss": 0.461, "step": 1830 }, { "epoch": 0.24025592478944963, "grad_norm": 0.784537132667547, "learning_rate": 7.699126753109288e-05, "loss": 0.4049, "step": 1840 }, { "epoch": 0.24156166351113142, "grad_norm": 0.7467341087789225, "learning_rate": 7.685895739613655e-05, "loss": 0.3679, "step": 1850 }, { "epoch": 0.24286740223281322, "grad_norm": 0.8236880297357595, "learning_rate": 7.672664726118021e-05, "loss": 0.3943, "step": 1860 }, { "epoch": 0.244173140954495, "grad_norm": 0.7556845025054512, "learning_rate": 7.659433712622387e-05, "loss": 0.4157, "step": 1870 }, { "epoch": 0.2454788796761768, "grad_norm": 0.6360791477832024, "learning_rate": 7.646202699126754e-05, "loss": 0.3831, "step": 1880 }, { "epoch": 0.2467846183978586, "grad_norm": 0.6303418719933962, "learning_rate": 7.632971685631119e-05, "loss": 0.4453, "step": 1890 }, { "epoch": 0.2480903571195404, "grad_norm": 0.5939177823249843, "learning_rate": 7.619740672135485e-05, "loss": 0.3846, "step": 1900 }, { "epoch": 0.24939609584122216, "grad_norm": 0.7017019771793869, "learning_rate": 7.606509658639852e-05, "loss": 0.3356, "step": 1910 }, { "epoch": 0.25070183456290396, "grad_norm": 0.6633493508823003, "learning_rate": 7.593278645144218e-05, "loss": 0.3924, "step": 1920 }, { "epoch": 0.25200757328458573, "grad_norm": 0.9559386511697058, "learning_rate": 7.580047631648585e-05, "loss": 0.3909, "step": 1930 }, { "epoch": 0.25331331200626755, "grad_norm": 0.692403498664843, "learning_rate": 7.566816618152951e-05, "loss": 0.4283, "step": 1940 }, { "epoch": 0.2546190507279493, "grad_norm": 0.7592635716034934, "learning_rate": 7.553585604657317e-05, "loss": 0.3854, "step": 1950 }, { "epoch": 0.25592478944963115, "grad_norm": 0.8355795315073584, "learning_rate": 7.540354591161684e-05, "loss": 0.3713, "step": 1960 }, { "epoch": 0.2572305281713129, "grad_norm": 0.47916243333799696, "learning_rate": 7.52712357766605e-05, "loss": 0.3744, "step": 1970 }, { "epoch": 0.2585362668929947, "grad_norm": 0.8653076631569674, "learning_rate": 7.513892564170417e-05, "loss": 0.4415, "step": 1980 }, { "epoch": 0.2598420056146765, "grad_norm": 0.8156425576285299, "learning_rate": 7.500661550674783e-05, "loss": 0.4027, "step": 1990 }, { "epoch": 0.2611477443363583, "grad_norm": 0.9597473121294312, "learning_rate": 7.487430537179148e-05, "loss": 0.3969, "step": 2000 }, { "epoch": 0.26245348305804006, "grad_norm": 0.7549626154771166, "learning_rate": 7.474199523683515e-05, "loss": 0.3675, "step": 2010 }, { "epoch": 0.2637592217797219, "grad_norm": 0.6154826505731877, "learning_rate": 7.460968510187881e-05, "loss": 0.366, "step": 2020 }, { "epoch": 0.26506496050140366, "grad_norm": 1.0580428481311195, "learning_rate": 7.447737496692247e-05, "loss": 0.3883, "step": 2030 }, { "epoch": 0.2663706992230855, "grad_norm": 1.1923011172328544, "learning_rate": 7.434506483196612e-05, "loss": 0.377, "step": 2040 }, { "epoch": 0.26767643794476725, "grad_norm": 0.9688140054971308, "learning_rate": 7.421275469700979e-05, "loss": 0.3862, "step": 2050 }, { "epoch": 0.268982176666449, "grad_norm": 0.9989048095302998, "learning_rate": 7.408044456205345e-05, "loss": 0.4097, "step": 2060 }, { "epoch": 0.27028791538813085, "grad_norm": 0.655029339029875, "learning_rate": 7.394813442709712e-05, "loss": 0.3445, "step": 2070 }, { "epoch": 0.2715936541098126, "grad_norm": 0.8180852421034762, "learning_rate": 7.381582429214078e-05, "loss": 0.3812, "step": 2080 }, { "epoch": 0.2728993928314944, "grad_norm": 0.8789518315935385, "learning_rate": 7.368351415718444e-05, "loss": 0.41, "step": 2090 }, { "epoch": 0.2742051315531762, "grad_norm": 0.5982810419180924, "learning_rate": 7.355120402222811e-05, "loss": 0.3222, "step": 2100 }, { "epoch": 0.275510870274858, "grad_norm": 0.8485036823014819, "learning_rate": 7.341889388727177e-05, "loss": 0.4533, "step": 2110 }, { "epoch": 0.2768166089965398, "grad_norm": 0.8875732639162344, "learning_rate": 7.328658375231542e-05, "loss": 0.3527, "step": 2120 }, { "epoch": 0.2781223477182216, "grad_norm": 0.8111954766908331, "learning_rate": 7.315427361735909e-05, "loss": 0.4169, "step": 2130 }, { "epoch": 0.27942808643990336, "grad_norm": 0.7374257771379444, "learning_rate": 7.302196348240275e-05, "loss": 0.3717, "step": 2140 }, { "epoch": 0.2807338251615852, "grad_norm": 0.6082137000751072, "learning_rate": 7.288965334744642e-05, "loss": 0.3968, "step": 2150 }, { "epoch": 0.28203956388326695, "grad_norm": 0.6583201858022675, "learning_rate": 7.275734321249008e-05, "loss": 0.3928, "step": 2160 }, { "epoch": 0.2833453026049487, "grad_norm": 0.8867612359893449, "learning_rate": 7.262503307753374e-05, "loss": 0.4004, "step": 2170 }, { "epoch": 0.28465104132663055, "grad_norm": 1.0731335842873515, "learning_rate": 7.249272294257741e-05, "loss": 0.3734, "step": 2180 }, { "epoch": 0.2859567800483123, "grad_norm": 0.6847809644200603, "learning_rate": 7.236041280762107e-05, "loss": 0.3908, "step": 2190 }, { "epoch": 0.28726251876999415, "grad_norm": 0.6593196156966425, "learning_rate": 7.222810267266474e-05, "loss": 0.3639, "step": 2200 }, { "epoch": 0.2885682574916759, "grad_norm": 0.7624075216466923, "learning_rate": 7.20957925377084e-05, "loss": 0.3686, "step": 2210 }, { "epoch": 0.2898739962133577, "grad_norm": 0.7977596110722943, "learning_rate": 7.196348240275206e-05, "loss": 0.3302, "step": 2220 }, { "epoch": 0.2911797349350395, "grad_norm": 0.573162160889478, "learning_rate": 7.183117226779572e-05, "loss": 0.4212, "step": 2230 }, { "epoch": 0.2924854736567213, "grad_norm": 0.6982307854439872, "learning_rate": 7.169886213283938e-05, "loss": 0.3792, "step": 2240 }, { "epoch": 0.29379121237840305, "grad_norm": 0.7303994668682778, "learning_rate": 7.156655199788304e-05, "loss": 0.3662, "step": 2250 }, { "epoch": 0.2950969511000849, "grad_norm": 0.7454812485507484, "learning_rate": 7.14342418629267e-05, "loss": 0.3755, "step": 2260 }, { "epoch": 0.29640268982176665, "grad_norm": 0.6901721610990712, "learning_rate": 7.130193172797036e-05, "loss": 0.3816, "step": 2270 }, { "epoch": 0.2977084285434485, "grad_norm": 0.6875693961855176, "learning_rate": 7.116962159301402e-05, "loss": 0.4125, "step": 2280 }, { "epoch": 0.29901416726513025, "grad_norm": 0.7465504461299693, "learning_rate": 7.103731145805769e-05, "loss": 0.4327, "step": 2290 }, { "epoch": 0.300319905986812, "grad_norm": 1.066401978095089, "learning_rate": 7.090500132310135e-05, "loss": 0.3304, "step": 2300 }, { "epoch": 0.30162564470849385, "grad_norm": 0.7695932077689371, "learning_rate": 7.077269118814501e-05, "loss": 0.404, "step": 2310 }, { "epoch": 0.3029313834301756, "grad_norm": 1.0806277114784977, "learning_rate": 7.064038105318868e-05, "loss": 0.3843, "step": 2320 }, { "epoch": 0.3042371221518574, "grad_norm": 0.7136484119583236, "learning_rate": 7.050807091823234e-05, "loss": 0.4043, "step": 2330 }, { "epoch": 0.3055428608735392, "grad_norm": 0.8168316318378807, "learning_rate": 7.0375760783276e-05, "loss": 0.3525, "step": 2340 }, { "epoch": 0.306848599595221, "grad_norm": 0.997603205860958, "learning_rate": 7.024345064831967e-05, "loss": 0.4038, "step": 2350 }, { "epoch": 0.3081543383169028, "grad_norm": 0.8994788383155479, "learning_rate": 7.011114051336332e-05, "loss": 0.3733, "step": 2360 }, { "epoch": 0.3094600770385846, "grad_norm": 0.6907067484368589, "learning_rate": 6.997883037840699e-05, "loss": 0.4113, "step": 2370 }, { "epoch": 0.31076581576026635, "grad_norm": 0.7642399860115726, "learning_rate": 6.984652024345065e-05, "loss": 0.3848, "step": 2380 }, { "epoch": 0.3120715544819482, "grad_norm": 0.691730701205516, "learning_rate": 6.971421010849431e-05, "loss": 0.3478, "step": 2390 }, { "epoch": 0.31337729320362995, "grad_norm": 0.8190867542983772, "learning_rate": 6.958189997353798e-05, "loss": 0.3568, "step": 2400 }, { "epoch": 0.3146830319253117, "grad_norm": 0.7455522928040189, "learning_rate": 6.944958983858164e-05, "loss": 0.3184, "step": 2410 }, { "epoch": 0.31598877064699354, "grad_norm": 0.5748397850934389, "learning_rate": 6.93172797036253e-05, "loss": 0.3546, "step": 2420 }, { "epoch": 0.3172945093686753, "grad_norm": 0.8074055526542768, "learning_rate": 6.918496956866897e-05, "loss": 0.3586, "step": 2430 }, { "epoch": 0.31860024809035714, "grad_norm": 0.7434459226292305, "learning_rate": 6.905265943371263e-05, "loss": 0.3793, "step": 2440 }, { "epoch": 0.3199059868120389, "grad_norm": 0.7089530318410264, "learning_rate": 6.89203492987563e-05, "loss": 0.41, "step": 2450 }, { "epoch": 0.3212117255337207, "grad_norm": 0.5945382776226636, "learning_rate": 6.878803916379995e-05, "loss": 0.3668, "step": 2460 }, { "epoch": 0.3225174642554025, "grad_norm": 0.5835994786644748, "learning_rate": 6.865572902884361e-05, "loss": 0.3738, "step": 2470 }, { "epoch": 0.3238232029770843, "grad_norm": 1.0241574564233715, "learning_rate": 6.852341889388726e-05, "loss": 0.3951, "step": 2480 }, { "epoch": 0.32512894169876605, "grad_norm": 0.7147255869111713, "learning_rate": 6.839110875893093e-05, "loss": 0.3405, "step": 2490 }, { "epoch": 0.3264346804204479, "grad_norm": 0.695266623973948, "learning_rate": 6.825879862397459e-05, "loss": 0.3808, "step": 2500 }, { "epoch": 0.32774041914212965, "grad_norm": 0.966320496860587, "learning_rate": 6.812648848901826e-05, "loss": 0.4211, "step": 2510 }, { "epoch": 0.3290461578638115, "grad_norm": 0.7307499127897409, "learning_rate": 6.799417835406192e-05, "loss": 0.3941, "step": 2520 }, { "epoch": 0.33035189658549324, "grad_norm": 0.7625573954064706, "learning_rate": 6.786186821910558e-05, "loss": 0.3695, "step": 2530 }, { "epoch": 0.331657635307175, "grad_norm": 0.6913173334795023, "learning_rate": 6.772955808414925e-05, "loss": 0.3946, "step": 2540 }, { "epoch": 0.33296337402885684, "grad_norm": 0.8108629160370623, "learning_rate": 6.759724794919291e-05, "loss": 0.3618, "step": 2550 }, { "epoch": 0.3342691127505386, "grad_norm": 0.7964814283593564, "learning_rate": 6.746493781423658e-05, "loss": 0.3639, "step": 2560 }, { "epoch": 0.3355748514722204, "grad_norm": 0.997933668993109, "learning_rate": 6.733262767928024e-05, "loss": 0.3926, "step": 2570 }, { "epoch": 0.3368805901939022, "grad_norm": 0.6258007771960417, "learning_rate": 6.72003175443239e-05, "loss": 0.394, "step": 2580 }, { "epoch": 0.338186328915584, "grad_norm": 0.6802913820517577, "learning_rate": 6.706800740936756e-05, "loss": 0.3323, "step": 2590 }, { "epoch": 0.3394920676372658, "grad_norm": 0.7159963548016552, "learning_rate": 6.693569727441122e-05, "loss": 0.3477, "step": 2600 }, { "epoch": 0.3407978063589476, "grad_norm": 0.7690385509947173, "learning_rate": 6.680338713945488e-05, "loss": 0.3588, "step": 2610 }, { "epoch": 0.34210354508062935, "grad_norm": 0.7065378508784833, "learning_rate": 6.667107700449855e-05, "loss": 0.3829, "step": 2620 }, { "epoch": 0.34340928380231117, "grad_norm": 0.7314858112617808, "learning_rate": 6.653876686954221e-05, "loss": 0.3718, "step": 2630 }, { "epoch": 0.34471502252399294, "grad_norm": 0.7064516264021813, "learning_rate": 6.640645673458588e-05, "loss": 0.4166, "step": 2640 }, { "epoch": 0.3460207612456747, "grad_norm": 0.8220074189639662, "learning_rate": 6.627414659962954e-05, "loss": 0.3985, "step": 2650 }, { "epoch": 0.34732649996735654, "grad_norm": 0.6457028113287738, "learning_rate": 6.61418364646732e-05, "loss": 0.3624, "step": 2660 }, { "epoch": 0.3486322386890383, "grad_norm": 0.7966342138746413, "learning_rate": 6.600952632971687e-05, "loss": 0.3658, "step": 2670 }, { "epoch": 0.34993797741072014, "grad_norm": 0.6489537352654261, "learning_rate": 6.587721619476053e-05, "loss": 0.3329, "step": 2680 }, { "epoch": 0.3512437161324019, "grad_norm": 0.6325325561055318, "learning_rate": 6.574490605980418e-05, "loss": 0.3354, "step": 2690 }, { "epoch": 0.3525494548540837, "grad_norm": 0.6731029108410079, "learning_rate": 6.561259592484785e-05, "loss": 0.4055, "step": 2700 }, { "epoch": 0.3538551935757655, "grad_norm": 0.6521860569821696, "learning_rate": 6.548028578989151e-05, "loss": 0.3329, "step": 2710 }, { "epoch": 0.3551609322974473, "grad_norm": 0.6871520559095162, "learning_rate": 6.534797565493516e-05, "loss": 0.3535, "step": 2720 }, { "epoch": 0.3564666710191291, "grad_norm": 0.822595410669817, "learning_rate": 6.521566551997883e-05, "loss": 0.3695, "step": 2730 }, { "epoch": 0.35777240974081087, "grad_norm": 0.7305262583248748, "learning_rate": 6.508335538502249e-05, "loss": 0.3546, "step": 2740 }, { "epoch": 0.35907814846249264, "grad_norm": 0.775640452487442, "learning_rate": 6.495104525006615e-05, "loss": 0.3935, "step": 2750 }, { "epoch": 0.36038388718417447, "grad_norm": 0.5622398775947159, "learning_rate": 6.481873511510982e-05, "loss": 0.3796, "step": 2760 }, { "epoch": 0.36168962590585624, "grad_norm": 0.8432831255834533, "learning_rate": 6.468642498015348e-05, "loss": 0.3809, "step": 2770 }, { "epoch": 0.362995364627538, "grad_norm": 0.6852005146951986, "learning_rate": 6.455411484519715e-05, "loss": 0.3718, "step": 2780 }, { "epoch": 0.36430110334921983, "grad_norm": 0.7838127164310449, "learning_rate": 6.442180471024081e-05, "loss": 0.3264, "step": 2790 }, { "epoch": 0.3656068420709016, "grad_norm": 0.9290637314095141, "learning_rate": 6.428949457528447e-05, "loss": 0.4082, "step": 2800 }, { "epoch": 0.36691258079258343, "grad_norm": 0.5903236399026356, "learning_rate": 6.415718444032814e-05, "loss": 0.3317, "step": 2810 }, { "epoch": 0.3682183195142652, "grad_norm": 0.7064838669446578, "learning_rate": 6.402487430537179e-05, "loss": 0.3453, "step": 2820 }, { "epoch": 0.369524058235947, "grad_norm": 0.8900365749101484, "learning_rate": 6.389256417041545e-05, "loss": 0.3321, "step": 2830 }, { "epoch": 0.3708297969576288, "grad_norm": 0.7842132664851053, "learning_rate": 6.376025403545912e-05, "loss": 0.3633, "step": 2840 }, { "epoch": 0.37213553567931057, "grad_norm": 0.5929414972262719, "learning_rate": 6.362794390050278e-05, "loss": 0.3434, "step": 2850 }, { "epoch": 0.37344127440099234, "grad_norm": 0.503386620178694, "learning_rate": 6.349563376554645e-05, "loss": 0.3556, "step": 2860 }, { "epoch": 0.37474701312267417, "grad_norm": 0.5903771809459383, "learning_rate": 6.336332363059011e-05, "loss": 0.3812, "step": 2870 }, { "epoch": 0.37605275184435594, "grad_norm": 0.8542853334494567, "learning_rate": 6.323101349563377e-05, "loss": 0.3799, "step": 2880 }, { "epoch": 0.37735849056603776, "grad_norm": 0.7476902212131092, "learning_rate": 6.309870336067744e-05, "loss": 0.3665, "step": 2890 }, { "epoch": 0.37866422928771953, "grad_norm": 0.91201012608746, "learning_rate": 6.29663932257211e-05, "loss": 0.3844, "step": 2900 }, { "epoch": 0.3799699680094013, "grad_norm": 0.7249871540242889, "learning_rate": 6.283408309076475e-05, "loss": 0.3739, "step": 2910 }, { "epoch": 0.38127570673108313, "grad_norm": 0.6768959061914, "learning_rate": 6.270177295580842e-05, "loss": 0.3423, "step": 2920 }, { "epoch": 0.3825814454527649, "grad_norm": 0.49879786127518344, "learning_rate": 6.256946282085208e-05, "loss": 0.3589, "step": 2930 }, { "epoch": 0.38388718417444667, "grad_norm": 1.1391501458158868, "learning_rate": 6.243715268589575e-05, "loss": 0.391, "step": 2940 }, { "epoch": 0.3851929228961285, "grad_norm": 0.7110665513965606, "learning_rate": 6.23048425509394e-05, "loss": 0.3504, "step": 2950 }, { "epoch": 0.38649866161781027, "grad_norm": 0.7021872565181878, "learning_rate": 6.217253241598306e-05, "loss": 0.3528, "step": 2960 }, { "epoch": 0.3878044003394921, "grad_norm": 0.7624387457719385, "learning_rate": 6.204022228102672e-05, "loss": 0.3651, "step": 2970 }, { "epoch": 0.38911013906117387, "grad_norm": 0.694263239359064, "learning_rate": 6.190791214607039e-05, "loss": 0.3604, "step": 2980 }, { "epoch": 0.39041587778285564, "grad_norm": 0.6008928620877932, "learning_rate": 6.177560201111405e-05, "loss": 0.3882, "step": 2990 }, { "epoch": 0.39172161650453746, "grad_norm": 0.762962247285366, "learning_rate": 6.164329187615772e-05, "loss": 0.3781, "step": 3000 }, { "epoch": 0.39302735522621923, "grad_norm": 0.638291362563214, "learning_rate": 6.151098174120138e-05, "loss": 0.3359, "step": 3010 }, { "epoch": 0.394333093947901, "grad_norm": 0.7133584352397153, "learning_rate": 6.137867160624504e-05, "loss": 0.3696, "step": 3020 }, { "epoch": 0.39563883266958283, "grad_norm": 0.6500556876238234, "learning_rate": 6.124636147128871e-05, "loss": 0.37, "step": 3030 }, { "epoch": 0.3969445713912646, "grad_norm": 0.707461384834344, "learning_rate": 6.111405133633237e-05, "loss": 0.3987, "step": 3040 }, { "epoch": 0.3982503101129464, "grad_norm": 0.7220757436346421, "learning_rate": 6.098174120137603e-05, "loss": 0.3649, "step": 3050 }, { "epoch": 0.3995560488346282, "grad_norm": 0.7715850908947294, "learning_rate": 6.0849431066419694e-05, "loss": 0.3244, "step": 3060 }, { "epoch": 0.40086178755630997, "grad_norm": 0.7649024906573362, "learning_rate": 6.071712093146336e-05, "loss": 0.3462, "step": 3070 }, { "epoch": 0.4021675262779918, "grad_norm": 0.6677816209901085, "learning_rate": 6.0584810796507016e-05, "loss": 0.3768, "step": 3080 }, { "epoch": 0.40347326499967356, "grad_norm": 0.7519608310304735, "learning_rate": 6.045250066155068e-05, "loss": 0.3574, "step": 3090 }, { "epoch": 0.40477900372135533, "grad_norm": 1.0302063729574247, "learning_rate": 6.0320190526594344e-05, "loss": 0.3632, "step": 3100 }, { "epoch": 0.40608474244303716, "grad_norm": 0.8124786856872885, "learning_rate": 6.018788039163801e-05, "loss": 0.3804, "step": 3110 }, { "epoch": 0.40739048116471893, "grad_norm": 1.1663215316506883, "learning_rate": 6.005557025668166e-05, "loss": 0.3456, "step": 3120 }, { "epoch": 0.40869621988640076, "grad_norm": 0.8165657207013097, "learning_rate": 5.992326012172532e-05, "loss": 0.371, "step": 3130 }, { "epoch": 0.41000195860808253, "grad_norm": 0.8140851081568773, "learning_rate": 5.979094998676899e-05, "loss": 0.3892, "step": 3140 }, { "epoch": 0.4113076973297643, "grad_norm": 0.6077355158139854, "learning_rate": 5.965863985181265e-05, "loss": 0.3577, "step": 3150 }, { "epoch": 0.4126134360514461, "grad_norm": 0.8023067990609025, "learning_rate": 5.952632971685631e-05, "loss": 0.3539, "step": 3160 }, { "epoch": 0.4139191747731279, "grad_norm": 0.694962984227201, "learning_rate": 5.939401958189997e-05, "loss": 0.3999, "step": 3170 }, { "epoch": 0.41522491349480967, "grad_norm": 0.683070552832424, "learning_rate": 5.9261709446943636e-05, "loss": 0.3493, "step": 3180 }, { "epoch": 0.4165306522164915, "grad_norm": 0.6909656666701103, "learning_rate": 5.91293993119873e-05, "loss": 0.3804, "step": 3190 }, { "epoch": 0.41783639093817326, "grad_norm": 0.8473135084371841, "learning_rate": 5.8997089177030965e-05, "loss": 0.3754, "step": 3200 }, { "epoch": 0.4191421296598551, "grad_norm": 0.9173915852527912, "learning_rate": 5.886477904207462e-05, "loss": 0.4104, "step": 3210 }, { "epoch": 0.42044786838153686, "grad_norm": 0.7009398877601288, "learning_rate": 5.8732468907118286e-05, "loss": 0.415, "step": 3220 }, { "epoch": 0.42175360710321863, "grad_norm": 1.0537743952899077, "learning_rate": 5.860015877216195e-05, "loss": 0.3792, "step": 3230 }, { "epoch": 0.42305934582490046, "grad_norm": 0.7195938231845664, "learning_rate": 5.8467848637205614e-05, "loss": 0.3421, "step": 3240 }, { "epoch": 0.4243650845465822, "grad_norm": 0.6768335148724509, "learning_rate": 5.833553850224928e-05, "loss": 0.3656, "step": 3250 }, { "epoch": 0.425670823268264, "grad_norm": 0.7214819545803853, "learning_rate": 5.8203228367292936e-05, "loss": 0.3722, "step": 3260 }, { "epoch": 0.4269765619899458, "grad_norm": 0.6431922805809357, "learning_rate": 5.80709182323366e-05, "loss": 0.3659, "step": 3270 }, { "epoch": 0.4282823007116276, "grad_norm": 0.8133347562563679, "learning_rate": 5.7938608097380264e-05, "loss": 0.3418, "step": 3280 }, { "epoch": 0.4295880394333094, "grad_norm": 0.9284250977269741, "learning_rate": 5.780629796242393e-05, "loss": 0.363, "step": 3290 }, { "epoch": 0.4308937781549912, "grad_norm": 0.8037356080027969, "learning_rate": 5.767398782746759e-05, "loss": 0.3696, "step": 3300 }, { "epoch": 0.43219951687667296, "grad_norm": 1.1698954520428022, "learning_rate": 5.7541677692511256e-05, "loss": 0.3339, "step": 3310 }, { "epoch": 0.4335052555983548, "grad_norm": 0.7813109401549374, "learning_rate": 5.7409367557554914e-05, "loss": 0.3405, "step": 3320 }, { "epoch": 0.43481099432003656, "grad_norm": 0.8684126879631643, "learning_rate": 5.727705742259858e-05, "loss": 0.3541, "step": 3330 }, { "epoch": 0.43611673304171833, "grad_norm": 0.9602030399474978, "learning_rate": 5.714474728764223e-05, "loss": 0.3443, "step": 3340 }, { "epoch": 0.43742247176340016, "grad_norm": 0.723423934843899, "learning_rate": 5.701243715268589e-05, "loss": 0.3599, "step": 3350 }, { "epoch": 0.4387282104850819, "grad_norm": 0.9036096850355771, "learning_rate": 5.6880127017729557e-05, "loss": 0.3559, "step": 3360 }, { "epoch": 0.44003394920676375, "grad_norm": 0.5925348401371253, "learning_rate": 5.674781688277322e-05, "loss": 0.3408, "step": 3370 }, { "epoch": 0.4413396879284455, "grad_norm": 0.8390343357770222, "learning_rate": 5.6615506747816885e-05, "loss": 0.3922, "step": 3380 }, { "epoch": 0.4426454266501273, "grad_norm": 0.84168674416005, "learning_rate": 5.648319661286054e-05, "loss": 0.3458, "step": 3390 }, { "epoch": 0.4439511653718091, "grad_norm": 0.6132065178793368, "learning_rate": 5.6350886477904206e-05, "loss": 0.4061, "step": 3400 }, { "epoch": 0.4452569040934909, "grad_norm": 0.7795412129846988, "learning_rate": 5.621857634294787e-05, "loss": 0.3272, "step": 3410 }, { "epoch": 0.44656264281517266, "grad_norm": 0.928901988938315, "learning_rate": 5.6086266207991534e-05, "loss": 0.3874, "step": 3420 }, { "epoch": 0.4478683815368545, "grad_norm": 0.850754366375525, "learning_rate": 5.59539560730352e-05, "loss": 0.3631, "step": 3430 }, { "epoch": 0.44917412025853626, "grad_norm": 0.6165303888379535, "learning_rate": 5.5821645938078856e-05, "loss": 0.3691, "step": 3440 }, { "epoch": 0.4504798589802181, "grad_norm": 0.7842930339530104, "learning_rate": 5.568933580312252e-05, "loss": 0.3936, "step": 3450 }, { "epoch": 0.45178559770189985, "grad_norm": 0.51794810832377, "learning_rate": 5.5557025668166184e-05, "loss": 0.3327, "step": 3460 }, { "epoch": 0.4530913364235816, "grad_norm": 0.6895988390760774, "learning_rate": 5.542471553320985e-05, "loss": 0.3782, "step": 3470 }, { "epoch": 0.45439707514526345, "grad_norm": 0.6735225640790709, "learning_rate": 5.529240539825351e-05, "loss": 0.3517, "step": 3480 }, { "epoch": 0.4557028138669452, "grad_norm": 0.5211084501012763, "learning_rate": 5.5160095263297177e-05, "loss": 0.387, "step": 3490 }, { "epoch": 0.457008552588627, "grad_norm": 0.7384932211476657, "learning_rate": 5.5027785128340834e-05, "loss": 0.3846, "step": 3500 }, { "epoch": 0.4583142913103088, "grad_norm": 0.6164135133073266, "learning_rate": 5.48954749933845e-05, "loss": 0.3444, "step": 3510 }, { "epoch": 0.4596200300319906, "grad_norm": 0.7767558913020326, "learning_rate": 5.476316485842816e-05, "loss": 0.3206, "step": 3520 }, { "epoch": 0.4609257687536724, "grad_norm": 0.969684850230047, "learning_rate": 5.4630854723471826e-05, "loss": 0.3818, "step": 3530 }, { "epoch": 0.4622315074753542, "grad_norm": 0.8630512426351037, "learning_rate": 5.449854458851549e-05, "loss": 0.3375, "step": 3540 }, { "epoch": 0.46353724619703596, "grad_norm": 0.8318796166869811, "learning_rate": 5.436623445355915e-05, "loss": 0.3964, "step": 3550 }, { "epoch": 0.4648429849187178, "grad_norm": 0.7071654927945067, "learning_rate": 5.4233924318602805e-05, "loss": 0.3917, "step": 3560 }, { "epoch": 0.46614872364039955, "grad_norm": 0.6454484407304377, "learning_rate": 5.410161418364646e-05, "loss": 0.3582, "step": 3570 }, { "epoch": 0.4674544623620813, "grad_norm": 0.6503030049049471, "learning_rate": 5.3969304048690126e-05, "loss": 0.3895, "step": 3580 }, { "epoch": 0.46876020108376315, "grad_norm": 0.6551631558901667, "learning_rate": 5.383699391373379e-05, "loss": 0.3477, "step": 3590 }, { "epoch": 0.4700659398054449, "grad_norm": 0.7188061640550096, "learning_rate": 5.3704683778777455e-05, "loss": 0.3337, "step": 3600 }, { "epoch": 0.47137167852712675, "grad_norm": 0.7114930103425791, "learning_rate": 5.357237364382112e-05, "loss": 0.3276, "step": 3610 }, { "epoch": 0.4726774172488085, "grad_norm": 0.5836693870944046, "learning_rate": 5.3440063508864776e-05, "loss": 0.3104, "step": 3620 }, { "epoch": 0.4739831559704903, "grad_norm": 0.5961390168625231, "learning_rate": 5.330775337390844e-05, "loss": 0.404, "step": 3630 }, { "epoch": 0.4752888946921721, "grad_norm": 0.557357402781801, "learning_rate": 5.3175443238952104e-05, "loss": 0.3491, "step": 3640 }, { "epoch": 0.4765946334138539, "grad_norm": 0.8999505092954329, "learning_rate": 5.304313310399577e-05, "loss": 0.3257, "step": 3650 }, { "epoch": 0.47790037213553566, "grad_norm": 0.573656512680599, "learning_rate": 5.291082296903943e-05, "loss": 0.3842, "step": 3660 }, { "epoch": 0.4792061108572175, "grad_norm": 0.693797723071788, "learning_rate": 5.27785128340831e-05, "loss": 0.3541, "step": 3670 }, { "epoch": 0.48051184957889925, "grad_norm": 0.895401849318473, "learning_rate": 5.2646202699126754e-05, "loss": 0.3653, "step": 3680 }, { "epoch": 0.4818175883005811, "grad_norm": 1.2462508602614697, "learning_rate": 5.251389256417042e-05, "loss": 0.3765, "step": 3690 }, { "epoch": 0.48312332702226285, "grad_norm": 0.5707772023560279, "learning_rate": 5.238158242921408e-05, "loss": 0.3095, "step": 3700 }, { "epoch": 0.4844290657439446, "grad_norm": 0.933586586166466, "learning_rate": 5.2249272294257746e-05, "loss": 0.3492, "step": 3710 }, { "epoch": 0.48573480446562645, "grad_norm": 0.6759885845135221, "learning_rate": 5.211696215930141e-05, "loss": 0.3681, "step": 3720 }, { "epoch": 0.4870405431873082, "grad_norm": 0.6737159875219385, "learning_rate": 5.198465202434507e-05, "loss": 0.3804, "step": 3730 }, { "epoch": 0.48834628190899, "grad_norm": 0.7915288712677737, "learning_rate": 5.185234188938873e-05, "loss": 0.3281, "step": 3740 }, { "epoch": 0.4896520206306718, "grad_norm": 0.8582008723635511, "learning_rate": 5.1720031754432396e-05, "loss": 0.3563, "step": 3750 }, { "epoch": 0.4909577593523536, "grad_norm": 0.694326759700088, "learning_rate": 5.158772161947606e-05, "loss": 0.3362, "step": 3760 }, { "epoch": 0.4922634980740354, "grad_norm": 0.6685533319845902, "learning_rate": 5.1455411484519724e-05, "loss": 0.3671, "step": 3770 }, { "epoch": 0.4935692367957172, "grad_norm": 0.7234839909074131, "learning_rate": 5.1323101349563375e-05, "loss": 0.3712, "step": 3780 }, { "epoch": 0.49487497551739895, "grad_norm": 0.6356609557158871, "learning_rate": 5.119079121460704e-05, "loss": 0.3404, "step": 3790 }, { "epoch": 0.4961807142390808, "grad_norm": 0.683800144066809, "learning_rate": 5.1058481079650696e-05, "loss": 0.3749, "step": 3800 }, { "epoch": 0.49748645296076255, "grad_norm": 0.838581215032931, "learning_rate": 5.092617094469436e-05, "loss": 0.3821, "step": 3810 }, { "epoch": 0.4987921916824443, "grad_norm": 0.723338471243074, "learning_rate": 5.0793860809738024e-05, "loss": 0.3516, "step": 3820 }, { "epoch": 0.5000979304041261, "grad_norm": 0.8636733704767072, "learning_rate": 5.066155067478169e-05, "loss": 0.3514, "step": 3830 }, { "epoch": 0.5014036691258079, "grad_norm": 0.6729791744063756, "learning_rate": 5.052924053982535e-05, "loss": 0.3423, "step": 3840 }, { "epoch": 0.5027094078474897, "grad_norm": 0.6110412085829554, "learning_rate": 5.039693040486902e-05, "loss": 0.3619, "step": 3850 }, { "epoch": 0.5040151465691715, "grad_norm": 0.5114763349435761, "learning_rate": 5.0264620269912674e-05, "loss": 0.3815, "step": 3860 }, { "epoch": 0.5053208852908533, "grad_norm": 0.7103584875680274, "learning_rate": 5.013231013495634e-05, "loss": 0.3544, "step": 3870 }, { "epoch": 0.5066266240125351, "grad_norm": 0.9361168595695444, "learning_rate": 5e-05, "loss": 0.3152, "step": 3880 }, { "epoch": 0.5079323627342169, "grad_norm": 0.6094952677251722, "learning_rate": 4.9867689865043667e-05, "loss": 0.344, "step": 3890 }, { "epoch": 0.5092381014558987, "grad_norm": 0.6152737199310468, "learning_rate": 4.973537973008733e-05, "loss": 0.3232, "step": 3900 }, { "epoch": 0.5105438401775805, "grad_norm": 0.7351562526915447, "learning_rate": 4.960306959513099e-05, "loss": 0.3656, "step": 3910 }, { "epoch": 0.5118495788992623, "grad_norm": 0.7886316520977281, "learning_rate": 4.947075946017465e-05, "loss": 0.3521, "step": 3920 }, { "epoch": 0.513155317620944, "grad_norm": 0.6868536560624379, "learning_rate": 4.9338449325218316e-05, "loss": 0.3575, "step": 3930 }, { "epoch": 0.5144610563426258, "grad_norm": 0.7636461679680541, "learning_rate": 4.9206139190261974e-05, "loss": 0.3559, "step": 3940 }, { "epoch": 0.5157667950643077, "grad_norm": 0.8708760137517473, "learning_rate": 4.907382905530564e-05, "loss": 0.374, "step": 3950 }, { "epoch": 0.5170725337859894, "grad_norm": 0.8155352425390439, "learning_rate": 4.89415189203493e-05, "loss": 0.4114, "step": 3960 }, { "epoch": 0.5183782725076712, "grad_norm": 0.6125473401308895, "learning_rate": 4.8809208785392966e-05, "loss": 0.3557, "step": 3970 }, { "epoch": 0.519684011229353, "grad_norm": 0.7000544321921915, "learning_rate": 4.867689865043662e-05, "loss": 0.3552, "step": 3980 }, { "epoch": 0.5209897499510348, "grad_norm": 0.8399287812926355, "learning_rate": 4.854458851548029e-05, "loss": 0.3847, "step": 3990 }, { "epoch": 0.5222954886727166, "grad_norm": 0.7404298209906728, "learning_rate": 4.841227838052395e-05, "loss": 0.3714, "step": 4000 }, { "epoch": 0.5236012273943984, "grad_norm": 0.918070276686145, "learning_rate": 4.8279968245567616e-05, "loss": 0.3417, "step": 4010 }, { "epoch": 0.5249069661160801, "grad_norm": 0.8947915282909988, "learning_rate": 4.814765811061128e-05, "loss": 0.389, "step": 4020 }, { "epoch": 0.526212704837762, "grad_norm": 0.6199696115977593, "learning_rate": 4.801534797565494e-05, "loss": 0.3489, "step": 4030 }, { "epoch": 0.5275184435594438, "grad_norm": 0.814086611350882, "learning_rate": 4.78830378406986e-05, "loss": 0.3749, "step": 4040 }, { "epoch": 0.5288241822811256, "grad_norm": 0.7922936323495489, "learning_rate": 4.775072770574226e-05, "loss": 0.3428, "step": 4050 }, { "epoch": 0.5301299210028073, "grad_norm": 0.7059926199965918, "learning_rate": 4.761841757078592e-05, "loss": 0.342, "step": 4060 }, { "epoch": 0.5314356597244891, "grad_norm": 0.6360020863993846, "learning_rate": 4.748610743582959e-05, "loss": 0.3421, "step": 4070 }, { "epoch": 0.532741398446171, "grad_norm": 0.7140327284156396, "learning_rate": 4.735379730087325e-05, "loss": 0.3511, "step": 4080 }, { "epoch": 0.5340471371678527, "grad_norm": 0.808478556612797, "learning_rate": 4.722148716591691e-05, "loss": 0.335, "step": 4090 }, { "epoch": 0.5353528758895345, "grad_norm": 0.5853575460924064, "learning_rate": 4.708917703096057e-05, "loss": 0.3592, "step": 4100 }, { "epoch": 0.5366586146112163, "grad_norm": 0.5932507028870644, "learning_rate": 4.6956866896004236e-05, "loss": 0.3074, "step": 4110 }, { "epoch": 0.537964353332898, "grad_norm": 0.6012483176918041, "learning_rate": 4.68245567610479e-05, "loss": 0.3304, "step": 4120 }, { "epoch": 0.5392700920545799, "grad_norm": 0.8443330424339736, "learning_rate": 4.6692246626091565e-05, "loss": 0.3719, "step": 4130 }, { "epoch": 0.5405758307762617, "grad_norm": 0.6326393246751175, "learning_rate": 4.655993649113523e-05, "loss": 0.3354, "step": 4140 }, { "epoch": 0.5418815694979434, "grad_norm": 0.7026969470945176, "learning_rate": 4.6427626356178886e-05, "loss": 0.3522, "step": 4150 }, { "epoch": 0.5431873082196252, "grad_norm": 0.6531041974107171, "learning_rate": 4.629531622122254e-05, "loss": 0.3746, "step": 4160 }, { "epoch": 0.5444930469413071, "grad_norm": 0.6482802600779038, "learning_rate": 4.616300608626621e-05, "loss": 0.381, "step": 4170 }, { "epoch": 0.5457987856629888, "grad_norm": 0.750177651565516, "learning_rate": 4.603069595130987e-05, "loss": 0.3336, "step": 4180 }, { "epoch": 0.5471045243846706, "grad_norm": 0.6216364974132169, "learning_rate": 4.5898385816353536e-05, "loss": 0.3846, "step": 4190 }, { "epoch": 0.5484102631063524, "grad_norm": 0.6093631084044545, "learning_rate": 4.57660756813972e-05, "loss": 0.3543, "step": 4200 }, { "epoch": 0.5497160018280343, "grad_norm": 0.6133054404317404, "learning_rate": 4.563376554644086e-05, "loss": 0.3389, "step": 4210 }, { "epoch": 0.551021740549716, "grad_norm": 0.7662023723049465, "learning_rate": 4.550145541148452e-05, "loss": 0.3556, "step": 4220 }, { "epoch": 0.5523274792713978, "grad_norm": 0.8094877728662864, "learning_rate": 4.5369145276528185e-05, "loss": 0.3969, "step": 4230 }, { "epoch": 0.5536332179930796, "grad_norm": 0.6492528412358544, "learning_rate": 4.523683514157185e-05, "loss": 0.3329, "step": 4240 }, { "epoch": 0.5549389567147613, "grad_norm": 0.5511819814686476, "learning_rate": 4.5104525006615514e-05, "loss": 0.3592, "step": 4250 }, { "epoch": 0.5562446954364432, "grad_norm": 0.7826369096464542, "learning_rate": 4.497221487165917e-05, "loss": 0.3518, "step": 4260 }, { "epoch": 0.557550434158125, "grad_norm": 0.9016876799518607, "learning_rate": 4.483990473670283e-05, "loss": 0.297, "step": 4270 }, { "epoch": 0.5588561728798067, "grad_norm": 0.7257742557583019, "learning_rate": 4.470759460174649e-05, "loss": 0.3474, "step": 4280 }, { "epoch": 0.5601619116014885, "grad_norm": 0.7356287986471893, "learning_rate": 4.4575284466790157e-05, "loss": 0.3295, "step": 4290 }, { "epoch": 0.5614676503231704, "grad_norm": 0.8205676393579607, "learning_rate": 4.444297433183382e-05, "loss": 0.361, "step": 4300 }, { "epoch": 0.5627733890448521, "grad_norm": 0.9422114310116798, "learning_rate": 4.4310664196877485e-05, "loss": 0.3777, "step": 4310 }, { "epoch": 0.5640791277665339, "grad_norm": 0.5469170573921535, "learning_rate": 4.417835406192115e-05, "loss": 0.2994, "step": 4320 }, { "epoch": 0.5653848664882157, "grad_norm": 0.8384407275890108, "learning_rate": 4.4046043926964806e-05, "loss": 0.3867, "step": 4330 }, { "epoch": 0.5666906052098974, "grad_norm": 0.8550619018220096, "learning_rate": 4.391373379200847e-05, "loss": 0.3582, "step": 4340 }, { "epoch": 0.5679963439315793, "grad_norm": 0.6419130944419994, "learning_rate": 4.3781423657052134e-05, "loss": 0.3593, "step": 4350 }, { "epoch": 0.5693020826532611, "grad_norm": 0.7641174478817735, "learning_rate": 4.36491135220958e-05, "loss": 0.3485, "step": 4360 }, { "epoch": 0.5706078213749429, "grad_norm": 0.5877119236073053, "learning_rate": 4.3516803387139456e-05, "loss": 0.3328, "step": 4370 }, { "epoch": 0.5719135600966246, "grad_norm": 0.5550533265972072, "learning_rate": 4.338449325218312e-05, "loss": 0.309, "step": 4380 }, { "epoch": 0.5732192988183065, "grad_norm": 0.705221754577661, "learning_rate": 4.325218311722678e-05, "loss": 0.3913, "step": 4390 }, { "epoch": 0.5745250375399883, "grad_norm": 0.6947936672355854, "learning_rate": 4.311987298227044e-05, "loss": 0.37, "step": 4400 }, { "epoch": 0.57583077626167, "grad_norm": 0.5319231622371322, "learning_rate": 4.2987562847314106e-05, "loss": 0.3554, "step": 4410 }, { "epoch": 0.5771365149833518, "grad_norm": 0.773927024452756, "learning_rate": 4.285525271235777e-05, "loss": 0.3852, "step": 4420 }, { "epoch": 0.5784422537050337, "grad_norm": 0.8146250090809857, "learning_rate": 4.2722942577401434e-05, "loss": 0.3634, "step": 4430 }, { "epoch": 0.5797479924267154, "grad_norm": 0.79605786142863, "learning_rate": 4.259063244244509e-05, "loss": 0.345, "step": 4440 }, { "epoch": 0.5810537311483972, "grad_norm": 0.7712384447574988, "learning_rate": 4.2458322307488755e-05, "loss": 0.3845, "step": 4450 }, { "epoch": 0.582359469870079, "grad_norm": 0.8475698172691328, "learning_rate": 4.232601217253242e-05, "loss": 0.3163, "step": 4460 }, { "epoch": 0.5836652085917607, "grad_norm": 0.6049884397133825, "learning_rate": 4.2193702037576083e-05, "loss": 0.3386, "step": 4470 }, { "epoch": 0.5849709473134426, "grad_norm": 0.6783545309574633, "learning_rate": 4.206139190261974e-05, "loss": 0.3515, "step": 4480 }, { "epoch": 0.5862766860351244, "grad_norm": 0.587049976889506, "learning_rate": 4.1929081767663405e-05, "loss": 0.3832, "step": 4490 }, { "epoch": 0.5875824247568061, "grad_norm": 0.7817671688615186, "learning_rate": 4.179677163270707e-05, "loss": 0.3322, "step": 4500 }, { "epoch": 0.5888881634784879, "grad_norm": 0.6841578121072844, "learning_rate": 4.1664461497750726e-05, "loss": 0.3222, "step": 4510 }, { "epoch": 0.5901939022001698, "grad_norm": 0.6626977950932726, "learning_rate": 4.153215136279439e-05, "loss": 0.3739, "step": 4520 }, { "epoch": 0.5914996409218516, "grad_norm": 0.7031127519993573, "learning_rate": 4.1399841227838055e-05, "loss": 0.3479, "step": 4530 }, { "epoch": 0.5928053796435333, "grad_norm": 0.7702940881345544, "learning_rate": 4.126753109288172e-05, "loss": 0.4106, "step": 4540 }, { "epoch": 0.5941111183652151, "grad_norm": 0.7017827594427748, "learning_rate": 4.113522095792538e-05, "loss": 0.3646, "step": 4550 }, { "epoch": 0.595416857086897, "grad_norm": 0.740858803131998, "learning_rate": 4.100291082296904e-05, "loss": 0.3238, "step": 4560 }, { "epoch": 0.5967225958085787, "grad_norm": 0.9817239235770329, "learning_rate": 4.0870600688012704e-05, "loss": 0.3675, "step": 4570 }, { "epoch": 0.5980283345302605, "grad_norm": 0.9092954046964721, "learning_rate": 4.073829055305637e-05, "loss": 0.3314, "step": 4580 }, { "epoch": 0.5993340732519423, "grad_norm": 0.6129411847252844, "learning_rate": 4.0605980418100026e-05, "loss": 0.3751, "step": 4590 }, { "epoch": 0.600639811973624, "grad_norm": 0.7998535882464389, "learning_rate": 4.047367028314369e-05, "loss": 0.303, "step": 4600 }, { "epoch": 0.6019455506953059, "grad_norm": 0.5589142297736447, "learning_rate": 4.0341360148187354e-05, "loss": 0.2928, "step": 4610 }, { "epoch": 0.6032512894169877, "grad_norm": 0.8602164371459833, "learning_rate": 4.020905001323101e-05, "loss": 0.3579, "step": 4620 }, { "epoch": 0.6045570281386694, "grad_norm": 0.7126329737366726, "learning_rate": 4.0076739878274675e-05, "loss": 0.372, "step": 4630 }, { "epoch": 0.6058627668603512, "grad_norm": 0.76924716032001, "learning_rate": 3.994442974331834e-05, "loss": 0.3677, "step": 4640 }, { "epoch": 0.6071685055820331, "grad_norm": 0.7127728105017538, "learning_rate": 3.9812119608362004e-05, "loss": 0.3643, "step": 4650 }, { "epoch": 0.6084742443037148, "grad_norm": 0.655725530951708, "learning_rate": 3.967980947340567e-05, "loss": 0.3551, "step": 4660 }, { "epoch": 0.6097799830253966, "grad_norm": 0.6806383049846684, "learning_rate": 3.954749933844933e-05, "loss": 0.3153, "step": 4670 }, { "epoch": 0.6110857217470784, "grad_norm": 1.0049194560335168, "learning_rate": 3.941518920349299e-05, "loss": 0.3124, "step": 4680 }, { "epoch": 0.6123914604687603, "grad_norm": 0.600891089492612, "learning_rate": 3.928287906853665e-05, "loss": 0.3178, "step": 4690 }, { "epoch": 0.613697199190442, "grad_norm": 0.7290341922042528, "learning_rate": 3.915056893358031e-05, "loss": 0.4044, "step": 4700 }, { "epoch": 0.6150029379121238, "grad_norm": 0.6272834228208501, "learning_rate": 3.9018258798623975e-05, "loss": 0.3239, "step": 4710 }, { "epoch": 0.6163086766338056, "grad_norm": 0.7121846651854263, "learning_rate": 3.888594866366764e-05, "loss": 0.3923, "step": 4720 }, { "epoch": 0.6176144153554873, "grad_norm": 0.7273747873408097, "learning_rate": 3.87536385287113e-05, "loss": 0.3749, "step": 4730 }, { "epoch": 0.6189201540771692, "grad_norm": 0.8225480428717336, "learning_rate": 3.862132839375496e-05, "loss": 0.3996, "step": 4740 }, { "epoch": 0.620225892798851, "grad_norm": 0.5477909794519085, "learning_rate": 3.8489018258798624e-05, "loss": 0.3652, "step": 4750 }, { "epoch": 0.6215316315205327, "grad_norm": 0.5995332990378268, "learning_rate": 3.835670812384229e-05, "loss": 0.3422, "step": 4760 }, { "epoch": 0.6228373702422145, "grad_norm": 0.7754391276101182, "learning_rate": 3.822439798888595e-05, "loss": 0.3711, "step": 4770 }, { "epoch": 0.6241431089638964, "grad_norm": 0.5711600207408024, "learning_rate": 3.809208785392962e-05, "loss": 0.3553, "step": 4780 }, { "epoch": 0.6254488476855781, "grad_norm": 0.5543313706034462, "learning_rate": 3.7959777718973274e-05, "loss": 0.3265, "step": 4790 }, { "epoch": 0.6267545864072599, "grad_norm": 0.7819826442893762, "learning_rate": 3.782746758401694e-05, "loss": 0.3676, "step": 4800 }, { "epoch": 0.6280603251289417, "grad_norm": 0.6835713245932021, "learning_rate": 3.7695157449060596e-05, "loss": 0.3322, "step": 4810 }, { "epoch": 0.6293660638506234, "grad_norm": 0.7515164773977503, "learning_rate": 3.756284731410426e-05, "loss": 0.3833, "step": 4820 }, { "epoch": 0.6306718025723053, "grad_norm": 0.5728997475398802, "learning_rate": 3.7430537179147924e-05, "loss": 0.3496, "step": 4830 }, { "epoch": 0.6319775412939871, "grad_norm": 0.6941210331165345, "learning_rate": 3.729822704419159e-05, "loss": 0.3131, "step": 4840 }, { "epoch": 0.6332832800156689, "grad_norm": 0.5660526549850581, "learning_rate": 3.716591690923525e-05, "loss": 0.3404, "step": 4850 }, { "epoch": 0.6345890187373506, "grad_norm": 0.7525392606482112, "learning_rate": 3.703360677427891e-05, "loss": 0.3604, "step": 4860 }, { "epoch": 0.6358947574590325, "grad_norm": 0.5623921606568412, "learning_rate": 3.6901296639322573e-05, "loss": 0.3279, "step": 4870 }, { "epoch": 0.6372004961807143, "grad_norm": 0.8113331400057858, "learning_rate": 3.676898650436624e-05, "loss": 0.3614, "step": 4880 }, { "epoch": 0.638506234902396, "grad_norm": 0.655129559026077, "learning_rate": 3.66366763694099e-05, "loss": 0.3472, "step": 4890 }, { "epoch": 0.6398119736240778, "grad_norm": 0.829051397268011, "learning_rate": 3.6504366234453566e-05, "loss": 0.3438, "step": 4900 }, { "epoch": 0.6411177123457596, "grad_norm": 0.7424223288234809, "learning_rate": 3.637205609949722e-05, "loss": 0.3236, "step": 4910 }, { "epoch": 0.6424234510674414, "grad_norm": 0.7759519637866501, "learning_rate": 3.623974596454088e-05, "loss": 0.3594, "step": 4920 }, { "epoch": 0.6437291897891232, "grad_norm": 0.6684045052464821, "learning_rate": 3.6107435829584545e-05, "loss": 0.3433, "step": 4930 }, { "epoch": 0.645034928510805, "grad_norm": 0.7747377043516256, "learning_rate": 3.597512569462821e-05, "loss": 0.3567, "step": 4940 }, { "epoch": 0.6463406672324867, "grad_norm": 0.8596470519307204, "learning_rate": 3.584281555967187e-05, "loss": 0.3456, "step": 4950 }, { "epoch": 0.6476464059541686, "grad_norm": 0.7469962285270341, "learning_rate": 3.571050542471554e-05, "loss": 0.3311, "step": 4960 }, { "epoch": 0.6489521446758504, "grad_norm": 0.8284240931408812, "learning_rate": 3.5578195289759194e-05, "loss": 0.3764, "step": 4970 }, { "epoch": 0.6502578833975321, "grad_norm": 0.6079556688715883, "learning_rate": 3.544588515480286e-05, "loss": 0.3456, "step": 4980 }, { "epoch": 0.6515636221192139, "grad_norm": 0.7302510508059331, "learning_rate": 3.531357501984652e-05, "loss": 0.4029, "step": 4990 }, { "epoch": 0.6528693608408958, "grad_norm": 0.5104303525055096, "learning_rate": 3.5181264884890187e-05, "loss": 0.3515, "step": 5000 }, { "epoch": 0.6541750995625776, "grad_norm": 0.8736525064327274, "learning_rate": 3.504895474993385e-05, "loss": 0.3389, "step": 5010 }, { "epoch": 0.6554808382842593, "grad_norm": 0.7314193327605255, "learning_rate": 3.4916644614977515e-05, "loss": 0.3505, "step": 5020 }, { "epoch": 0.6567865770059411, "grad_norm": 0.6475605519372453, "learning_rate": 3.478433448002117e-05, "loss": 0.3229, "step": 5030 }, { "epoch": 0.658092315727623, "grad_norm": 0.9880646422540837, "learning_rate": 3.465202434506483e-05, "loss": 0.3787, "step": 5040 }, { "epoch": 0.6593980544493047, "grad_norm": 0.8138543265812863, "learning_rate": 3.4519714210108494e-05, "loss": 0.3591, "step": 5050 }, { "epoch": 0.6607037931709865, "grad_norm": 0.7999982066614137, "learning_rate": 3.438740407515216e-05, "loss": 0.3766, "step": 5060 }, { "epoch": 0.6620095318926683, "grad_norm": 0.9253846039277362, "learning_rate": 3.425509394019582e-05, "loss": 0.343, "step": 5070 }, { "epoch": 0.66331527061435, "grad_norm": 0.8649877265537332, "learning_rate": 3.4122783805239486e-05, "loss": 0.3679, "step": 5080 }, { "epoch": 0.6646210093360319, "grad_norm": 0.7737849501800617, "learning_rate": 3.399047367028314e-05, "loss": 0.3304, "step": 5090 }, { "epoch": 0.6659267480577137, "grad_norm": 0.6513101067291575, "learning_rate": 3.385816353532681e-05, "loss": 0.332, "step": 5100 }, { "epoch": 0.6672324867793954, "grad_norm": 0.4570766650616556, "learning_rate": 3.372585340037047e-05, "loss": 0.3519, "step": 5110 }, { "epoch": 0.6685382255010772, "grad_norm": 0.7020142585454845, "learning_rate": 3.3593543265414136e-05, "loss": 0.3897, "step": 5120 }, { "epoch": 0.669843964222759, "grad_norm": 0.7362863258898942, "learning_rate": 3.346123313045779e-05, "loss": 0.3823, "step": 5130 }, { "epoch": 0.6711497029444408, "grad_norm": 0.6568860671977359, "learning_rate": 3.332892299550146e-05, "loss": 0.3345, "step": 5140 }, { "epoch": 0.6724554416661226, "grad_norm": 0.7121206348568369, "learning_rate": 3.3196612860545114e-05, "loss": 0.3584, "step": 5150 }, { "epoch": 0.6737611803878044, "grad_norm": 0.7480394109345886, "learning_rate": 3.306430272558878e-05, "loss": 0.3365, "step": 5160 }, { "epoch": 0.6750669191094862, "grad_norm": 0.614629002385429, "learning_rate": 3.293199259063244e-05, "loss": 0.3798, "step": 5170 }, { "epoch": 0.676372657831168, "grad_norm": 0.5328025936040346, "learning_rate": 3.279968245567611e-05, "loss": 0.3743, "step": 5180 }, { "epoch": 0.6776783965528498, "grad_norm": 0.5901154265953442, "learning_rate": 3.266737232071977e-05, "loss": 0.3345, "step": 5190 }, { "epoch": 0.6789841352745316, "grad_norm": 0.6589386299759271, "learning_rate": 3.2535062185763435e-05, "loss": 0.3424, "step": 5200 }, { "epoch": 0.6802898739962133, "grad_norm": 0.9064986797696243, "learning_rate": 3.240275205080709e-05, "loss": 0.3627, "step": 5210 }, { "epoch": 0.6815956127178951, "grad_norm": 0.6009358102278358, "learning_rate": 3.2270441915850756e-05, "loss": 0.3181, "step": 5220 }, { "epoch": 0.682901351439577, "grad_norm": 0.7648260529673933, "learning_rate": 3.213813178089442e-05, "loss": 0.3396, "step": 5230 }, { "epoch": 0.6842070901612587, "grad_norm": 0.7447660035718667, "learning_rate": 3.200582164593808e-05, "loss": 0.3591, "step": 5240 }, { "epoch": 0.6855128288829405, "grad_norm": 0.6896661068999579, "learning_rate": 3.187351151098174e-05, "loss": 0.323, "step": 5250 }, { "epoch": 0.6868185676046223, "grad_norm": 0.9190265149690328, "learning_rate": 3.1741201376025406e-05, "loss": 0.3146, "step": 5260 }, { "epoch": 0.6881243063263041, "grad_norm": 0.5287154544825814, "learning_rate": 3.1608891241069063e-05, "loss": 0.3486, "step": 5270 }, { "epoch": 0.6894300450479859, "grad_norm": 0.7672367091888096, "learning_rate": 3.147658110611273e-05, "loss": 0.3549, "step": 5280 }, { "epoch": 0.6907357837696677, "grad_norm": 0.7201321972656557, "learning_rate": 3.134427097115639e-05, "loss": 0.3378, "step": 5290 }, { "epoch": 0.6920415224913494, "grad_norm": 0.7021513072836311, "learning_rate": 3.1211960836200056e-05, "loss": 0.3448, "step": 5300 }, { "epoch": 0.6933472612130313, "grad_norm": 0.6700433848129983, "learning_rate": 3.107965070124372e-05, "loss": 0.3352, "step": 5310 }, { "epoch": 0.6946529999347131, "grad_norm": 0.7339484087928275, "learning_rate": 3.094734056628738e-05, "loss": 0.3418, "step": 5320 }, { "epoch": 0.6959587386563949, "grad_norm": 0.584669398674776, "learning_rate": 3.081503043133104e-05, "loss": 0.3381, "step": 5330 }, { "epoch": 0.6972644773780766, "grad_norm": 0.8524273962246642, "learning_rate": 3.0682720296374705e-05, "loss": 0.3646, "step": 5340 }, { "epoch": 0.6985702160997584, "grad_norm": 0.665968909550134, "learning_rate": 3.055041016141836e-05, "loss": 0.3726, "step": 5350 }, { "epoch": 0.6998759548214403, "grad_norm": 0.6076679293007453, "learning_rate": 3.0418100026462027e-05, "loss": 0.3146, "step": 5360 }, { "epoch": 0.701181693543122, "grad_norm": 0.9217082287094234, "learning_rate": 3.0285789891505688e-05, "loss": 0.3547, "step": 5370 }, { "epoch": 0.7024874322648038, "grad_norm": 0.6644173103506473, "learning_rate": 3.0153479756549352e-05, "loss": 0.323, "step": 5380 }, { "epoch": 0.7037931709864856, "grad_norm": 0.696013802209398, "learning_rate": 3.0021169621593016e-05, "loss": 0.3657, "step": 5390 }, { "epoch": 0.7050989097081674, "grad_norm": 0.7746845947653378, "learning_rate": 2.9888859486636677e-05, "loss": 0.322, "step": 5400 }, { "epoch": 0.7064046484298492, "grad_norm": 0.7986715081505823, "learning_rate": 2.975654935168034e-05, "loss": 0.386, "step": 5410 }, { "epoch": 0.707710387151531, "grad_norm": 0.625051116401338, "learning_rate": 2.9624239216724005e-05, "loss": 0.3678, "step": 5420 }, { "epoch": 0.7090161258732127, "grad_norm": 0.590432601447704, "learning_rate": 2.9491929081767666e-05, "loss": 0.3101, "step": 5430 }, { "epoch": 0.7103218645948945, "grad_norm": 0.690310682548942, "learning_rate": 2.935961894681133e-05, "loss": 0.3479, "step": 5440 }, { "epoch": 0.7116276033165764, "grad_norm": 0.6674317717647725, "learning_rate": 2.922730881185499e-05, "loss": 0.3534, "step": 5450 }, { "epoch": 0.7129333420382582, "grad_norm": 0.7618332051114953, "learning_rate": 2.9094998676898648e-05, "loss": 0.3329, "step": 5460 }, { "epoch": 0.7142390807599399, "grad_norm": 0.7512787849568524, "learning_rate": 2.8962688541942312e-05, "loss": 0.3664, "step": 5470 }, { "epoch": 0.7155448194816217, "grad_norm": 0.5729848095058256, "learning_rate": 2.8830378406985976e-05, "loss": 0.3379, "step": 5480 }, { "epoch": 0.7168505582033036, "grad_norm": 0.6445975264163951, "learning_rate": 2.8698068272029637e-05, "loss": 0.3186, "step": 5490 }, { "epoch": 0.7181562969249853, "grad_norm": 0.6706219363582558, "learning_rate": 2.85657581370733e-05, "loss": 0.3477, "step": 5500 }, { "epoch": 0.7194620356466671, "grad_norm": 0.5402479117069005, "learning_rate": 2.8433448002116965e-05, "loss": 0.3694, "step": 5510 }, { "epoch": 0.7207677743683489, "grad_norm": 0.7258389859612734, "learning_rate": 2.8301137867160626e-05, "loss": 0.3771, "step": 5520 }, { "epoch": 0.7220735130900306, "grad_norm": 0.674968495813283, "learning_rate": 2.816882773220429e-05, "loss": 0.3001, "step": 5530 }, { "epoch": 0.7233792518117125, "grad_norm": 0.8387936410792364, "learning_rate": 2.803651759724795e-05, "loss": 0.3581, "step": 5540 }, { "epoch": 0.7246849905333943, "grad_norm": 1.3352234978250883, "learning_rate": 2.7904207462291615e-05, "loss": 0.3155, "step": 5550 }, { "epoch": 0.725990729255076, "grad_norm": 0.8017176555568436, "learning_rate": 2.777189732733528e-05, "loss": 0.3628, "step": 5560 }, { "epoch": 0.7272964679767578, "grad_norm": 0.645823766941924, "learning_rate": 2.7639587192378936e-05, "loss": 0.3302, "step": 5570 }, { "epoch": 0.7286022066984397, "grad_norm": 0.5979987189597915, "learning_rate": 2.7507277057422597e-05, "loss": 0.3448, "step": 5580 }, { "epoch": 0.7299079454201214, "grad_norm": 0.7486055209678033, "learning_rate": 2.737496692246626e-05, "loss": 0.4082, "step": 5590 }, { "epoch": 0.7312136841418032, "grad_norm": 0.7626823211323887, "learning_rate": 2.7242656787509925e-05, "loss": 0.3553, "step": 5600 }, { "epoch": 0.732519422863485, "grad_norm": 0.5880294292188434, "learning_rate": 2.7110346652553586e-05, "loss": 0.3484, "step": 5610 }, { "epoch": 0.7338251615851669, "grad_norm": 0.7236797067760615, "learning_rate": 2.697803651759725e-05, "loss": 0.3301, "step": 5620 }, { "epoch": 0.7351309003068486, "grad_norm": 0.6114617442254638, "learning_rate": 2.684572638264091e-05, "loss": 0.3932, "step": 5630 }, { "epoch": 0.7364366390285304, "grad_norm": 0.700900906480859, "learning_rate": 2.6713416247684575e-05, "loss": 0.3391, "step": 5640 }, { "epoch": 0.7377423777502122, "grad_norm": 0.658992175285034, "learning_rate": 2.658110611272824e-05, "loss": 0.3497, "step": 5650 }, { "epoch": 0.739048116471894, "grad_norm": 0.6903728923807146, "learning_rate": 2.64487959777719e-05, "loss": 0.3701, "step": 5660 }, { "epoch": 0.7403538551935758, "grad_norm": 0.5582864995036794, "learning_rate": 2.6316485842815564e-05, "loss": 0.3215, "step": 5670 }, { "epoch": 0.7416595939152576, "grad_norm": 0.979083781281091, "learning_rate": 2.618417570785922e-05, "loss": 0.3779, "step": 5680 }, { "epoch": 0.7429653326369393, "grad_norm": 0.7270077031724468, "learning_rate": 2.6051865572902885e-05, "loss": 0.382, "step": 5690 }, { "epoch": 0.7442710713586211, "grad_norm": 0.45368860482202605, "learning_rate": 2.5919555437946546e-05, "loss": 0.3139, "step": 5700 }, { "epoch": 0.745576810080303, "grad_norm": 0.8230760747696388, "learning_rate": 2.578724530299021e-05, "loss": 0.2873, "step": 5710 }, { "epoch": 0.7468825488019847, "grad_norm": 0.8618249282783758, "learning_rate": 2.5654935168033874e-05, "loss": 0.3583, "step": 5720 }, { "epoch": 0.7481882875236665, "grad_norm": 0.6599004507239166, "learning_rate": 2.5522625033077535e-05, "loss": 0.3213, "step": 5730 }, { "epoch": 0.7494940262453483, "grad_norm": 0.8268087097480011, "learning_rate": 2.53903148981212e-05, "loss": 0.3156, "step": 5740 }, { "epoch": 0.75079976496703, "grad_norm": 0.6304290381323213, "learning_rate": 2.525800476316486e-05, "loss": 0.3725, "step": 5750 }, { "epoch": 0.7521055036887119, "grad_norm": 0.591230032671368, "learning_rate": 2.5125694628208524e-05, "loss": 0.333, "step": 5760 }, { "epoch": 0.7534112424103937, "grad_norm": 0.8002827555791459, "learning_rate": 2.4993384493252184e-05, "loss": 0.3627, "step": 5770 }, { "epoch": 0.7547169811320755, "grad_norm": 0.7649735624696388, "learning_rate": 2.4861074358295845e-05, "loss": 0.3685, "step": 5780 }, { "epoch": 0.7560227198537572, "grad_norm": 0.7113061956468756, "learning_rate": 2.472876422333951e-05, "loss": 0.3458, "step": 5790 }, { "epoch": 0.7573284585754391, "grad_norm": 0.5507106427599706, "learning_rate": 2.4596454088383173e-05, "loss": 0.3331, "step": 5800 }, { "epoch": 0.7586341972971209, "grad_norm": 0.7562278669590585, "learning_rate": 2.4464143953426834e-05, "loss": 0.34, "step": 5810 }, { "epoch": 0.7599399360188026, "grad_norm": 0.8856942774091859, "learning_rate": 2.4331833818470495e-05, "loss": 0.3138, "step": 5820 }, { "epoch": 0.7612456747404844, "grad_norm": 0.8415331561575933, "learning_rate": 2.419952368351416e-05, "loss": 0.3673, "step": 5830 }, { "epoch": 0.7625514134621663, "grad_norm": 0.8059486415808219, "learning_rate": 2.406721354855782e-05, "loss": 0.36, "step": 5840 }, { "epoch": 0.763857152183848, "grad_norm": 0.7433892100602566, "learning_rate": 2.3934903413601484e-05, "loss": 0.3452, "step": 5850 }, { "epoch": 0.7651628909055298, "grad_norm": 0.8312767821944339, "learning_rate": 2.3802593278645148e-05, "loss": 0.371, "step": 5860 }, { "epoch": 0.7664686296272116, "grad_norm": 0.8077682584262603, "learning_rate": 2.3670283143688805e-05, "loss": 0.3233, "step": 5870 }, { "epoch": 0.7677743683488933, "grad_norm": 0.8494171652126219, "learning_rate": 2.353797300873247e-05, "loss": 0.3716, "step": 5880 }, { "epoch": 0.7690801070705752, "grad_norm": 0.6968025318311023, "learning_rate": 2.3405662873776133e-05, "loss": 0.3474, "step": 5890 }, { "epoch": 0.770385845792257, "grad_norm": 0.6583668642621959, "learning_rate": 2.3273352738819794e-05, "loss": 0.3551, "step": 5900 }, { "epoch": 0.7716915845139387, "grad_norm": 0.6765517479373635, "learning_rate": 2.3141042603863458e-05, "loss": 0.3511, "step": 5910 }, { "epoch": 0.7729973232356205, "grad_norm": 0.644535144623749, "learning_rate": 2.300873246890712e-05, "loss": 0.3401, "step": 5920 }, { "epoch": 0.7743030619573024, "grad_norm": 0.885617685466154, "learning_rate": 2.287642233395078e-05, "loss": 0.3439, "step": 5930 }, { "epoch": 0.7756088006789842, "grad_norm": 0.7419291293186455, "learning_rate": 2.2744112198994444e-05, "loss": 0.3732, "step": 5940 }, { "epoch": 0.7769145394006659, "grad_norm": 0.7056818808578822, "learning_rate": 2.2611802064038108e-05, "loss": 0.3368, "step": 5950 }, { "epoch": 0.7782202781223477, "grad_norm": 0.6420576552747186, "learning_rate": 2.247949192908177e-05, "loss": 0.3214, "step": 5960 }, { "epoch": 0.7795260168440296, "grad_norm": 0.6469900027485412, "learning_rate": 2.2347181794125433e-05, "loss": 0.3191, "step": 5970 }, { "epoch": 0.7808317555657113, "grad_norm": 0.9011044498548404, "learning_rate": 2.2214871659169094e-05, "loss": 0.3767, "step": 5980 }, { "epoch": 0.7821374942873931, "grad_norm": 0.7552148184349973, "learning_rate": 2.2082561524212754e-05, "loss": 0.3868, "step": 5990 }, { "epoch": 0.7834432330090749, "grad_norm": 0.5916607999954083, "learning_rate": 2.195025138925642e-05, "loss": 0.3765, "step": 6000 }, { "epoch": 0.7847489717307566, "grad_norm": 0.8536523532631002, "learning_rate": 2.1817941254300082e-05, "loss": 0.4403, "step": 6010 }, { "epoch": 0.7860547104524385, "grad_norm": 0.5320420219381146, "learning_rate": 2.1685631119343743e-05, "loss": 0.3301, "step": 6020 }, { "epoch": 0.7873604491741203, "grad_norm": 0.7214062267808119, "learning_rate": 2.1553320984387404e-05, "loss": 0.359, "step": 6030 }, { "epoch": 0.788666187895802, "grad_norm": 1.0129434745263957, "learning_rate": 2.1421010849431068e-05, "loss": 0.3268, "step": 6040 }, { "epoch": 0.7899719266174838, "grad_norm": 0.4819400541061572, "learning_rate": 2.128870071447473e-05, "loss": 0.3122, "step": 6050 }, { "epoch": 0.7912776653391657, "grad_norm": 0.6191768896861363, "learning_rate": 2.1156390579518393e-05, "loss": 0.316, "step": 6060 }, { "epoch": 0.7925834040608474, "grad_norm": 0.7414687562017633, "learning_rate": 2.1024080444562057e-05, "loss": 0.3497, "step": 6070 }, { "epoch": 0.7938891427825292, "grad_norm": 0.6184025648274264, "learning_rate": 2.0891770309605714e-05, "loss": 0.405, "step": 6080 }, { "epoch": 0.795194881504211, "grad_norm": 0.7989261253041049, "learning_rate": 2.075946017464938e-05, "loss": 0.4093, "step": 6090 }, { "epoch": 0.7965006202258929, "grad_norm": 0.6649426852535906, "learning_rate": 2.0627150039693043e-05, "loss": 0.3366, "step": 6100 }, { "epoch": 0.7978063589475746, "grad_norm": 0.8485614610290491, "learning_rate": 2.0494839904736703e-05, "loss": 0.3464, "step": 6110 }, { "epoch": 0.7991120976692564, "grad_norm": 0.7069753459518193, "learning_rate": 2.0362529769780367e-05, "loss": 0.3579, "step": 6120 }, { "epoch": 0.8004178363909382, "grad_norm": 0.7227036258138947, "learning_rate": 2.0230219634824028e-05, "loss": 0.3442, "step": 6130 }, { "epoch": 0.8017235751126199, "grad_norm": 0.9197826379428903, "learning_rate": 2.009790949986769e-05, "loss": 0.342, "step": 6140 }, { "epoch": 0.8030293138343018, "grad_norm": 0.6481949524977022, "learning_rate": 1.9965599364911353e-05, "loss": 0.3202, "step": 6150 }, { "epoch": 0.8043350525559836, "grad_norm": 0.8399465931705289, "learning_rate": 1.9833289229955017e-05, "loss": 0.3873, "step": 6160 }, { "epoch": 0.8056407912776653, "grad_norm": 0.7290610435674404, "learning_rate": 1.9700979094998678e-05, "loss": 0.3476, "step": 6170 }, { "epoch": 0.8069465299993471, "grad_norm": 0.6899792824013538, "learning_rate": 1.9568668960042342e-05, "loss": 0.3538, "step": 6180 }, { "epoch": 0.808252268721029, "grad_norm": 1.2527054590309432, "learning_rate": 1.9436358825086003e-05, "loss": 0.3239, "step": 6190 }, { "epoch": 0.8095580074427107, "grad_norm": 0.8627242711599293, "learning_rate": 1.9304048690129663e-05, "loss": 0.3479, "step": 6200 }, { "epoch": 0.8108637461643925, "grad_norm": 0.9091866628899391, "learning_rate": 1.9171738555173327e-05, "loss": 0.3671, "step": 6210 }, { "epoch": 0.8121694848860743, "grad_norm": 0.6730426185991659, "learning_rate": 1.9039428420216988e-05, "loss": 0.3246, "step": 6220 }, { "epoch": 0.813475223607756, "grad_norm": 0.739618379913077, "learning_rate": 1.8907118285260652e-05, "loss": 0.3405, "step": 6230 }, { "epoch": 0.8147809623294379, "grad_norm": 0.6662013136634514, "learning_rate": 1.8774808150304316e-05, "loss": 0.3333, "step": 6240 }, { "epoch": 0.8160867010511197, "grad_norm": 0.9066509710571504, "learning_rate": 1.8642498015347977e-05, "loss": 0.3829, "step": 6250 }, { "epoch": 0.8173924397728015, "grad_norm": 0.8923658722919103, "learning_rate": 1.8510187880391638e-05, "loss": 0.328, "step": 6260 }, { "epoch": 0.8186981784944832, "grad_norm": 0.9199392786136853, "learning_rate": 1.8377877745435302e-05, "loss": 0.3561, "step": 6270 }, { "epoch": 0.8200039172161651, "grad_norm": 0.6463420752583502, "learning_rate": 1.8245567610478963e-05, "loss": 0.3315, "step": 6280 }, { "epoch": 0.8213096559378469, "grad_norm": 0.8789103610293677, "learning_rate": 1.8113257475522627e-05, "loss": 0.3323, "step": 6290 }, { "epoch": 0.8226153946595286, "grad_norm": 0.6238349532758262, "learning_rate": 1.7980947340566288e-05, "loss": 0.3331, "step": 6300 }, { "epoch": 0.8239211333812104, "grad_norm": 0.757834190895548, "learning_rate": 1.7848637205609948e-05, "loss": 0.3409, "step": 6310 }, { "epoch": 0.8252268721028923, "grad_norm": 0.6535233000839505, "learning_rate": 1.7716327070653612e-05, "loss": 0.3414, "step": 6320 }, { "epoch": 0.826532610824574, "grad_norm": 0.5888308963153944, "learning_rate": 1.7584016935697276e-05, "loss": 0.374, "step": 6330 }, { "epoch": 0.8278383495462558, "grad_norm": 0.47914357342522756, "learning_rate": 1.7451706800740937e-05, "loss": 0.3417, "step": 6340 }, { "epoch": 0.8291440882679376, "grad_norm": 0.7148002333669002, "learning_rate": 1.73193966657846e-05, "loss": 0.3261, "step": 6350 }, { "epoch": 0.8304498269896193, "grad_norm": 0.6278156725745362, "learning_rate": 1.7187086530828262e-05, "loss": 0.3457, "step": 6360 }, { "epoch": 0.8317555657113012, "grad_norm": 0.9696144507126726, "learning_rate": 1.7054776395871923e-05, "loss": 0.3385, "step": 6370 }, { "epoch": 0.833061304432983, "grad_norm": 0.9649014230989341, "learning_rate": 1.6922466260915587e-05, "loss": 0.3627, "step": 6380 }, { "epoch": 0.8343670431546647, "grad_norm": 0.6680216705169412, "learning_rate": 1.679015612595925e-05, "loss": 0.3427, "step": 6390 }, { "epoch": 0.8356727818763465, "grad_norm": 0.603124795630466, "learning_rate": 1.6657845991002912e-05, "loss": 0.3405, "step": 6400 }, { "epoch": 0.8369785205980284, "grad_norm": 0.8898996426587519, "learning_rate": 1.6525535856046572e-05, "loss": 0.3554, "step": 6410 }, { "epoch": 0.8382842593197102, "grad_norm": 0.6172993476536083, "learning_rate": 1.6393225721090237e-05, "loss": 0.2937, "step": 6420 }, { "epoch": 0.8395899980413919, "grad_norm": 0.8823600496438655, "learning_rate": 1.6260915586133897e-05, "loss": 0.3823, "step": 6430 }, { "epoch": 0.8408957367630737, "grad_norm": 0.8760184469279744, "learning_rate": 1.612860545117756e-05, "loss": 0.3597, "step": 6440 }, { "epoch": 0.8422014754847555, "grad_norm": 0.9955381983446654, "learning_rate": 1.5996295316221226e-05, "loss": 0.3068, "step": 6450 }, { "epoch": 0.8435072142064373, "grad_norm": 0.5598513800418826, "learning_rate": 1.5863985181264883e-05, "loss": 0.3456, "step": 6460 }, { "epoch": 0.8448129529281191, "grad_norm": 0.7488974691598855, "learning_rate": 1.5731675046308547e-05, "loss": 0.3598, "step": 6470 }, { "epoch": 0.8461186916498009, "grad_norm": 0.8306291263548443, "learning_rate": 1.559936491135221e-05, "loss": 0.3574, "step": 6480 }, { "epoch": 0.8474244303714826, "grad_norm": 0.7467018497151834, "learning_rate": 1.5467054776395872e-05, "loss": 0.3333, "step": 6490 }, { "epoch": 0.8487301690931645, "grad_norm": 0.6679438302907792, "learning_rate": 1.5334744641439536e-05, "loss": 0.3478, "step": 6500 }, { "epoch": 0.8500359078148463, "grad_norm": 0.46076534741389913, "learning_rate": 1.5202434506483198e-05, "loss": 0.3589, "step": 6510 }, { "epoch": 0.851341646536528, "grad_norm": 0.664165191415085, "learning_rate": 1.5070124371526859e-05, "loss": 0.3757, "step": 6520 }, { "epoch": 0.8526473852582098, "grad_norm": 0.5202770295787932, "learning_rate": 1.4937814236570521e-05, "loss": 0.3541, "step": 6530 }, { "epoch": 0.8539531239798916, "grad_norm": 0.6784784904303647, "learning_rate": 1.4805504101614184e-05, "loss": 0.3423, "step": 6540 }, { "epoch": 0.8552588627015734, "grad_norm": 0.6677478555341161, "learning_rate": 1.4673193966657846e-05, "loss": 0.3541, "step": 6550 }, { "epoch": 0.8565646014232552, "grad_norm": 0.735132276151151, "learning_rate": 1.454088383170151e-05, "loss": 0.3459, "step": 6560 }, { "epoch": 0.857870340144937, "grad_norm": 0.8899398808373948, "learning_rate": 1.440857369674517e-05, "loss": 0.3406, "step": 6570 }, { "epoch": 0.8591760788666188, "grad_norm": 0.7739823396900144, "learning_rate": 1.4276263561788834e-05, "loss": 0.3339, "step": 6580 }, { "epoch": 0.8604818175883006, "grad_norm": 1.0129091767391734, "learning_rate": 1.4143953426832496e-05, "loss": 0.3415, "step": 6590 }, { "epoch": 0.8617875563099824, "grad_norm": 0.5766706459601169, "learning_rate": 1.4011643291876158e-05, "loss": 0.3272, "step": 6600 }, { "epoch": 0.8630932950316642, "grad_norm": 0.8521630883134714, "learning_rate": 1.3879333156919821e-05, "loss": 0.3501, "step": 6610 }, { "epoch": 0.8643990337533459, "grad_norm": 0.6862097728564349, "learning_rate": 1.3747023021963485e-05, "loss": 0.3345, "step": 6620 }, { "epoch": 0.8657047724750278, "grad_norm": 0.7167847452803705, "learning_rate": 1.3614712887007144e-05, "loss": 0.3695, "step": 6630 }, { "epoch": 0.8670105111967096, "grad_norm": 0.7374905746104996, "learning_rate": 1.3482402752050808e-05, "loss": 0.3572, "step": 6640 }, { "epoch": 0.8683162499183913, "grad_norm": 0.7379677874067828, "learning_rate": 1.335009261709447e-05, "loss": 0.3234, "step": 6650 }, { "epoch": 0.8696219886400731, "grad_norm": 0.5988997538682022, "learning_rate": 1.3217782482138133e-05, "loss": 0.3266, "step": 6660 }, { "epoch": 0.8709277273617549, "grad_norm": 0.4655358170781221, "learning_rate": 1.3085472347181795e-05, "loss": 0.3427, "step": 6670 }, { "epoch": 0.8722334660834367, "grad_norm": 0.6848541770238878, "learning_rate": 1.2953162212225456e-05, "loss": 0.3339, "step": 6680 }, { "epoch": 0.8735392048051185, "grad_norm": 0.6518078426646481, "learning_rate": 1.2820852077269119e-05, "loss": 0.3191, "step": 6690 }, { "epoch": 0.8748449435268003, "grad_norm": 0.5519939322503953, "learning_rate": 1.2688541942312781e-05, "loss": 0.3433, "step": 6700 }, { "epoch": 0.876150682248482, "grad_norm": 0.6214421905586885, "learning_rate": 1.2556231807356445e-05, "loss": 0.3424, "step": 6710 }, { "epoch": 0.8774564209701639, "grad_norm": 0.6191050718046756, "learning_rate": 1.2423921672400106e-05, "loss": 0.3072, "step": 6720 }, { "epoch": 0.8787621596918457, "grad_norm": 0.6212138829785082, "learning_rate": 1.2291611537443768e-05, "loss": 0.3207, "step": 6730 }, { "epoch": 0.8800678984135275, "grad_norm": 0.6839725569555116, "learning_rate": 1.2159301402487432e-05, "loss": 0.3798, "step": 6740 }, { "epoch": 0.8813736371352092, "grad_norm": 0.8245019360403081, "learning_rate": 1.2026991267531093e-05, "loss": 0.2962, "step": 6750 }, { "epoch": 0.882679375856891, "grad_norm": 0.7030784907979168, "learning_rate": 1.1894681132574755e-05, "loss": 0.3305, "step": 6760 }, { "epoch": 0.8839851145785729, "grad_norm": 0.5649286720423399, "learning_rate": 1.176237099761842e-05, "loss": 0.3569, "step": 6770 }, { "epoch": 0.8852908533002546, "grad_norm": 0.8210213117750729, "learning_rate": 1.163006086266208e-05, "loss": 0.3345, "step": 6780 }, { "epoch": 0.8865965920219364, "grad_norm": 0.8121204301943723, "learning_rate": 1.1497750727705743e-05, "loss": 0.3231, "step": 6790 }, { "epoch": 0.8879023307436182, "grad_norm": 0.7991738073127946, "learning_rate": 1.1365440592749405e-05, "loss": 0.2986, "step": 6800 }, { "epoch": 0.8892080694653, "grad_norm": 0.8782007546171526, "learning_rate": 1.1233130457793068e-05, "loss": 0.3537, "step": 6810 }, { "epoch": 0.8905138081869818, "grad_norm": 0.7104772150141395, "learning_rate": 1.110082032283673e-05, "loss": 0.3067, "step": 6820 }, { "epoch": 0.8918195469086636, "grad_norm": 0.8322458996452081, "learning_rate": 1.0968510187880392e-05, "loss": 0.3304, "step": 6830 }, { "epoch": 0.8931252856303453, "grad_norm": 0.7556961072598447, "learning_rate": 1.0836200052924055e-05, "loss": 0.3477, "step": 6840 }, { "epoch": 0.8944310243520271, "grad_norm": 0.44104295710388514, "learning_rate": 1.0703889917967717e-05, "loss": 0.3543, "step": 6850 }, { "epoch": 0.895736763073709, "grad_norm": 0.6765612529142849, "learning_rate": 1.057157978301138e-05, "loss": 0.3633, "step": 6860 }, { "epoch": 0.8970425017953907, "grad_norm": 0.6253376263728544, "learning_rate": 1.0439269648055042e-05, "loss": 0.3497, "step": 6870 }, { "epoch": 0.8983482405170725, "grad_norm": 0.8589034278375015, "learning_rate": 1.0306959513098703e-05, "loss": 0.3254, "step": 6880 }, { "epoch": 0.8996539792387543, "grad_norm": 0.9789655584886806, "learning_rate": 1.0174649378142365e-05, "loss": 0.3671, "step": 6890 }, { "epoch": 0.9009597179604362, "grad_norm": 0.7454482541236567, "learning_rate": 1.004233924318603e-05, "loss": 0.31, "step": 6900 }, { "epoch": 0.9022654566821179, "grad_norm": 1.5045992714785235, "learning_rate": 9.91002910822969e-06, "loss": 0.3406, "step": 6910 }, { "epoch": 0.9035711954037997, "grad_norm": 0.7202640944472604, "learning_rate": 9.777718973273352e-06, "loss": 0.3236, "step": 6920 }, { "epoch": 0.9048769341254815, "grad_norm": 0.7691959020023031, "learning_rate": 9.645408838317017e-06, "loss": 0.3551, "step": 6930 }, { "epoch": 0.9061826728471633, "grad_norm": 0.8596287788941775, "learning_rate": 9.513098703360677e-06, "loss": 0.3278, "step": 6940 }, { "epoch": 0.9074884115688451, "grad_norm": 0.7895778773755453, "learning_rate": 9.38078856840434e-06, "loss": 0.3222, "step": 6950 }, { "epoch": 0.9087941502905269, "grad_norm": 0.5333469491945221, "learning_rate": 9.248478433448004e-06, "loss": 0.3393, "step": 6960 }, { "epoch": 0.9100998890122086, "grad_norm": 0.6568547287295069, "learning_rate": 9.116168298491665e-06, "loss": 0.3129, "step": 6970 }, { "epoch": 0.9114056277338904, "grad_norm": 0.6803634167277283, "learning_rate": 8.983858163535327e-06, "loss": 0.3454, "step": 6980 }, { "epoch": 0.9127113664555723, "grad_norm": 0.6948661640899675, "learning_rate": 8.85154802857899e-06, "loss": 0.3528, "step": 6990 }, { "epoch": 0.914017105177254, "grad_norm": 0.7196739429044744, "learning_rate": 8.719237893622652e-06, "loss": 0.3659, "step": 7000 }, { "epoch": 0.9153228438989358, "grad_norm": 0.5976334648967417, "learning_rate": 8.586927758666314e-06, "loss": 0.3079, "step": 7010 }, { "epoch": 0.9166285826206176, "grad_norm": 0.7173623360458893, "learning_rate": 8.454617623709977e-06, "loss": 0.3679, "step": 7020 }, { "epoch": 0.9179343213422994, "grad_norm": 0.7419919022883544, "learning_rate": 8.322307488753639e-06, "loss": 0.3529, "step": 7030 }, { "epoch": 0.9192400600639812, "grad_norm": 0.6341039921345587, "learning_rate": 8.189997353797301e-06, "loss": 0.3122, "step": 7040 }, { "epoch": 0.920545798785663, "grad_norm": 0.6066816827257387, "learning_rate": 8.057687218840964e-06, "loss": 0.3243, "step": 7050 }, { "epoch": 0.9218515375073448, "grad_norm": 0.6485866623414742, "learning_rate": 7.925377083884626e-06, "loss": 0.2987, "step": 7060 }, { "epoch": 0.9231572762290265, "grad_norm": 0.7355455742870073, "learning_rate": 7.793066948928287e-06, "loss": 0.3681, "step": 7070 }, { "epoch": 0.9244630149507084, "grad_norm": 0.8318357837349304, "learning_rate": 7.660756813971951e-06, "loss": 0.3613, "step": 7080 }, { "epoch": 0.9257687536723902, "grad_norm": 0.7914663977180142, "learning_rate": 7.528446679015614e-06, "loss": 0.3563, "step": 7090 }, { "epoch": 0.9270744923940719, "grad_norm": 0.676893067212301, "learning_rate": 7.396136544059275e-06, "loss": 0.3339, "step": 7100 }, { "epoch": 0.9283802311157537, "grad_norm": 0.7445250134710878, "learning_rate": 7.263826409102938e-06, "loss": 0.37, "step": 7110 }, { "epoch": 0.9296859698374356, "grad_norm": 0.6748908372976828, "learning_rate": 7.131516274146601e-06, "loss": 0.371, "step": 7120 }, { "epoch": 0.9309917085591173, "grad_norm": 0.9008341638482246, "learning_rate": 6.9992061391902616e-06, "loss": 0.3254, "step": 7130 }, { "epoch": 0.9322974472807991, "grad_norm": 0.7278126362465884, "learning_rate": 6.866896004233925e-06, "loss": 0.3408, "step": 7140 }, { "epoch": 0.9336031860024809, "grad_norm": 0.8116705858882163, "learning_rate": 6.734585869277587e-06, "loss": 0.3293, "step": 7150 }, { "epoch": 0.9349089247241626, "grad_norm": 1.0648603881250955, "learning_rate": 6.602275734321249e-06, "loss": 0.3313, "step": 7160 }, { "epoch": 0.9362146634458445, "grad_norm": 0.9391900652976399, "learning_rate": 6.469965599364912e-06, "loss": 0.3655, "step": 7170 }, { "epoch": 0.9375204021675263, "grad_norm": 0.7954449941951462, "learning_rate": 6.337655464408574e-06, "loss": 0.3679, "step": 7180 }, { "epoch": 0.938826140889208, "grad_norm": 0.6180432852757723, "learning_rate": 6.205345329452236e-06, "loss": 0.3188, "step": 7190 }, { "epoch": 0.9401318796108898, "grad_norm": 0.9344345100874922, "learning_rate": 6.0730351944958985e-06, "loss": 0.3128, "step": 7200 }, { "epoch": 0.9414376183325717, "grad_norm": 0.6435906842865081, "learning_rate": 5.940725059539561e-06, "loss": 0.3125, "step": 7210 }, { "epoch": 0.9427433570542535, "grad_norm": 0.6660329759103202, "learning_rate": 5.808414924583223e-06, "loss": 0.3413, "step": 7220 }, { "epoch": 0.9440490957759352, "grad_norm": 0.7919248766957494, "learning_rate": 5.676104789626886e-06, "loss": 0.2987, "step": 7230 }, { "epoch": 0.945354834497617, "grad_norm": 0.6397632114634846, "learning_rate": 5.543794654670547e-06, "loss": 0.3496, "step": 7240 }, { "epoch": 0.9466605732192989, "grad_norm": 0.6386278717066863, "learning_rate": 5.411484519714211e-06, "loss": 0.3079, "step": 7250 }, { "epoch": 0.9479663119409806, "grad_norm": 0.8024569710422961, "learning_rate": 5.279174384757873e-06, "loss": 0.3362, "step": 7260 }, { "epoch": 0.9492720506626624, "grad_norm": 0.7611381737446198, "learning_rate": 5.146864249801535e-06, "loss": 0.3467, "step": 7270 }, { "epoch": 0.9505777893843442, "grad_norm": 0.6542924612380807, "learning_rate": 5.014554114845198e-06, "loss": 0.3109, "step": 7280 }, { "epoch": 0.951883528106026, "grad_norm": 0.7191512573453717, "learning_rate": 4.88224397988886e-06, "loss": 0.33, "step": 7290 }, { "epoch": 0.9531892668277078, "grad_norm": 0.6629837983573698, "learning_rate": 4.749933844932522e-06, "loss": 0.3351, "step": 7300 }, { "epoch": 0.9544950055493896, "grad_norm": 0.6712407020785167, "learning_rate": 4.617623709976184e-06, "loss": 0.3389, "step": 7310 }, { "epoch": 0.9558007442710713, "grad_norm": 0.7238215021291633, "learning_rate": 4.485313575019847e-06, "loss": 0.3081, "step": 7320 }, { "epoch": 0.9571064829927531, "grad_norm": 0.6396688577098572, "learning_rate": 4.353003440063509e-06, "loss": 0.3163, "step": 7330 }, { "epoch": 0.958412221714435, "grad_norm": 0.5737213551972973, "learning_rate": 4.2206933051071715e-06, "loss": 0.3002, "step": 7340 }, { "epoch": 0.9597179604361167, "grad_norm": 0.5855896034900631, "learning_rate": 4.088383170150833e-06, "loss": 0.3427, "step": 7350 }, { "epoch": 0.9610236991577985, "grad_norm": 0.6909807225235872, "learning_rate": 3.956073035194496e-06, "loss": 0.338, "step": 7360 }, { "epoch": 0.9623294378794803, "grad_norm": 0.5647533212537805, "learning_rate": 3.823762900238159e-06, "loss": 0.2482, "step": 7370 }, { "epoch": 0.9636351766011622, "grad_norm": 0.64004791948944, "learning_rate": 3.6914527652818208e-06, "loss": 0.3558, "step": 7380 }, { "epoch": 0.9649409153228439, "grad_norm": 0.7218815388867011, "learning_rate": 3.5591426303254828e-06, "loss": 0.3347, "step": 7390 }, { "epoch": 0.9662466540445257, "grad_norm": 0.7279984281829391, "learning_rate": 3.4268324953691456e-06, "loss": 0.3116, "step": 7400 }, { "epoch": 0.9675523927662075, "grad_norm": 0.5731000076760607, "learning_rate": 3.2945223604128076e-06, "loss": 0.3204, "step": 7410 }, { "epoch": 0.9688581314878892, "grad_norm": 0.5636373233322279, "learning_rate": 3.16221222545647e-06, "loss": 0.3379, "step": 7420 }, { "epoch": 0.9701638702095711, "grad_norm": 0.6065007786953994, "learning_rate": 3.0299020905001325e-06, "loss": 0.3178, "step": 7430 }, { "epoch": 0.9714696089312529, "grad_norm": 0.6109965180647138, "learning_rate": 2.8975919555437945e-06, "loss": 0.315, "step": 7440 }, { "epoch": 0.9727753476529346, "grad_norm": 0.7041732521694112, "learning_rate": 2.7652818205874573e-06, "loss": 0.3555, "step": 7450 }, { "epoch": 0.9740810863746164, "grad_norm": 0.7338180109751901, "learning_rate": 2.6329716856311197e-06, "loss": 0.3423, "step": 7460 }, { "epoch": 0.9753868250962983, "grad_norm": 0.7678966837867073, "learning_rate": 2.5006615506747817e-06, "loss": 0.3228, "step": 7470 }, { "epoch": 0.97669256381798, "grad_norm": 0.712088574728428, "learning_rate": 2.368351415718444e-06, "loss": 0.3494, "step": 7480 }, { "epoch": 0.9779983025396618, "grad_norm": 0.6925539887628469, "learning_rate": 2.2360412807621066e-06, "loss": 0.3055, "step": 7490 }, { "epoch": 0.9793040412613436, "grad_norm": 0.6441957713092538, "learning_rate": 2.103731145805769e-06, "loss": 0.3455, "step": 7500 }, { "epoch": 0.9806097799830253, "grad_norm": 0.6629616103352994, "learning_rate": 1.971421010849431e-06, "loss": 0.325, "step": 7510 }, { "epoch": 0.9819155187047072, "grad_norm": 0.7019670624024492, "learning_rate": 1.8391108758930936e-06, "loss": 0.3281, "step": 7520 }, { "epoch": 0.983221257426389, "grad_norm": 0.6139780680833851, "learning_rate": 1.7068007409367558e-06, "loss": 0.335, "step": 7530 }, { "epoch": 0.9845269961480708, "grad_norm": 0.6802229682663576, "learning_rate": 1.5744906059804182e-06, "loss": 0.329, "step": 7540 }, { "epoch": 0.9858327348697525, "grad_norm": 0.7553252017561364, "learning_rate": 1.4421804710240806e-06, "loss": 0.342, "step": 7550 }, { "epoch": 0.9871384735914344, "grad_norm": 0.7302809765512707, "learning_rate": 1.3098703360677429e-06, "loss": 0.3493, "step": 7560 }, { "epoch": 0.9884442123131162, "grad_norm": 0.749406573268671, "learning_rate": 1.1775602011114053e-06, "loss": 0.342, "step": 7570 }, { "epoch": 0.9897499510347979, "grad_norm": 0.4983501069323526, "learning_rate": 1.0452500661550675e-06, "loss": 0.3418, "step": 7580 }, { "epoch": 0.9910556897564797, "grad_norm": 0.9492797781054759, "learning_rate": 9.129399311987299e-07, "loss": 0.3422, "step": 7590 }, { "epoch": 0.9923614284781616, "grad_norm": 0.55769549725853, "learning_rate": 7.806297962423922e-07, "loss": 0.3233, "step": 7600 }, { "epoch": 0.9936671671998433, "grad_norm": 0.5908153496773096, "learning_rate": 6.483196612860545e-07, "loss": 0.3595, "step": 7610 }, { "epoch": 0.9949729059215251, "grad_norm": 0.8641210205213213, "learning_rate": 5.160095263297169e-07, "loss": 0.3401, "step": 7620 }, { "epoch": 0.9962786446432069, "grad_norm": 0.6757227008366056, "learning_rate": 3.836993913733792e-07, "loss": 0.3506, "step": 7630 }, { "epoch": 0.9975843833648886, "grad_norm": 0.5109458531979302, "learning_rate": 2.5138925641704157e-07, "loss": 0.3386, "step": 7640 }, { "epoch": 0.9988901220865705, "grad_norm": 0.7189438739294972, "learning_rate": 1.1907912146070389e-07, "loss": 0.3297, "step": 7650 } ], "logging_steps": 10, "max_steps": 7658, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }