diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,95633 @@ +{ + "best_global_step": 11942, + "best_metric": 3.06417274, + "best_model_checkpoint": "/inspire/hdd/project/deepanalysis/guitao-25013/Muse/workspace/Finals/ckpt/Muse_0.6b_main_5e-4/v2-20251228-192522/checkpoint-11942", + "epoch": 7.0, + "eval_steps": 500, + "global_step": 11942, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005863383172090296, + "grad_norm": 406.2271101390586, + "learning_rate": 2.9308323563892146e-07, + "loss": 20.440696716308594, + "step": 1, + "token_acc": 0.005169762453967869 + }, + { + "epoch": 0.0011726766344180592, + "grad_norm": 412.16447882895346, + "learning_rate": 5.861664712778429e-07, + "loss": 20.451984405517578, + "step": 2, + "token_acc": 0.005461987590491526 + }, + { + "epoch": 0.001759014951627089, + "grad_norm": 418.20941122468446, + "learning_rate": 8.792497069167644e-07, + "loss": 20.480960845947266, + "step": 3, + "token_acc": 0.005607774672857289 + }, + { + "epoch": 0.0023453532688361184, + "grad_norm": 410.87151701566614, + "learning_rate": 1.1723329425556858e-06, + "loss": 20.430644989013672, + "step": 4, + "token_acc": 0.005623277022171703 + }, + { + "epoch": 0.002931691586045148, + "grad_norm": 409.3267148139974, + "learning_rate": 1.4654161781946073e-06, + "loss": 20.39999771118164, + "step": 5, + "token_acc": 0.005599030271045839 + }, + { + "epoch": 0.003518029903254178, + "grad_norm": 388.1571609711306, + "learning_rate": 1.7584994138335288e-06, + "loss": 20.17621612548828, + "step": 6, + "token_acc": 0.0054469173645299825 + }, + { + "epoch": 0.004104368220463207, + "grad_norm": 313.74906069914255, + "learning_rate": 2.0515826494724504e-06, + "loss": 19.58666229248047, + "step": 7, + "token_acc": 0.005114621384660843 + }, + { + "epoch": 0.004690706537672237, + "grad_norm": 281.5294972237458, + "learning_rate": 2.3446658851113717e-06, + "loss": 19.404088973999023, + "step": 8, + "token_acc": 0.005057536731744876 + }, + { + "epoch": 0.005277044854881266, + "grad_norm": 146.13008801719596, + "learning_rate": 2.637749120750293e-06, + "loss": 18.61431121826172, + "step": 9, + "token_acc": 0.005126710240583648 + }, + { + "epoch": 0.005863383172090296, + "grad_norm": 140.23502708044148, + "learning_rate": 2.9308323563892146e-06, + "loss": 18.53695297241211, + "step": 10, + "token_acc": 0.00499539431729611 + }, + { + "epoch": 0.006449721489299325, + "grad_norm": 127.48319880970146, + "learning_rate": 3.2239155920281363e-06, + "loss": 18.369373321533203, + "step": 11, + "token_acc": 0.005458429543062689 + }, + { + "epoch": 0.007036059806508356, + "grad_norm": 91.25765890361949, + "learning_rate": 3.5169988276670575e-06, + "loss": 17.680696487426758, + "step": 12, + "token_acc": 0.00642902841568142 + }, + { + "epoch": 0.007622398123717385, + "grad_norm": 84.66330309793499, + "learning_rate": 3.810082063305979e-06, + "loss": 17.540504455566406, + "step": 13, + "token_acc": 0.006386651475459239 + }, + { + "epoch": 0.008208736440926415, + "grad_norm": 84.53039848160421, + "learning_rate": 4.103165298944901e-06, + "loss": 17.426589965820312, + "step": 14, + "token_acc": 0.006393642687734901 + }, + { + "epoch": 0.008795074758135445, + "grad_norm": 86.68350785550199, + "learning_rate": 4.396248534583822e-06, + "loss": 17.20493507385254, + "step": 15, + "token_acc": 0.006940054885741413 + }, + { + "epoch": 0.009381413075344474, + "grad_norm": 93.15417829751227, + "learning_rate": 4.689331770222743e-06, + "loss": 16.8516845703125, + "step": 16, + "token_acc": 0.0071443162087631495 + }, + { + "epoch": 0.009967751392553504, + "grad_norm": 95.75185430500625, + "learning_rate": 4.982415005861665e-06, + "loss": 16.221324920654297, + "step": 17, + "token_acc": 0.007623914578918941 + }, + { + "epoch": 0.010554089709762533, + "grad_norm": 78.27687919781866, + "learning_rate": 5.275498241500586e-06, + "loss": 15.901122093200684, + "step": 18, + "token_acc": 0.00788330992196805 + }, + { + "epoch": 0.011140428026971563, + "grad_norm": 68.95359329805592, + "learning_rate": 5.568581477139508e-06, + "loss": 15.643229484558105, + "step": 19, + "token_acc": 0.007641320724639917 + }, + { + "epoch": 0.011726766344180592, + "grad_norm": 69.12087738108575, + "learning_rate": 5.861664712778429e-06, + "loss": 15.347555160522461, + "step": 20, + "token_acc": 0.008035478591415448 + }, + { + "epoch": 0.012313104661389622, + "grad_norm": 61.85968622002263, + "learning_rate": 6.1547479484173505e-06, + "loss": 15.043094635009766, + "step": 21, + "token_acc": 0.007827971204153596 + }, + { + "epoch": 0.01289944297859865, + "grad_norm": 49.45410654333595, + "learning_rate": 6.447831184056273e-06, + "loss": 14.705094337463379, + "step": 22, + "token_acc": 0.007564304131815609 + }, + { + "epoch": 0.013485781295807681, + "grad_norm": 40.375815666363415, + "learning_rate": 6.740914419695194e-06, + "loss": 14.266519546508789, + "step": 23, + "token_acc": 0.007753145017100947 + }, + { + "epoch": 0.014072119613016711, + "grad_norm": 37.38537767245995, + "learning_rate": 7.033997655334115e-06, + "loss": 13.957818031311035, + "step": 24, + "token_acc": 0.007307753542429549 + }, + { + "epoch": 0.01465845793022574, + "grad_norm": 33.6695480176108, + "learning_rate": 7.327080890973036e-06, + "loss": 13.675622940063477, + "step": 25, + "token_acc": 0.007562456005206327 + }, + { + "epoch": 0.01524479624743477, + "grad_norm": 28.340626533649406, + "learning_rate": 7.620164126611958e-06, + "loss": 13.420007705688477, + "step": 26, + "token_acc": 0.007324099751416719 + }, + { + "epoch": 0.0158311345646438, + "grad_norm": 25.85556440334715, + "learning_rate": 7.913247362250878e-06, + "loss": 13.171337127685547, + "step": 27, + "token_acc": 0.007610073360902352 + }, + { + "epoch": 0.01641747288185283, + "grad_norm": 22.00322568760864, + "learning_rate": 8.206330597889802e-06, + "loss": 12.96188735961914, + "step": 28, + "token_acc": 0.0071992851485550655 + }, + { + "epoch": 0.017003811199061858, + "grad_norm": 22.931083569226036, + "learning_rate": 8.499413833528722e-06, + "loss": 12.78353500366211, + "step": 29, + "token_acc": 0.007740123363203294 + }, + { + "epoch": 0.01759014951627089, + "grad_norm": 16.299858980962625, + "learning_rate": 8.792497069167644e-06, + "loss": 12.634174346923828, + "step": 30, + "token_acc": 0.007655337227827934 + }, + { + "epoch": 0.01817648783347992, + "grad_norm": 15.892915389936697, + "learning_rate": 9.085580304806565e-06, + "loss": 12.513818740844727, + "step": 31, + "token_acc": 0.00838201743396326 + }, + { + "epoch": 0.018762826150688947, + "grad_norm": 13.103249979050556, + "learning_rate": 9.378663540445487e-06, + "loss": 12.393245697021484, + "step": 32, + "token_acc": 0.0096021912022602 + }, + { + "epoch": 0.019349164467897976, + "grad_norm": 10.526629503930657, + "learning_rate": 9.671746776084409e-06, + "loss": 12.305274963378906, + "step": 33, + "token_acc": 0.009416344045408287 + }, + { + "epoch": 0.019935502785107008, + "grad_norm": 11.034153807902563, + "learning_rate": 9.96483001172333e-06, + "loss": 12.234249114990234, + "step": 34, + "token_acc": 0.009817216368435272 + }, + { + "epoch": 0.020521841102316037, + "grad_norm": 11.369897198382668, + "learning_rate": 1.0257913247362251e-05, + "loss": 12.171164512634277, + "step": 35, + "token_acc": 0.010525034982052687 + }, + { + "epoch": 0.021108179419525065, + "grad_norm": 7.9678722582631645, + "learning_rate": 1.0550996483001172e-05, + "loss": 12.120739936828613, + "step": 36, + "token_acc": 0.010051593165849991 + }, + { + "epoch": 0.021694517736734097, + "grad_norm": 6.368991130360932, + "learning_rate": 1.0844079718640094e-05, + "loss": 12.073944091796875, + "step": 37, + "token_acc": 0.009734524520581022 + }, + { + "epoch": 0.022280856053943126, + "grad_norm": 6.38252883921138, + "learning_rate": 1.1137162954279016e-05, + "loss": 12.034017562866211, + "step": 38, + "token_acc": 0.009590650620317526 + }, + { + "epoch": 0.022867194371152155, + "grad_norm": 7.924309869588609, + "learning_rate": 1.1430246189917938e-05, + "loss": 11.9993896484375, + "step": 39, + "token_acc": 0.010022894937674892 + }, + { + "epoch": 0.023453532688361183, + "grad_norm": 9.058949431991275, + "learning_rate": 1.1723329425556858e-05, + "loss": 11.971410751342773, + "step": 40, + "token_acc": 0.009991763546265915 + }, + { + "epoch": 0.024039871005570215, + "grad_norm": 5.575744466881461, + "learning_rate": 1.2016412661195779e-05, + "loss": 11.937782287597656, + "step": 41, + "token_acc": 0.009981918543302703 + }, + { + "epoch": 0.024626209322779244, + "grad_norm": 4.104321671559125, + "learning_rate": 1.2309495896834701e-05, + "loss": 11.91295337677002, + "step": 42, + "token_acc": 0.010011369961669247 + }, + { + "epoch": 0.025212547639988273, + "grad_norm": 4.428976160604956, + "learning_rate": 1.2602579132473623e-05, + "loss": 11.9051513671875, + "step": 43, + "token_acc": 0.008911209871034638 + }, + { + "epoch": 0.0257988859571973, + "grad_norm": 6.286880923171877, + "learning_rate": 1.2895662368112545e-05, + "loss": 11.88243293762207, + "step": 44, + "token_acc": 0.00932694862883434 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 2.7217172639780425, + "learning_rate": 1.3188745603751466e-05, + "loss": 11.856371879577637, + "step": 45, + "token_acc": 0.009457271649877254 + }, + { + "epoch": 0.026971562591615362, + "grad_norm": 3.5129047119292545, + "learning_rate": 1.3481828839390388e-05, + "loss": 11.833991050720215, + "step": 46, + "token_acc": 0.009954296567402753 + }, + { + "epoch": 0.02755790090882439, + "grad_norm": 3.631314687151238, + "learning_rate": 1.3774912075029308e-05, + "loss": 11.818042755126953, + "step": 47, + "token_acc": 0.00974159406980461 + }, + { + "epoch": 0.028144239226033423, + "grad_norm": 2.7061434715967256, + "learning_rate": 1.406799531066823e-05, + "loss": 11.795839309692383, + "step": 48, + "token_acc": 0.010114053269194533 + }, + { + "epoch": 0.02873057754324245, + "grad_norm": 2.025945906052117, + "learning_rate": 1.4361078546307152e-05, + "loss": 11.78170394897461, + "step": 49, + "token_acc": 0.00965653692444547 + }, + { + "epoch": 0.02931691586045148, + "grad_norm": 2.5478209922208266, + "learning_rate": 1.4654161781946073e-05, + "loss": 11.755483627319336, + "step": 50, + "token_acc": 0.010177322843888137 + }, + { + "epoch": 0.02990325417766051, + "grad_norm": 2.1008328982135307, + "learning_rate": 1.4947245017584995e-05, + "loss": 11.737262725830078, + "step": 51, + "token_acc": 0.00994983299427297 + }, + { + "epoch": 0.03048959249486954, + "grad_norm": 1.8469441975521643, + "learning_rate": 1.5240328253223915e-05, + "loss": 11.708173751831055, + "step": 52, + "token_acc": 0.010665230272017236 + }, + { + "epoch": 0.03107593081207857, + "grad_norm": 1.5613251133142243, + "learning_rate": 1.553341148886284e-05, + "loss": 11.68775749206543, + "step": 53, + "token_acc": 0.010381088131673077 + }, + { + "epoch": 0.0316622691292876, + "grad_norm": 1.4698207661359133, + "learning_rate": 1.5826494724501756e-05, + "loss": 11.66779613494873, + "step": 54, + "token_acc": 0.009833599549249058 + }, + { + "epoch": 0.03224860744649663, + "grad_norm": 1.8460961634365187, + "learning_rate": 1.611957796014068e-05, + "loss": 11.639435768127441, + "step": 55, + "token_acc": 0.009808991012105984 + }, + { + "epoch": 0.03283494576370566, + "grad_norm": 1.5322585399900048, + "learning_rate": 1.6412661195779604e-05, + "loss": 11.597869873046875, + "step": 56, + "token_acc": 0.010166980877996229 + }, + { + "epoch": 0.03342128408091469, + "grad_norm": 1.7763741255224632, + "learning_rate": 1.6705744431418524e-05, + "loss": 11.565065383911133, + "step": 57, + "token_acc": 0.009926121865523092 + }, + { + "epoch": 0.034007622398123716, + "grad_norm": 1.512973788254566, + "learning_rate": 1.6998827667057444e-05, + "loss": 11.529674530029297, + "step": 58, + "token_acc": 0.009541249531284511 + }, + { + "epoch": 0.034593960715332744, + "grad_norm": 1.7183466708739061, + "learning_rate": 1.7291910902696368e-05, + "loss": 11.476898193359375, + "step": 59, + "token_acc": 0.010312098545578402 + }, + { + "epoch": 0.03518029903254178, + "grad_norm": 1.580856942478647, + "learning_rate": 1.758499413833529e-05, + "loss": 11.43745231628418, + "step": 60, + "token_acc": 0.009987452006606464 + }, + { + "epoch": 0.03576663734975081, + "grad_norm": 1.688630094041116, + "learning_rate": 1.787807737397421e-05, + "loss": 11.390299797058105, + "step": 61, + "token_acc": 0.010081490959432815 + }, + { + "epoch": 0.03635297566695984, + "grad_norm": 1.5114146317310522, + "learning_rate": 1.817116060961313e-05, + "loss": 11.349411010742188, + "step": 62, + "token_acc": 0.009682069716251167 + }, + { + "epoch": 0.036939313984168866, + "grad_norm": 1.2803492701078971, + "learning_rate": 1.846424384525205e-05, + "loss": 11.30108642578125, + "step": 63, + "token_acc": 0.009598904815762525 + }, + { + "epoch": 0.037525652301377894, + "grad_norm": 1.3579777563209132, + "learning_rate": 1.8757327080890974e-05, + "loss": 11.251730918884277, + "step": 64, + "token_acc": 0.009494669682006818 + }, + { + "epoch": 0.03811199061858692, + "grad_norm": 1.4758649599916502, + "learning_rate": 1.9050410316529894e-05, + "loss": 11.187777519226074, + "step": 65, + "token_acc": 0.009765726590864169 + }, + { + "epoch": 0.03869832893579595, + "grad_norm": 1.685003032531641, + "learning_rate": 1.9343493552168818e-05, + "loss": 11.127204895019531, + "step": 66, + "token_acc": 0.009664113140836771 + }, + { + "epoch": 0.03928466725300499, + "grad_norm": 2.1922224414208324, + "learning_rate": 1.9636576787807738e-05, + "loss": 11.059297561645508, + "step": 67, + "token_acc": 0.009575511602198475 + }, + { + "epoch": 0.039871005570214016, + "grad_norm": 2.3194743247234113, + "learning_rate": 1.992966002344666e-05, + "loss": 10.98441219329834, + "step": 68, + "token_acc": 0.009746186611111684 + }, + { + "epoch": 0.040457343887423045, + "grad_norm": 1.5235055597217073, + "learning_rate": 2.0222743259085582e-05, + "loss": 10.910924911499023, + "step": 69, + "token_acc": 0.01034937890285831 + }, + { + "epoch": 0.04104368220463207, + "grad_norm": 1.9147325017366896, + "learning_rate": 2.0515826494724503e-05, + "loss": 10.82236385345459, + "step": 70, + "token_acc": 0.009599121765713144 + }, + { + "epoch": 0.0416300205218411, + "grad_norm": 2.8991858566970525, + "learning_rate": 2.0808909730363423e-05, + "loss": 10.746939659118652, + "step": 71, + "token_acc": 0.009460101329955669 + }, + { + "epoch": 0.04221635883905013, + "grad_norm": 2.246200722657236, + "learning_rate": 2.1101992966002344e-05, + "loss": 10.647573471069336, + "step": 72, + "token_acc": 0.010042107779976887 + }, + { + "epoch": 0.04280269715625916, + "grad_norm": 2.010490388482763, + "learning_rate": 2.1395076201641264e-05, + "loss": 10.567008972167969, + "step": 73, + "token_acc": 0.00965858873464549 + }, + { + "epoch": 0.043389035473468195, + "grad_norm": 2.1101483851767027, + "learning_rate": 2.1688159437280188e-05, + "loss": 10.481674194335938, + "step": 74, + "token_acc": 0.00989701499776354 + }, + { + "epoch": 0.04397537379067722, + "grad_norm": 2.6334525423699335, + "learning_rate": 2.1981242672919108e-05, + "loss": 10.381482124328613, + "step": 75, + "token_acc": 0.010316418404159646 + }, + { + "epoch": 0.04456171210788625, + "grad_norm": 2.912172133202095, + "learning_rate": 2.2274325908558032e-05, + "loss": 10.294354438781738, + "step": 76, + "token_acc": 0.009850675640149324 + }, + { + "epoch": 0.04514805042509528, + "grad_norm": 2.3797273274710036, + "learning_rate": 2.2567409144196952e-05, + "loss": 10.213769912719727, + "step": 77, + "token_acc": 0.010241388484334845 + }, + { + "epoch": 0.04573438874230431, + "grad_norm": 2.945535177249324, + "learning_rate": 2.2860492379835876e-05, + "loss": 10.099189758300781, + "step": 78, + "token_acc": 0.009869874397761462 + }, + { + "epoch": 0.04632072705951334, + "grad_norm": 2.971770277897452, + "learning_rate": 2.3153575615474797e-05, + "loss": 9.993583679199219, + "step": 79, + "token_acc": 0.010431967935535318 + }, + { + "epoch": 0.046907065376722366, + "grad_norm": 2.547720468269889, + "learning_rate": 2.3446658851113717e-05, + "loss": 9.909547805786133, + "step": 80, + "token_acc": 0.010044648712742459 + }, + { + "epoch": 0.047493403693931395, + "grad_norm": 2.1454924184368163, + "learning_rate": 2.3739742086752637e-05, + "loss": 9.846504211425781, + "step": 81, + "token_acc": 0.010159731663403445 + }, + { + "epoch": 0.04807974201114043, + "grad_norm": 2.2665870973230287, + "learning_rate": 2.4032825322391558e-05, + "loss": 9.754265785217285, + "step": 82, + "token_acc": 0.009307710734342977 + }, + { + "epoch": 0.04866608032834946, + "grad_norm": 2.2607988486840003, + "learning_rate": 2.432590855803048e-05, + "loss": 9.701478958129883, + "step": 83, + "token_acc": 0.009814831657847852 + }, + { + "epoch": 0.04925241864555849, + "grad_norm": 1.7210474669988394, + "learning_rate": 2.4618991793669402e-05, + "loss": 9.639317512512207, + "step": 84, + "token_acc": 0.009418293617988417 + }, + { + "epoch": 0.049838756962767516, + "grad_norm": 1.9291926373706727, + "learning_rate": 2.4912075029308322e-05, + "loss": 9.624462127685547, + "step": 85, + "token_acc": 0.00989620739097431 + }, + { + "epoch": 0.050425095279976545, + "grad_norm": 2.005482804094446, + "learning_rate": 2.5205158264947246e-05, + "loss": 9.52865219116211, + "step": 86, + "token_acc": 0.00879396344849723 + }, + { + "epoch": 0.051011433597185574, + "grad_norm": 1.516405972085273, + "learning_rate": 2.5498241500586167e-05, + "loss": 9.478717803955078, + "step": 87, + "token_acc": 0.00946927640135774 + }, + { + "epoch": 0.0515977719143946, + "grad_norm": 2.466584861081511, + "learning_rate": 2.579132473622509e-05, + "loss": 9.46435546875, + "step": 88, + "token_acc": 0.00986529307929467 + }, + { + "epoch": 0.05218411023160364, + "grad_norm": 1.4310252606493354, + "learning_rate": 2.608440797186401e-05, + "loss": 9.38080883026123, + "step": 89, + "token_acc": 0.009301760346752612 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 1.6057792268393976, + "learning_rate": 2.637749120750293e-05, + "loss": 9.322309494018555, + "step": 90, + "token_acc": 0.009803260379883084 + }, + { + "epoch": 0.053356786866021695, + "grad_norm": 1.2211016643267802, + "learning_rate": 2.667057444314185e-05, + "loss": 9.337228775024414, + "step": 91, + "token_acc": 0.010295949169358073 + }, + { + "epoch": 0.053943125183230724, + "grad_norm": 1.37593460058677, + "learning_rate": 2.6963657678780775e-05, + "loss": 9.242362022399902, + "step": 92, + "token_acc": 0.0102168605941473 + }, + { + "epoch": 0.05452946350043975, + "grad_norm": 1.348385407206309, + "learning_rate": 2.7256740914419696e-05, + "loss": 9.236753463745117, + "step": 93, + "token_acc": 0.01001013718997094 + }, + { + "epoch": 0.05511580181764878, + "grad_norm": 1.1206301898574862, + "learning_rate": 2.7549824150058616e-05, + "loss": 9.238260269165039, + "step": 94, + "token_acc": 0.009541506872784301 + }, + { + "epoch": 0.05570214013485781, + "grad_norm": 1.345623166185279, + "learning_rate": 2.7842907385697537e-05, + "loss": 9.263555526733398, + "step": 95, + "token_acc": 0.010272638240962079 + }, + { + "epoch": 0.056288478452066845, + "grad_norm": 1.1896193598625675, + "learning_rate": 2.813599062133646e-05, + "loss": 9.129605293273926, + "step": 96, + "token_acc": 0.012232874370405438 + }, + { + "epoch": 0.056874816769275874, + "grad_norm": 0.9356573544307287, + "learning_rate": 2.8429073856975384e-05, + "loss": 9.214315414428711, + "step": 97, + "token_acc": 0.011531789767618967 + }, + { + "epoch": 0.0574611550864849, + "grad_norm": 1.2848383130859589, + "learning_rate": 2.8722157092614305e-05, + "loss": 9.171603202819824, + "step": 98, + "token_acc": 0.011461781454034188 + }, + { + "epoch": 0.05804749340369393, + "grad_norm": 0.9553656306915715, + "learning_rate": 2.9015240328253225e-05, + "loss": 9.156946182250977, + "step": 99, + "token_acc": 0.011090126436273988 + }, + { + "epoch": 0.05863383172090296, + "grad_norm": 0.9865674826661757, + "learning_rate": 2.9308323563892145e-05, + "loss": 9.10972785949707, + "step": 100, + "token_acc": 0.01165684761034624 + }, + { + "epoch": 0.05922017003811199, + "grad_norm": 1.155982251271156, + "learning_rate": 2.9601406799531066e-05, + "loss": 9.144954681396484, + "step": 101, + "token_acc": 0.01193571507044243 + }, + { + "epoch": 0.05980650835532102, + "grad_norm": 0.8102995038859054, + "learning_rate": 2.989449003516999e-05, + "loss": 9.072721481323242, + "step": 102, + "token_acc": 0.01121621144663665 + }, + { + "epoch": 0.06039284667253005, + "grad_norm": 0.775186952799585, + "learning_rate": 3.018757327080891e-05, + "loss": 9.186348915100098, + "step": 103, + "token_acc": 0.011551146598398943 + }, + { + "epoch": 0.06097918498973908, + "grad_norm": 0.761217397882543, + "learning_rate": 3.048065650644783e-05, + "loss": 9.047115325927734, + "step": 104, + "token_acc": 0.011618798955613577 + }, + { + "epoch": 0.06156552330694811, + "grad_norm": 0.7213021351684632, + "learning_rate": 3.077373974208675e-05, + "loss": 9.156316757202148, + "step": 105, + "token_acc": 0.011375447209672786 + }, + { + "epoch": 0.06215186162415714, + "grad_norm": 0.7309509073181476, + "learning_rate": 3.106682297772568e-05, + "loss": 9.148033142089844, + "step": 106, + "token_acc": 0.012079490499722524 + }, + { + "epoch": 0.06273819994136617, + "grad_norm": 0.71620452889025, + "learning_rate": 3.13599062133646e-05, + "loss": 9.068413734436035, + "step": 107, + "token_acc": 0.011607522944027414 + }, + { + "epoch": 0.0633245382585752, + "grad_norm": 0.6771575928881343, + "learning_rate": 3.165298944900351e-05, + "loss": 9.069038391113281, + "step": 108, + "token_acc": 0.011099353621377666 + }, + { + "epoch": 0.06391087657578423, + "grad_norm": 0.6711069252190972, + "learning_rate": 3.194607268464244e-05, + "loss": 9.122047424316406, + "step": 109, + "token_acc": 0.011947090355173344 + }, + { + "epoch": 0.06449721489299326, + "grad_norm": 0.6158664940699149, + "learning_rate": 3.223915592028136e-05, + "loss": 9.059281349182129, + "step": 110, + "token_acc": 0.011221092233253866 + }, + { + "epoch": 0.06508355321020229, + "grad_norm": 0.6541226788161765, + "learning_rate": 3.253223915592028e-05, + "loss": 9.042486190795898, + "step": 111, + "token_acc": 0.011933036580314684 + }, + { + "epoch": 0.06566989152741132, + "grad_norm": 0.5883955641324334, + "learning_rate": 3.282532239155921e-05, + "loss": 9.05929946899414, + "step": 112, + "token_acc": 0.011732247975197016 + }, + { + "epoch": 0.06625622984462035, + "grad_norm": 0.7627940108714882, + "learning_rate": 3.311840562719812e-05, + "loss": 9.178930282592773, + "step": 113, + "token_acc": 0.011931200114896979 + }, + { + "epoch": 0.06684256816182937, + "grad_norm": 0.5941328029995522, + "learning_rate": 3.341148886283705e-05, + "loss": 9.0415620803833, + "step": 114, + "token_acc": 0.011746869853882949 + }, + { + "epoch": 0.0674289064790384, + "grad_norm": 0.5648062486746203, + "learning_rate": 3.370457209847597e-05, + "loss": 9.050302505493164, + "step": 115, + "token_acc": 0.01133997295476865 + }, + { + "epoch": 0.06801524479624743, + "grad_norm": 0.6145887557531267, + "learning_rate": 3.399765533411489e-05, + "loss": 9.054960250854492, + "step": 116, + "token_acc": 0.012100911975666616 + }, + { + "epoch": 0.06860158311345646, + "grad_norm": 0.6102056970676166, + "learning_rate": 3.429073856975381e-05, + "loss": 9.081972122192383, + "step": 117, + "token_acc": 0.01301256789860475 + }, + { + "epoch": 0.06918792143066549, + "grad_norm": 0.6291909427439372, + "learning_rate": 3.4583821805392736e-05, + "loss": 8.970478057861328, + "step": 118, + "token_acc": 0.01245429528011244 + }, + { + "epoch": 0.06977425974787452, + "grad_norm": 0.6645018971262254, + "learning_rate": 3.487690504103165e-05, + "loss": 9.07176399230957, + "step": 119, + "token_acc": 0.012789338134439861 + }, + { + "epoch": 0.07036059806508356, + "grad_norm": 0.6594780162967981, + "learning_rate": 3.516998827667058e-05, + "loss": 9.01324462890625, + "step": 120, + "token_acc": 0.011675599037382705 + }, + { + "epoch": 0.07094693638229259, + "grad_norm": 0.6039936107825037, + "learning_rate": 3.54630715123095e-05, + "loss": 9.027917861938477, + "step": 121, + "token_acc": 0.01210492997977365 + }, + { + "epoch": 0.07153327469950162, + "grad_norm": 0.5376271255533, + "learning_rate": 3.575615474794842e-05, + "loss": 9.073873519897461, + "step": 122, + "token_acc": 0.012944805841006463 + }, + { + "epoch": 0.07211961301671065, + "grad_norm": 0.5554818213186559, + "learning_rate": 3.6049237983587345e-05, + "loss": 9.039239883422852, + "step": 123, + "token_acc": 0.011444788680887667 + }, + { + "epoch": 0.07270595133391967, + "grad_norm": 0.5884672473219501, + "learning_rate": 3.634232121922626e-05, + "loss": 9.021541595458984, + "step": 124, + "token_acc": 0.011900584556713426 + }, + { + "epoch": 0.0732922896511287, + "grad_norm": 0.5396993704088476, + "learning_rate": 3.6635404454865186e-05, + "loss": 9.010172843933105, + "step": 125, + "token_acc": 0.011912235628030156 + }, + { + "epoch": 0.07387862796833773, + "grad_norm": 0.5989403347793538, + "learning_rate": 3.69284876905041e-05, + "loss": 9.069947242736816, + "step": 126, + "token_acc": 0.012838447957519643 + }, + { + "epoch": 0.07446496628554676, + "grad_norm": 0.6878989358274504, + "learning_rate": 3.722157092614303e-05, + "loss": 9.054113388061523, + "step": 127, + "token_acc": 0.01162454683969947 + }, + { + "epoch": 0.07505130460275579, + "grad_norm": 1.0466514286625217, + "learning_rate": 3.751465416178195e-05, + "loss": 9.123021125793457, + "step": 128, + "token_acc": 0.012989581607180084 + }, + { + "epoch": 0.07563764291996482, + "grad_norm": 1.4583369467166205, + "learning_rate": 3.780773739742087e-05, + "loss": 9.07336139678955, + "step": 129, + "token_acc": 0.012118592430957352 + }, + { + "epoch": 0.07622398123717385, + "grad_norm": 0.6613674916373863, + "learning_rate": 3.810082063305979e-05, + "loss": 9.000986099243164, + "step": 130, + "token_acc": 0.012802827245847347 + }, + { + "epoch": 0.07681031955438287, + "grad_norm": 0.798447665040762, + "learning_rate": 3.839390386869871e-05, + "loss": 9.099397659301758, + "step": 131, + "token_acc": 0.012345867804403122 + }, + { + "epoch": 0.0773966578715919, + "grad_norm": 1.1620166447954958, + "learning_rate": 3.8686987104337636e-05, + "loss": 9.042312622070312, + "step": 132, + "token_acc": 0.013380480085938385 + }, + { + "epoch": 0.07798299618880093, + "grad_norm": 0.6241317601800273, + "learning_rate": 3.8980070339976556e-05, + "loss": 9.096951484680176, + "step": 133, + "token_acc": 0.01236691735428148 + }, + { + "epoch": 0.07856933450600997, + "grad_norm": 0.644152757164242, + "learning_rate": 3.9273153575615476e-05, + "loss": 9.075241088867188, + "step": 134, + "token_acc": 0.013975434391851407 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 0.6360182345956721, + "learning_rate": 3.95662368112544e-05, + "loss": 9.120025634765625, + "step": 135, + "token_acc": 0.013047885454446469 + }, + { + "epoch": 0.07974201114042803, + "grad_norm": 0.8471619740332605, + "learning_rate": 3.985932004689332e-05, + "loss": 8.983390808105469, + "step": 136, + "token_acc": 0.012065508987385343 + }, + { + "epoch": 0.08032834945763706, + "grad_norm": 1.5445162116932314, + "learning_rate": 4.015240328253224e-05, + "loss": 9.056979179382324, + "step": 137, + "token_acc": 0.011673870858445332 + }, + { + "epoch": 0.08091468777484609, + "grad_norm": 1.345636253725767, + "learning_rate": 4.0445486518171165e-05, + "loss": 9.058370590209961, + "step": 138, + "token_acc": 0.013804131801278749 + }, + { + "epoch": 0.08150102609205512, + "grad_norm": 1.7977199714744192, + "learning_rate": 4.073856975381008e-05, + "loss": 9.08059310913086, + "step": 139, + "token_acc": 0.013110879996704903 + }, + { + "epoch": 0.08208736440926415, + "grad_norm": 0.6622174014647327, + "learning_rate": 4.1031652989449006e-05, + "loss": 8.98945426940918, + "step": 140, + "token_acc": 0.01424087982832618 + }, + { + "epoch": 0.08267370272647317, + "grad_norm": 27.670994701344057, + "learning_rate": 4.1324736225087926e-05, + "loss": 9.153547286987305, + "step": 141, + "token_acc": 0.013166422844021407 + }, + { + "epoch": 0.0832600410436822, + "grad_norm": 0.9503391387441521, + "learning_rate": 4.1617819460726846e-05, + "loss": 9.067795753479004, + "step": 142, + "token_acc": 0.01426748347364399 + }, + { + "epoch": 0.08384637936089123, + "grad_norm": 0.8234029150083744, + "learning_rate": 4.1910902696365774e-05, + "loss": 9.054396629333496, + "step": 143, + "token_acc": 0.014940769333155864 + }, + { + "epoch": 0.08443271767810026, + "grad_norm": 1.1606118933038128, + "learning_rate": 4.220398593200469e-05, + "loss": 8.98237419128418, + "step": 144, + "token_acc": 0.015627841275948674 + }, + { + "epoch": 0.08501905599530929, + "grad_norm": 0.7555726774032652, + "learning_rate": 4.2497069167643614e-05, + "loss": 9.117998123168945, + "step": 145, + "token_acc": 0.01568815131729457 + }, + { + "epoch": 0.08560539431251832, + "grad_norm": 3.368266363746793, + "learning_rate": 4.279015240328253e-05, + "loss": 9.014965057373047, + "step": 146, + "token_acc": 0.015468345677977644 + }, + { + "epoch": 0.08619173262972735, + "grad_norm": 0.6560979521936137, + "learning_rate": 4.3083235638921455e-05, + "loss": 8.998909950256348, + "step": 147, + "token_acc": 0.013764487051080269 + }, + { + "epoch": 0.08677807094693639, + "grad_norm": 1.0923013975992322, + "learning_rate": 4.3376318874560376e-05, + "loss": 9.04861831665039, + "step": 148, + "token_acc": 0.013260634881591713 + }, + { + "epoch": 0.08736440926414542, + "grad_norm": 0.6015653002957679, + "learning_rate": 4.3669402110199296e-05, + "loss": 9.02303695678711, + "step": 149, + "token_acc": 0.01542785262839678 + }, + { + "epoch": 0.08795074758135445, + "grad_norm": 1.5446096175911532, + "learning_rate": 4.3962485345838216e-05, + "loss": 9.037572860717773, + "step": 150, + "token_acc": 0.01518732292035231 + }, + { + "epoch": 0.08853708589856348, + "grad_norm": 0.6411615461501838, + "learning_rate": 4.4255568581477144e-05, + "loss": 9.064362525939941, + "step": 151, + "token_acc": 0.015274854043740214 + }, + { + "epoch": 0.0891234242157725, + "grad_norm": 1.0535392960185859, + "learning_rate": 4.4548651817116064e-05, + "loss": 9.030740737915039, + "step": 152, + "token_acc": 0.014803713909681665 + }, + { + "epoch": 0.08970976253298153, + "grad_norm": 1.0571200978375455, + "learning_rate": 4.4841735052754984e-05, + "loss": 9.012420654296875, + "step": 153, + "token_acc": 0.015575135558928999 + }, + { + "epoch": 0.09029610085019056, + "grad_norm": 2.2788284348010657, + "learning_rate": 4.5134818288393905e-05, + "loss": 9.037130355834961, + "step": 154, + "token_acc": 0.01606599742626949 + }, + { + "epoch": 0.09088243916739959, + "grad_norm": 0.8269007597807994, + "learning_rate": 4.5427901524032825e-05, + "loss": 9.056532859802246, + "step": 155, + "token_acc": 0.014675794875192046 + }, + { + "epoch": 0.09146877748460862, + "grad_norm": 1.6203791781845984, + "learning_rate": 4.572098475967175e-05, + "loss": 8.994211196899414, + "step": 156, + "token_acc": 0.015230111957664211 + }, + { + "epoch": 0.09205511580181765, + "grad_norm": 3.0794533802847983, + "learning_rate": 4.6014067995310666e-05, + "loss": 9.0416898727417, + "step": 157, + "token_acc": 0.014912312972004267 + }, + { + "epoch": 0.09264145411902668, + "grad_norm": 1.891714166389983, + "learning_rate": 4.630715123094959e-05, + "loss": 8.981319427490234, + "step": 158, + "token_acc": 0.014646135062310542 + }, + { + "epoch": 0.0932277924362357, + "grad_norm": 10.774120420203591, + "learning_rate": 4.660023446658851e-05, + "loss": 9.010435104370117, + "step": 159, + "token_acc": 0.015444536067503721 + }, + { + "epoch": 0.09381413075344473, + "grad_norm": 7.45957162719553, + "learning_rate": 4.6893317702227434e-05, + "loss": 9.080111503601074, + "step": 160, + "token_acc": 0.01499811001489156 + }, + { + "epoch": 0.09440046907065376, + "grad_norm": 0.8937305353273864, + "learning_rate": 4.718640093786636e-05, + "loss": 9.031442642211914, + "step": 161, + "token_acc": 0.015406480668060142 + }, + { + "epoch": 0.09498680738786279, + "grad_norm": 3.5487521889087907, + "learning_rate": 4.7479484173505275e-05, + "loss": 9.011468887329102, + "step": 162, + "token_acc": 0.015158708989063751 + }, + { + "epoch": 0.09557314570507183, + "grad_norm": 2.959279305380981, + "learning_rate": 4.77725674091442e-05, + "loss": 9.005142211914062, + "step": 163, + "token_acc": 0.013494043376438244 + }, + { + "epoch": 0.09615948402228086, + "grad_norm": 3.2622271122520994, + "learning_rate": 4.8065650644783116e-05, + "loss": 8.998067855834961, + "step": 164, + "token_acc": 0.013487252744424283 + }, + { + "epoch": 0.09674582233948989, + "grad_norm": 2.027402533781415, + "learning_rate": 4.835873388042204e-05, + "loss": 9.002195358276367, + "step": 165, + "token_acc": 0.015836285272474035 + }, + { + "epoch": 0.09733216065669892, + "grad_norm": 2.9061635664284533, + "learning_rate": 4.865181711606096e-05, + "loss": 9.032505989074707, + "step": 166, + "token_acc": 0.014648964429502346 + }, + { + "epoch": 0.09791849897390795, + "grad_norm": 1.3054722296281038, + "learning_rate": 4.8944900351699884e-05, + "loss": 8.937246322631836, + "step": 167, + "token_acc": 0.01595735133097593 + }, + { + "epoch": 0.09850483729111698, + "grad_norm": 3.730000570838039, + "learning_rate": 4.9237983587338804e-05, + "loss": 8.91473388671875, + "step": 168, + "token_acc": 0.015526105569324676 + }, + { + "epoch": 0.099091175608326, + "grad_norm": 1.1265081450438386, + "learning_rate": 4.9531066822977724e-05, + "loss": 8.87942886352539, + "step": 169, + "token_acc": 0.01571133998300599 + }, + { + "epoch": 0.09967751392553503, + "grad_norm": 4.572016892486631, + "learning_rate": 4.9824150058616645e-05, + "loss": 8.914380073547363, + "step": 170, + "token_acc": 0.013701915556992253 + }, + { + "epoch": 0.10026385224274406, + "grad_norm": 1.551018304245375, + "learning_rate": 5.011723329425557e-05, + "loss": 8.881754875183105, + "step": 171, + "token_acc": 0.01519993986381227 + }, + { + "epoch": 0.10085019055995309, + "grad_norm": 11.730889156050635, + "learning_rate": 5.041031652989449e-05, + "loss": 8.938714027404785, + "step": 172, + "token_acc": 0.014901633339907332 + }, + { + "epoch": 0.10143652887716212, + "grad_norm": 13.232236209459126, + "learning_rate": 5.070339976553341e-05, + "loss": 8.957929611206055, + "step": 173, + "token_acc": 0.01576526516449063 + }, + { + "epoch": 0.10202286719437115, + "grad_norm": 4.295964381097921, + "learning_rate": 5.099648300117233e-05, + "loss": 8.853195190429688, + "step": 174, + "token_acc": 0.014773312860853744 + }, + { + "epoch": 0.10260920551158018, + "grad_norm": 16.44930152861039, + "learning_rate": 5.1289566236811254e-05, + "loss": 8.970190048217773, + "step": 175, + "token_acc": 0.014638800968162353 + }, + { + "epoch": 0.1031955438287892, + "grad_norm": 18.684173259328407, + "learning_rate": 5.158264947245018e-05, + "loss": 9.034139633178711, + "step": 176, + "token_acc": 0.015680815876515986 + }, + { + "epoch": 0.10378188214599825, + "grad_norm": 12.684678314847286, + "learning_rate": 5.1875732708089094e-05, + "loss": 8.913228988647461, + "step": 177, + "token_acc": 0.014684177351452724 + }, + { + "epoch": 0.10436822046320728, + "grad_norm": 1.8227889064380007, + "learning_rate": 5.216881594372802e-05, + "loss": 8.800104141235352, + "step": 178, + "token_acc": 0.016324880312539435 + }, + { + "epoch": 0.1049545587804163, + "grad_norm": 9.010683810362792, + "learning_rate": 5.2461899179366935e-05, + "loss": 8.902030944824219, + "step": 179, + "token_acc": 0.016581966851631345 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 6.100096475661366, + "learning_rate": 5.275498241500586e-05, + "loss": 8.933889389038086, + "step": 180, + "token_acc": 0.015566585867296236 + }, + { + "epoch": 0.10612723541483436, + "grad_norm": 3.529948177834921, + "learning_rate": 5.304806565064479e-05, + "loss": 8.916126251220703, + "step": 181, + "token_acc": 0.01492761684280152 + }, + { + "epoch": 0.10671357373204339, + "grad_norm": 1.2590959787599525, + "learning_rate": 5.33411488862837e-05, + "loss": 8.729218482971191, + "step": 182, + "token_acc": 0.015315456723284795 + }, + { + "epoch": 0.10729991204925242, + "grad_norm": 3.1380519156783886, + "learning_rate": 5.363423212192263e-05, + "loss": 8.827041625976562, + "step": 183, + "token_acc": 0.0161524568000338 + }, + { + "epoch": 0.10788625036646145, + "grad_norm": 2.763974886897374, + "learning_rate": 5.392731535756155e-05, + "loss": 8.771095275878906, + "step": 184, + "token_acc": 0.016393485567288472 + }, + { + "epoch": 0.10847258868367048, + "grad_norm": 1.9384487957952075, + "learning_rate": 5.422039859320047e-05, + "loss": 8.778203964233398, + "step": 185, + "token_acc": 0.016288290889591273 + }, + { + "epoch": 0.1090589270008795, + "grad_norm": 4.471488914801324, + "learning_rate": 5.451348182883939e-05, + "loss": 8.763311386108398, + "step": 186, + "token_acc": 0.016823097406277413 + }, + { + "epoch": 0.10964526531808853, + "grad_norm": 4.93635206450147, + "learning_rate": 5.480656506447831e-05, + "loss": 8.744799613952637, + "step": 187, + "token_acc": 0.015184437426726559 + }, + { + "epoch": 0.11023160363529756, + "grad_norm": 4.522895153957665, + "learning_rate": 5.509964830011723e-05, + "loss": 8.70407772064209, + "step": 188, + "token_acc": 0.016358883997253115 + }, + { + "epoch": 0.11081794195250659, + "grad_norm": 3.7313033196719037, + "learning_rate": 5.539273153575616e-05, + "loss": 8.730113983154297, + "step": 189, + "token_acc": 0.016440375650633842 + }, + { + "epoch": 0.11140428026971562, + "grad_norm": 5.052261907074615, + "learning_rate": 5.568581477139507e-05, + "loss": 8.744132995605469, + "step": 190, + "token_acc": 0.01658494451496016 + }, + { + "epoch": 0.11199061858692466, + "grad_norm": 3.31067778798652, + "learning_rate": 5.5978898007034e-05, + "loss": 8.700679779052734, + "step": 191, + "token_acc": 0.01586239342257869 + }, + { + "epoch": 0.11257695690413369, + "grad_norm": 1.0260062681026476, + "learning_rate": 5.627198124267292e-05, + "loss": 8.780962944030762, + "step": 192, + "token_acc": 0.016017765772317602 + }, + { + "epoch": 0.11316329522134272, + "grad_norm": 7.458556736196419, + "learning_rate": 5.656506447831184e-05, + "loss": 8.744999885559082, + "step": 193, + "token_acc": 0.014912581848799301 + }, + { + "epoch": 0.11374963353855175, + "grad_norm": 10.079846221465687, + "learning_rate": 5.685814771395077e-05, + "loss": 8.803315162658691, + "step": 194, + "token_acc": 0.014887918854515932 + }, + { + "epoch": 0.11433597185576078, + "grad_norm": 3.477254786052486, + "learning_rate": 5.715123094958968e-05, + "loss": 8.727133750915527, + "step": 195, + "token_acc": 0.015802094702375955 + }, + { + "epoch": 0.1149223101729698, + "grad_norm": 5.0618517385679525, + "learning_rate": 5.744431418522861e-05, + "loss": 8.644700050354004, + "step": 196, + "token_acc": 0.016661720327876602 + }, + { + "epoch": 0.11550864849017883, + "grad_norm": 11.88288450557306, + "learning_rate": 5.773739742086752e-05, + "loss": 8.746366500854492, + "step": 197, + "token_acc": 0.016478598027581125 + }, + { + "epoch": 0.11609498680738786, + "grad_norm": 5.731286437963412, + "learning_rate": 5.803048065650645e-05, + "loss": 8.639501571655273, + "step": 198, + "token_acc": 0.01787763083632972 + }, + { + "epoch": 0.11668132512459689, + "grad_norm": 16.386161788718045, + "learning_rate": 5.832356389214537e-05, + "loss": 8.684814453125, + "step": 199, + "token_acc": 0.017497375004861104 + }, + { + "epoch": 0.11726766344180592, + "grad_norm": 16.38806388393325, + "learning_rate": 5.861664712778429e-05, + "loss": 8.64936637878418, + "step": 200, + "token_acc": 0.016449116853381306 + }, + { + "epoch": 0.11785400175901495, + "grad_norm": 5.212003127584709, + "learning_rate": 5.890973036342322e-05, + "loss": 8.724567413330078, + "step": 201, + "token_acc": 0.01695317162588456 + }, + { + "epoch": 0.11844034007622398, + "grad_norm": 9.156993737651222, + "learning_rate": 5.920281359906213e-05, + "loss": 8.577198028564453, + "step": 202, + "token_acc": 0.01718782633842265 + }, + { + "epoch": 0.119026678393433, + "grad_norm": 9.101926016859572, + "learning_rate": 5.949589683470106e-05, + "loss": 8.692757606506348, + "step": 203, + "token_acc": 0.016880875238676457 + }, + { + "epoch": 0.11961301671064203, + "grad_norm": 3.3144327424040925, + "learning_rate": 5.978898007033998e-05, + "loss": 8.529800415039062, + "step": 204, + "token_acc": 0.017974216518853565 + }, + { + "epoch": 0.12019935502785108, + "grad_norm": 19.142046024227387, + "learning_rate": 6.00820633059789e-05, + "loss": 8.707862854003906, + "step": 205, + "token_acc": 0.01619655515954679 + }, + { + "epoch": 0.1207856933450601, + "grad_norm": 17.117850232171193, + "learning_rate": 6.037514654161782e-05, + "loss": 8.636751174926758, + "step": 206, + "token_acc": 0.017083395959032056 + }, + { + "epoch": 0.12137203166226913, + "grad_norm": 2.5671023748067876, + "learning_rate": 6.066822977725674e-05, + "loss": 8.624223709106445, + "step": 207, + "token_acc": 0.016383109467759928 + }, + { + "epoch": 0.12195836997947816, + "grad_norm": 17.705045927553492, + "learning_rate": 6.096131301289566e-05, + "loss": 8.70902156829834, + "step": 208, + "token_acc": 0.016802597196927706 + }, + { + "epoch": 0.12254470829668719, + "grad_norm": 18.93270872158918, + "learning_rate": 6.125439624853459e-05, + "loss": 8.715855598449707, + "step": 209, + "token_acc": 0.016338521390424136 + }, + { + "epoch": 0.12313104661389622, + "grad_norm": 7.289969427086145, + "learning_rate": 6.15474794841735e-05, + "loss": 8.569097518920898, + "step": 210, + "token_acc": 0.01744045387717773 + }, + { + "epoch": 0.12371738493110525, + "grad_norm": 13.724303951548745, + "learning_rate": 6.184056271981243e-05, + "loss": 8.679459571838379, + "step": 211, + "token_acc": 0.017568939332338336 + }, + { + "epoch": 0.12430372324831428, + "grad_norm": 13.474857729860062, + "learning_rate": 6.213364595545136e-05, + "loss": 8.554481506347656, + "step": 212, + "token_acc": 0.017222620894002175 + }, + { + "epoch": 0.1248900615655233, + "grad_norm": 3.8323785919939715, + "learning_rate": 6.242672919109027e-05, + "loss": 8.478784561157227, + "step": 213, + "token_acc": 0.018962274757509785 + }, + { + "epoch": 0.12547639988273235, + "grad_norm": 3.043740645972869, + "learning_rate": 6.27198124267292e-05, + "loss": 8.605936050415039, + "step": 214, + "token_acc": 0.017647326139754706 + }, + { + "epoch": 0.12606273819994138, + "grad_norm": 23.383653643283452, + "learning_rate": 6.301289566236812e-05, + "loss": 8.636078834533691, + "step": 215, + "token_acc": 0.01640215832670891 + }, + { + "epoch": 0.1266490765171504, + "grad_norm": 21.752726012141736, + "learning_rate": 6.330597889800702e-05, + "loss": 8.577640533447266, + "step": 216, + "token_acc": 0.017832840456819418 + }, + { + "epoch": 0.12723541483435943, + "grad_norm": 4.094310847547717, + "learning_rate": 6.359906213364595e-05, + "loss": 8.488836288452148, + "step": 217, + "token_acc": 0.017452771819831712 + }, + { + "epoch": 0.12782175315156846, + "grad_norm": 3.9601952228084842, + "learning_rate": 6.389214536928488e-05, + "loss": 8.578311920166016, + "step": 218, + "token_acc": 0.016829314478811332 + }, + { + "epoch": 0.1284080914687775, + "grad_norm": 11.737225536193094, + "learning_rate": 6.41852286049238e-05, + "loss": 8.462087631225586, + "step": 219, + "token_acc": 0.017304168067840263 + }, + { + "epoch": 0.12899442978598652, + "grad_norm": 10.142299779472314, + "learning_rate": 6.447831184056272e-05, + "loss": 8.476770401000977, + "step": 220, + "token_acc": 0.017343309836713363 + }, + { + "epoch": 0.12958076810319555, + "grad_norm": 11.20711605649483, + "learning_rate": 6.477139507620163e-05, + "loss": 8.518610954284668, + "step": 221, + "token_acc": 0.01845935963564089 + }, + { + "epoch": 0.13016710642040458, + "grad_norm": 8.319735186212462, + "learning_rate": 6.506447831184056e-05, + "loss": 8.442079544067383, + "step": 222, + "token_acc": 0.018927895621336226 + }, + { + "epoch": 0.1307534447376136, + "grad_norm": 13.346414566879151, + "learning_rate": 6.535756154747949e-05, + "loss": 8.54383659362793, + "step": 223, + "token_acc": 0.017246862225140155 + }, + { + "epoch": 0.13133978305482263, + "grad_norm": 8.198788362666994, + "learning_rate": 6.565064478311841e-05, + "loss": 8.526226043701172, + "step": 224, + "token_acc": 0.017177822512137484 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 12.2174533699686, + "learning_rate": 6.594372801875733e-05, + "loss": 8.415727615356445, + "step": 225, + "token_acc": 0.018047791791998554 + }, + { + "epoch": 0.1325124596892407, + "grad_norm": 9.152416241421493, + "learning_rate": 6.623681125439624e-05, + "loss": 8.444255828857422, + "step": 226, + "token_acc": 0.017586869005900726 + }, + { + "epoch": 0.13309879800644972, + "grad_norm": 10.40590711692462, + "learning_rate": 6.652989449003517e-05, + "loss": 8.471542358398438, + "step": 227, + "token_acc": 0.016466763159575147 + }, + { + "epoch": 0.13368513632365875, + "grad_norm": 11.55576830229065, + "learning_rate": 6.68229777256741e-05, + "loss": 8.469377517700195, + "step": 228, + "token_acc": 0.016281522371486484 + }, + { + "epoch": 0.13427147464086778, + "grad_norm": 4.957683145602286, + "learning_rate": 6.711606096131301e-05, + "loss": 8.450397491455078, + "step": 229, + "token_acc": 0.017184085943190094 + }, + { + "epoch": 0.1348578129580768, + "grad_norm": 4.417961684691017, + "learning_rate": 6.740914419695194e-05, + "loss": 8.386655807495117, + "step": 230, + "token_acc": 0.01818479561630031 + }, + { + "epoch": 0.13544415127528583, + "grad_norm": 10.883855260440926, + "learning_rate": 6.770222743259086e-05, + "loss": 8.469613075256348, + "step": 231, + "token_acc": 0.016472601851611732 + }, + { + "epoch": 0.13603048959249486, + "grad_norm": 8.959714325000101, + "learning_rate": 6.799531066822978e-05, + "loss": 8.390556335449219, + "step": 232, + "token_acc": 0.016688635783613407 + }, + { + "epoch": 0.1366168279097039, + "grad_norm": 5.46317976744313, + "learning_rate": 6.82883939038687e-05, + "loss": 8.366771697998047, + "step": 233, + "token_acc": 0.01893216334795815 + }, + { + "epoch": 0.13720316622691292, + "grad_norm": 5.171782104398142, + "learning_rate": 6.858147713950762e-05, + "loss": 8.392815589904785, + "step": 234, + "token_acc": 0.018358918534072472 + }, + { + "epoch": 0.13778950454412195, + "grad_norm": 7.597631537523283, + "learning_rate": 6.887456037514655e-05, + "loss": 8.311612129211426, + "step": 235, + "token_acc": 0.018695744891112694 + }, + { + "epoch": 0.13837584286133098, + "grad_norm": 4.025975897904792, + "learning_rate": 6.916764361078547e-05, + "loss": 8.252484321594238, + "step": 236, + "token_acc": 0.017489714597891913 + }, + { + "epoch": 0.13896218117854, + "grad_norm": 12.63744324347212, + "learning_rate": 6.946072684642439e-05, + "loss": 8.398983001708984, + "step": 237, + "token_acc": 0.01723557211573884 + }, + { + "epoch": 0.13954851949574903, + "grad_norm": 12.001682900315135, + "learning_rate": 6.97538100820633e-05, + "loss": 8.364505767822266, + "step": 238, + "token_acc": 0.019183627514700537 + }, + { + "epoch": 0.14013485781295806, + "grad_norm": 3.0690319680586584, + "learning_rate": 7.004689331770223e-05, + "loss": 8.331502914428711, + "step": 239, + "token_acc": 0.018484823422380284 + }, + { + "epoch": 0.14072119613016712, + "grad_norm": 12.133847265694815, + "learning_rate": 7.033997655334115e-05, + "loss": 8.34527587890625, + "step": 240, + "token_acc": 0.017558171153323233 + }, + { + "epoch": 0.14130753444737615, + "grad_norm": 12.355574260271677, + "learning_rate": 7.063305978898008e-05, + "loss": 8.383485794067383, + "step": 241, + "token_acc": 0.01869788664355479 + }, + { + "epoch": 0.14189387276458518, + "grad_norm": 2.806155418914126, + "learning_rate": 7.0926143024619e-05, + "loss": 8.341289520263672, + "step": 242, + "token_acc": 0.01852517130434323 + }, + { + "epoch": 0.1424802110817942, + "grad_norm": 15.330216005338592, + "learning_rate": 7.121922626025791e-05, + "loss": 8.337993621826172, + "step": 243, + "token_acc": 0.018042335944819347 + }, + { + "epoch": 0.14306654939900323, + "grad_norm": 19.931118361452103, + "learning_rate": 7.151230949589684e-05, + "loss": 8.49669075012207, + "step": 244, + "token_acc": 0.017280365421119104 + }, + { + "epoch": 0.14365288771621226, + "grad_norm": 10.74510387049553, + "learning_rate": 7.180539273153576e-05, + "loss": 8.496963500976562, + "step": 245, + "token_acc": 0.017016856246551425 + }, + { + "epoch": 0.1442392260334213, + "grad_norm": 7.392613548328144, + "learning_rate": 7.209847596717469e-05, + "loss": 8.245037078857422, + "step": 246, + "token_acc": 0.018232677186264666 + }, + { + "epoch": 0.14482556435063032, + "grad_norm": 14.144941718304963, + "learning_rate": 7.239155920281359e-05, + "loss": 8.253929138183594, + "step": 247, + "token_acc": 0.01806724960295295 + }, + { + "epoch": 0.14541190266783935, + "grad_norm": 18.210836124188965, + "learning_rate": 7.268464243845252e-05, + "loss": 8.385154724121094, + "step": 248, + "token_acc": 0.017011478807361737 + }, + { + "epoch": 0.14599824098504838, + "grad_norm": 8.848093798836455, + "learning_rate": 7.297772567409144e-05, + "loss": 8.348281860351562, + "step": 249, + "token_acc": 0.01845634040755992 + }, + { + "epoch": 0.1465845793022574, + "grad_norm": 17.71281655108162, + "learning_rate": 7.327080890973037e-05, + "loss": 8.335196495056152, + "step": 250, + "token_acc": 0.01910326214137506 + }, + { + "epoch": 0.14717091761946643, + "grad_norm": 21.5179146789294, + "learning_rate": 7.35638921453693e-05, + "loss": 8.454710006713867, + "step": 251, + "token_acc": 0.016284138015118264 + }, + { + "epoch": 0.14775725593667546, + "grad_norm": 13.758264497854526, + "learning_rate": 7.38569753810082e-05, + "loss": 8.32026481628418, + "step": 252, + "token_acc": 0.019163544261402883 + }, + { + "epoch": 0.1483435942538845, + "grad_norm": 2.4753180663240912, + "learning_rate": 7.415005861664713e-05, + "loss": 8.14482307434082, + "step": 253, + "token_acc": 0.019855293699420757 + }, + { + "epoch": 0.14892993257109352, + "grad_norm": 10.851512709145455, + "learning_rate": 7.444314185228605e-05, + "loss": 8.293392181396484, + "step": 254, + "token_acc": 0.01991583835159254 + }, + { + "epoch": 0.14951627088830255, + "grad_norm": 12.717966175460814, + "learning_rate": 7.473622508792498e-05, + "loss": 8.399085998535156, + "step": 255, + "token_acc": 0.019179935154781818 + }, + { + "epoch": 0.15010260920551158, + "grad_norm": 7.183095352986484, + "learning_rate": 7.50293083235639e-05, + "loss": 8.30589485168457, + "step": 256, + "token_acc": 0.017908344007417372 + }, + { + "epoch": 0.1506889475227206, + "grad_norm": 5.023779912680333, + "learning_rate": 7.532239155920281e-05, + "loss": 8.213706970214844, + "step": 257, + "token_acc": 0.018848616466637234 + }, + { + "epoch": 0.15127528583992964, + "grad_norm": 3.621674151339373, + "learning_rate": 7.561547479484174e-05, + "loss": 8.140850067138672, + "step": 258, + "token_acc": 0.020262012018062013 + }, + { + "epoch": 0.15186162415713866, + "grad_norm": 3.6894373755328913, + "learning_rate": 7.590855803048066e-05, + "loss": 8.16871166229248, + "step": 259, + "token_acc": 0.020408534006005258 + }, + { + "epoch": 0.1524479624743477, + "grad_norm": 2.8075728014756134, + "learning_rate": 7.620164126611958e-05, + "loss": 8.118350982666016, + "step": 260, + "token_acc": 0.02136015078361003 + }, + { + "epoch": 0.15303430079155672, + "grad_norm": 7.877261405761018, + "learning_rate": 7.64947245017585e-05, + "loss": 8.198387145996094, + "step": 261, + "token_acc": 0.02049476271997793 + }, + { + "epoch": 0.15362063910876575, + "grad_norm": 3.3493707278793172, + "learning_rate": 7.678780773739742e-05, + "loss": 8.124204635620117, + "step": 262, + "token_acc": 0.021180212958395 + }, + { + "epoch": 0.15420697742597478, + "grad_norm": 11.11112658219854, + "learning_rate": 7.708089097303634e-05, + "loss": 8.171187400817871, + "step": 263, + "token_acc": 0.020974568045039797 + }, + { + "epoch": 0.1547933157431838, + "grad_norm": 10.659399873600039, + "learning_rate": 7.737397420867527e-05, + "loss": 8.115743637084961, + "step": 264, + "token_acc": 0.02047971949956172 + }, + { + "epoch": 0.15537965406039284, + "grad_norm": 6.822965896088382, + "learning_rate": 7.766705744431418e-05, + "loss": 8.126745223999023, + "step": 265, + "token_acc": 0.021024674916348633 + }, + { + "epoch": 0.15596599237760186, + "grad_norm": 8.330780930249595, + "learning_rate": 7.796014067995311e-05, + "loss": 8.091007232666016, + "step": 266, + "token_acc": 0.02025383767213807 + }, + { + "epoch": 0.1565523306948109, + "grad_norm": 8.047149462638302, + "learning_rate": 7.825322391559203e-05, + "loss": 8.087890625, + "step": 267, + "token_acc": 0.021566429280016775 + }, + { + "epoch": 0.15713866901201995, + "grad_norm": 7.575106694004534, + "learning_rate": 7.854630715123095e-05, + "loss": 8.138514518737793, + "step": 268, + "token_acc": 0.020341918429767968 + }, + { + "epoch": 0.15772500732922898, + "grad_norm": 7.071751041121472, + "learning_rate": 7.883939038686987e-05, + "loss": 7.99616813659668, + "step": 269, + "token_acc": 0.021258970001438397 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 7.791014013059152, + "learning_rate": 7.91324736225088e-05, + "loss": 8.087705612182617, + "step": 270, + "token_acc": 0.020033058210134052 + }, + { + "epoch": 0.15889768396364704, + "grad_norm": 4.228416297373905, + "learning_rate": 7.942555685814772e-05, + "loss": 7.922218322753906, + "step": 271, + "token_acc": 0.023088780633423147 + }, + { + "epoch": 0.15948402228085606, + "grad_norm": 2.317191701919628, + "learning_rate": 7.971864009378663e-05, + "loss": 7.988491058349609, + "step": 272, + "token_acc": 0.022586674800372273 + }, + { + "epoch": 0.1600703605980651, + "grad_norm": 3.1131941773042953, + "learning_rate": 8.001172332942556e-05, + "loss": 7.979279041290283, + "step": 273, + "token_acc": 0.022202807978539518 + }, + { + "epoch": 0.16065669891527412, + "grad_norm": 5.1091131725230134, + "learning_rate": 8.030480656506448e-05, + "loss": 8.028542518615723, + "step": 274, + "token_acc": 0.022238672700770037 + }, + { + "epoch": 0.16124303723248315, + "grad_norm": 4.085072181013333, + "learning_rate": 8.05978898007034e-05, + "loss": 7.9908447265625, + "step": 275, + "token_acc": 0.02197045568352529 + }, + { + "epoch": 0.16182937554969218, + "grad_norm": 2.288307429583844, + "learning_rate": 8.089097303634233e-05, + "loss": 7.983156681060791, + "step": 276, + "token_acc": 0.02247299609471938 + }, + { + "epoch": 0.1624157138669012, + "grad_norm": 7.895985902368486, + "learning_rate": 8.118405627198124e-05, + "loss": 7.926133155822754, + "step": 277, + "token_acc": 0.02331120092378753 + }, + { + "epoch": 0.16300205218411024, + "grad_norm": 3.820383773726855, + "learning_rate": 8.147713950762016e-05, + "loss": 7.868941307067871, + "step": 278, + "token_acc": 0.024034693695067878 + }, + { + "epoch": 0.16358839050131926, + "grad_norm": 6.9392128157306265, + "learning_rate": 8.177022274325908e-05, + "loss": 7.8979411125183105, + "step": 279, + "token_acc": 0.02393419479226478 + }, + { + "epoch": 0.1641747288185283, + "grad_norm": 3.815135016073319, + "learning_rate": 8.206330597889801e-05, + "loss": 7.842163562774658, + "step": 280, + "token_acc": 0.024602287452519262 + }, + { + "epoch": 0.16476106713573732, + "grad_norm": 3.6512547425180597, + "learning_rate": 8.235638921453694e-05, + "loss": 7.815593719482422, + "step": 281, + "token_acc": 0.025859780940211397 + }, + { + "epoch": 0.16534740545294635, + "grad_norm": 7.268224507531726, + "learning_rate": 8.264947245017585e-05, + "loss": 7.821690559387207, + "step": 282, + "token_acc": 0.025709757477233232 + }, + { + "epoch": 0.16593374377015538, + "grad_norm": 3.272034002598221, + "learning_rate": 8.294255568581477e-05, + "loss": 7.8845014572143555, + "step": 283, + "token_acc": 0.02380989510235058 + }, + { + "epoch": 0.1665200820873644, + "grad_norm": 4.105038652483503, + "learning_rate": 8.323563892145369e-05, + "loss": 7.85194730758667, + "step": 284, + "token_acc": 0.02499056039968893 + }, + { + "epoch": 0.16710642040457344, + "grad_norm": 7.653802822717913, + "learning_rate": 8.352872215709262e-05, + "loss": 7.733232021331787, + "step": 285, + "token_acc": 0.026103846756814927 + }, + { + "epoch": 0.16769275872178246, + "grad_norm": 1.8435773358723115, + "learning_rate": 8.382180539273155e-05, + "loss": 7.811696529388428, + "step": 286, + "token_acc": 0.026082955468500982 + }, + { + "epoch": 0.1682790970389915, + "grad_norm": 14.044582027839947, + "learning_rate": 8.411488862837045e-05, + "loss": 7.8016557693481445, + "step": 287, + "token_acc": 0.026037299444329053 + }, + { + "epoch": 0.16886543535620052, + "grad_norm": 11.461243976306463, + "learning_rate": 8.440797186400937e-05, + "loss": 7.900874137878418, + "step": 288, + "token_acc": 0.022702794701528854 + }, + { + "epoch": 0.16945177367340955, + "grad_norm": 5.488517541389156, + "learning_rate": 8.47010550996483e-05, + "loss": 7.884481430053711, + "step": 289, + "token_acc": 0.023360143394362504 + }, + { + "epoch": 0.17003811199061858, + "grad_norm": 8.484756413355848, + "learning_rate": 8.499413833528723e-05, + "loss": 7.763012886047363, + "step": 290, + "token_acc": 0.02676448188501531 + }, + { + "epoch": 0.1706244503078276, + "grad_norm": 4.421983111333443, + "learning_rate": 8.528722157092614e-05, + "loss": 7.755222797393799, + "step": 291, + "token_acc": 0.025426050882831206 + }, + { + "epoch": 0.17121078862503664, + "grad_norm": 12.721238963219005, + "learning_rate": 8.558030480656506e-05, + "loss": 7.662722110748291, + "step": 292, + "token_acc": 0.026705650005930004 + }, + { + "epoch": 0.17179712694224566, + "grad_norm": 4.460130019396442, + "learning_rate": 8.587338804220398e-05, + "loss": 7.782173156738281, + "step": 293, + "token_acc": 0.02529513539407869 + }, + { + "epoch": 0.1723834652594547, + "grad_norm": 11.492135230545891, + "learning_rate": 8.616647127784291e-05, + "loss": 7.726858139038086, + "step": 294, + "token_acc": 0.025093573450926356 + }, + { + "epoch": 0.17296980357666372, + "grad_norm": 6.705542705901324, + "learning_rate": 8.645955451348184e-05, + "loss": 7.73845100402832, + "step": 295, + "token_acc": 0.026246969769304625 + }, + { + "epoch": 0.17355614189387278, + "grad_norm": 15.321151135680177, + "learning_rate": 8.675263774912075e-05, + "loss": 7.725770950317383, + "step": 296, + "token_acc": 0.026485684548575326 + }, + { + "epoch": 0.1741424802110818, + "grad_norm": 10.711760137487351, + "learning_rate": 8.704572098475968e-05, + "loss": 7.694210052490234, + "step": 297, + "token_acc": 0.026845782417523133 + }, + { + "epoch": 0.17472881852829084, + "grad_norm": 15.335026968597955, + "learning_rate": 8.733880422039859e-05, + "loss": 7.765105724334717, + "step": 298, + "token_acc": 0.026298105737273206 + }, + { + "epoch": 0.17531515684549986, + "grad_norm": 12.9802950210738, + "learning_rate": 8.763188745603752e-05, + "loss": 7.57515811920166, + "step": 299, + "token_acc": 0.029251924574345965 + }, + { + "epoch": 0.1759014951627089, + "grad_norm": 9.798442465078873, + "learning_rate": 8.792497069167643e-05, + "loss": 7.5946364402771, + "step": 300, + "token_acc": 0.027993413314514232 + }, + { + "epoch": 0.17648783347991792, + "grad_norm": 11.152273327422026, + "learning_rate": 8.821805392731536e-05, + "loss": 7.743631362915039, + "step": 301, + "token_acc": 0.027415174088690656 + }, + { + "epoch": 0.17707417179712695, + "grad_norm": 13.056494508565278, + "learning_rate": 8.851113716295429e-05, + "loss": 7.624645233154297, + "step": 302, + "token_acc": 0.029570342053916298 + }, + { + "epoch": 0.17766051011433598, + "grad_norm": 10.280070828201312, + "learning_rate": 8.88042203985932e-05, + "loss": 7.5809431076049805, + "step": 303, + "token_acc": 0.029406342668429397 + }, + { + "epoch": 0.178246848431545, + "grad_norm": 14.994484623702146, + "learning_rate": 8.909730363423213e-05, + "loss": 7.604696273803711, + "step": 304, + "token_acc": 0.028297901565079473 + }, + { + "epoch": 0.17883318674875404, + "grad_norm": 16.895591916059605, + "learning_rate": 8.939038686987104e-05, + "loss": 7.581826686859131, + "step": 305, + "token_acc": 0.028195779819885084 + }, + { + "epoch": 0.17941952506596306, + "grad_norm": 2.238952058149123, + "learning_rate": 8.968347010550997e-05, + "loss": 7.470364093780518, + "step": 306, + "token_acc": 0.03223189912907838 + }, + { + "epoch": 0.1800058633831721, + "grad_norm": 4.763228035828373, + "learning_rate": 8.99765533411489e-05, + "loss": 7.580644130706787, + "step": 307, + "token_acc": 0.030268167972149696 + }, + { + "epoch": 0.18059220170038112, + "grad_norm": 3.757399084390193, + "learning_rate": 9.026963657678781e-05, + "loss": 7.576322555541992, + "step": 308, + "token_acc": 0.029256197097563497 + }, + { + "epoch": 0.18117854001759015, + "grad_norm": 9.51844999336912, + "learning_rate": 9.056271981242672e-05, + "loss": 7.5692853927612305, + "step": 309, + "token_acc": 0.030617597597288722 + }, + { + "epoch": 0.18176487833479918, + "grad_norm": 3.427442184887727, + "learning_rate": 9.085580304806565e-05, + "loss": 7.4781341552734375, + "step": 310, + "token_acc": 0.03173392227244986 + }, + { + "epoch": 0.1823512166520082, + "grad_norm": 8.371967792200746, + "learning_rate": 9.114888628370458e-05, + "loss": 7.501359462738037, + "step": 311, + "token_acc": 0.03140233413607855 + }, + { + "epoch": 0.18293755496921724, + "grad_norm": 6.312168582950612, + "learning_rate": 9.14419695193435e-05, + "loss": 7.405187129974365, + "step": 312, + "token_acc": 0.03208052461153087 + }, + { + "epoch": 0.18352389328642627, + "grad_norm": 3.7641563791734614, + "learning_rate": 9.173505275498242e-05, + "loss": 7.448599815368652, + "step": 313, + "token_acc": 0.03280546749262446 + }, + { + "epoch": 0.1841102316036353, + "grad_norm": 9.448631141997817, + "learning_rate": 9.202813599062133e-05, + "loss": 7.440299987792969, + "step": 314, + "token_acc": 0.03276126768285996 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 3.90500189305802, + "learning_rate": 9.232121922626026e-05, + "loss": 7.399839878082275, + "step": 315, + "token_acc": 0.03215483883860879 + }, + { + "epoch": 0.18528290823805335, + "grad_norm": 7.098742387259987, + "learning_rate": 9.261430246189919e-05, + "loss": 7.342225074768066, + "step": 316, + "token_acc": 0.03405625994558801 + }, + { + "epoch": 0.18586924655526238, + "grad_norm": 6.579085040673552, + "learning_rate": 9.290738569753811e-05, + "loss": 7.373745918273926, + "step": 317, + "token_acc": 0.0345059090023882 + }, + { + "epoch": 0.1864555848724714, + "grad_norm": 2.9704252232035913, + "learning_rate": 9.320046893317701e-05, + "loss": 7.324457168579102, + "step": 318, + "token_acc": 0.03474796385485953 + }, + { + "epoch": 0.18704192318968044, + "grad_norm": 6.623089072181961, + "learning_rate": 9.349355216881594e-05, + "loss": 7.3864240646362305, + "step": 319, + "token_acc": 0.03265260121973398 + }, + { + "epoch": 0.18762826150688947, + "grad_norm": 5.704289522288452, + "learning_rate": 9.378663540445487e-05, + "loss": 7.358420372009277, + "step": 320, + "token_acc": 0.03454702320598669 + }, + { + "epoch": 0.1882145998240985, + "grad_norm": 3.4302450755989344, + "learning_rate": 9.40797186400938e-05, + "loss": 7.35070276260376, + "step": 321, + "token_acc": 0.03472312289401742 + }, + { + "epoch": 0.18880093814130752, + "grad_norm": 8.555791589944487, + "learning_rate": 9.437280187573272e-05, + "loss": 7.284658908843994, + "step": 322, + "token_acc": 0.03561939072511846 + }, + { + "epoch": 0.18938727645851655, + "grad_norm": 3.9378425757811066, + "learning_rate": 9.466588511137162e-05, + "loss": 7.287924289703369, + "step": 323, + "token_acc": 0.03677401988726328 + }, + { + "epoch": 0.18997361477572558, + "grad_norm": 6.8059336192624915, + "learning_rate": 9.495896834701055e-05, + "loss": 7.250253677368164, + "step": 324, + "token_acc": 0.03618896842636764 + }, + { + "epoch": 0.19055995309293464, + "grad_norm": 3.4663918608481024, + "learning_rate": 9.525205158264948e-05, + "loss": 7.141970157623291, + "step": 325, + "token_acc": 0.03980846498516945 + }, + { + "epoch": 0.19114629141014366, + "grad_norm": 6.494804356916958, + "learning_rate": 9.55451348182884e-05, + "loss": 7.144181251525879, + "step": 326, + "token_acc": 0.03971016146516803 + }, + { + "epoch": 0.1917326297273527, + "grad_norm": 7.4809562091856545, + "learning_rate": 9.583821805392732e-05, + "loss": 7.219866752624512, + "step": 327, + "token_acc": 0.03793140071339942 + }, + { + "epoch": 0.19231896804456172, + "grad_norm": 4.6179864476947525, + "learning_rate": 9.613130128956623e-05, + "loss": 7.127732753753662, + "step": 328, + "token_acc": 0.04009841262331149 + }, + { + "epoch": 0.19290530636177075, + "grad_norm": 3.4159232581083847, + "learning_rate": 9.642438452520516e-05, + "loss": 7.109224319458008, + "step": 329, + "token_acc": 0.039040996668115314 + }, + { + "epoch": 0.19349164467897978, + "grad_norm": 11.101823923649848, + "learning_rate": 9.671746776084409e-05, + "loss": 7.121427536010742, + "step": 330, + "token_acc": 0.04288325556784838 + }, + { + "epoch": 0.1940779829961888, + "grad_norm": 4.4398761624838174, + "learning_rate": 9.7010550996483e-05, + "loss": 7.084875106811523, + "step": 331, + "token_acc": 0.04085425697774236 + }, + { + "epoch": 0.19466432131339784, + "grad_norm": 10.534448841630402, + "learning_rate": 9.730363423212193e-05, + "loss": 7.209066390991211, + "step": 332, + "token_acc": 0.03748962545692137 + }, + { + "epoch": 0.19525065963060687, + "grad_norm": 7.457683337075591, + "learning_rate": 9.759671746776084e-05, + "loss": 7.057285308837891, + "step": 333, + "token_acc": 0.042235192153104534 + }, + { + "epoch": 0.1958369979478159, + "grad_norm": 9.769992005909325, + "learning_rate": 9.788980070339977e-05, + "loss": 7.133115768432617, + "step": 334, + "token_acc": 0.040370114596897703 + }, + { + "epoch": 0.19642333626502492, + "grad_norm": 8.514258762313188, + "learning_rate": 9.81828839390387e-05, + "loss": 7.023362159729004, + "step": 335, + "token_acc": 0.04553765713707722 + }, + { + "epoch": 0.19700967458223395, + "grad_norm": 10.687729322728085, + "learning_rate": 9.847596717467761e-05, + "loss": 7.071666240692139, + "step": 336, + "token_acc": 0.04284351406487803 + }, + { + "epoch": 0.19759601289944298, + "grad_norm": 8.975853867533473, + "learning_rate": 9.876905041031654e-05, + "loss": 6.967942237854004, + "step": 337, + "token_acc": 0.04838330530563111 + }, + { + "epoch": 0.198182351216652, + "grad_norm": 8.085078352833444, + "learning_rate": 9.906213364595545e-05, + "loss": 6.97743558883667, + "step": 338, + "token_acc": 0.046405126285870346 + }, + { + "epoch": 0.19876868953386104, + "grad_norm": 6.473983769929311, + "learning_rate": 9.935521688159438e-05, + "loss": 7.011883735656738, + "step": 339, + "token_acc": 0.04605891806198015 + }, + { + "epoch": 0.19935502785107007, + "grad_norm": 9.353614949786364, + "learning_rate": 9.964830011723329e-05, + "loss": 6.90988302230835, + "step": 340, + "token_acc": 0.049334454030279966 + }, + { + "epoch": 0.1999413661682791, + "grad_norm": 8.191913891200802, + "learning_rate": 9.994138335287222e-05, + "loss": 7.003641128540039, + "step": 341, + "token_acc": 0.04576521646864575 + }, + { + "epoch": 0.20052770448548812, + "grad_norm": 6.53629218369338, + "learning_rate": 0.00010023446658851114, + "loss": 6.881128787994385, + "step": 342, + "token_acc": 0.04841722337155768 + }, + { + "epoch": 0.20111404280269715, + "grad_norm": 5.102785860731727, + "learning_rate": 0.00010052754982415006, + "loss": 6.955008029937744, + "step": 343, + "token_acc": 0.049230626908014434 + }, + { + "epoch": 0.20170038111990618, + "grad_norm": 8.215374373674763, + "learning_rate": 0.00010082063305978898, + "loss": 6.877338409423828, + "step": 344, + "token_acc": 0.051758990749952376 + }, + { + "epoch": 0.2022867194371152, + "grad_norm": 6.619862417211111, + "learning_rate": 0.0001011137162954279, + "loss": 6.9279584884643555, + "step": 345, + "token_acc": 0.04782156210250414 + }, + { + "epoch": 0.20287305775432424, + "grad_norm": 6.871229992803177, + "learning_rate": 0.00010140679953106683, + "loss": 6.894254684448242, + "step": 346, + "token_acc": 0.05075188976926333 + }, + { + "epoch": 0.20345939607153327, + "grad_norm": 7.020359484608488, + "learning_rate": 0.00010169988276670575, + "loss": 6.808967113494873, + "step": 347, + "token_acc": 0.05406310677071067 + }, + { + "epoch": 0.2040457343887423, + "grad_norm": 5.743485111044436, + "learning_rate": 0.00010199296600234467, + "loss": 6.807111740112305, + "step": 348, + "token_acc": 0.05539717470796795 + }, + { + "epoch": 0.20463207270595132, + "grad_norm": 6.647518761433814, + "learning_rate": 0.00010228604923798358, + "loss": 6.821077346801758, + "step": 349, + "token_acc": 0.05351821521958795 + }, + { + "epoch": 0.20521841102316035, + "grad_norm": 12.054921021367194, + "learning_rate": 0.00010257913247362251, + "loss": 6.858293533325195, + "step": 350, + "token_acc": 0.05407264933218066 + }, + { + "epoch": 0.20580474934036938, + "grad_norm": 6.711263415199636, + "learning_rate": 0.00010287221570926143, + "loss": 6.742191314697266, + "step": 351, + "token_acc": 0.05734490517518483 + }, + { + "epoch": 0.2063910876575784, + "grad_norm": 8.223553120178638, + "learning_rate": 0.00010316529894490036, + "loss": 6.734842300415039, + "step": 352, + "token_acc": 0.0565855038670826 + }, + { + "epoch": 0.20697742597478747, + "grad_norm": 8.348030935749714, + "learning_rate": 0.00010345838218053928, + "loss": 6.833942413330078, + "step": 353, + "token_acc": 0.05438101608165877 + }, + { + "epoch": 0.2075637642919965, + "grad_norm": 6.625395594162651, + "learning_rate": 0.00010375146541617819, + "loss": 6.7424821853637695, + "step": 354, + "token_acc": 0.05685099120952248 + }, + { + "epoch": 0.20815010260920552, + "grad_norm": 10.177371253947637, + "learning_rate": 0.00010404454865181712, + "loss": 6.679440975189209, + "step": 355, + "token_acc": 0.060094043887147335 + }, + { + "epoch": 0.20873644092641455, + "grad_norm": 8.4730174808182, + "learning_rate": 0.00010433763188745604, + "loss": 6.7848052978515625, + "step": 356, + "token_acc": 0.056823985378563 + }, + { + "epoch": 0.20932277924362358, + "grad_norm": 7.646075916543815, + "learning_rate": 0.00010463071512309497, + "loss": 6.736171245574951, + "step": 357, + "token_acc": 0.05594884082743405 + }, + { + "epoch": 0.2099091175608326, + "grad_norm": 10.782082900980125, + "learning_rate": 0.00010492379835873387, + "loss": 6.678497791290283, + "step": 358, + "token_acc": 0.06489317333530427 + }, + { + "epoch": 0.21049545587804164, + "grad_norm": 5.034854056114249, + "learning_rate": 0.0001052168815943728, + "loss": 6.620523929595947, + "step": 359, + "token_acc": 0.06533077781094371 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 11.317390689825455, + "learning_rate": 0.00010550996483001172, + "loss": 6.652674674987793, + "step": 360, + "token_acc": 0.059921380688700926 + }, + { + "epoch": 0.2116681325124597, + "grad_norm": 9.08330521548402, + "learning_rate": 0.00010580304806565065, + "loss": 6.76340389251709, + "step": 361, + "token_acc": 0.05683870749766746 + }, + { + "epoch": 0.21225447082966872, + "grad_norm": 4.192956432983848, + "learning_rate": 0.00010609613130128958, + "loss": 6.675985336303711, + "step": 362, + "token_acc": 0.061936511821048 + }, + { + "epoch": 0.21284080914687775, + "grad_norm": 4.929739288408198, + "learning_rate": 0.00010638921453692848, + "loss": 6.55964469909668, + "step": 363, + "token_acc": 0.06749772520473157 + }, + { + "epoch": 0.21342714746408678, + "grad_norm": 5.661717824511602, + "learning_rate": 0.0001066822977725674, + "loss": 6.6542816162109375, + "step": 364, + "token_acc": 0.06293617594069043 + }, + { + "epoch": 0.2140134857812958, + "grad_norm": 9.191242956620586, + "learning_rate": 0.00010697538100820633, + "loss": 6.536946773529053, + "step": 365, + "token_acc": 0.07048085066340454 + }, + { + "epoch": 0.21459982409850484, + "grad_norm": 3.8615696252789444, + "learning_rate": 0.00010726846424384526, + "loss": 6.442357063293457, + "step": 366, + "token_acc": 0.07190959032798268 + }, + { + "epoch": 0.21518616241571387, + "grad_norm": 7.489777371921469, + "learning_rate": 0.00010756154747948417, + "loss": 6.4940104484558105, + "step": 367, + "token_acc": 0.071526452620929 + }, + { + "epoch": 0.2157725007329229, + "grad_norm": 5.125394613804369, + "learning_rate": 0.0001078546307151231, + "loss": 6.479306221008301, + "step": 368, + "token_acc": 0.06973716851955572 + }, + { + "epoch": 0.21635883905013192, + "grad_norm": 7.661570494600948, + "learning_rate": 0.00010814771395076202, + "loss": 6.42294454574585, + "step": 369, + "token_acc": 0.07784033575807714 + }, + { + "epoch": 0.21694517736734095, + "grad_norm": 3.941237775704587, + "learning_rate": 0.00010844079718640094, + "loss": 6.3986897468566895, + "step": 370, + "token_acc": 0.07585258816686935 + }, + { + "epoch": 0.21753151568454998, + "grad_norm": 10.062581230596951, + "learning_rate": 0.00010873388042203986, + "loss": 6.433681964874268, + "step": 371, + "token_acc": 0.0742224188653176 + }, + { + "epoch": 0.218117854001759, + "grad_norm": 6.612360630538697, + "learning_rate": 0.00010902696365767878, + "loss": 6.455165386199951, + "step": 372, + "token_acc": 0.07427043767122381 + }, + { + "epoch": 0.21870419231896804, + "grad_norm": 7.044091251652804, + "learning_rate": 0.00010932004689331771, + "loss": 6.425081729888916, + "step": 373, + "token_acc": 0.0744649742234545 + }, + { + "epoch": 0.21929053063617707, + "grad_norm": 5.932340105583805, + "learning_rate": 0.00010961313012895662, + "loss": 6.407991409301758, + "step": 374, + "token_acc": 0.07746613139274158 + }, + { + "epoch": 0.2198768689533861, + "grad_norm": 5.892266088269297, + "learning_rate": 0.00010990621336459555, + "loss": 6.425785064697266, + "step": 375, + "token_acc": 0.07493806663140087 + }, + { + "epoch": 0.22046320727059512, + "grad_norm": 4.389198671491138, + "learning_rate": 0.00011019929660023446, + "loss": 6.331266403198242, + "step": 376, + "token_acc": 0.08358848984821347 + }, + { + "epoch": 0.22104954558780415, + "grad_norm": 5.671827948944255, + "learning_rate": 0.00011049237983587339, + "loss": 6.38395881652832, + "step": 377, + "token_acc": 0.0792953962153578 + }, + { + "epoch": 0.22163588390501318, + "grad_norm": 8.012472306467243, + "learning_rate": 0.00011078546307151232, + "loss": 6.316666126251221, + "step": 378, + "token_acc": 0.08367280922988721 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 4.066955994948399, + "learning_rate": 0.00011107854630715123, + "loss": 6.273752689361572, + "step": 379, + "token_acc": 0.08612666064057206 + }, + { + "epoch": 0.22280856053943124, + "grad_norm": 9.76137199232559, + "learning_rate": 0.00011137162954279015, + "loss": 6.331414222717285, + "step": 380, + "token_acc": 0.08063501627975812 + }, + { + "epoch": 0.2233948988566403, + "grad_norm": 5.712229411662863, + "learning_rate": 0.00011166471277842907, + "loss": 6.2297797203063965, + "step": 381, + "token_acc": 0.08709685939935022 + }, + { + "epoch": 0.22398123717384932, + "grad_norm": 7.143148053425372, + "learning_rate": 0.000111957796014068, + "loss": 6.208853244781494, + "step": 382, + "token_acc": 0.08924345350364703 + }, + { + "epoch": 0.22456757549105835, + "grad_norm": 4.872547665438867, + "learning_rate": 0.00011225087924970693, + "loss": 6.216014862060547, + "step": 383, + "token_acc": 0.08970910293501103 + }, + { + "epoch": 0.22515391380826738, + "grad_norm": 3.830088671315554, + "learning_rate": 0.00011254396248534584, + "loss": 6.143113136291504, + "step": 384, + "token_acc": 0.09522872445099217 + }, + { + "epoch": 0.2257402521254764, + "grad_norm": 4.079606120494896, + "learning_rate": 0.00011283704572098476, + "loss": 6.1694207191467285, + "step": 385, + "token_acc": 0.09036978551213279 + }, + { + "epoch": 0.22632659044268544, + "grad_norm": 6.619930436486503, + "learning_rate": 0.00011313012895662368, + "loss": 6.212316513061523, + "step": 386, + "token_acc": 0.09043562015199764 + }, + { + "epoch": 0.22691292875989447, + "grad_norm": 3.821554462672562, + "learning_rate": 0.00011342321219226261, + "loss": 6.152814865112305, + "step": 387, + "token_acc": 0.08981522863043875 + }, + { + "epoch": 0.2274992670771035, + "grad_norm": 6.269206135361615, + "learning_rate": 0.00011371629542790154, + "loss": 6.163750648498535, + "step": 388, + "token_acc": 0.09439272740663972 + }, + { + "epoch": 0.22808560539431252, + "grad_norm": 4.347102457246547, + "learning_rate": 0.00011400937866354044, + "loss": 6.199976921081543, + "step": 389, + "token_acc": 0.09205683538266946 + }, + { + "epoch": 0.22867194371152155, + "grad_norm": 5.599273769887971, + "learning_rate": 0.00011430246189917936, + "loss": 6.06398868560791, + "step": 390, + "token_acc": 0.09922983679060975 + }, + { + "epoch": 0.22925828202873058, + "grad_norm": 3.6439645079874134, + "learning_rate": 0.00011459554513481829, + "loss": 6.026261329650879, + "step": 391, + "token_acc": 0.10002185377978236 + }, + { + "epoch": 0.2298446203459396, + "grad_norm": 4.434991454044366, + "learning_rate": 0.00011488862837045722, + "loss": 6.05257511138916, + "step": 392, + "token_acc": 0.0997451107489922 + }, + { + "epoch": 0.23043095866314864, + "grad_norm": 5.0655362788626075, + "learning_rate": 0.00011518171160609615, + "loss": 6.026375770568848, + "step": 393, + "token_acc": 0.10212818069036754 + }, + { + "epoch": 0.23101729698035767, + "grad_norm": 2.665448202349342, + "learning_rate": 0.00011547479484173505, + "loss": 5.91487455368042, + "step": 394, + "token_acc": 0.10893922910333187 + }, + { + "epoch": 0.2316036352975667, + "grad_norm": 9.06084480222526, + "learning_rate": 0.00011576787807737397, + "loss": 6.025001525878906, + "step": 395, + "token_acc": 0.09560397704138593 + }, + { + "epoch": 0.23218997361477572, + "grad_norm": 4.673902059041878, + "learning_rate": 0.0001160609613130129, + "loss": 5.972105979919434, + "step": 396, + "token_acc": 0.10496248382923674 + }, + { + "epoch": 0.23277631193198475, + "grad_norm": 5.941030418706131, + "learning_rate": 0.00011635404454865183, + "loss": 6.013332843780518, + "step": 397, + "token_acc": 0.10335180193018191 + }, + { + "epoch": 0.23336265024919378, + "grad_norm": 6.045884946008961, + "learning_rate": 0.00011664712778429074, + "loss": 6.084906578063965, + "step": 398, + "token_acc": 0.0951407001049074 + }, + { + "epoch": 0.2339489885664028, + "grad_norm": 4.883329485507978, + "learning_rate": 0.00011694021101992965, + "loss": 5.929563522338867, + "step": 399, + "token_acc": 0.10884320825369688 + }, + { + "epoch": 0.23453532688361184, + "grad_norm": 3.717426479172605, + "learning_rate": 0.00011723329425556858, + "loss": 5.903749465942383, + "step": 400, + "token_acc": 0.10962016385501569 + }, + { + "epoch": 0.23512166520082087, + "grad_norm": 5.287308426237174, + "learning_rate": 0.00011752637749120751, + "loss": 5.854625701904297, + "step": 401, + "token_acc": 0.11044980023881325 + }, + { + "epoch": 0.2357080035180299, + "grad_norm": 4.8190976688777925, + "learning_rate": 0.00011781946072684644, + "loss": 5.83117151260376, + "step": 402, + "token_acc": 0.11495696625648988 + }, + { + "epoch": 0.23629434183523892, + "grad_norm": 7.065135868946098, + "learning_rate": 0.00011811254396248535, + "loss": 5.819321632385254, + "step": 403, + "token_acc": 0.11133195636605188 + }, + { + "epoch": 0.23688068015244795, + "grad_norm": 3.253187254276713, + "learning_rate": 0.00011840562719812426, + "loss": 5.825146198272705, + "step": 404, + "token_acc": 0.11440163551033493 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 7.519045532066019, + "learning_rate": 0.00011869871043376319, + "loss": 5.731228351593018, + "step": 405, + "token_acc": 0.12204654047717918 + }, + { + "epoch": 0.238053356786866, + "grad_norm": 4.9885069960570725, + "learning_rate": 0.00011899179366940212, + "loss": 5.820338249206543, + "step": 406, + "token_acc": 0.11491831572603985 + }, + { + "epoch": 0.23863969510407504, + "grad_norm": 4.72737283734716, + "learning_rate": 0.00011928487690504103, + "loss": 5.732571601867676, + "step": 407, + "token_acc": 0.11993047359719426 + }, + { + "epoch": 0.23922603342128407, + "grad_norm": 5.431511393200514, + "learning_rate": 0.00011957796014067996, + "loss": 5.79659366607666, + "step": 408, + "token_acc": 0.11445042926524408 + }, + { + "epoch": 0.23981237173849312, + "grad_norm": 3.4479967425078395, + "learning_rate": 0.00011987104337631887, + "loss": 5.674437522888184, + "step": 409, + "token_acc": 0.12330751174325962 + }, + { + "epoch": 0.24039871005570215, + "grad_norm": 3.5530065378300013, + "learning_rate": 0.0001201641266119578, + "loss": 5.674881458282471, + "step": 410, + "token_acc": 0.12472383795608989 + }, + { + "epoch": 0.24098504837291118, + "grad_norm": 5.900044462285062, + "learning_rate": 0.00012045720984759671, + "loss": 5.541609764099121, + "step": 411, + "token_acc": 0.13444416534092496 + }, + { + "epoch": 0.2415713866901202, + "grad_norm": 3.1343076342691605, + "learning_rate": 0.00012075029308323564, + "loss": 5.634923934936523, + "step": 412, + "token_acc": 0.12509247986210442 + }, + { + "epoch": 0.24215772500732924, + "grad_norm": 5.272992968186741, + "learning_rate": 0.00012104337631887457, + "loss": 5.641106128692627, + "step": 413, + "token_acc": 0.124788313913987 + }, + { + "epoch": 0.24274406332453827, + "grad_norm": 5.1129186705206076, + "learning_rate": 0.00012133645955451348, + "loss": 5.602048873901367, + "step": 414, + "token_acc": 0.12926272099443858 + }, + { + "epoch": 0.2433304016417473, + "grad_norm": 5.17456541351753, + "learning_rate": 0.00012162954279015241, + "loss": 5.635060787200928, + "step": 415, + "token_acc": 0.12416406893501389 + }, + { + "epoch": 0.24391673995895632, + "grad_norm": 6.50594811222227, + "learning_rate": 0.00012192262602579132, + "loss": 5.699517726898193, + "step": 416, + "token_acc": 0.11834356482794695 + }, + { + "epoch": 0.24450307827616535, + "grad_norm": 3.001109057680427, + "learning_rate": 0.00012221570926143025, + "loss": 5.493686199188232, + "step": 417, + "token_acc": 0.13267444406217746 + }, + { + "epoch": 0.24508941659337438, + "grad_norm": 5.560878125283581, + "learning_rate": 0.00012250879249706918, + "loss": 5.516180992126465, + "step": 418, + "token_acc": 0.13398374313032077 + }, + { + "epoch": 0.2456757549105834, + "grad_norm": 3.538854348735928, + "learning_rate": 0.0001228018757327081, + "loss": 5.538155555725098, + "step": 419, + "token_acc": 0.12949818185604256 + }, + { + "epoch": 0.24626209322779244, + "grad_norm": 6.284893238784886, + "learning_rate": 0.000123094958968347, + "loss": 5.549795150756836, + "step": 420, + "token_acc": 0.1259016342565413 + }, + { + "epoch": 0.24684843154500147, + "grad_norm": 4.829788058367907, + "learning_rate": 0.00012338804220398593, + "loss": 5.550993919372559, + "step": 421, + "token_acc": 0.1300300552139439 + }, + { + "epoch": 0.2474347698622105, + "grad_norm": 4.630256565786252, + "learning_rate": 0.00012368112543962486, + "loss": 5.513519763946533, + "step": 422, + "token_acc": 0.13096768376717888 + }, + { + "epoch": 0.24802110817941952, + "grad_norm": 4.508957731839889, + "learning_rate": 0.00012397420867526378, + "loss": 5.520076751708984, + "step": 423, + "token_acc": 0.13224918754586434 + }, + { + "epoch": 0.24860744649662855, + "grad_norm": 3.723587986939637, + "learning_rate": 0.0001242672919109027, + "loss": 5.342349052429199, + "step": 424, + "token_acc": 0.14435009797517961 + }, + { + "epoch": 0.24919378481383758, + "grad_norm": 5.648678919123222, + "learning_rate": 0.0001245603751465416, + "loss": 5.36069393157959, + "step": 425, + "token_acc": 0.14558231572792513 + }, + { + "epoch": 0.2497801231310466, + "grad_norm": 3.801985637338194, + "learning_rate": 0.00012485345838218054, + "loss": 5.387795448303223, + "step": 426, + "token_acc": 0.14029441563859588 + }, + { + "epoch": 0.25036646144825564, + "grad_norm": 3.9776691412757303, + "learning_rate": 0.00012514654161781947, + "loss": 5.389639854431152, + "step": 427, + "token_acc": 0.1379084034812384 + }, + { + "epoch": 0.2509527997654647, + "grad_norm": 5.036749741166907, + "learning_rate": 0.0001254396248534584, + "loss": 5.404719829559326, + "step": 428, + "token_acc": 0.1399757166626547 + }, + { + "epoch": 0.2515391380826737, + "grad_norm": 3.005253859269899, + "learning_rate": 0.00012573270808909732, + "loss": 5.345599174499512, + "step": 429, + "token_acc": 0.14372790345304234 + }, + { + "epoch": 0.25212547639988275, + "grad_norm": 7.125839984359642, + "learning_rate": 0.00012602579132473625, + "loss": 5.4137797355651855, + "step": 430, + "token_acc": 0.13999409500926474 + }, + { + "epoch": 0.25271181471709175, + "grad_norm": 4.434683344779012, + "learning_rate": 0.00012631887456037515, + "loss": 5.269342422485352, + "step": 431, + "token_acc": 0.14908022973169852 + }, + { + "epoch": 0.2532981530343008, + "grad_norm": 7.551460242481809, + "learning_rate": 0.00012661195779601405, + "loss": 5.3578643798828125, + "step": 432, + "token_acc": 0.1418907453417722 + }, + { + "epoch": 0.2538844913515098, + "grad_norm": 3.6294283005438133, + "learning_rate": 0.00012690504103165298, + "loss": 5.3116960525512695, + "step": 433, + "token_acc": 0.14279131091511885 + }, + { + "epoch": 0.25447082966871887, + "grad_norm": 7.059565815092824, + "learning_rate": 0.0001271981242672919, + "loss": 5.324859142303467, + "step": 434, + "token_acc": 0.14112250264249973 + }, + { + "epoch": 0.25505716798592787, + "grad_norm": 4.587438686951819, + "learning_rate": 0.00012749120750293083, + "loss": 5.269500732421875, + "step": 435, + "token_acc": 0.14550385486596573 + }, + { + "epoch": 0.2556435063031369, + "grad_norm": 4.278257181215779, + "learning_rate": 0.00012778429073856976, + "loss": 5.255743026733398, + "step": 436, + "token_acc": 0.1474247030161913 + }, + { + "epoch": 0.2562298446203459, + "grad_norm": 5.488653213285053, + "learning_rate": 0.00012807737397420868, + "loss": 5.346251487731934, + "step": 437, + "token_acc": 0.14210230942724933 + }, + { + "epoch": 0.256816182937555, + "grad_norm": 4.0312762992681845, + "learning_rate": 0.0001283704572098476, + "loss": 5.282540321350098, + "step": 438, + "token_acc": 0.14412451198412093 + }, + { + "epoch": 0.257402521254764, + "grad_norm": 4.6300663375694135, + "learning_rate": 0.00012866354044548654, + "loss": 5.18112850189209, + "step": 439, + "token_acc": 0.15508663307693524 + }, + { + "epoch": 0.25798885957197304, + "grad_norm": 3.7038026719717814, + "learning_rate": 0.00012895662368112544, + "loss": 5.1809892654418945, + "step": 440, + "token_acc": 0.15596789965814592 + }, + { + "epoch": 0.25857519788918204, + "grad_norm": 4.25864334099672, + "learning_rate": 0.00012924970691676437, + "loss": 5.202960968017578, + "step": 441, + "token_acc": 0.15038723513588623 + }, + { + "epoch": 0.2591615362063911, + "grad_norm": 7.2267914422965225, + "learning_rate": 0.00012954279015240327, + "loss": 5.208362579345703, + "step": 442, + "token_acc": 0.1490746610182594 + }, + { + "epoch": 0.2597478745236001, + "grad_norm": 3.4195045685076977, + "learning_rate": 0.0001298358733880422, + "loss": 5.213512897491455, + "step": 443, + "token_acc": 0.14599194512346145 + }, + { + "epoch": 0.26033421284080915, + "grad_norm": 6.422194572580395, + "learning_rate": 0.00013012895662368112, + "loss": 5.196712017059326, + "step": 444, + "token_acc": 0.14994420899885275 + }, + { + "epoch": 0.26092055115801815, + "grad_norm": 3.3713794363129006, + "learning_rate": 0.00013042203985932005, + "loss": 5.15119743347168, + "step": 445, + "token_acc": 0.15283893677732643 + }, + { + "epoch": 0.2615068894752272, + "grad_norm": 6.796911619094949, + "learning_rate": 0.00013071512309495897, + "loss": 5.200589179992676, + "step": 446, + "token_acc": 0.14966023193871295 + }, + { + "epoch": 0.2620932277924362, + "grad_norm": 4.4739001822028035, + "learning_rate": 0.0001310082063305979, + "loss": 5.130384922027588, + "step": 447, + "token_acc": 0.1567592307631606 + }, + { + "epoch": 0.26267956610964527, + "grad_norm": 3.5174103629453457, + "learning_rate": 0.00013130128956623683, + "loss": 5.191955089569092, + "step": 448, + "token_acc": 0.14917900104740844 + }, + { + "epoch": 0.26326590442685427, + "grad_norm": 4.7302865891328505, + "learning_rate": 0.00013159437280187573, + "loss": 5.087374687194824, + "step": 449, + "token_acc": 0.15747861805378852 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 3.42149699764738, + "learning_rate": 0.00013188745603751466, + "loss": 5.169719696044922, + "step": 450, + "token_acc": 0.1484520241550834 + }, + { + "epoch": 0.2644385810612723, + "grad_norm": 4.3774170350702715, + "learning_rate": 0.00013218053927315358, + "loss": 5.158553123474121, + "step": 451, + "token_acc": 0.15196783623354493 + }, + { + "epoch": 0.2650249193784814, + "grad_norm": 5.697101553122713, + "learning_rate": 0.00013247362250879248, + "loss": 5.155243873596191, + "step": 452, + "token_acc": 0.152841854995793 + }, + { + "epoch": 0.26561125769569044, + "grad_norm": 4.18377075223577, + "learning_rate": 0.0001327667057444314, + "loss": 5.048179626464844, + "step": 453, + "token_acc": 0.15982089928738097 + }, + { + "epoch": 0.26619759601289944, + "grad_norm": 4.365075888953617, + "learning_rate": 0.00013305978898007034, + "loss": 4.967833995819092, + "step": 454, + "token_acc": 0.16580578245860272 + }, + { + "epoch": 0.2667839343301085, + "grad_norm": 3.6152317842258745, + "learning_rate": 0.00013335287221570926, + "loss": 5.094614505767822, + "step": 455, + "token_acc": 0.1549353852172373 + }, + { + "epoch": 0.2673702726473175, + "grad_norm": 6.368131028673277, + "learning_rate": 0.0001336459554513482, + "loss": 5.008109092712402, + "step": 456, + "token_acc": 0.16077867009315772 + }, + { + "epoch": 0.26795661096452655, + "grad_norm": 3.1043652406502407, + "learning_rate": 0.00013393903868698712, + "loss": 5.056596755981445, + "step": 457, + "token_acc": 0.159609351413082 + }, + { + "epoch": 0.26854294928173555, + "grad_norm": 5.99537880938208, + "learning_rate": 0.00013423212192262602, + "loss": 5.066808223724365, + "step": 458, + "token_acc": 0.15492473345855712 + }, + { + "epoch": 0.2691292875989446, + "grad_norm": 4.018389706152762, + "learning_rate": 0.00013452520515826495, + "loss": 5.08259916305542, + "step": 459, + "token_acc": 0.15455270631581955 + }, + { + "epoch": 0.2697156259161536, + "grad_norm": 5.212084138386232, + "learning_rate": 0.00013481828839390387, + "loss": 5.064886569976807, + "step": 460, + "token_acc": 0.15543550494113842 + }, + { + "epoch": 0.27030196423336267, + "grad_norm": 3.972706194092152, + "learning_rate": 0.0001351113716295428, + "loss": 5.083024024963379, + "step": 461, + "token_acc": 0.1531443466927338 + }, + { + "epoch": 0.27088830255057167, + "grad_norm": 3.6225981166084416, + "learning_rate": 0.00013540445486518173, + "loss": 4.947565078735352, + "step": 462, + "token_acc": 0.16832664583762866 + }, + { + "epoch": 0.2714746408677807, + "grad_norm": 4.149631590130578, + "learning_rate": 0.00013569753810082063, + "loss": 4.913524627685547, + "step": 463, + "token_acc": 0.1697766859721047 + }, + { + "epoch": 0.2720609791849897, + "grad_norm": 3.171155110106739, + "learning_rate": 0.00013599062133645955, + "loss": 4.927159309387207, + "step": 464, + "token_acc": 0.16713602620664147 + }, + { + "epoch": 0.2726473175021988, + "grad_norm": 4.074171411318694, + "learning_rate": 0.00013628370457209848, + "loss": 5.02904748916626, + "step": 465, + "token_acc": 0.15604169678279445 + }, + { + "epoch": 0.2732336558194078, + "grad_norm": 4.747758175097898, + "learning_rate": 0.0001365767878077374, + "loss": 4.936007022857666, + "step": 466, + "token_acc": 0.1657910727883883 + }, + { + "epoch": 0.27381999413661684, + "grad_norm": 4.091176120075038, + "learning_rate": 0.0001368698710433763, + "loss": 4.963770866394043, + "step": 467, + "token_acc": 0.16243046835206842 + }, + { + "epoch": 0.27440633245382584, + "grad_norm": 4.007489318793903, + "learning_rate": 0.00013716295427901524, + "loss": 4.950592517852783, + "step": 468, + "token_acc": 0.1655808786833166 + }, + { + "epoch": 0.2749926707710349, + "grad_norm": 2.914476230383308, + "learning_rate": 0.00013745603751465416, + "loss": 4.875804424285889, + "step": 469, + "token_acc": 0.16606682310164045 + }, + { + "epoch": 0.2755790090882439, + "grad_norm": 3.9232989048126585, + "learning_rate": 0.0001377491207502931, + "loss": 4.926511764526367, + "step": 470, + "token_acc": 0.16421727409512665 + }, + { + "epoch": 0.27616534740545295, + "grad_norm": 3.6144053806786967, + "learning_rate": 0.00013804220398593202, + "loss": 4.87507438659668, + "step": 471, + "token_acc": 0.1707926773812153 + }, + { + "epoch": 0.27675168572266196, + "grad_norm": 5.396727216600319, + "learning_rate": 0.00013833528722157095, + "loss": 4.926799297332764, + "step": 472, + "token_acc": 0.163388599276186 + }, + { + "epoch": 0.277338024039871, + "grad_norm": 2.987639672318001, + "learning_rate": 0.00013862837045720985, + "loss": 4.893725872039795, + "step": 473, + "token_acc": 0.1669549369529036 + }, + { + "epoch": 0.27792436235708, + "grad_norm": 4.989929850230553, + "learning_rate": 0.00013892145369284877, + "loss": 4.840160846710205, + "step": 474, + "token_acc": 0.17184066359218794 + }, + { + "epoch": 0.27851070067428907, + "grad_norm": 3.6110403848522625, + "learning_rate": 0.0001392145369284877, + "loss": 4.768781661987305, + "step": 475, + "token_acc": 0.17762787195163693 + }, + { + "epoch": 0.27909703899149807, + "grad_norm": 4.650105290437723, + "learning_rate": 0.0001395076201641266, + "loss": 4.8848748207092285, + "step": 476, + "token_acc": 0.1663902528915049 + }, + { + "epoch": 0.2796833773087071, + "grad_norm": 3.3139178413774797, + "learning_rate": 0.00013980070339976553, + "loss": 4.829568862915039, + "step": 477, + "token_acc": 0.169737405414885 + }, + { + "epoch": 0.2802697156259161, + "grad_norm": 3.909762394566097, + "learning_rate": 0.00014009378663540445, + "loss": 4.835160255432129, + "step": 478, + "token_acc": 0.17160266965770152 + }, + { + "epoch": 0.2808560539431252, + "grad_norm": 5.21933032998255, + "learning_rate": 0.00014038686987104338, + "loss": 4.901421070098877, + "step": 479, + "token_acc": 0.16438791649370235 + }, + { + "epoch": 0.28144239226033424, + "grad_norm": 2.863127004079764, + "learning_rate": 0.0001406799531066823, + "loss": 4.756311416625977, + "step": 480, + "token_acc": 0.17576272293721826 + }, + { + "epoch": 0.28202873057754324, + "grad_norm": 6.043684580459418, + "learning_rate": 0.00014097303634232124, + "loss": 4.799533843994141, + "step": 481, + "token_acc": 0.171276678914862 + }, + { + "epoch": 0.2826150688947523, + "grad_norm": 3.4298587559569516, + "learning_rate": 0.00014126611957796016, + "loss": 4.817670822143555, + "step": 482, + "token_acc": 0.16861697080869098 + }, + { + "epoch": 0.2832014072119613, + "grad_norm": 4.7766752965713914, + "learning_rate": 0.00014155920281359906, + "loss": 4.818942070007324, + "step": 483, + "token_acc": 0.17058975480918767 + }, + { + "epoch": 0.28378774552917035, + "grad_norm": 2.9046544792376454, + "learning_rate": 0.000141852286049238, + "loss": 4.799318790435791, + "step": 484, + "token_acc": 0.17104853684090415 + }, + { + "epoch": 0.28437408384637936, + "grad_norm": 4.580249712240795, + "learning_rate": 0.0001421453692848769, + "loss": 4.807289123535156, + "step": 485, + "token_acc": 0.17053504809736608 + }, + { + "epoch": 0.2849604221635884, + "grad_norm": 3.44091745195971, + "learning_rate": 0.00014243845252051582, + "loss": 4.7312235832214355, + "step": 486, + "token_acc": 0.17689699428010078 + }, + { + "epoch": 0.2855467604807974, + "grad_norm": 4.388778649894042, + "learning_rate": 0.00014273153575615474, + "loss": 4.7546892166137695, + "step": 487, + "token_acc": 0.17446729876108408 + }, + { + "epoch": 0.28613309879800647, + "grad_norm": 3.31251169387694, + "learning_rate": 0.00014302461899179367, + "loss": 4.791693687438965, + "step": 488, + "token_acc": 0.17254942921705546 + }, + { + "epoch": 0.28671943711521547, + "grad_norm": 4.17733589463874, + "learning_rate": 0.0001433177022274326, + "loss": 4.675629615783691, + "step": 489, + "token_acc": 0.18031281183909964 + }, + { + "epoch": 0.2873057754324245, + "grad_norm": 2.374452580098422, + "learning_rate": 0.00014361078546307153, + "loss": 4.753551483154297, + "step": 490, + "token_acc": 0.1734588895392462 + }, + { + "epoch": 0.2878921137496335, + "grad_norm": 4.126681562736818, + "learning_rate": 0.00014390386869871045, + "loss": 4.665050029754639, + "step": 491, + "token_acc": 0.1825853501330672 + }, + { + "epoch": 0.2884784520668426, + "grad_norm": 3.2581736524192833, + "learning_rate": 0.00014419695193434938, + "loss": 4.734525203704834, + "step": 492, + "token_acc": 0.17628407539496407 + }, + { + "epoch": 0.2890647903840516, + "grad_norm": 4.195978878494247, + "learning_rate": 0.00014449003516998828, + "loss": 4.720024585723877, + "step": 493, + "token_acc": 0.17701848379319582 + }, + { + "epoch": 0.28965112870126064, + "grad_norm": 3.6177234625347356, + "learning_rate": 0.00014478311840562718, + "loss": 4.724521636962891, + "step": 494, + "token_acc": 0.1753375245325339 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 5.118608365216558, + "learning_rate": 0.0001450762016412661, + "loss": 4.729095458984375, + "step": 495, + "token_acc": 0.17579916941202026 + }, + { + "epoch": 0.2908238053356787, + "grad_norm": 3.2562283272992527, + "learning_rate": 0.00014536928487690504, + "loss": 4.689156532287598, + "step": 496, + "token_acc": 0.17751681708030417 + }, + { + "epoch": 0.2914101436528877, + "grad_norm": 3.780850825032303, + "learning_rate": 0.00014566236811254396, + "loss": 4.734624862670898, + "step": 497, + "token_acc": 0.17406583503173662 + }, + { + "epoch": 0.29199648197009676, + "grad_norm": 2.826405194074065, + "learning_rate": 0.0001459554513481829, + "loss": 4.635134696960449, + "step": 498, + "token_acc": 0.1823283880751012 + }, + { + "epoch": 0.29258282028730576, + "grad_norm": 5.399876723900705, + "learning_rate": 0.00014624853458382182, + "loss": 4.663265705108643, + "step": 499, + "token_acc": 0.18205944086874704 + }, + { + "epoch": 0.2931691586045148, + "grad_norm": 2.2487711399775905, + "learning_rate": 0.00014654161781946074, + "loss": 4.7593278884887695, + "step": 500, + "token_acc": 0.17025784507648223 + }, + { + "epoch": 0.2937554969217238, + "grad_norm": 6.120777372840855, + "learning_rate": 0.00014683470105509967, + "loss": 4.659954071044922, + "step": 501, + "token_acc": 0.18130114849434723 + }, + { + "epoch": 0.29434183523893287, + "grad_norm": 3.507488648517917, + "learning_rate": 0.0001471277842907386, + "loss": 4.660583972930908, + "step": 502, + "token_acc": 0.1801478194306521 + }, + { + "epoch": 0.29492817355614187, + "grad_norm": 4.311213813931897, + "learning_rate": 0.00014742086752637747, + "loss": 4.589162826538086, + "step": 503, + "token_acc": 0.18823028781911766 + }, + { + "epoch": 0.2955145118733509, + "grad_norm": 3.6903797707518136, + "learning_rate": 0.0001477139507620164, + "loss": 4.702823638916016, + "step": 504, + "token_acc": 0.17542404941931272 + }, + { + "epoch": 0.2961008501905599, + "grad_norm": 3.999987914497177, + "learning_rate": 0.00014800703399765533, + "loss": 4.7104949951171875, + "step": 505, + "token_acc": 0.17259399415633728 + }, + { + "epoch": 0.296687188507769, + "grad_norm": 3.5066109164601493, + "learning_rate": 0.00014830011723329425, + "loss": 4.665492057800293, + "step": 506, + "token_acc": 0.17730312938611967 + }, + { + "epoch": 0.297273526824978, + "grad_norm": 4.236748264188352, + "learning_rate": 0.00014859320046893318, + "loss": 4.674029350280762, + "step": 507, + "token_acc": 0.17897982681661173 + }, + { + "epoch": 0.29785986514218704, + "grad_norm": 2.9556318702165614, + "learning_rate": 0.0001488862837045721, + "loss": 4.589700222015381, + "step": 508, + "token_acc": 0.18660020687325846 + }, + { + "epoch": 0.2984462034593961, + "grad_norm": 4.6849527569842255, + "learning_rate": 0.00014917936694021103, + "loss": 4.662505149841309, + "step": 509, + "token_acc": 0.17884270052942447 + }, + { + "epoch": 0.2990325417766051, + "grad_norm": 2.8180560272428927, + "learning_rate": 0.00014947245017584996, + "loss": 4.572309494018555, + "step": 510, + "token_acc": 0.1850498285811631 + }, + { + "epoch": 0.29961888009381415, + "grad_norm": 5.387123123783376, + "learning_rate": 0.00014976553341148886, + "loss": 4.636224746704102, + "step": 511, + "token_acc": 0.18315400590684713 + }, + { + "epoch": 0.30020521841102316, + "grad_norm": 2.9832438924506883, + "learning_rate": 0.0001500586166471278, + "loss": 4.695883750915527, + "step": 512, + "token_acc": 0.173301160852899 + }, + { + "epoch": 0.3007915567282322, + "grad_norm": 3.8761121383422603, + "learning_rate": 0.0001503516998827667, + "loss": 4.663463115692139, + "step": 513, + "token_acc": 0.17561182598916533 + }, + { + "epoch": 0.3013778950454412, + "grad_norm": 3.128858579933917, + "learning_rate": 0.00015064478311840562, + "loss": 4.602659225463867, + "step": 514, + "token_acc": 0.1811925587177552 + }, + { + "epoch": 0.30196423336265027, + "grad_norm": 4.0941899353636, + "learning_rate": 0.00015093786635404454, + "loss": 4.607230186462402, + "step": 515, + "token_acc": 0.18291015931873172 + }, + { + "epoch": 0.30255057167985927, + "grad_norm": 2.796167830470198, + "learning_rate": 0.00015123094958968347, + "loss": 4.67079496383667, + "step": 516, + "token_acc": 0.17750250983016816 + }, + { + "epoch": 0.3031369099970683, + "grad_norm": 2.9856667127973027, + "learning_rate": 0.0001515240328253224, + "loss": 4.563554763793945, + "step": 517, + "token_acc": 0.18527267098680805 + }, + { + "epoch": 0.3037232483142773, + "grad_norm": 3.5670505137011355, + "learning_rate": 0.00015181711606096132, + "loss": 4.562885284423828, + "step": 518, + "token_acc": 0.1843352118555625 + }, + { + "epoch": 0.3043095866314864, + "grad_norm": 4.594787663944211, + "learning_rate": 0.00015211019929660025, + "loss": 4.633196830749512, + "step": 519, + "token_acc": 0.17984546399342943 + }, + { + "epoch": 0.3048959249486954, + "grad_norm": 2.9036673221380154, + "learning_rate": 0.00015240328253223915, + "loss": 4.570888996124268, + "step": 520, + "token_acc": 0.1806706297843537 + }, + { + "epoch": 0.30548226326590444, + "grad_norm": 3.9279495400431843, + "learning_rate": 0.00015269636576787808, + "loss": 4.625429153442383, + "step": 521, + "token_acc": 0.17875706272781958 + }, + { + "epoch": 0.30606860158311344, + "grad_norm": 2.677651938949762, + "learning_rate": 0.000152989449003517, + "loss": 4.521848678588867, + "step": 522, + "token_acc": 0.18934240957410545 + }, + { + "epoch": 0.3066549399003225, + "grad_norm": 2.916817592188344, + "learning_rate": 0.0001532825322391559, + "loss": 4.53693962097168, + "step": 523, + "token_acc": 0.18639868637110016 + }, + { + "epoch": 0.3072412782175315, + "grad_norm": 3.6160515179985886, + "learning_rate": 0.00015357561547479483, + "loss": 4.605867385864258, + "step": 524, + "token_acc": 0.17898636395385836 + }, + { + "epoch": 0.30782761653474056, + "grad_norm": 2.3900981644929695, + "learning_rate": 0.00015386869871043376, + "loss": 4.502622604370117, + "step": 525, + "token_acc": 0.18960608354056294 + }, + { + "epoch": 0.30841395485194956, + "grad_norm": 4.384180131112934, + "learning_rate": 0.0001541617819460727, + "loss": 4.515637397766113, + "step": 526, + "token_acc": 0.18843714218207594 + }, + { + "epoch": 0.3090002931691586, + "grad_norm": 2.223020506544334, + "learning_rate": 0.00015445486518171161, + "loss": 4.538787841796875, + "step": 527, + "token_acc": 0.18645295901879946 + }, + { + "epoch": 0.3095866314863676, + "grad_norm": 3.0828032954448212, + "learning_rate": 0.00015474794841735054, + "loss": 4.515827178955078, + "step": 528, + "token_acc": 0.18863375736725604 + }, + { + "epoch": 0.31017296980357667, + "grad_norm": 3.5244506426862072, + "learning_rate": 0.00015504103165298944, + "loss": 4.562204360961914, + "step": 529, + "token_acc": 0.18436511640640685 + }, + { + "epoch": 0.31075930812078567, + "grad_norm": 3.480444321801198, + "learning_rate": 0.00015533411488862837, + "loss": 4.504839897155762, + "step": 530, + "token_acc": 0.1886315175002635 + }, + { + "epoch": 0.3113456464379947, + "grad_norm": 3.373181923178309, + "learning_rate": 0.0001556271981242673, + "loss": 4.509557247161865, + "step": 531, + "token_acc": 0.1863521138277864 + }, + { + "epoch": 0.31193198475520373, + "grad_norm": 3.4565189514168075, + "learning_rate": 0.00015592028135990622, + "loss": 4.46580696105957, + "step": 532, + "token_acc": 0.1925276481780916 + }, + { + "epoch": 0.3125183230724128, + "grad_norm": 3.6008447696143775, + "learning_rate": 0.00015621336459554515, + "loss": 4.435978889465332, + "step": 533, + "token_acc": 0.19529056900029182 + }, + { + "epoch": 0.3131046613896218, + "grad_norm": 2.6080844785138124, + "learning_rate": 0.00015650644783118405, + "loss": 4.429786205291748, + "step": 534, + "token_acc": 0.195933408321611 + }, + { + "epoch": 0.31369099970683084, + "grad_norm": 3.9659555466611365, + "learning_rate": 0.00015679953106682298, + "loss": 4.470489978790283, + "step": 535, + "token_acc": 0.19054969739230412 + }, + { + "epoch": 0.3142773380240399, + "grad_norm": 3.190122712935279, + "learning_rate": 0.0001570926143024619, + "loss": 4.506174087524414, + "step": 536, + "token_acc": 0.1851689226060097 + }, + { + "epoch": 0.3148636763412489, + "grad_norm": 3.6036123067173675, + "learning_rate": 0.00015738569753810083, + "loss": 4.515082359313965, + "step": 537, + "token_acc": 0.18538086563940442 + }, + { + "epoch": 0.31545001465845796, + "grad_norm": 3.6821928414589697, + "learning_rate": 0.00015767878077373973, + "loss": 4.4421234130859375, + "step": 538, + "token_acc": 0.19192258497952222 + }, + { + "epoch": 0.31603635297566696, + "grad_norm": 4.579216954552315, + "learning_rate": 0.00015797186400937866, + "loss": 4.4645490646362305, + "step": 539, + "token_acc": 0.19084683586215595 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 2.157258419917712, + "learning_rate": 0.0001582649472450176, + "loss": 4.472905158996582, + "step": 540, + "token_acc": 0.19049191918931263 + }, + { + "epoch": 0.317209029610085, + "grad_norm": 5.125895666636859, + "learning_rate": 0.00015855803048065651, + "loss": 4.477203369140625, + "step": 541, + "token_acc": 0.1892764893158076 + }, + { + "epoch": 0.31779536792729407, + "grad_norm": 2.4641923245822013, + "learning_rate": 0.00015885111371629544, + "loss": 4.522824287414551, + "step": 542, + "token_acc": 0.1858015348256725 + }, + { + "epoch": 0.31838170624450307, + "grad_norm": 4.113866472139571, + "learning_rate": 0.00015914419695193437, + "loss": 4.524702072143555, + "step": 543, + "token_acc": 0.18356269733107675 + }, + { + "epoch": 0.3189680445617121, + "grad_norm": 3.2474792562328254, + "learning_rate": 0.00015943728018757327, + "loss": 4.578799247741699, + "step": 544, + "token_acc": 0.17872985772194935 + }, + { + "epoch": 0.31955438287892113, + "grad_norm": 3.0974908436309, + "learning_rate": 0.0001597303634232122, + "loss": 4.3994879722595215, + "step": 545, + "token_acc": 0.19697764501283002 + }, + { + "epoch": 0.3201407211961302, + "grad_norm": 3.0481663775573913, + "learning_rate": 0.00016002344665885112, + "loss": 4.469366550445557, + "step": 546, + "token_acc": 0.1881582026679148 + }, + { + "epoch": 0.3207270595133392, + "grad_norm": 2.4321962734241316, + "learning_rate": 0.00016031652989449002, + "loss": 4.466395854949951, + "step": 547, + "token_acc": 0.1915929492623904 + }, + { + "epoch": 0.32131339783054824, + "grad_norm": 3.7179095396605657, + "learning_rate": 0.00016060961313012895, + "loss": 4.5207414627075195, + "step": 548, + "token_acc": 0.18468413558425256 + }, + { + "epoch": 0.32189973614775724, + "grad_norm": 2.833110587710605, + "learning_rate": 0.00016090269636576788, + "loss": 4.469472885131836, + "step": 549, + "token_acc": 0.18806375661174515 + }, + { + "epoch": 0.3224860744649663, + "grad_norm": 3.7109472767311766, + "learning_rate": 0.0001611957796014068, + "loss": 4.402301788330078, + "step": 550, + "token_acc": 0.19425649739490036 + }, + { + "epoch": 0.3230724127821753, + "grad_norm": 2.6285673973131027, + "learning_rate": 0.00016148886283704573, + "loss": 4.452491760253906, + "step": 551, + "token_acc": 0.18925668332399154 + }, + { + "epoch": 0.32365875109938436, + "grad_norm": 3.959979555121441, + "learning_rate": 0.00016178194607268466, + "loss": 4.421535491943359, + "step": 552, + "token_acc": 0.19194938166322772 + }, + { + "epoch": 0.32424508941659336, + "grad_norm": 3.0536149646835398, + "learning_rate": 0.00016207502930832359, + "loss": 4.468191623687744, + "step": 553, + "token_acc": 0.1887056466466493 + }, + { + "epoch": 0.3248314277338024, + "grad_norm": 1.9387140243529846, + "learning_rate": 0.00016236811254396249, + "loss": 4.35836124420166, + "step": 554, + "token_acc": 0.20117667843532056 + }, + { + "epoch": 0.3254177660510114, + "grad_norm": 3.4994199637137653, + "learning_rate": 0.0001626611957796014, + "loss": 4.385231971740723, + "step": 555, + "token_acc": 0.19869304672345237 + }, + { + "epoch": 0.32600410436822047, + "grad_norm": 3.312716764496414, + "learning_rate": 0.0001629542790152403, + "loss": 4.452750205993652, + "step": 556, + "token_acc": 0.1890747709967253 + }, + { + "epoch": 0.32659044268542947, + "grad_norm": 3.42770904470029, + "learning_rate": 0.00016324736225087924, + "loss": 4.490422248840332, + "step": 557, + "token_acc": 0.1838231117582968 + }, + { + "epoch": 0.32717678100263853, + "grad_norm": 2.7378232095594153, + "learning_rate": 0.00016354044548651817, + "loss": 4.38887882232666, + "step": 558, + "token_acc": 0.1926649779893164 + }, + { + "epoch": 0.32776311931984753, + "grad_norm": 3.151619286716409, + "learning_rate": 0.0001638335287221571, + "loss": 4.399745464324951, + "step": 559, + "token_acc": 0.1940473683399508 + }, + { + "epoch": 0.3283494576370566, + "grad_norm": 2.323076613373229, + "learning_rate": 0.00016412661195779602, + "loss": 4.426732063293457, + "step": 560, + "token_acc": 0.19175233375274034 + }, + { + "epoch": 0.3289357959542656, + "grad_norm": 3.015180056428602, + "learning_rate": 0.00016441969519343495, + "loss": 4.401934623718262, + "step": 561, + "token_acc": 0.19396697702772409 + }, + { + "epoch": 0.32952213427147464, + "grad_norm": 2.9430845511558554, + "learning_rate": 0.00016471277842907388, + "loss": 4.353907585144043, + "step": 562, + "token_acc": 0.19816314268547394 + }, + { + "epoch": 0.33010847258868364, + "grad_norm": 3.1828030565555006, + "learning_rate": 0.0001650058616647128, + "loss": 4.3354268074035645, + "step": 563, + "token_acc": 0.20195604316287127 + }, + { + "epoch": 0.3306948109058927, + "grad_norm": 2.888642548657088, + "learning_rate": 0.0001652989449003517, + "loss": 4.373227119445801, + "step": 564, + "token_acc": 0.19680482395342389 + }, + { + "epoch": 0.33128114922310176, + "grad_norm": 3.727612892595648, + "learning_rate": 0.0001655920281359906, + "loss": 4.3680419921875, + "step": 565, + "token_acc": 0.1959753619077607 + }, + { + "epoch": 0.33186748754031076, + "grad_norm": 2.1111682287067963, + "learning_rate": 0.00016588511137162953, + "loss": 4.369876861572266, + "step": 566, + "token_acc": 0.19430340072089014 + }, + { + "epoch": 0.3324538258575198, + "grad_norm": 4.386398150910521, + "learning_rate": 0.00016617819460726846, + "loss": 4.440284252166748, + "step": 567, + "token_acc": 0.18975706525690428 + }, + { + "epoch": 0.3330401641747288, + "grad_norm": 2.1936625491596784, + "learning_rate": 0.00016647127784290739, + "loss": 4.39591121673584, + "step": 568, + "token_acc": 0.19313192965279477 + }, + { + "epoch": 0.33362650249193787, + "grad_norm": 3.2652331006577655, + "learning_rate": 0.0001667643610785463, + "loss": 4.384404182434082, + "step": 569, + "token_acc": 0.19393213633751824 + }, + { + "epoch": 0.33421284080914687, + "grad_norm": 2.9407242805369176, + "learning_rate": 0.00016705744431418524, + "loss": 4.441839218139648, + "step": 570, + "token_acc": 0.187965974278624 + }, + { + "epoch": 0.33479917912635593, + "grad_norm": 2.020857634519983, + "learning_rate": 0.00016735052754982417, + "loss": 4.312097549438477, + "step": 571, + "token_acc": 0.20122706531290696 + }, + { + "epoch": 0.33538551744356493, + "grad_norm": 3.835885045193653, + "learning_rate": 0.0001676436107854631, + "loss": 4.369815826416016, + "step": 572, + "token_acc": 0.19539235749128023 + }, + { + "epoch": 0.335971855760774, + "grad_norm": 2.347547928381657, + "learning_rate": 0.00016793669402110202, + "loss": 4.357950210571289, + "step": 573, + "token_acc": 0.1969721098618764 + }, + { + "epoch": 0.336558194077983, + "grad_norm": 3.178535706394947, + "learning_rate": 0.0001682297772567409, + "loss": 4.401962757110596, + "step": 574, + "token_acc": 0.19254838608488348 + }, + { + "epoch": 0.33714453239519204, + "grad_norm": 2.9710820442098504, + "learning_rate": 0.00016852286049237982, + "loss": 4.346654891967773, + "step": 575, + "token_acc": 0.19889154413195606 + }, + { + "epoch": 0.33773087071240104, + "grad_norm": 2.453938641919291, + "learning_rate": 0.00016881594372801875, + "loss": 4.352444648742676, + "step": 576, + "token_acc": 0.1979722464027856 + }, + { + "epoch": 0.3383172090296101, + "grad_norm": 3.4726701501672452, + "learning_rate": 0.00016910902696365768, + "loss": 4.365447044372559, + "step": 577, + "token_acc": 0.19561060986647677 + }, + { + "epoch": 0.3389035473468191, + "grad_norm": 2.857005802336948, + "learning_rate": 0.0001694021101992966, + "loss": 4.408263206481934, + "step": 578, + "token_acc": 0.19032571101545198 + }, + { + "epoch": 0.33948988566402816, + "grad_norm": 2.8735270224763916, + "learning_rate": 0.00016969519343493553, + "loss": 4.34467077255249, + "step": 579, + "token_acc": 0.1974039596171496 + }, + { + "epoch": 0.34007622398123716, + "grad_norm": 2.478860741700152, + "learning_rate": 0.00016998827667057446, + "loss": 4.283778190612793, + "step": 580, + "token_acc": 0.20183136785016081 + }, + { + "epoch": 0.3406625622984462, + "grad_norm": 3.2839133701779444, + "learning_rate": 0.00017028135990621338, + "loss": 4.350898742675781, + "step": 581, + "token_acc": 0.19648146771847494 + }, + { + "epoch": 0.3412489006156552, + "grad_norm": 2.3226703308053547, + "learning_rate": 0.00017057444314185228, + "loss": 4.3489179611206055, + "step": 582, + "token_acc": 0.1970700570292471 + }, + { + "epoch": 0.34183523893286427, + "grad_norm": 3.370819504708132, + "learning_rate": 0.0001708675263774912, + "loss": 4.374782562255859, + "step": 583, + "token_acc": 0.19299956099124407 + }, + { + "epoch": 0.3424215772500733, + "grad_norm": 2.1288858852401185, + "learning_rate": 0.0001711606096131301, + "loss": 4.364096641540527, + "step": 584, + "token_acc": 0.19417908127096736 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 4.065434395585224, + "learning_rate": 0.00017145369284876904, + "loss": 4.389575958251953, + "step": 585, + "token_acc": 0.19208265079562506 + }, + { + "epoch": 0.34359425388449133, + "grad_norm": 2.359015143636616, + "learning_rate": 0.00017174677608440797, + "loss": 4.342007160186768, + "step": 586, + "token_acc": 0.19773779136999545 + }, + { + "epoch": 0.3441805922017004, + "grad_norm": 3.283454040141777, + "learning_rate": 0.0001720398593200469, + "loss": 4.32462215423584, + "step": 587, + "token_acc": 0.19869578104930344 + }, + { + "epoch": 0.3447669305189094, + "grad_norm": 2.6925374365327404, + "learning_rate": 0.00017233294255568582, + "loss": 4.30461311340332, + "step": 588, + "token_acc": 0.19965218236661814 + }, + { + "epoch": 0.34535326883611844, + "grad_norm": 3.203168425369097, + "learning_rate": 0.00017262602579132475, + "loss": 4.323164463043213, + "step": 589, + "token_acc": 0.1986490328422959 + }, + { + "epoch": 0.34593960715332744, + "grad_norm": 2.1656274852807917, + "learning_rate": 0.00017291910902696367, + "loss": 4.297807693481445, + "step": 590, + "token_acc": 0.19995629763878714 + }, + { + "epoch": 0.3465259454705365, + "grad_norm": 4.069325046705623, + "learning_rate": 0.00017321219226260257, + "loss": 4.383184909820557, + "step": 591, + "token_acc": 0.19115910507504608 + }, + { + "epoch": 0.34711228378774556, + "grad_norm": 2.2855486485486205, + "learning_rate": 0.0001735052754982415, + "loss": 4.2431535720825195, + "step": 592, + "token_acc": 0.20432408538963573 + }, + { + "epoch": 0.34769862210495456, + "grad_norm": 3.1184100370119503, + "learning_rate": 0.00017379835873388043, + "loss": 4.280085563659668, + "step": 593, + "token_acc": 0.20095056735844813 + }, + { + "epoch": 0.3482849604221636, + "grad_norm": 2.4848425502508524, + "learning_rate": 0.00017409144196951936, + "loss": 4.318235397338867, + "step": 594, + "token_acc": 0.19635401074574188 + }, + { + "epoch": 0.3488712987393726, + "grad_norm": 1.8658468156999546, + "learning_rate": 0.00017438452520515826, + "loss": 4.249464511871338, + "step": 595, + "token_acc": 0.20389160030824557 + }, + { + "epoch": 0.34945763705658167, + "grad_norm": 3.8716497641936853, + "learning_rate": 0.00017467760844079718, + "loss": 4.287879943847656, + "step": 596, + "token_acc": 0.20160418120694523 + }, + { + "epoch": 0.3500439753737907, + "grad_norm": 2.0078757419361355, + "learning_rate": 0.0001749706916764361, + "loss": 4.318107604980469, + "step": 597, + "token_acc": 0.19689672905707276 + }, + { + "epoch": 0.35063031369099973, + "grad_norm": 2.884315560846559, + "learning_rate": 0.00017526377491207504, + "loss": 4.311124801635742, + "step": 598, + "token_acc": 0.19751258977310998 + }, + { + "epoch": 0.35121665200820873, + "grad_norm": 2.7484929202368944, + "learning_rate": 0.00017555685814771397, + "loss": 4.3280205726623535, + "step": 599, + "token_acc": 0.19439567476013922 + }, + { + "epoch": 0.3518029903254178, + "grad_norm": 2.2047357723896215, + "learning_rate": 0.00017584994138335287, + "loss": 4.284904479980469, + "step": 600, + "token_acc": 0.2003533531799161 + }, + { + "epoch": 0.3523893286426268, + "grad_norm": 3.1107430335760817, + "learning_rate": 0.0001761430246189918, + "loss": 4.276244640350342, + "step": 601, + "token_acc": 0.20092996600544433 + }, + { + "epoch": 0.35297566695983584, + "grad_norm": 2.0833190258734384, + "learning_rate": 0.00017643610785463072, + "loss": 4.340874671936035, + "step": 602, + "token_acc": 0.1932120958569206 + }, + { + "epoch": 0.35356200527704484, + "grad_norm": 2.514109913489446, + "learning_rate": 0.00017672919109026965, + "loss": 4.243865013122559, + "step": 603, + "token_acc": 0.2062694576927058 + }, + { + "epoch": 0.3541483435942539, + "grad_norm": 2.103358985280864, + "learning_rate": 0.00017702227432590857, + "loss": 4.242303848266602, + "step": 604, + "token_acc": 0.20248158859914006 + }, + { + "epoch": 0.3547346819114629, + "grad_norm": 2.7013474943179427, + "learning_rate": 0.00017731535756154747, + "loss": 4.238546371459961, + "step": 605, + "token_acc": 0.2064178454785494 + }, + { + "epoch": 0.35532102022867196, + "grad_norm": 2.559143552036905, + "learning_rate": 0.0001776084407971864, + "loss": 4.238079071044922, + "step": 606, + "token_acc": 0.2062076735148546 + }, + { + "epoch": 0.35590735854588096, + "grad_norm": 3.261877485645797, + "learning_rate": 0.00017790152403282533, + "loss": 4.278648376464844, + "step": 607, + "token_acc": 0.20108211677410812 + }, + { + "epoch": 0.35649369686309, + "grad_norm": 2.317306909803825, + "learning_rate": 0.00017819460726846426, + "loss": 4.323906421661377, + "step": 608, + "token_acc": 0.19419433440189857 + }, + { + "epoch": 0.357080035180299, + "grad_norm": 2.779401267702493, + "learning_rate": 0.00017848769050410316, + "loss": 4.259009838104248, + "step": 609, + "token_acc": 0.20273544964205653 + }, + { + "epoch": 0.35766637349750807, + "grad_norm": 2.425582147671826, + "learning_rate": 0.00017878077373974208, + "loss": 4.300567626953125, + "step": 610, + "token_acc": 0.19641605305305818 + }, + { + "epoch": 0.3582527118147171, + "grad_norm": 3.840767861503941, + "learning_rate": 0.000179073856975381, + "loss": 4.302745819091797, + "step": 611, + "token_acc": 0.19933901918976546 + }, + { + "epoch": 0.35883905013192613, + "grad_norm": 2.0539525690315923, + "learning_rate": 0.00017936694021101994, + "loss": 4.270225524902344, + "step": 612, + "token_acc": 0.20239351806149336 + }, + { + "epoch": 0.35942538844913513, + "grad_norm": 2.757055231102099, + "learning_rate": 0.00017966002344665886, + "loss": 4.273060321807861, + "step": 613, + "token_acc": 0.20284586436350519 + }, + { + "epoch": 0.3600117267663442, + "grad_norm": 2.4590772409139188, + "learning_rate": 0.0001799531066822978, + "loss": 4.348200798034668, + "step": 614, + "token_acc": 0.19452398829712866 + }, + { + "epoch": 0.3605980650835532, + "grad_norm": 2.351065205116467, + "learning_rate": 0.0001802461899179367, + "loss": 4.231278896331787, + "step": 615, + "token_acc": 0.20566704885304 + }, + { + "epoch": 0.36118440340076224, + "grad_norm": 2.6967747274489073, + "learning_rate": 0.00018053927315357562, + "loss": 4.202358245849609, + "step": 616, + "token_acc": 0.20790187042558958 + }, + { + "epoch": 0.36177074171797124, + "grad_norm": 2.457169452104043, + "learning_rate": 0.00018083235638921455, + "loss": 4.189783096313477, + "step": 617, + "token_acc": 0.2092393323817407 + }, + { + "epoch": 0.3623570800351803, + "grad_norm": 2.6406543537554494, + "learning_rate": 0.00018112543962485345, + "loss": 4.248398780822754, + "step": 618, + "token_acc": 0.20415585384439128 + }, + { + "epoch": 0.3629434183523893, + "grad_norm": 2.636979858364093, + "learning_rate": 0.00018141852286049237, + "loss": 4.2956647872924805, + "step": 619, + "token_acc": 0.19798279583544948 + }, + { + "epoch": 0.36352975666959836, + "grad_norm": 2.1239145706551477, + "learning_rate": 0.0001817116060961313, + "loss": 4.238051414489746, + "step": 620, + "token_acc": 0.20078229568697775 + }, + { + "epoch": 0.3641160949868074, + "grad_norm": 3.3368681844812147, + "learning_rate": 0.00018200468933177023, + "loss": 4.244442939758301, + "step": 621, + "token_acc": 0.20395911836060496 + }, + { + "epoch": 0.3647024333040164, + "grad_norm": 1.6621377139225848, + "learning_rate": 0.00018229777256740915, + "loss": 4.1781325340271, + "step": 622, + "token_acc": 0.20918979862408785 + }, + { + "epoch": 0.36528877162122547, + "grad_norm": 4.310592781243259, + "learning_rate": 0.00018259085580304808, + "loss": 4.3004279136657715, + "step": 623, + "token_acc": 0.20038858582515773 + }, + { + "epoch": 0.3658751099384345, + "grad_norm": 2.093247029693665, + "learning_rate": 0.000182883939038687, + "loss": 4.260926723480225, + "step": 624, + "token_acc": 0.19990207701121143 + }, + { + "epoch": 0.36646144825564353, + "grad_norm": 2.875707055580328, + "learning_rate": 0.0001831770222743259, + "loss": 4.207454681396484, + "step": 625, + "token_acc": 0.20758294094486066 + }, + { + "epoch": 0.36704778657285253, + "grad_norm": 2.6211863582176185, + "learning_rate": 0.00018347010550996484, + "loss": 4.265021324157715, + "step": 626, + "token_acc": 0.20177751420488696 + }, + { + "epoch": 0.3676341248900616, + "grad_norm": 1.9973754906759469, + "learning_rate": 0.00018376318874560374, + "loss": 4.224093437194824, + "step": 627, + "token_acc": 0.2052583200410933 + }, + { + "epoch": 0.3682204632072706, + "grad_norm": 3.055874514068887, + "learning_rate": 0.00018405627198124266, + "loss": 4.233010292053223, + "step": 628, + "token_acc": 0.20451630232201198 + }, + { + "epoch": 0.36880680152447964, + "grad_norm": 2.4864402857444987, + "learning_rate": 0.0001843493552168816, + "loss": 4.277772426605225, + "step": 629, + "token_acc": 0.1984967330453631 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 2.8483866732175764, + "learning_rate": 0.00018464243845252052, + "loss": 4.214406967163086, + "step": 630, + "token_acc": 0.2045053015060296 + }, + { + "epoch": 0.3699794781588977, + "grad_norm": 2.3936202913166955, + "learning_rate": 0.00018493552168815945, + "loss": 4.237961292266846, + "step": 631, + "token_acc": 0.2022509319896603 + }, + { + "epoch": 0.3705658164761067, + "grad_norm": 3.009788283392778, + "learning_rate": 0.00018522860492379837, + "loss": 4.20882511138916, + "step": 632, + "token_acc": 0.20298626039816806 + }, + { + "epoch": 0.37115215479331576, + "grad_norm": 2.4938104821910785, + "learning_rate": 0.0001855216881594373, + "loss": 4.223635673522949, + "step": 633, + "token_acc": 0.20166257528877618 + }, + { + "epoch": 0.37173849311052476, + "grad_norm": 2.5118341234503103, + "learning_rate": 0.00018581477139507623, + "loss": 4.199278831481934, + "step": 634, + "token_acc": 0.20652520926610862 + }, + { + "epoch": 0.3723248314277338, + "grad_norm": 2.6770372173940804, + "learning_rate": 0.00018610785463071513, + "loss": 4.208103179931641, + "step": 635, + "token_acc": 0.20533533771131682 + }, + { + "epoch": 0.3729111697449428, + "grad_norm": 1.8882324339634855, + "learning_rate": 0.00018640093786635403, + "loss": 4.210339546203613, + "step": 636, + "token_acc": 0.20622000492226916 + }, + { + "epoch": 0.3734975080621519, + "grad_norm": 2.3713912485881776, + "learning_rate": 0.00018669402110199295, + "loss": 4.169193267822266, + "step": 637, + "token_acc": 0.2085651312236603 + }, + { + "epoch": 0.3740838463793609, + "grad_norm": 2.603907089459109, + "learning_rate": 0.00018698710433763188, + "loss": 4.230314254760742, + "step": 638, + "token_acc": 0.20190862220638509 + }, + { + "epoch": 0.37467018469656993, + "grad_norm": 1.5405340251070139, + "learning_rate": 0.0001872801875732708, + "loss": 4.1755571365356445, + "step": 639, + "token_acc": 0.20599684323248932 + }, + { + "epoch": 0.37525652301377893, + "grad_norm": 3.2916341186210762, + "learning_rate": 0.00018757327080890974, + "loss": 4.271075248718262, + "step": 640, + "token_acc": 0.19831048402698068 + }, + { + "epoch": 0.375842861330988, + "grad_norm": 1.811396346941996, + "learning_rate": 0.00018786635404454866, + "loss": 4.204448699951172, + "step": 641, + "token_acc": 0.2036325982459875 + }, + { + "epoch": 0.376429199648197, + "grad_norm": 3.0144909423445023, + "learning_rate": 0.0001881594372801876, + "loss": 4.224557399749756, + "step": 642, + "token_acc": 0.20237541943952156 + }, + { + "epoch": 0.37701553796540604, + "grad_norm": 1.8952182270429252, + "learning_rate": 0.00018845252051582652, + "loss": 4.130260467529297, + "step": 643, + "token_acc": 0.21324102862728378 + }, + { + "epoch": 0.37760187628261505, + "grad_norm": 3.1569638666645696, + "learning_rate": 0.00018874560375146544, + "loss": 4.187472343444824, + "step": 644, + "token_acc": 0.20633202372931822 + }, + { + "epoch": 0.3781882145998241, + "grad_norm": 2.0198294319321546, + "learning_rate": 0.00018903868698710432, + "loss": 4.226292610168457, + "step": 645, + "token_acc": 0.20234970628671417 + }, + { + "epoch": 0.3787745529170331, + "grad_norm": 2.2042089620698837, + "learning_rate": 0.00018933177022274324, + "loss": 4.2882232666015625, + "step": 646, + "token_acc": 0.19653220827330725 + }, + { + "epoch": 0.37936089123424216, + "grad_norm": 2.2112993057427555, + "learning_rate": 0.00018962485345838217, + "loss": 4.209702491760254, + "step": 647, + "token_acc": 0.2061230998192835 + }, + { + "epoch": 0.37994722955145116, + "grad_norm": 2.0707954712151477, + "learning_rate": 0.0001899179366940211, + "loss": 4.175907135009766, + "step": 648, + "token_acc": 0.20961082661613126 + }, + { + "epoch": 0.3805335678686602, + "grad_norm": 2.5413261692069367, + "learning_rate": 0.00019021101992966003, + "loss": 4.276764869689941, + "step": 649, + "token_acc": 0.19810358943200368 + }, + { + "epoch": 0.3811199061858693, + "grad_norm": 1.9520696277506953, + "learning_rate": 0.00019050410316529895, + "loss": 4.206478118896484, + "step": 650, + "token_acc": 0.20120135590426244 + }, + { + "epoch": 0.3817062445030783, + "grad_norm": 2.592106622393218, + "learning_rate": 0.00019079718640093788, + "loss": 4.13508415222168, + "step": 651, + "token_acc": 0.21329034596661586 + }, + { + "epoch": 0.38229258282028733, + "grad_norm": 2.1197731947154654, + "learning_rate": 0.0001910902696365768, + "loss": 4.264552593231201, + "step": 652, + "token_acc": 0.19786214849640735 + }, + { + "epoch": 0.38287892113749633, + "grad_norm": 2.529222559482183, + "learning_rate": 0.00019138335287221573, + "loss": 4.22831392288208, + "step": 653, + "token_acc": 0.20204209385635905 + }, + { + "epoch": 0.3834652594547054, + "grad_norm": 1.9404252012608736, + "learning_rate": 0.00019167643610785463, + "loss": 4.157746315002441, + "step": 654, + "token_acc": 0.20975458975344685 + }, + { + "epoch": 0.3840515977719144, + "grad_norm": 2.512600821555176, + "learning_rate": 0.00019196951934349353, + "loss": 4.170214653015137, + "step": 655, + "token_acc": 0.20631282485127453 + }, + { + "epoch": 0.38463793608912344, + "grad_norm": 2.701434050883918, + "learning_rate": 0.00019226260257913246, + "loss": 4.170098304748535, + "step": 656, + "token_acc": 0.20622947980064404 + }, + { + "epoch": 0.38522427440633245, + "grad_norm": 2.4431766897358385, + "learning_rate": 0.0001925556858147714, + "loss": 4.179371356964111, + "step": 657, + "token_acc": 0.205891707492241 + }, + { + "epoch": 0.3858106127235415, + "grad_norm": 2.0730537353964187, + "learning_rate": 0.00019284876905041032, + "loss": 4.178879737854004, + "step": 658, + "token_acc": 0.2054048947415381 + }, + { + "epoch": 0.3863969510407505, + "grad_norm": 2.6491191267137, + "learning_rate": 0.00019314185228604924, + "loss": 4.181436538696289, + "step": 659, + "token_acc": 0.20544109301439537 + }, + { + "epoch": 0.38698328935795956, + "grad_norm": 1.8885377245486985, + "learning_rate": 0.00019343493552168817, + "loss": 4.13986873626709, + "step": 660, + "token_acc": 0.20963559195035902 + }, + { + "epoch": 0.38756962767516856, + "grad_norm": 2.2991514894392853, + "learning_rate": 0.0001937280187573271, + "loss": 4.190619468688965, + "step": 661, + "token_acc": 0.20524464103072138 + }, + { + "epoch": 0.3881559659923776, + "grad_norm": 2.362925115009617, + "learning_rate": 0.000194021101992966, + "loss": 4.182242393493652, + "step": 662, + "token_acc": 0.20488680661232828 + }, + { + "epoch": 0.3887423043095866, + "grad_norm": 2.640894521065949, + "learning_rate": 0.00019431418522860493, + "loss": 4.087883949279785, + "step": 663, + "token_acc": 0.2144021990511306 + }, + { + "epoch": 0.3893286426267957, + "grad_norm": 1.6606458079428112, + "learning_rate": 0.00019460726846424385, + "loss": 4.110154151916504, + "step": 664, + "token_acc": 0.212002778562323 + }, + { + "epoch": 0.3899149809440047, + "grad_norm": 2.1844881931565054, + "learning_rate": 0.00019490035169988278, + "loss": 4.138330459594727, + "step": 665, + "token_acc": 0.21075519561346842 + }, + { + "epoch": 0.39050131926121373, + "grad_norm": 2.2601177318952486, + "learning_rate": 0.00019519343493552168, + "loss": 4.164782524108887, + "step": 666, + "token_acc": 0.20874460868147818 + }, + { + "epoch": 0.39108765757842273, + "grad_norm": 2.497526862000262, + "learning_rate": 0.0001954865181711606, + "loss": 4.187143325805664, + "step": 667, + "token_acc": 0.20432454563823338 + }, + { + "epoch": 0.3916739958956318, + "grad_norm": 2.557495163505733, + "learning_rate": 0.00019577960140679953, + "loss": 4.090910911560059, + "step": 668, + "token_acc": 0.21460647173811437 + }, + { + "epoch": 0.3922603342128408, + "grad_norm": 2.0140974221078505, + "learning_rate": 0.00019607268464243846, + "loss": 4.16569185256958, + "step": 669, + "token_acc": 0.20671116569120165 + }, + { + "epoch": 0.39284667253004985, + "grad_norm": 2.9724759185931, + "learning_rate": 0.0001963657678780774, + "loss": 4.216405868530273, + "step": 670, + "token_acc": 0.2014534242441915 + }, + { + "epoch": 0.39343301084725885, + "grad_norm": 1.6546005712160807, + "learning_rate": 0.0001966588511137163, + "loss": 4.114252090454102, + "step": 671, + "token_acc": 0.2104150753874002 + }, + { + "epoch": 0.3940193491644679, + "grad_norm": 2.8122543091915144, + "learning_rate": 0.00019695193434935522, + "loss": 4.128007888793945, + "step": 672, + "token_acc": 0.21066403190755523 + }, + { + "epoch": 0.3946056874816769, + "grad_norm": 1.5690590844479788, + "learning_rate": 0.00019724501758499414, + "loss": 4.1066789627075195, + "step": 673, + "token_acc": 0.21355748085737752 + }, + { + "epoch": 0.39519202579888596, + "grad_norm": 2.3403342855611395, + "learning_rate": 0.00019753810082063307, + "loss": 4.1145219802856445, + "step": 674, + "token_acc": 0.21464247988419558 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 2.202178188154737, + "learning_rate": 0.000197831184056272, + "loss": 4.1572265625, + "step": 675, + "token_acc": 0.2079692853238772 + }, + { + "epoch": 0.396364702433304, + "grad_norm": 1.5476916958439135, + "learning_rate": 0.0001981242672919109, + "loss": 4.0672760009765625, + "step": 676, + "token_acc": 0.2172768555814968 + }, + { + "epoch": 0.3969510407505131, + "grad_norm": 2.347658204195443, + "learning_rate": 0.00019841735052754982, + "loss": 4.104122161865234, + "step": 677, + "token_acc": 0.21262940536317215 + }, + { + "epoch": 0.3975373790677221, + "grad_norm": 2.217887236862945, + "learning_rate": 0.00019871043376318875, + "loss": 4.072694778442383, + "step": 678, + "token_acc": 0.21492755265842337 + }, + { + "epoch": 0.39812371738493113, + "grad_norm": 2.5696013306792467, + "learning_rate": 0.00019900351699882768, + "loss": 4.1688995361328125, + "step": 679, + "token_acc": 0.20589306705592084 + }, + { + "epoch": 0.39871005570214013, + "grad_norm": 2.7365388251139335, + "learning_rate": 0.00019929660023446658, + "loss": 4.068920612335205, + "step": 680, + "token_acc": 0.21451251669284382 + }, + { + "epoch": 0.3992963940193492, + "grad_norm": 1.8692370350576855, + "learning_rate": 0.0001995896834701055, + "loss": 4.217334747314453, + "step": 681, + "token_acc": 0.20007284423389154 + }, + { + "epoch": 0.3998827323365582, + "grad_norm": 3.320321895015174, + "learning_rate": 0.00019988276670574443, + "loss": 4.143980979919434, + "step": 682, + "token_acc": 0.20586730825088617 + }, + { + "epoch": 0.40046907065376725, + "grad_norm": 1.7168458465318832, + "learning_rate": 0.00020017584994138336, + "loss": 4.147307395935059, + "step": 683, + "token_acc": 0.20697672622220162 + }, + { + "epoch": 0.40105540897097625, + "grad_norm": 1.9489371333865713, + "learning_rate": 0.0002004689331770223, + "loss": 4.114063262939453, + "step": 684, + "token_acc": 0.21234300050067564 + }, + { + "epoch": 0.4016417472881853, + "grad_norm": 2.113578416771434, + "learning_rate": 0.00020076201641266121, + "loss": 4.083517074584961, + "step": 685, + "token_acc": 0.21295801050374152 + }, + { + "epoch": 0.4022280856053943, + "grad_norm": 1.5800534078436324, + "learning_rate": 0.00020105509964830011, + "loss": 4.111965179443359, + "step": 686, + "token_acc": 0.2108251211817903 + }, + { + "epoch": 0.40281442392260336, + "grad_norm": 1.8988496429416992, + "learning_rate": 0.00020134818288393904, + "loss": 4.109745979309082, + "step": 687, + "token_acc": 0.2105494365060982 + }, + { + "epoch": 0.40340076223981236, + "grad_norm": 2.404042301757026, + "learning_rate": 0.00020164126611957797, + "loss": 4.127511501312256, + "step": 688, + "token_acc": 0.2111118657882225 + }, + { + "epoch": 0.4039871005570214, + "grad_norm": 2.216419345224772, + "learning_rate": 0.00020193434935521687, + "loss": 4.121830940246582, + "step": 689, + "token_acc": 0.21161733556519877 + }, + { + "epoch": 0.4045734388742304, + "grad_norm": 2.032865459940097, + "learning_rate": 0.0002022274325908558, + "loss": 4.079171657562256, + "step": 690, + "token_acc": 0.21245708353035506 + }, + { + "epoch": 0.4051597771914395, + "grad_norm": 2.439975321293924, + "learning_rate": 0.00020252051582649472, + "loss": 4.1306328773498535, + "step": 691, + "token_acc": 0.2077358432999756 + }, + { + "epoch": 0.4057461155086485, + "grad_norm": 1.4006861575532985, + "learning_rate": 0.00020281359906213365, + "loss": 4.150385856628418, + "step": 692, + "token_acc": 0.204880960103795 + }, + { + "epoch": 0.40633245382585753, + "grad_norm": 2.5333265490794483, + "learning_rate": 0.00020310668229777258, + "loss": 4.1458282470703125, + "step": 693, + "token_acc": 0.20784267167283782 + }, + { + "epoch": 0.40691879214306653, + "grad_norm": 1.935447343788751, + "learning_rate": 0.0002033997655334115, + "loss": 4.165396213531494, + "step": 694, + "token_acc": 0.2063111892190331 + }, + { + "epoch": 0.4075051304602756, + "grad_norm": 2.277939455747646, + "learning_rate": 0.00020369284876905043, + "loss": 4.051935195922852, + "step": 695, + "token_acc": 0.21691328870691245 + }, + { + "epoch": 0.4080914687774846, + "grad_norm": 1.9818369549169748, + "learning_rate": 0.00020398593200468933, + "loss": 4.093563079833984, + "step": 696, + "token_acc": 0.2116513357303584 + }, + { + "epoch": 0.40867780709469365, + "grad_norm": 1.75615010635649, + "learning_rate": 0.00020427901524032826, + "loss": 4.061014652252197, + "step": 697, + "token_acc": 0.2163832424849383 + }, + { + "epoch": 0.40926414541190265, + "grad_norm": 2.0761384136039487, + "learning_rate": 0.00020457209847596716, + "loss": 4.1219072341918945, + "step": 698, + "token_acc": 0.2075109321380693 + }, + { + "epoch": 0.4098504837291117, + "grad_norm": 2.084426291121808, + "learning_rate": 0.0002048651817116061, + "loss": 4.014046669006348, + "step": 699, + "token_acc": 0.219447500207435 + }, + { + "epoch": 0.4104368220463207, + "grad_norm": 2.0836725884319014, + "learning_rate": 0.00020515826494724501, + "loss": 4.119354248046875, + "step": 700, + "token_acc": 0.20690766883869174 + }, + { + "epoch": 0.41102316036352976, + "grad_norm": 2.361919740078177, + "learning_rate": 0.00020545134818288394, + "loss": 4.129726409912109, + "step": 701, + "token_acc": 0.20816018930121397 + }, + { + "epoch": 0.41160949868073876, + "grad_norm": 1.7317770889794688, + "learning_rate": 0.00020574443141852287, + "loss": 4.121204376220703, + "step": 702, + "token_acc": 0.21044066880173024 + }, + { + "epoch": 0.4121958369979478, + "grad_norm": 2.2494175244386403, + "learning_rate": 0.0002060375146541618, + "loss": 4.0933942794799805, + "step": 703, + "token_acc": 0.2119768174373189 + }, + { + "epoch": 0.4127821753151568, + "grad_norm": 2.1567664120244676, + "learning_rate": 0.00020633059788980072, + "loss": 4.113862991333008, + "step": 704, + "token_acc": 0.20816850067688936 + }, + { + "epoch": 0.4133685136323659, + "grad_norm": 1.9985908835457784, + "learning_rate": 0.00020662368112543965, + "loss": 4.14725399017334, + "step": 705, + "token_acc": 0.20447180140171065 + }, + { + "epoch": 0.41395485194957493, + "grad_norm": 2.571306310754256, + "learning_rate": 0.00020691676436107855, + "loss": 4.0604705810546875, + "step": 706, + "token_acc": 0.21314476825532225 + }, + { + "epoch": 0.41454119026678393, + "grad_norm": 1.7564910695849216, + "learning_rate": 0.00020720984759671745, + "loss": 4.050766944885254, + "step": 707, + "token_acc": 0.2154999615512778 + }, + { + "epoch": 0.415127528583993, + "grad_norm": 2.731482760416182, + "learning_rate": 0.00020750293083235638, + "loss": 4.046940326690674, + "step": 708, + "token_acc": 0.21555105350520207 + }, + { + "epoch": 0.415713866901202, + "grad_norm": 1.9118279706971355, + "learning_rate": 0.0002077960140679953, + "loss": 4.081516265869141, + "step": 709, + "token_acc": 0.21448180268806766 + }, + { + "epoch": 0.41630020521841105, + "grad_norm": 1.8197667369067003, + "learning_rate": 0.00020808909730363423, + "loss": 4.071291446685791, + "step": 710, + "token_acc": 0.2139945038297782 + }, + { + "epoch": 0.41688654353562005, + "grad_norm": 2.0462060621382965, + "learning_rate": 0.00020838218053927316, + "loss": 4.06553840637207, + "step": 711, + "token_acc": 0.21349837574240058 + }, + { + "epoch": 0.4174728818528291, + "grad_norm": 2.1720148383857003, + "learning_rate": 0.00020867526377491209, + "loss": 4.117403984069824, + "step": 712, + "token_acc": 0.20813482943056724 + }, + { + "epoch": 0.4180592201700381, + "grad_norm": 2.1483037378863634, + "learning_rate": 0.000208968347010551, + "loss": 4.112849712371826, + "step": 713, + "token_acc": 0.21048822886268712 + }, + { + "epoch": 0.41864555848724716, + "grad_norm": 1.8091475844278968, + "learning_rate": 0.00020926143024618994, + "loss": 4.067354202270508, + "step": 714, + "token_acc": 0.21204839866497155 + }, + { + "epoch": 0.41923189680445616, + "grad_norm": 1.9319472139802623, + "learning_rate": 0.00020955451348182887, + "loss": 4.059874534606934, + "step": 715, + "token_acc": 0.21449142188318154 + }, + { + "epoch": 0.4198182351216652, + "grad_norm": 2.4975280097721315, + "learning_rate": 0.00020984759671746774, + "loss": 4.029237270355225, + "step": 716, + "token_acc": 0.21732570020118605 + }, + { + "epoch": 0.4204045734388742, + "grad_norm": 1.9164801344004896, + "learning_rate": 0.00021014067995310667, + "loss": 4.080502510070801, + "step": 717, + "token_acc": 0.21237672406253696 + }, + { + "epoch": 0.4209909117560833, + "grad_norm": 2.1400704047185934, + "learning_rate": 0.0002104337631887456, + "loss": 4.098213195800781, + "step": 718, + "token_acc": 0.2090646176153848 + }, + { + "epoch": 0.4215772500732923, + "grad_norm": 2.1614696940446176, + "learning_rate": 0.00021072684642438452, + "loss": 4.03939151763916, + "step": 719, + "token_acc": 0.2161053818673158 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 1.8201702413374024, + "learning_rate": 0.00021101992966002345, + "loss": 4.116702556610107, + "step": 720, + "token_acc": 0.20861004726205998 + }, + { + "epoch": 0.42274992670771033, + "grad_norm": 2.133483432451525, + "learning_rate": 0.00021131301289566238, + "loss": 3.985278606414795, + "step": 721, + "token_acc": 0.22097369563868727 + }, + { + "epoch": 0.4233362650249194, + "grad_norm": 1.9164653947577601, + "learning_rate": 0.0002116060961313013, + "loss": 4.088165283203125, + "step": 722, + "token_acc": 0.20935368847412422 + }, + { + "epoch": 0.4239226033421284, + "grad_norm": 2.4035328307719053, + "learning_rate": 0.00021189917936694023, + "loss": 4.082188129425049, + "step": 723, + "token_acc": 0.2121592905888603 + }, + { + "epoch": 0.42450894165933745, + "grad_norm": 1.7137457048751554, + "learning_rate": 0.00021219226260257916, + "loss": 3.997396230697632, + "step": 724, + "token_acc": 0.21972841037383326 + }, + { + "epoch": 0.42509527997654645, + "grad_norm": 2.2470061122133567, + "learning_rate": 0.00021248534583821806, + "loss": 4.1175689697265625, + "step": 725, + "token_acc": 0.2067081742268474 + }, + { + "epoch": 0.4256816182937555, + "grad_norm": 2.3229221372335567, + "learning_rate": 0.00021277842907385696, + "loss": 4.1208415031433105, + "step": 726, + "token_acc": 0.20549858835910434 + }, + { + "epoch": 0.4262679566109645, + "grad_norm": 1.8310992649134992, + "learning_rate": 0.00021307151230949589, + "loss": 4.040855884552002, + "step": 727, + "token_acc": 0.21452379133083374 + }, + { + "epoch": 0.42685429492817356, + "grad_norm": 1.7348197747209888, + "learning_rate": 0.0002133645955451348, + "loss": 4.058550834655762, + "step": 728, + "token_acc": 0.21420323472808148 + }, + { + "epoch": 0.42744063324538256, + "grad_norm": 2.0396949777502584, + "learning_rate": 0.00021365767878077374, + "loss": 4.041525363922119, + "step": 729, + "token_acc": 0.21365123140071832 + }, + { + "epoch": 0.4280269715625916, + "grad_norm": 1.9526226295230678, + "learning_rate": 0.00021395076201641267, + "loss": 4.05921745300293, + "step": 730, + "token_acc": 0.21601703895110597 + }, + { + "epoch": 0.4286133098798006, + "grad_norm": 1.8738785478939408, + "learning_rate": 0.0002142438452520516, + "loss": 4.072822570800781, + "step": 731, + "token_acc": 0.2113496143958869 + }, + { + "epoch": 0.4291996481970097, + "grad_norm": 1.6557007612296912, + "learning_rate": 0.00021453692848769052, + "loss": 4.027461051940918, + "step": 732, + "token_acc": 0.21646072492207008 + }, + { + "epoch": 0.42978598651421873, + "grad_norm": 2.3622858426424944, + "learning_rate": 0.00021483001172332945, + "loss": 4.029733657836914, + "step": 733, + "token_acc": 0.2170796281378818 + }, + { + "epoch": 0.43037232483142773, + "grad_norm": 1.5324235551213214, + "learning_rate": 0.00021512309495896835, + "loss": 4.010272979736328, + "step": 734, + "token_acc": 0.21741305979484613 + }, + { + "epoch": 0.4309586631486368, + "grad_norm": 2.3386073862547914, + "learning_rate": 0.00021541617819460728, + "loss": 4.052709579467773, + "step": 735, + "token_acc": 0.2140196321216496 + }, + { + "epoch": 0.4315450014658458, + "grad_norm": 1.4599089111534616, + "learning_rate": 0.0002157092614302462, + "loss": 3.989351511001587, + "step": 736, + "token_acc": 0.2223892787838745 + }, + { + "epoch": 0.43213133978305485, + "grad_norm": 1.9077388670200224, + "learning_rate": 0.0002160023446658851, + "loss": 4.063753128051758, + "step": 737, + "token_acc": 0.2143941338770098 + }, + { + "epoch": 0.43271767810026385, + "grad_norm": 1.6787276252209509, + "learning_rate": 0.00021629542790152403, + "loss": 4.056057453155518, + "step": 738, + "token_acc": 0.2125789944081055 + }, + { + "epoch": 0.4333040164174729, + "grad_norm": 2.19216933069188, + "learning_rate": 0.00021658851113716296, + "loss": 4.056268692016602, + "step": 739, + "token_acc": 0.21214584271107967 + }, + { + "epoch": 0.4338903547346819, + "grad_norm": 1.4126912322007483, + "learning_rate": 0.00021688159437280188, + "loss": 4.005537033081055, + "step": 740, + "token_acc": 0.21921170577379384 + }, + { + "epoch": 0.43447669305189096, + "grad_norm": 2.5008048589762844, + "learning_rate": 0.0002171746776084408, + "loss": 4.111293792724609, + "step": 741, + "token_acc": 0.20807779685783462 + }, + { + "epoch": 0.43506303136909996, + "grad_norm": 1.679513752408952, + "learning_rate": 0.0002174677608440797, + "loss": 4.003476142883301, + "step": 742, + "token_acc": 0.21912503568009417 + }, + { + "epoch": 0.435649369686309, + "grad_norm": 1.9614447718753012, + "learning_rate": 0.00021776084407971864, + "loss": 4.041287422180176, + "step": 743, + "token_acc": 0.21623992270693307 + }, + { + "epoch": 0.436235708003518, + "grad_norm": 1.704727283799899, + "learning_rate": 0.00021805392731535757, + "loss": 4.006258964538574, + "step": 744, + "token_acc": 0.2168786741505862 + }, + { + "epoch": 0.4368220463207271, + "grad_norm": 1.777120123403354, + "learning_rate": 0.0002183470105509965, + "loss": 4.042520523071289, + "step": 745, + "token_acc": 0.21516022325248732 + }, + { + "epoch": 0.4374083846379361, + "grad_norm": 1.7565510746502324, + "learning_rate": 0.00021864009378663542, + "loss": 4.07253360748291, + "step": 746, + "token_acc": 0.21072772067738174 + }, + { + "epoch": 0.43799472295514513, + "grad_norm": 1.900008782040298, + "learning_rate": 0.00021893317702227432, + "loss": 4.014519691467285, + "step": 747, + "token_acc": 0.21650056101907464 + }, + { + "epoch": 0.43858106127235413, + "grad_norm": 1.6277034347422095, + "learning_rate": 0.00021922626025791325, + "loss": 4.0982561111450195, + "step": 748, + "token_acc": 0.2068687987495277 + }, + { + "epoch": 0.4391673995895632, + "grad_norm": 1.774838004056762, + "learning_rate": 0.00021951934349355217, + "loss": 4.032931327819824, + "step": 749, + "token_acc": 0.2158822078242466 + }, + { + "epoch": 0.4397537379067722, + "grad_norm": 2.3355340328229817, + "learning_rate": 0.0002198124267291911, + "loss": 4.0619354248046875, + "step": 750, + "token_acc": 0.21238170097382386 + }, + { + "epoch": 0.44034007622398125, + "grad_norm": 1.7957200090688348, + "learning_rate": 0.00022010550996483, + "loss": 4.074429035186768, + "step": 751, + "token_acc": 0.20913684027135032 + }, + { + "epoch": 0.44092641454119025, + "grad_norm": 1.8291746297036837, + "learning_rate": 0.00022039859320046893, + "loss": 4.036952972412109, + "step": 752, + "token_acc": 0.21266663261657898 + }, + { + "epoch": 0.4415127528583993, + "grad_norm": 1.7351075902549933, + "learning_rate": 0.00022069167643610786, + "loss": 4.024649620056152, + "step": 753, + "token_acc": 0.2182853040713773 + }, + { + "epoch": 0.4420990911756083, + "grad_norm": 1.68521167471349, + "learning_rate": 0.00022098475967174678, + "loss": 3.986966133117676, + "step": 754, + "token_acc": 0.2194137588479565 + }, + { + "epoch": 0.44268542949281736, + "grad_norm": 1.7226967559828836, + "learning_rate": 0.0002212778429073857, + "loss": 4.0455827713012695, + "step": 755, + "token_acc": 0.21069507928671158 + }, + { + "epoch": 0.44327176781002636, + "grad_norm": 1.717343244747412, + "learning_rate": 0.00022157092614302464, + "loss": 4.025557518005371, + "step": 756, + "token_acc": 0.21723849839890502 + }, + { + "epoch": 0.4438581061272354, + "grad_norm": 2.006092086695301, + "learning_rate": 0.00022186400937866354, + "loss": 4.047937870025635, + "step": 757, + "token_acc": 0.21306945846076414 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.4330332716195198, + "learning_rate": 0.00022215709261430247, + "loss": 4.046126365661621, + "step": 758, + "token_acc": 0.211544689119171 + }, + { + "epoch": 0.4450307827616535, + "grad_norm": 2.4784736645670744, + "learning_rate": 0.0002224501758499414, + "loss": 4.0084381103515625, + "step": 759, + "token_acc": 0.21681487845085792 + }, + { + "epoch": 0.4456171210788625, + "grad_norm": 1.5606073263465277, + "learning_rate": 0.0002227432590855803, + "loss": 4.0421671867370605, + "step": 760, + "token_acc": 0.21304208308964717 + }, + { + "epoch": 0.44620345939607153, + "grad_norm": 1.9342978153963148, + "learning_rate": 0.00022303634232121922, + "loss": 4.01188850402832, + "step": 761, + "token_acc": 0.21594870238676395 + }, + { + "epoch": 0.4467897977132806, + "grad_norm": 1.8247554518599955, + "learning_rate": 0.00022332942555685815, + "loss": 4.003640651702881, + "step": 762, + "token_acc": 0.21659994232522856 + }, + { + "epoch": 0.4473761360304896, + "grad_norm": 1.4198825641000967, + "learning_rate": 0.00022362250879249707, + "loss": 3.9982781410217285, + "step": 763, + "token_acc": 0.21839969279488272 + }, + { + "epoch": 0.44796247434769865, + "grad_norm": 2.2293971828528427, + "learning_rate": 0.000223915592028136, + "loss": 3.9771862030029297, + "step": 764, + "token_acc": 0.22031586030372197 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 1.3034303537416057, + "learning_rate": 0.00022420867526377493, + "loss": 4.025883197784424, + "step": 765, + "token_acc": 0.21496806316009734 + }, + { + "epoch": 0.4491351509821167, + "grad_norm": 2.2050447448546726, + "learning_rate": 0.00022450175849941386, + "loss": 3.986133098602295, + "step": 766, + "token_acc": 0.22015319052101326 + }, + { + "epoch": 0.4497214892993257, + "grad_norm": 1.3875255518866465, + "learning_rate": 0.00022479484173505276, + "loss": 4.023779392242432, + "step": 767, + "token_acc": 0.2153660157407185 + }, + { + "epoch": 0.45030782761653476, + "grad_norm": 1.831923129459966, + "learning_rate": 0.00022508792497069168, + "loss": 4.017499923706055, + "step": 768, + "token_acc": 0.21423496601380657 + }, + { + "epoch": 0.45089416593374376, + "grad_norm": 1.5963945653154585, + "learning_rate": 0.00022538100820633058, + "loss": 4.066218376159668, + "step": 769, + "token_acc": 0.20941322234455975 + }, + { + "epoch": 0.4514805042509528, + "grad_norm": 1.9272891640295549, + "learning_rate": 0.0002256740914419695, + "loss": 3.9980664253234863, + "step": 770, + "token_acc": 0.2180391748009331 + }, + { + "epoch": 0.4520668425681618, + "grad_norm": 1.612232335848192, + "learning_rate": 0.00022596717467760844, + "loss": 3.987384557723999, + "step": 771, + "token_acc": 0.21802794300306277 + }, + { + "epoch": 0.4526531808853709, + "grad_norm": 1.5710388494163687, + "learning_rate": 0.00022626025791324736, + "loss": 3.9475903511047363, + "step": 772, + "token_acc": 0.22294018774578206 + }, + { + "epoch": 0.4532395192025799, + "grad_norm": 1.9439372962106505, + "learning_rate": 0.0002265533411488863, + "loss": 4.011725902557373, + "step": 773, + "token_acc": 0.2174230031999703 + }, + { + "epoch": 0.45382585751978893, + "grad_norm": 1.490390064617704, + "learning_rate": 0.00022684642438452522, + "loss": 4.037602424621582, + "step": 774, + "token_acc": 0.21332604707443695 + }, + { + "epoch": 0.45441219583699793, + "grad_norm": 1.5222723737978927, + "learning_rate": 0.00022713950762016415, + "loss": 4.036128044128418, + "step": 775, + "token_acc": 0.21326276709190725 + }, + { + "epoch": 0.454998534154207, + "grad_norm": 1.901559354248794, + "learning_rate": 0.00022743259085580307, + "loss": 3.994436502456665, + "step": 776, + "token_acc": 0.21889277748371924 + }, + { + "epoch": 0.455584872471416, + "grad_norm": 1.599926779973861, + "learning_rate": 0.00022772567409144197, + "loss": 3.9802608489990234, + "step": 777, + "token_acc": 0.21836872811511085 + }, + { + "epoch": 0.45617121078862505, + "grad_norm": 1.7743235548118104, + "learning_rate": 0.00022801875732708087, + "loss": 3.98239803314209, + "step": 778, + "token_acc": 0.21905449393850912 + }, + { + "epoch": 0.45675754910583405, + "grad_norm": 1.8566778758906217, + "learning_rate": 0.0002283118405627198, + "loss": 3.983452558517456, + "step": 779, + "token_acc": 0.21680983706370532 + }, + { + "epoch": 0.4573438874230431, + "grad_norm": 1.6718892396047285, + "learning_rate": 0.00022860492379835873, + "loss": 3.9802134037017822, + "step": 780, + "token_acc": 0.2166123987447817 + }, + { + "epoch": 0.4579302257402521, + "grad_norm": 1.76356672238489, + "learning_rate": 0.00022889800703399765, + "loss": 3.9591665267944336, + "step": 781, + "token_acc": 0.22130066367825002 + }, + { + "epoch": 0.45851656405746116, + "grad_norm": 1.5392791281243248, + "learning_rate": 0.00022919109026963658, + "loss": 3.9620537757873535, + "step": 782, + "token_acc": 0.21998108722876786 + }, + { + "epoch": 0.45910290237467016, + "grad_norm": 2.0898358779682837, + "learning_rate": 0.0002294841735052755, + "loss": 3.993950128555298, + "step": 783, + "token_acc": 0.21714388040206564 + }, + { + "epoch": 0.4596892406918792, + "grad_norm": 1.5344549833791856, + "learning_rate": 0.00022977725674091444, + "loss": 3.9980597496032715, + "step": 784, + "token_acc": 0.21616525084778657 + }, + { + "epoch": 0.4602755790090882, + "grad_norm": 2.2583607519702027, + "learning_rate": 0.00023007033997655336, + "loss": 3.998781204223633, + "step": 785, + "token_acc": 0.217110075847996 + }, + { + "epoch": 0.4608619173262973, + "grad_norm": 1.9138605630757042, + "learning_rate": 0.0002303634232121923, + "loss": 3.9461002349853516, + "step": 786, + "token_acc": 0.22020585189694916 + }, + { + "epoch": 0.4614482556435063, + "grad_norm": 1.8192137921741556, + "learning_rate": 0.00023065650644783116, + "loss": 4.0256547927856445, + "step": 787, + "token_acc": 0.21172509765751066 + }, + { + "epoch": 0.46203459396071533, + "grad_norm": 1.7106583109607434, + "learning_rate": 0.0002309495896834701, + "loss": 3.969985008239746, + "step": 788, + "token_acc": 0.21994648976431475 + }, + { + "epoch": 0.46262093227792433, + "grad_norm": 1.6847870238014038, + "learning_rate": 0.00023124267291910902, + "loss": 3.9602386951446533, + "step": 789, + "token_acc": 0.219969565847498 + }, + { + "epoch": 0.4632072705951334, + "grad_norm": 1.902475063581863, + "learning_rate": 0.00023153575615474795, + "loss": 3.966522216796875, + "step": 790, + "token_acc": 0.21975405667791595 + }, + { + "epoch": 0.46379360891234245, + "grad_norm": 1.5687939762527154, + "learning_rate": 0.00023182883939038687, + "loss": 3.9939188957214355, + "step": 791, + "token_acc": 0.21603237381806753 + }, + { + "epoch": 0.46437994722955145, + "grad_norm": 1.7354660867276823, + "learning_rate": 0.0002321219226260258, + "loss": 3.9329535961151123, + "step": 792, + "token_acc": 0.2234515050167224 + }, + { + "epoch": 0.4649662855467605, + "grad_norm": 1.7651701514741607, + "learning_rate": 0.00023241500586166473, + "loss": 3.980254650115967, + "step": 793, + "token_acc": 0.21843814153295277 + }, + { + "epoch": 0.4655526238639695, + "grad_norm": 1.6692683821353422, + "learning_rate": 0.00023270808909730365, + "loss": 3.9503610134124756, + "step": 794, + "token_acc": 0.2194911304946864 + }, + { + "epoch": 0.46613896218117856, + "grad_norm": 2.1222261682151147, + "learning_rate": 0.00023300117233294258, + "loss": 3.946131706237793, + "step": 795, + "token_acc": 0.22192083007888044 + }, + { + "epoch": 0.46672530049838756, + "grad_norm": 1.3760324566699202, + "learning_rate": 0.00023329425556858148, + "loss": 3.954542875289917, + "step": 796, + "token_acc": 0.22213777783986924 + }, + { + "epoch": 0.4673116388155966, + "grad_norm": 1.9735904335451047, + "learning_rate": 0.0002335873388042204, + "loss": 3.9499893188476562, + "step": 797, + "token_acc": 0.22057887172770108 + }, + { + "epoch": 0.4678979771328056, + "grad_norm": 1.7351168327461521, + "learning_rate": 0.0002338804220398593, + "loss": 3.87284517288208, + "step": 798, + "token_acc": 0.22793353027300806 + }, + { + "epoch": 0.4684843154500147, + "grad_norm": 1.7946512194609563, + "learning_rate": 0.00023417350527549824, + "loss": 3.956754684448242, + "step": 799, + "token_acc": 0.21745916082619007 + }, + { + "epoch": 0.4690706537672237, + "grad_norm": 1.6625602665169559, + "learning_rate": 0.00023446658851113716, + "loss": 3.9647860527038574, + "step": 800, + "token_acc": 0.21907062507725905 + }, + { + "epoch": 0.46965699208443273, + "grad_norm": 1.551781501356406, + "learning_rate": 0.0002347596717467761, + "loss": 3.965974807739258, + "step": 801, + "token_acc": 0.21881128516581275 + }, + { + "epoch": 0.47024333040164173, + "grad_norm": 1.5915130492622156, + "learning_rate": 0.00023505275498241502, + "loss": 3.9539031982421875, + "step": 802, + "token_acc": 0.2180815172321433 + }, + { + "epoch": 0.4708296687188508, + "grad_norm": 2.0533221002606616, + "learning_rate": 0.00023534583821805394, + "loss": 3.974269151687622, + "step": 803, + "token_acc": 0.2189483978327563 + }, + { + "epoch": 0.4714160070360598, + "grad_norm": 1.4006372625876125, + "learning_rate": 0.00023563892145369287, + "loss": 3.9977452754974365, + "step": 804, + "token_acc": 0.21441884660729177 + }, + { + "epoch": 0.47200234535326885, + "grad_norm": 1.8883413465404273, + "learning_rate": 0.00023593200468933177, + "loss": 3.9808130264282227, + "step": 805, + "token_acc": 0.21612854984348975 + }, + { + "epoch": 0.47258868367047785, + "grad_norm": 1.2265562261984315, + "learning_rate": 0.0002362250879249707, + "loss": 3.887144088745117, + "step": 806, + "token_acc": 0.22653741712375927 + }, + { + "epoch": 0.4731750219876869, + "grad_norm": 2.158122965318434, + "learning_rate": 0.00023651817116060963, + "loss": 3.995875597000122, + "step": 807, + "token_acc": 0.21453683908816715 + }, + { + "epoch": 0.4737613603048959, + "grad_norm": 1.47491822181735, + "learning_rate": 0.00023681125439624853, + "loss": 3.9565086364746094, + "step": 808, + "token_acc": 0.21757540663226793 + }, + { + "epoch": 0.47434769862210496, + "grad_norm": 1.8730675623460245, + "learning_rate": 0.00023710433763188745, + "loss": 3.9426002502441406, + "step": 809, + "token_acc": 0.22001132613018065 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 1.3421298458653779, + "learning_rate": 0.00023739742086752638, + "loss": 3.9120216369628906, + "step": 810, + "token_acc": 0.22239072462580084 + }, + { + "epoch": 0.475520375256523, + "grad_norm": 1.8665485111656386, + "learning_rate": 0.0002376905041031653, + "loss": 3.9659433364868164, + "step": 811, + "token_acc": 0.21813315041919312 + }, + { + "epoch": 0.476106713573732, + "grad_norm": 1.6895152632784898, + "learning_rate": 0.00023798358733880423, + "loss": 3.9220194816589355, + "step": 812, + "token_acc": 0.22356798395696392 + }, + { + "epoch": 0.4766930518909411, + "grad_norm": 1.4356603748541694, + "learning_rate": 0.00023827667057444316, + "loss": 3.895996332168579, + "step": 813, + "token_acc": 0.22298496449163932 + }, + { + "epoch": 0.4772793902081501, + "grad_norm": 1.416091288295409, + "learning_rate": 0.00023856975381008206, + "loss": 3.922710418701172, + "step": 814, + "token_acc": 0.22113981734748728 + }, + { + "epoch": 0.47786572852535913, + "grad_norm": 1.5985097604893062, + "learning_rate": 0.000238862837045721, + "loss": 3.9446113109588623, + "step": 815, + "token_acc": 0.21899810989139165 + }, + { + "epoch": 0.47845206684256814, + "grad_norm": 1.3560245718360042, + "learning_rate": 0.00023915592028135992, + "loss": 3.944176197052002, + "step": 816, + "token_acc": 0.21776866388977892 + }, + { + "epoch": 0.4790384051597772, + "grad_norm": 1.9297407873813672, + "learning_rate": 0.00023944900351699884, + "loss": 3.918590545654297, + "step": 817, + "token_acc": 0.22297865869113398 + }, + { + "epoch": 0.47962474347698625, + "grad_norm": 1.8250733812697824, + "learning_rate": 0.00023974208675263774, + "loss": 3.912874937057495, + "step": 818, + "token_acc": 0.22163575675182112 + }, + { + "epoch": 0.48021108179419525, + "grad_norm": 1.445252455662415, + "learning_rate": 0.00024003516998827667, + "loss": 3.8873467445373535, + "step": 819, + "token_acc": 0.22527255594569412 + }, + { + "epoch": 0.4807974201114043, + "grad_norm": 1.7365774074738167, + "learning_rate": 0.0002403282532239156, + "loss": 3.89288592338562, + "step": 820, + "token_acc": 0.22349862639470816 + }, + { + "epoch": 0.4813837584286133, + "grad_norm": 1.4258480441439452, + "learning_rate": 0.00024062133645955453, + "loss": 3.9203014373779297, + "step": 821, + "token_acc": 0.22084931774179467 + }, + { + "epoch": 0.48197009674582236, + "grad_norm": 2.0842359087282265, + "learning_rate": 0.00024091441969519343, + "loss": 3.91632342338562, + "step": 822, + "token_acc": 0.22314360307824171 + }, + { + "epoch": 0.48255643506303136, + "grad_norm": 1.3275376084312145, + "learning_rate": 0.00024120750293083235, + "loss": 3.857473134994507, + "step": 823, + "token_acc": 0.22700043894755867 + }, + { + "epoch": 0.4831427733802404, + "grad_norm": 1.5158122755988426, + "learning_rate": 0.00024150058616647128, + "loss": 3.886094570159912, + "step": 824, + "token_acc": 0.22330989837846926 + }, + { + "epoch": 0.4837291116974494, + "grad_norm": 1.7691343421846069, + "learning_rate": 0.0002417936694021102, + "loss": 3.9427223205566406, + "step": 825, + "token_acc": 0.21862984653970963 + }, + { + "epoch": 0.4843154500146585, + "grad_norm": 1.9605147878845215, + "learning_rate": 0.00024208675263774913, + "loss": 3.846086025238037, + "step": 826, + "token_acc": 0.23053430145353646 + }, + { + "epoch": 0.4849017883318675, + "grad_norm": 1.596179867223669, + "learning_rate": 0.00024237983587338806, + "loss": 3.8211421966552734, + "step": 827, + "token_acc": 0.23085348986982274 + }, + { + "epoch": 0.48548812664907653, + "grad_norm": 1.4227708355316044, + "learning_rate": 0.00024267291910902696, + "loss": 3.9240598678588867, + "step": 828, + "token_acc": 0.21963006295038515 + }, + { + "epoch": 0.48607446496628554, + "grad_norm": 2.304916818681686, + "learning_rate": 0.0002429660023446659, + "loss": 3.882449150085449, + "step": 829, + "token_acc": 0.22366936808673749 + }, + { + "epoch": 0.4866608032834946, + "grad_norm": 1.508748373161343, + "learning_rate": 0.00024325908558030482, + "loss": 3.860668182373047, + "step": 830, + "token_acc": 0.22611897418455504 + }, + { + "epoch": 0.4872471416007036, + "grad_norm": 1.9435105690103587, + "learning_rate": 0.00024355216881594372, + "loss": 3.8318357467651367, + "step": 831, + "token_acc": 0.22927656509113284 + }, + { + "epoch": 0.48783347991791265, + "grad_norm": 1.3717838639501128, + "learning_rate": 0.00024384525205158264, + "loss": 3.9031076431274414, + "step": 832, + "token_acc": 0.2213116243571805 + }, + { + "epoch": 0.48841981823512165, + "grad_norm": 2.3623056127812823, + "learning_rate": 0.00024413833528722157, + "loss": 3.8353021144866943, + "step": 833, + "token_acc": 0.22938385142725162 + }, + { + "epoch": 0.4890061565523307, + "grad_norm": 1.2101893519203448, + "learning_rate": 0.0002444314185228605, + "loss": 3.825613498687744, + "step": 834, + "token_acc": 0.22759554228708115 + }, + { + "epoch": 0.4895924948695397, + "grad_norm": 1.7127815553304444, + "learning_rate": 0.0002447245017584994, + "loss": 3.840912342071533, + "step": 835, + "token_acc": 0.22741469209953138 + }, + { + "epoch": 0.49017883318674876, + "grad_norm": 1.7684884931962814, + "learning_rate": 0.00024501758499413835, + "loss": 3.800657033920288, + "step": 836, + "token_acc": 0.2280093758783257 + }, + { + "epoch": 0.49076517150395776, + "grad_norm": 1.601524331679, + "learning_rate": 0.0002453106682297773, + "loss": 3.8888421058654785, + "step": 837, + "token_acc": 0.22171272451092972 + }, + { + "epoch": 0.4913515098211668, + "grad_norm": 1.7350966009248292, + "learning_rate": 0.0002456037514654162, + "loss": 3.8140242099761963, + "step": 838, + "token_acc": 0.2305299539170507 + }, + { + "epoch": 0.4919378481383758, + "grad_norm": 1.53146327976739, + "learning_rate": 0.00024589683470105513, + "loss": 3.8475823402404785, + "step": 839, + "token_acc": 0.22582238700472188 + }, + { + "epoch": 0.4925241864555849, + "grad_norm": 1.5708135632039935, + "learning_rate": 0.000246189917936694, + "loss": 3.800248861312866, + "step": 840, + "token_acc": 0.23169208328811924 + }, + { + "epoch": 0.4931105247727939, + "grad_norm": 1.5902371303967509, + "learning_rate": 0.00024648300117233293, + "loss": 3.8462116718292236, + "step": 841, + "token_acc": 0.22476964881434236 + }, + { + "epoch": 0.49369686309000294, + "grad_norm": 1.8709222520264444, + "learning_rate": 0.00024677608440797186, + "loss": 3.811887741088867, + "step": 842, + "token_acc": 0.22873590316872675 + }, + { + "epoch": 0.49428320140721194, + "grad_norm": 1.9788407246933075, + "learning_rate": 0.0002470691676436108, + "loss": 3.852499485015869, + "step": 843, + "token_acc": 0.2244373753807716 + }, + { + "epoch": 0.494869539724421, + "grad_norm": 1.347167371088646, + "learning_rate": 0.0002473622508792497, + "loss": 3.7867794036865234, + "step": 844, + "token_acc": 0.2342614632924079 + }, + { + "epoch": 0.49545587804163, + "grad_norm": 1.8687765602264133, + "learning_rate": 0.00024765533411488864, + "loss": 3.760441303253174, + "step": 845, + "token_acc": 0.23309817149673995 + }, + { + "epoch": 0.49604221635883905, + "grad_norm": 1.6938706310063778, + "learning_rate": 0.00024794841735052757, + "loss": 3.78448224067688, + "step": 846, + "token_acc": 0.22955764840907303 + }, + { + "epoch": 0.4966285546760481, + "grad_norm": 1.8302013611347514, + "learning_rate": 0.0002482415005861665, + "loss": 3.832221269607544, + "step": 847, + "token_acc": 0.2283948759617603 + }, + { + "epoch": 0.4972148929932571, + "grad_norm": 1.7567761809256568, + "learning_rate": 0.0002485345838218054, + "loss": 3.822584629058838, + "step": 848, + "token_acc": 0.22859576240415216 + }, + { + "epoch": 0.49780123131046616, + "grad_norm": 1.55317348636794, + "learning_rate": 0.0002488276670574443, + "loss": 3.8465898036956787, + "step": 849, + "token_acc": 0.22345761820214374 + }, + { + "epoch": 0.49838756962767516, + "grad_norm": 1.876278955854456, + "learning_rate": 0.0002491207502930832, + "loss": 3.8260867595672607, + "step": 850, + "token_acc": 0.22560403620414268 + }, + { + "epoch": 0.4989739079448842, + "grad_norm": 1.703511239070497, + "learning_rate": 0.00024941383352872215, + "loss": 3.7570641040802, + "step": 851, + "token_acc": 0.23538907273974283 + }, + { + "epoch": 0.4995602462620932, + "grad_norm": 1.744215941748723, + "learning_rate": 0.0002497069167643611, + "loss": 3.81729793548584, + "step": 852, + "token_acc": 0.2290196278488439 + }, + { + "epoch": 0.5001465845793023, + "grad_norm": 1.3282473726456636, + "learning_rate": 0.00025, + "loss": 3.78011417388916, + "step": 853, + "token_acc": 0.23217940344001742 + }, + { + "epoch": 0.5007329228965113, + "grad_norm": 2.0818591381285407, + "learning_rate": 0.00025029308323563893, + "loss": 3.7965335845947266, + "step": 854, + "token_acc": 0.22880796281741916 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 1.3255993760528444, + "learning_rate": 0.00025058616647127786, + "loss": 3.796429395675659, + "step": 855, + "token_acc": 0.2300318642577323 + }, + { + "epoch": 0.5019055995309294, + "grad_norm": 1.9816862652096225, + "learning_rate": 0.0002508792497069168, + "loss": 3.7942514419555664, + "step": 856, + "token_acc": 0.2281402858147639 + }, + { + "epoch": 0.5024919378481384, + "grad_norm": 1.483853262794814, + "learning_rate": 0.0002511723329425557, + "loss": 3.7643816471099854, + "step": 857, + "token_acc": 0.23288964975361512 + }, + { + "epoch": 0.5030782761653474, + "grad_norm": 1.4459776318361894, + "learning_rate": 0.00025146541617819464, + "loss": 3.771902561187744, + "step": 858, + "token_acc": 0.2300484127587409 + }, + { + "epoch": 0.5036646144825564, + "grad_norm": 1.9185294373549824, + "learning_rate": 0.00025175849941383357, + "loss": 3.8050169944763184, + "step": 859, + "token_acc": 0.22899281416895148 + }, + { + "epoch": 0.5042509527997655, + "grad_norm": 1.6138231773015839, + "learning_rate": 0.0002520515826494725, + "loss": 3.7790579795837402, + "step": 860, + "token_acc": 0.2292011251461742 + }, + { + "epoch": 0.5048372911169745, + "grad_norm": 1.674408509483965, + "learning_rate": 0.0002523446658851114, + "loss": 3.7430124282836914, + "step": 861, + "token_acc": 0.23371507881739023 + }, + { + "epoch": 0.5054236294341835, + "grad_norm": 1.4529751246816234, + "learning_rate": 0.0002526377491207503, + "loss": 3.7706427574157715, + "step": 862, + "token_acc": 0.2313683163497455 + }, + { + "epoch": 0.5060099677513925, + "grad_norm": 1.581861246698084, + "learning_rate": 0.00025293083235638917, + "loss": 3.7948055267333984, + "step": 863, + "token_acc": 0.22711628395775313 + }, + { + "epoch": 0.5065963060686016, + "grad_norm": 1.555470453769317, + "learning_rate": 0.0002532239155920281, + "loss": 3.814864158630371, + "step": 864, + "token_acc": 0.22548447593248594 + }, + { + "epoch": 0.5071826443858106, + "grad_norm": 1.452257538138237, + "learning_rate": 0.000253516998827667, + "loss": 3.745765209197998, + "step": 865, + "token_acc": 0.2342552713981896 + }, + { + "epoch": 0.5077689827030196, + "grad_norm": 1.4935048903499673, + "learning_rate": 0.00025381008206330595, + "loss": 3.7806010246276855, + "step": 866, + "token_acc": 0.23029535710455343 + }, + { + "epoch": 0.5083553210202286, + "grad_norm": 1.6269784268180973, + "learning_rate": 0.0002541031652989449, + "loss": 3.7190489768981934, + "step": 867, + "token_acc": 0.23618227994683721 + }, + { + "epoch": 0.5089416593374377, + "grad_norm": 1.7379111019279003, + "learning_rate": 0.0002543962485345838, + "loss": 3.6765201091766357, + "step": 868, + "token_acc": 0.24139835188174327 + }, + { + "epoch": 0.5095279976546467, + "grad_norm": 1.6690603724861948, + "learning_rate": 0.00025468933177022273, + "loss": 3.788986921310425, + "step": 869, + "token_acc": 0.2293322703228632 + }, + { + "epoch": 0.5101143359718557, + "grad_norm": 1.4586683935680724, + "learning_rate": 0.00025498241500586166, + "loss": 3.7371885776519775, + "step": 870, + "token_acc": 0.2350118615310204 + }, + { + "epoch": 0.5107006742890647, + "grad_norm": 1.703814884868689, + "learning_rate": 0.0002552754982415006, + "loss": 3.7847187519073486, + "step": 871, + "token_acc": 0.22848820905709494 + }, + { + "epoch": 0.5112870126062738, + "grad_norm": 1.5102894058540888, + "learning_rate": 0.0002555685814771395, + "loss": 3.671969413757324, + "step": 872, + "token_acc": 0.24356745967937476 + }, + { + "epoch": 0.5118733509234829, + "grad_norm": 1.4421153486857088, + "learning_rate": 0.00025586166471277844, + "loss": 3.783766269683838, + "step": 873, + "token_acc": 0.228521708494698 + }, + { + "epoch": 0.5124596892406919, + "grad_norm": 1.7147928289594974, + "learning_rate": 0.00025615474794841737, + "loss": 3.7101402282714844, + "step": 874, + "token_acc": 0.23715260361820079 + }, + { + "epoch": 0.513046027557901, + "grad_norm": 1.6473349422670045, + "learning_rate": 0.0002564478311840563, + "loss": 3.6850099563598633, + "step": 875, + "token_acc": 0.23835259512577417 + }, + { + "epoch": 0.51363236587511, + "grad_norm": 1.435763796706261, + "learning_rate": 0.0002567409144196952, + "loss": 3.693279266357422, + "step": 876, + "token_acc": 0.2382124459953473 + }, + { + "epoch": 0.514218704192319, + "grad_norm": 1.2554811401319679, + "learning_rate": 0.00025703399765533415, + "loss": 3.714064598083496, + "step": 877, + "token_acc": 0.23659434687011577 + }, + { + "epoch": 0.514805042509528, + "grad_norm": 1.846739856148871, + "learning_rate": 0.0002573270808909731, + "loss": 3.7165207862854004, + "step": 878, + "token_acc": 0.23597578492118504 + }, + { + "epoch": 0.5153913808267371, + "grad_norm": 1.7343840701434636, + "learning_rate": 0.000257620164126612, + "loss": 3.7642359733581543, + "step": 879, + "token_acc": 0.23114824560483913 + }, + { + "epoch": 0.5159777191439461, + "grad_norm": 1.5355658305727153, + "learning_rate": 0.0002579132473622509, + "loss": 3.677816867828369, + "step": 880, + "token_acc": 0.2423380585903915 + }, + { + "epoch": 0.5165640574611551, + "grad_norm": 1.6734766948831334, + "learning_rate": 0.0002582063305978898, + "loss": 3.7375717163085938, + "step": 881, + "token_acc": 0.23248602364693166 + }, + { + "epoch": 0.5171503957783641, + "grad_norm": 1.3877352433001913, + "learning_rate": 0.00025849941383352873, + "loss": 3.6727499961853027, + "step": 882, + "token_acc": 0.2387079859200266 + }, + { + "epoch": 0.5177367340955732, + "grad_norm": 1.7168686981707963, + "learning_rate": 0.00025879249706916766, + "loss": 3.7357640266418457, + "step": 883, + "token_acc": 0.23206524505063872 + }, + { + "epoch": 0.5183230724127822, + "grad_norm": 1.3815512820815554, + "learning_rate": 0.00025908558030480653, + "loss": 3.677196502685547, + "step": 884, + "token_acc": 0.2419070071900044 + }, + { + "epoch": 0.5189094107299912, + "grad_norm": 1.8921751982261423, + "learning_rate": 0.00025937866354044546, + "loss": 3.728128433227539, + "step": 885, + "token_acc": 0.23318721618903462 + }, + { + "epoch": 0.5194957490472002, + "grad_norm": 1.0594416564256517, + "learning_rate": 0.0002596717467760844, + "loss": 3.7119522094726562, + "step": 886, + "token_acc": 0.23479180756245246 + }, + { + "epoch": 0.5200820873644093, + "grad_norm": 1.9636985293337739, + "learning_rate": 0.0002599648300117233, + "loss": 3.738323450088501, + "step": 887, + "token_acc": 0.2331964429729615 + }, + { + "epoch": 0.5206684256816183, + "grad_norm": 1.13755919877721, + "learning_rate": 0.00026025791324736224, + "loss": 3.650294303894043, + "step": 888, + "token_acc": 0.2440574718483676 + }, + { + "epoch": 0.5212547639988273, + "grad_norm": 1.7955209398286704, + "learning_rate": 0.00026055099648300117, + "loss": 3.6420676708221436, + "step": 889, + "token_acc": 0.24424107048336896 + }, + { + "epoch": 0.5218411023160363, + "grad_norm": 1.649808238434703, + "learning_rate": 0.0002608440797186401, + "loss": 3.843682289123535, + "step": 890, + "token_acc": 0.2229359111333766 + }, + { + "epoch": 0.5224274406332454, + "grad_norm": 1.7562781272965513, + "learning_rate": 0.000261137162954279, + "loss": 3.748120069503784, + "step": 891, + "token_acc": 0.23115108186285035 + }, + { + "epoch": 0.5230137789504544, + "grad_norm": 1.315032986081824, + "learning_rate": 0.00026143024618991795, + "loss": 3.6962013244628906, + "step": 892, + "token_acc": 0.23633885874278351 + }, + { + "epoch": 0.5236001172676634, + "grad_norm": 1.4538060588276225, + "learning_rate": 0.0002617233294255569, + "loss": 3.750308036804199, + "step": 893, + "token_acc": 0.23088465815458406 + }, + { + "epoch": 0.5241864555848724, + "grad_norm": 1.4433750899747162, + "learning_rate": 0.0002620164126611958, + "loss": 3.770920753479004, + "step": 894, + "token_acc": 0.22776220795825852 + }, + { + "epoch": 0.5247727939020815, + "grad_norm": 1.5994260113138028, + "learning_rate": 0.00026230949589683473, + "loss": 3.672922134399414, + "step": 895, + "token_acc": 0.23874237670715165 + }, + { + "epoch": 0.5253591322192905, + "grad_norm": 1.9710930933372963, + "learning_rate": 0.00026260257913247366, + "loss": 3.6778249740600586, + "step": 896, + "token_acc": 0.240377017576823 + }, + { + "epoch": 0.5259454705364995, + "grad_norm": 1.447868446174383, + "learning_rate": 0.0002628956623681126, + "loss": 3.7156810760498047, + "step": 897, + "token_acc": 0.2359344003489785 + }, + { + "epoch": 0.5265318088537085, + "grad_norm": 1.7703622428736265, + "learning_rate": 0.00026318874560375146, + "loss": 3.7524983882904053, + "step": 898, + "token_acc": 0.23044549694042538 + }, + { + "epoch": 0.5271181471709177, + "grad_norm": 1.2780049250273275, + "learning_rate": 0.0002634818288393904, + "loss": 3.697524070739746, + "step": 899, + "token_acc": 0.23730320662988993 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 1.7118367502194758, + "learning_rate": 0.0002637749120750293, + "loss": 3.673832416534424, + "step": 900, + "token_acc": 0.24014515073940343 + }, + { + "epoch": 0.5282908238053357, + "grad_norm": 1.2900152989045572, + "learning_rate": 0.00026406799531066824, + "loss": 3.6508431434631348, + "step": 901, + "token_acc": 0.24179517180949583 + }, + { + "epoch": 0.5288771621225447, + "grad_norm": 1.4960768995430467, + "learning_rate": 0.00026436107854630717, + "loss": 3.70963716506958, + "step": 902, + "token_acc": 0.2366144450345194 + }, + { + "epoch": 0.5294635004397538, + "grad_norm": 1.407016462770081, + "learning_rate": 0.0002646541617819461, + "loss": 3.7448062896728516, + "step": 903, + "token_acc": 0.23096928969644098 + }, + { + "epoch": 0.5300498387569628, + "grad_norm": 1.312805594654422, + "learning_rate": 0.00026494724501758497, + "loss": 3.7442402839660645, + "step": 904, + "token_acc": 0.23011229131929434 + }, + { + "epoch": 0.5306361770741718, + "grad_norm": 1.6507352724637836, + "learning_rate": 0.0002652403282532239, + "loss": 3.7229902744293213, + "step": 905, + "token_acc": 0.23319511328598064 + }, + { + "epoch": 0.5312225153913809, + "grad_norm": 1.5986603900172769, + "learning_rate": 0.0002655334114888628, + "loss": 3.7134289741516113, + "step": 906, + "token_acc": 0.23611300525581444 + }, + { + "epoch": 0.5318088537085899, + "grad_norm": 1.5587459312244494, + "learning_rate": 0.00026582649472450175, + "loss": 3.6575586795806885, + "step": 907, + "token_acc": 0.2424574160640324 + }, + { + "epoch": 0.5323951920257989, + "grad_norm": 1.2896327052242644, + "learning_rate": 0.0002661195779601407, + "loss": 3.682987928390503, + "step": 908, + "token_acc": 0.23803892891039505 + }, + { + "epoch": 0.5329815303430079, + "grad_norm": 1.5910701578817552, + "learning_rate": 0.0002664126611957796, + "loss": 3.6545238494873047, + "step": 909, + "token_acc": 0.24120787162125207 + }, + { + "epoch": 0.533567868660217, + "grad_norm": 1.322561032431526, + "learning_rate": 0.00026670574443141853, + "loss": 3.709458351135254, + "step": 910, + "token_acc": 0.23508372747027995 + }, + { + "epoch": 0.534154206977426, + "grad_norm": 1.6789275610902028, + "learning_rate": 0.00026699882766705746, + "loss": 3.7739665508270264, + "step": 911, + "token_acc": 0.22829186440329005 + }, + { + "epoch": 0.534740545294635, + "grad_norm": 1.4693994047753915, + "learning_rate": 0.0002672919109026964, + "loss": 3.7147202491760254, + "step": 912, + "token_acc": 0.23320617369758645 + }, + { + "epoch": 0.535326883611844, + "grad_norm": 1.7096437235989137, + "learning_rate": 0.0002675849941383353, + "loss": 3.700660228729248, + "step": 913, + "token_acc": 0.23683841041006037 + }, + { + "epoch": 0.5359132219290531, + "grad_norm": 1.3740634061333472, + "learning_rate": 0.00026787807737397424, + "loss": 3.664520263671875, + "step": 914, + "token_acc": 0.23984174095248884 + }, + { + "epoch": 0.5364995602462621, + "grad_norm": 1.5306148097802394, + "learning_rate": 0.00026817116060961317, + "loss": 3.6581850051879883, + "step": 915, + "token_acc": 0.2397526761866706 + }, + { + "epoch": 0.5370858985634711, + "grad_norm": 1.4313735092079882, + "learning_rate": 0.00026846424384525204, + "loss": 3.647153615951538, + "step": 916, + "token_acc": 0.24048748839161907 + }, + { + "epoch": 0.5376722368806801, + "grad_norm": 1.7056871215337246, + "learning_rate": 0.00026875732708089097, + "loss": 3.672374725341797, + "step": 917, + "token_acc": 0.23911971464420934 + }, + { + "epoch": 0.5382585751978892, + "grad_norm": 1.5769228393630546, + "learning_rate": 0.0002690504103165299, + "loss": 3.637816905975342, + "step": 918, + "token_acc": 0.24200835525297848 + }, + { + "epoch": 0.5388449135150982, + "grad_norm": 1.6065656587295334, + "learning_rate": 0.0002693434935521688, + "loss": 3.709822177886963, + "step": 919, + "token_acc": 0.235122625250856 + }, + { + "epoch": 0.5394312518323072, + "grad_norm": 1.3937238481058214, + "learning_rate": 0.00026963657678780775, + "loss": 3.666848659515381, + "step": 920, + "token_acc": 0.23990388988266684 + }, + { + "epoch": 0.5400175901495162, + "grad_norm": 1.5515486711662791, + "learning_rate": 0.0002699296600234467, + "loss": 3.7059433460235596, + "step": 921, + "token_acc": 0.23402069638649073 + }, + { + "epoch": 0.5406039284667253, + "grad_norm": 1.280275575463355, + "learning_rate": 0.0002702227432590856, + "loss": 3.6682474613189697, + "step": 922, + "token_acc": 0.2378662725654388 + }, + { + "epoch": 0.5411902667839343, + "grad_norm": 1.7040837876578285, + "learning_rate": 0.00027051582649472453, + "loss": 3.67686128616333, + "step": 923, + "token_acc": 0.23788330022863857 + }, + { + "epoch": 0.5417766051011433, + "grad_norm": 1.2242242493961457, + "learning_rate": 0.00027080890973036346, + "loss": 3.7185275554656982, + "step": 924, + "token_acc": 0.23372108136210035 + }, + { + "epoch": 0.5423629434183523, + "grad_norm": 1.5593007597611654, + "learning_rate": 0.00027110199296600233, + "loss": 3.680393695831299, + "step": 925, + "token_acc": 0.23830905479225295 + }, + { + "epoch": 0.5429492817355615, + "grad_norm": 1.566231047067355, + "learning_rate": 0.00027139507620164126, + "loss": 3.6662802696228027, + "step": 926, + "token_acc": 0.23795519659334297 + }, + { + "epoch": 0.5435356200527705, + "grad_norm": 1.424326972095286, + "learning_rate": 0.0002716881594372802, + "loss": 3.6961522102355957, + "step": 927, + "token_acc": 0.23542440697342099 + }, + { + "epoch": 0.5441219583699795, + "grad_norm": 1.75356765742314, + "learning_rate": 0.0002719812426729191, + "loss": 3.708113670349121, + "step": 928, + "token_acc": 0.23260580716264792 + }, + { + "epoch": 0.5447082966871885, + "grad_norm": 1.2510373264903487, + "learning_rate": 0.00027227432590855804, + "loss": 3.6477930545806885, + "step": 929, + "token_acc": 0.24326106690183644 + }, + { + "epoch": 0.5452946350043976, + "grad_norm": 1.4548803766196858, + "learning_rate": 0.00027256740914419696, + "loss": 3.632298231124878, + "step": 930, + "token_acc": 0.24162094216807142 + }, + { + "epoch": 0.5458809733216066, + "grad_norm": 1.418298393172357, + "learning_rate": 0.0002728604923798359, + "loss": 3.699294328689575, + "step": 931, + "token_acc": 0.23485171363527688 + }, + { + "epoch": 0.5464673116388156, + "grad_norm": 1.1949458175856844, + "learning_rate": 0.0002731535756154748, + "loss": 3.630730152130127, + "step": 932, + "token_acc": 0.24417622305312223 + }, + { + "epoch": 0.5470536499560247, + "grad_norm": 1.663725947025361, + "learning_rate": 0.0002734466588511137, + "loss": 3.6516690254211426, + "step": 933, + "token_acc": 0.24071523505327017 + }, + { + "epoch": 0.5476399882732337, + "grad_norm": 1.473832392460202, + "learning_rate": 0.0002737397420867526, + "loss": 3.6903762817382812, + "step": 934, + "token_acc": 0.23811021688928444 + }, + { + "epoch": 0.5482263265904427, + "grad_norm": 1.5020590984664082, + "learning_rate": 0.00027403282532239155, + "loss": 3.593172788619995, + "step": 935, + "token_acc": 0.24806205590138178 + }, + { + "epoch": 0.5488126649076517, + "grad_norm": 1.410606778961064, + "learning_rate": 0.0002743259085580305, + "loss": 3.674931526184082, + "step": 936, + "token_acc": 0.2378261254551042 + }, + { + "epoch": 0.5493990032248608, + "grad_norm": 1.6590628344169205, + "learning_rate": 0.0002746189917936694, + "loss": 3.6757731437683105, + "step": 937, + "token_acc": 0.2382436082008386 + }, + { + "epoch": 0.5499853415420698, + "grad_norm": 1.430446721186781, + "learning_rate": 0.00027491207502930833, + "loss": 3.668175220489502, + "step": 938, + "token_acc": 0.23955169994520437 + }, + { + "epoch": 0.5505716798592788, + "grad_norm": 1.48518844546317, + "learning_rate": 0.00027520515826494725, + "loss": 3.6634273529052734, + "step": 939, + "token_acc": 0.23976885603546919 + }, + { + "epoch": 0.5511580181764878, + "grad_norm": 1.3094292865272106, + "learning_rate": 0.0002754982415005862, + "loss": 3.704200506210327, + "step": 940, + "token_acc": 0.232224367161714 + }, + { + "epoch": 0.5517443564936969, + "grad_norm": 1.4434376016126482, + "learning_rate": 0.0002757913247362251, + "loss": 3.637700319290161, + "step": 941, + "token_acc": 0.2427770989102637 + }, + { + "epoch": 0.5523306948109059, + "grad_norm": 1.4333197053663898, + "learning_rate": 0.00027608440797186404, + "loss": 3.693472385406494, + "step": 942, + "token_acc": 0.23614983477901694 + }, + { + "epoch": 0.5529170331281149, + "grad_norm": 1.336124291058086, + "learning_rate": 0.00027637749120750296, + "loss": 3.6455936431884766, + "step": 943, + "token_acc": 0.24077342725687537 + }, + { + "epoch": 0.5535033714453239, + "grad_norm": 1.4236844191835567, + "learning_rate": 0.0002766705744431419, + "loss": 3.6451268196105957, + "step": 944, + "token_acc": 0.23986033415081134 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 1.5151071706187003, + "learning_rate": 0.00027696365767878076, + "loss": 3.671522617340088, + "step": 945, + "token_acc": 0.23706088334592965 + }, + { + "epoch": 0.554676048079742, + "grad_norm": 1.4474331432616678, + "learning_rate": 0.0002772567409144197, + "loss": 3.653430938720703, + "step": 946, + "token_acc": 0.23992236921559681 + }, + { + "epoch": 0.555262386396951, + "grad_norm": 1.3880910943751068, + "learning_rate": 0.0002775498241500586, + "loss": 3.6660728454589844, + "step": 947, + "token_acc": 0.23714635963752614 + }, + { + "epoch": 0.55584872471416, + "grad_norm": 1.282567661556668, + "learning_rate": 0.00027784290738569755, + "loss": 3.675222873687744, + "step": 948, + "token_acc": 0.23885116046013283 + }, + { + "epoch": 0.5564350630313691, + "grad_norm": 1.5024143213464343, + "learning_rate": 0.00027813599062133647, + "loss": 3.6648471355438232, + "step": 949, + "token_acc": 0.23879919635013444 + }, + { + "epoch": 0.5570214013485781, + "grad_norm": 1.3100281801484799, + "learning_rate": 0.0002784290738569754, + "loss": 3.688194751739502, + "step": 950, + "token_acc": 0.2354916704306932 + }, + { + "epoch": 0.5576077396657871, + "grad_norm": 1.2605884090101194, + "learning_rate": 0.00027872215709261427, + "loss": 3.662505626678467, + "step": 951, + "token_acc": 0.23758602096001794 + }, + { + "epoch": 0.5581940779829961, + "grad_norm": 1.5417644358255755, + "learning_rate": 0.0002790152403282532, + "loss": 3.6443071365356445, + "step": 952, + "token_acc": 0.24169462607079295 + }, + { + "epoch": 0.5587804163002053, + "grad_norm": 1.378770453888231, + "learning_rate": 0.0002793083235638921, + "loss": 3.699918270111084, + "step": 953, + "token_acc": 0.23420569438205788 + }, + { + "epoch": 0.5593667546174143, + "grad_norm": 1.3985727205672815, + "learning_rate": 0.00027960140679953105, + "loss": 3.6232728958129883, + "step": 954, + "token_acc": 0.24246628371303938 + }, + { + "epoch": 0.5599530929346233, + "grad_norm": 1.17551988270458, + "learning_rate": 0.00027989449003517, + "loss": 3.6305136680603027, + "step": 955, + "token_acc": 0.24157700707518454 + }, + { + "epoch": 0.5605394312518323, + "grad_norm": 1.464389733669569, + "learning_rate": 0.0002801875732708089, + "loss": 3.5816683769226074, + "step": 956, + "token_acc": 0.24643953445489353 + }, + { + "epoch": 0.5611257695690414, + "grad_norm": 1.5202380201888184, + "learning_rate": 0.00028048065650644784, + "loss": 3.6169071197509766, + "step": 957, + "token_acc": 0.24440837359098228 + }, + { + "epoch": 0.5617121078862504, + "grad_norm": 1.553775257267738, + "learning_rate": 0.00028077373974208676, + "loss": 3.608323574066162, + "step": 958, + "token_acc": 0.24402427292237105 + }, + { + "epoch": 0.5622984462034594, + "grad_norm": 1.0718120225455532, + "learning_rate": 0.0002810668229777257, + "loss": 3.6268668174743652, + "step": 959, + "token_acc": 0.24203052317852286 + }, + { + "epoch": 0.5628847845206685, + "grad_norm": 1.567008922012089, + "learning_rate": 0.0002813599062133646, + "loss": 3.6444454193115234, + "step": 960, + "token_acc": 0.239551687011117 + }, + { + "epoch": 0.5634711228378775, + "grad_norm": 1.260261647985568, + "learning_rate": 0.00028165298944900354, + "loss": 3.624262809753418, + "step": 961, + "token_acc": 0.24357578631944296 + }, + { + "epoch": 0.5640574611550865, + "grad_norm": 1.301844245572885, + "learning_rate": 0.00028194607268464247, + "loss": 3.635347843170166, + "step": 962, + "token_acc": 0.24149458839974922 + }, + { + "epoch": 0.5646437994722955, + "grad_norm": 1.7375882907047981, + "learning_rate": 0.0002822391559202814, + "loss": 3.6010549068450928, + "step": 963, + "token_acc": 0.24749788435865844 + }, + { + "epoch": 0.5652301377895046, + "grad_norm": 1.5143098884869093, + "learning_rate": 0.0002825322391559203, + "loss": 3.5999910831451416, + "step": 964, + "token_acc": 0.24292920407429094 + }, + { + "epoch": 0.5658164761067136, + "grad_norm": 1.295096613319315, + "learning_rate": 0.0002828253223915592, + "loss": 3.627565860748291, + "step": 965, + "token_acc": 0.24175109307249915 + }, + { + "epoch": 0.5664028144239226, + "grad_norm": 1.3590009676842254, + "learning_rate": 0.0002831184056271981, + "loss": 3.6357483863830566, + "step": 966, + "token_acc": 0.24200606940371194 + }, + { + "epoch": 0.5669891527411316, + "grad_norm": 1.5019272884970047, + "learning_rate": 0.00028341148886283705, + "loss": 3.6971030235290527, + "step": 967, + "token_acc": 0.23295798743184884 + }, + { + "epoch": 0.5675754910583407, + "grad_norm": 1.4407196964135756, + "learning_rate": 0.000283704572098476, + "loss": 3.6472368240356445, + "step": 968, + "token_acc": 0.24040210601558706 + }, + { + "epoch": 0.5681618293755497, + "grad_norm": 1.102711073391483, + "learning_rate": 0.00028399765533411485, + "loss": 3.6212730407714844, + "step": 969, + "token_acc": 0.2446720908191644 + }, + { + "epoch": 0.5687481676927587, + "grad_norm": 1.5965234085692896, + "learning_rate": 0.0002842907385697538, + "loss": 3.677675247192383, + "step": 970, + "token_acc": 0.2334926658302352 + }, + { + "epoch": 0.5693345060099677, + "grad_norm": 1.4657220461830711, + "learning_rate": 0.0002845838218053927, + "loss": 3.610607862472534, + "step": 971, + "token_acc": 0.24342608969382665 + }, + { + "epoch": 0.5699208443271768, + "grad_norm": 1.5361650544965009, + "learning_rate": 0.00028487690504103163, + "loss": 3.604387044906616, + "step": 972, + "token_acc": 0.2446919024348877 + }, + { + "epoch": 0.5705071826443858, + "grad_norm": 1.241086038496879, + "learning_rate": 0.00028516998827667056, + "loss": 3.6427831649780273, + "step": 973, + "token_acc": 0.23833738550493097 + }, + { + "epoch": 0.5710935209615948, + "grad_norm": 1.4342048818985964, + "learning_rate": 0.0002854630715123095, + "loss": 3.6039347648620605, + "step": 974, + "token_acc": 0.2419299107061669 + }, + { + "epoch": 0.5716798592788038, + "grad_norm": 1.3527565134234767, + "learning_rate": 0.0002857561547479484, + "loss": 3.618162155151367, + "step": 975, + "token_acc": 0.2438772973084972 + }, + { + "epoch": 0.5722661975960129, + "grad_norm": 1.2848849752898708, + "learning_rate": 0.00028604923798358734, + "loss": 3.5735626220703125, + "step": 976, + "token_acc": 0.24879073269314594 + }, + { + "epoch": 0.5728525359132219, + "grad_norm": 1.5040951386383123, + "learning_rate": 0.00028634232121922627, + "loss": 3.585871696472168, + "step": 977, + "token_acc": 0.24762311690126462 + }, + { + "epoch": 0.5734388742304309, + "grad_norm": 1.1852199729733248, + "learning_rate": 0.0002866354044548652, + "loss": 3.607529640197754, + "step": 978, + "token_acc": 0.2434763883259019 + }, + { + "epoch": 0.5740252125476399, + "grad_norm": 1.6717518342340965, + "learning_rate": 0.0002869284876905041, + "loss": 3.673367977142334, + "step": 979, + "token_acc": 0.23545167774700376 + }, + { + "epoch": 0.574611550864849, + "grad_norm": 1.289435702282436, + "learning_rate": 0.00028722157092614305, + "loss": 3.6096255779266357, + "step": 980, + "token_acc": 0.2433342050209205 + }, + { + "epoch": 0.575197889182058, + "grad_norm": 1.485572914756259, + "learning_rate": 0.000287514654161782, + "loss": 3.6879472732543945, + "step": 981, + "token_acc": 0.23460435601747376 + }, + { + "epoch": 0.575784227499267, + "grad_norm": 1.5299409048590933, + "learning_rate": 0.0002878077373974209, + "loss": 3.6253550052642822, + "step": 982, + "token_acc": 0.2424067395094425 + }, + { + "epoch": 0.576370565816476, + "grad_norm": 1.107850913781708, + "learning_rate": 0.00028810082063305983, + "loss": 3.6505818367004395, + "step": 983, + "token_acc": 0.23898196207410452 + }, + { + "epoch": 0.5769569041336852, + "grad_norm": 1.4180933459067868, + "learning_rate": 0.00028839390386869876, + "loss": 3.57523250579834, + "step": 984, + "token_acc": 0.24687898621944335 + }, + { + "epoch": 0.5775432424508942, + "grad_norm": 1.328159485655566, + "learning_rate": 0.0002886869871043377, + "loss": 3.5903732776641846, + "step": 985, + "token_acc": 0.24551621031945187 + }, + { + "epoch": 0.5781295807681032, + "grad_norm": 1.3772252299142345, + "learning_rate": 0.00028898007033997656, + "loss": 3.5875444412231445, + "step": 986, + "token_acc": 0.24669940046571906 + }, + { + "epoch": 0.5787159190853123, + "grad_norm": 1.4681394726941328, + "learning_rate": 0.00028927315357561543, + "loss": 3.664885997772217, + "step": 987, + "token_acc": 0.23568614704543978 + }, + { + "epoch": 0.5793022574025213, + "grad_norm": 1.3501584209081006, + "learning_rate": 0.00028956623681125436, + "loss": 3.636493682861328, + "step": 988, + "token_acc": 0.2403348935192476 + }, + { + "epoch": 0.5798885957197303, + "grad_norm": 1.3162626910672084, + "learning_rate": 0.0002898593200468933, + "loss": 3.5964999198913574, + "step": 989, + "token_acc": 0.24443516081328026 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 1.548417274062198, + "learning_rate": 0.0002901524032825322, + "loss": 3.646665573120117, + "step": 990, + "token_acc": 0.23912856608314878 + }, + { + "epoch": 0.5810612723541484, + "grad_norm": 1.3329807138893148, + "learning_rate": 0.00029044548651817114, + "loss": 3.604050636291504, + "step": 991, + "token_acc": 0.24390972794723825 + }, + { + "epoch": 0.5816476106713574, + "grad_norm": 1.8329805774599375, + "learning_rate": 0.00029073856975381007, + "loss": 3.6503264904022217, + "step": 992, + "token_acc": 0.23781426640291864 + }, + { + "epoch": 0.5822339489885664, + "grad_norm": 1.0386239214358395, + "learning_rate": 0.000291031652989449, + "loss": 3.671471118927002, + "step": 993, + "token_acc": 0.23513708251906298 + }, + { + "epoch": 0.5828202873057754, + "grad_norm": 1.3632384025221655, + "learning_rate": 0.0002913247362250879, + "loss": 3.6130118370056152, + "step": 994, + "token_acc": 0.24241149915641103 + }, + { + "epoch": 0.5834066256229845, + "grad_norm": 1.175448756516594, + "learning_rate": 0.00029161781946072685, + "loss": 3.623880386352539, + "step": 995, + "token_acc": 0.24121099575430469 + }, + { + "epoch": 0.5839929639401935, + "grad_norm": 1.6118297069228227, + "learning_rate": 0.0002919109026963658, + "loss": 3.6532297134399414, + "step": 996, + "token_acc": 0.23697712004469987 + }, + { + "epoch": 0.5845793022574025, + "grad_norm": 1.4563665658704563, + "learning_rate": 0.0002922039859320047, + "loss": 3.6464314460754395, + "step": 997, + "token_acc": 0.23914868288513783 + }, + { + "epoch": 0.5851656405746115, + "grad_norm": 1.268823400552579, + "learning_rate": 0.00029249706916764363, + "loss": 3.6455702781677246, + "step": 998, + "token_acc": 0.23681017530647253 + }, + { + "epoch": 0.5857519788918206, + "grad_norm": 1.3353034791218754, + "learning_rate": 0.00029279015240328256, + "loss": 3.631204128265381, + "step": 999, + "token_acc": 0.24132621422490358 + }, + { + "epoch": 0.5863383172090296, + "grad_norm": 1.1626901276860175, + "learning_rate": 0.0002930832356389215, + "loss": 3.6033883094787598, + "step": 1000, + "token_acc": 0.2434216214658328 + }, + { + "epoch": 0.5869246555262386, + "grad_norm": 1.5052685338613823, + "learning_rate": 0.0002933763188745604, + "loss": 3.6411616802215576, + "step": 1001, + "token_acc": 0.23954394649533942 + }, + { + "epoch": 0.5875109938434476, + "grad_norm": 1.2259943633896444, + "learning_rate": 0.00029366940211019934, + "loss": 3.672987461090088, + "step": 1002, + "token_acc": 0.2349683564484737 + }, + { + "epoch": 0.5880973321606567, + "grad_norm": 1.3537330602649946, + "learning_rate": 0.00029396248534583827, + "loss": 3.655560255050659, + "step": 1003, + "token_acc": 0.23706484605595948 + }, + { + "epoch": 0.5886836704778657, + "grad_norm": 1.23563878812983, + "learning_rate": 0.0002942555685814772, + "loss": 3.6128153800964355, + "step": 1004, + "token_acc": 0.2416935006812226 + }, + { + "epoch": 0.5892700087950747, + "grad_norm": 1.5330607884715752, + "learning_rate": 0.00029454865181711607, + "loss": 3.631728410720825, + "step": 1005, + "token_acc": 0.23961668672118777 + }, + { + "epoch": 0.5898563471122837, + "grad_norm": 1.4332882088741332, + "learning_rate": 0.00029484173505275494, + "loss": 3.6275744438171387, + "step": 1006, + "token_acc": 0.23903324113443747 + }, + { + "epoch": 0.5904426854294929, + "grad_norm": 1.2481874169431866, + "learning_rate": 0.00029513481828839387, + "loss": 3.638216972351074, + "step": 1007, + "token_acc": 0.24026861684156908 + }, + { + "epoch": 0.5910290237467019, + "grad_norm": 1.2322664946771773, + "learning_rate": 0.0002954279015240328, + "loss": 3.552248954772949, + "step": 1008, + "token_acc": 0.2494416131873663 + }, + { + "epoch": 0.5916153620639109, + "grad_norm": 1.2124431570188956, + "learning_rate": 0.0002957209847596717, + "loss": 3.5769190788269043, + "step": 1009, + "token_acc": 0.24815042255168582 + }, + { + "epoch": 0.5922017003811199, + "grad_norm": 1.3448800910646759, + "learning_rate": 0.00029601406799531065, + "loss": 3.6008925437927246, + "step": 1010, + "token_acc": 0.24129498862818036 + }, + { + "epoch": 0.592788038698329, + "grad_norm": 1.2132735883613457, + "learning_rate": 0.0002963071512309496, + "loss": 3.638364315032959, + "step": 1011, + "token_acc": 0.23899496705235304 + }, + { + "epoch": 0.593374377015538, + "grad_norm": 1.15419606938618, + "learning_rate": 0.0002966002344665885, + "loss": 3.5545496940612793, + "step": 1012, + "token_acc": 0.24795645118342452 + }, + { + "epoch": 0.593960715332747, + "grad_norm": 1.49230703988686, + "learning_rate": 0.00029689331770222743, + "loss": 3.59454607963562, + "step": 1013, + "token_acc": 0.2436325937764915 + }, + { + "epoch": 0.594547053649956, + "grad_norm": 1.3133019455039763, + "learning_rate": 0.00029718640093786636, + "loss": 3.631288528442383, + "step": 1014, + "token_acc": 0.24004319396530466 + }, + { + "epoch": 0.5951333919671651, + "grad_norm": 1.4435558317562465, + "learning_rate": 0.0002974794841735053, + "loss": 3.5768966674804688, + "step": 1015, + "token_acc": 0.24592537317354507 + }, + { + "epoch": 0.5957197302843741, + "grad_norm": 1.244061776445077, + "learning_rate": 0.0002977725674091442, + "loss": 3.611121654510498, + "step": 1016, + "token_acc": 0.24127637227609697 + }, + { + "epoch": 0.5963060686015831, + "grad_norm": 1.1682934186019207, + "learning_rate": 0.00029806565064478314, + "loss": 3.6214759349823, + "step": 1017, + "token_acc": 0.23975738695364865 + }, + { + "epoch": 0.5968924069187922, + "grad_norm": 1.2933443579843602, + "learning_rate": 0.00029835873388042207, + "loss": 3.6063427925109863, + "step": 1018, + "token_acc": 0.24503447758956678 + }, + { + "epoch": 0.5974787452360012, + "grad_norm": 1.426534387630876, + "learning_rate": 0.000298651817116061, + "loss": 3.57485294342041, + "step": 1019, + "token_acc": 0.24551953414727806 + }, + { + "epoch": 0.5980650835532102, + "grad_norm": 1.319177337623613, + "learning_rate": 0.0002989449003516999, + "loss": 3.580430746078491, + "step": 1020, + "token_acc": 0.24619251384600047 + }, + { + "epoch": 0.5986514218704192, + "grad_norm": 1.229497624194284, + "learning_rate": 0.00029923798358733885, + "loss": 3.6006674766540527, + "step": 1021, + "token_acc": 0.2422664194763761 + }, + { + "epoch": 0.5992377601876283, + "grad_norm": 1.2176666824970224, + "learning_rate": 0.0002995310668229777, + "loss": 3.566042900085449, + "step": 1022, + "token_acc": 0.24857474139833916 + }, + { + "epoch": 0.5998240985048373, + "grad_norm": 1.3094918895241563, + "learning_rate": 0.00029982415005861665, + "loss": 3.5706920623779297, + "step": 1023, + "token_acc": 0.2467744899834589 + }, + { + "epoch": 0.6004104368220463, + "grad_norm": 1.248359085046116, + "learning_rate": 0.0003001172332942556, + "loss": 3.5613551139831543, + "step": 1024, + "token_acc": 0.24592954107369888 + }, + { + "epoch": 0.6009967751392553, + "grad_norm": 1.2605542190605061, + "learning_rate": 0.0003004103165298945, + "loss": 3.55257248878479, + "step": 1025, + "token_acc": 0.24668932748729588 + }, + { + "epoch": 0.6015831134564644, + "grad_norm": 1.618452542926587, + "learning_rate": 0.0003007033997655334, + "loss": 3.5784997940063477, + "step": 1026, + "token_acc": 0.24533656817193472 + }, + { + "epoch": 0.6021694517736734, + "grad_norm": 1.4087004271954864, + "learning_rate": 0.0003009964830011723, + "loss": 3.5917067527770996, + "step": 1027, + "token_acc": 0.2432783162334239 + }, + { + "epoch": 0.6027557900908824, + "grad_norm": 1.3518881039942305, + "learning_rate": 0.00030128956623681123, + "loss": 3.5749573707580566, + "step": 1028, + "token_acc": 0.24643084430725787 + }, + { + "epoch": 0.6033421284080914, + "grad_norm": 1.2800365906757063, + "learning_rate": 0.00030158264947245016, + "loss": 3.6161842346191406, + "step": 1029, + "token_acc": 0.2406215310420285 + }, + { + "epoch": 0.6039284667253005, + "grad_norm": 1.1862867108440904, + "learning_rate": 0.0003018757327080891, + "loss": 3.5706331729888916, + "step": 1030, + "token_acc": 0.24555600950341286 + }, + { + "epoch": 0.6045148050425095, + "grad_norm": 1.5320263128554652, + "learning_rate": 0.000302168815943728, + "loss": 3.64438796043396, + "step": 1031, + "token_acc": 0.2397131655243398 + }, + { + "epoch": 0.6051011433597185, + "grad_norm": 1.0996446651716916, + "learning_rate": 0.00030246189917936694, + "loss": 3.57479190826416, + "step": 1032, + "token_acc": 0.2466358914257987 + }, + { + "epoch": 0.6056874816769275, + "grad_norm": 1.4535221697194116, + "learning_rate": 0.00030275498241500587, + "loss": 3.603222370147705, + "step": 1033, + "token_acc": 0.24240436696005416 + }, + { + "epoch": 0.6062738199941367, + "grad_norm": 1.0934213206078802, + "learning_rate": 0.0003030480656506448, + "loss": 3.628729820251465, + "step": 1034, + "token_acc": 0.24039238315060588 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 1.4052113790723477, + "learning_rate": 0.0003033411488862837, + "loss": 3.557115077972412, + "step": 1035, + "token_acc": 0.2477239581875148 + }, + { + "epoch": 0.6074464966285547, + "grad_norm": 1.1027017557717609, + "learning_rate": 0.00030363423212192265, + "loss": 3.584075450897217, + "step": 1036, + "token_acc": 0.24446978095039984 + }, + { + "epoch": 0.6080328349457637, + "grad_norm": 1.2720714744698813, + "learning_rate": 0.0003039273153575616, + "loss": 3.596144676208496, + "step": 1037, + "token_acc": 0.24484441749064262 + }, + { + "epoch": 0.6086191732629728, + "grad_norm": 1.0908590856950309, + "learning_rate": 0.0003042203985932005, + "loss": 3.5665292739868164, + "step": 1038, + "token_acc": 0.24901609830643304 + }, + { + "epoch": 0.6092055115801818, + "grad_norm": 1.3540384838745612, + "learning_rate": 0.00030451348182883943, + "loss": 3.5630507469177246, + "step": 1039, + "token_acc": 0.24783621165417077 + }, + { + "epoch": 0.6097918498973908, + "grad_norm": 1.3358213164083288, + "learning_rate": 0.0003048065650644783, + "loss": 3.559636116027832, + "step": 1040, + "token_acc": 0.24888909163115697 + }, + { + "epoch": 0.6103781882145998, + "grad_norm": 1.321331455081741, + "learning_rate": 0.00030509964830011723, + "loss": 3.58921480178833, + "step": 1041, + "token_acc": 0.246624477205616 + }, + { + "epoch": 0.6109645265318089, + "grad_norm": 1.288596534247709, + "learning_rate": 0.00030539273153575616, + "loss": 3.559378147125244, + "step": 1042, + "token_acc": 0.24742083393036252 + }, + { + "epoch": 0.6115508648490179, + "grad_norm": 1.1938219310801197, + "learning_rate": 0.0003056858147713951, + "loss": 3.587851047515869, + "step": 1043, + "token_acc": 0.24487342887044464 + }, + { + "epoch": 0.6121372031662269, + "grad_norm": 1.2576937115774205, + "learning_rate": 0.000305978898007034, + "loss": 3.512676239013672, + "step": 1044, + "token_acc": 0.2536537277933675 + }, + { + "epoch": 0.612723541483436, + "grad_norm": 1.2264604190519774, + "learning_rate": 0.00030627198124267294, + "loss": 3.5607190132141113, + "step": 1045, + "token_acc": 0.24816403080845864 + }, + { + "epoch": 0.613309879800645, + "grad_norm": 1.3621433500744178, + "learning_rate": 0.0003065650644783118, + "loss": 3.553762197494507, + "step": 1046, + "token_acc": 0.2483781865764765 + }, + { + "epoch": 0.613896218117854, + "grad_norm": 1.2562294334755622, + "learning_rate": 0.00030685814771395074, + "loss": 3.584376096725464, + "step": 1047, + "token_acc": 0.24362378351095432 + }, + { + "epoch": 0.614482556435063, + "grad_norm": 1.0916962712069853, + "learning_rate": 0.00030715123094958967, + "loss": 3.518728256225586, + "step": 1048, + "token_acc": 0.25117837440044777 + }, + { + "epoch": 0.6150688947522721, + "grad_norm": 1.2879650968830731, + "learning_rate": 0.0003074443141852286, + "loss": 3.5597729682922363, + "step": 1049, + "token_acc": 0.24674009172991093 + }, + { + "epoch": 0.6156552330694811, + "grad_norm": 1.306969513994692, + "learning_rate": 0.0003077373974208675, + "loss": 3.6184351444244385, + "step": 1050, + "token_acc": 0.2398520965389995 + }, + { + "epoch": 0.6162415713866901, + "grad_norm": 1.0379125959396636, + "learning_rate": 0.00030803048065650645, + "loss": 3.5165929794311523, + "step": 1051, + "token_acc": 0.251242698251751 + }, + { + "epoch": 0.6168279097038991, + "grad_norm": 1.2578212169091842, + "learning_rate": 0.0003083235638921454, + "loss": 3.611384630203247, + "step": 1052, + "token_acc": 0.24111120393901744 + }, + { + "epoch": 0.6174142480211082, + "grad_norm": 1.1708280867492544, + "learning_rate": 0.0003086166471277843, + "loss": 3.5429625511169434, + "step": 1053, + "token_acc": 0.24778100628575972 + }, + { + "epoch": 0.6180005863383172, + "grad_norm": 1.2800884252386273, + "learning_rate": 0.00030890973036342323, + "loss": 3.5960659980773926, + "step": 1054, + "token_acc": 0.24389746994951972 + }, + { + "epoch": 0.6185869246555262, + "grad_norm": 1.2309028839167542, + "learning_rate": 0.00030920281359906216, + "loss": 3.5429959297180176, + "step": 1055, + "token_acc": 0.24793755337290654 + }, + { + "epoch": 0.6191732629727352, + "grad_norm": 1.2592940658145133, + "learning_rate": 0.0003094958968347011, + "loss": 3.577817916870117, + "step": 1056, + "token_acc": 0.245884408491202 + }, + { + "epoch": 0.6197596012899443, + "grad_norm": 1.3332887358166907, + "learning_rate": 0.00030978898007034, + "loss": 3.568650960922241, + "step": 1057, + "token_acc": 0.24795525384537268 + }, + { + "epoch": 0.6203459396071533, + "grad_norm": 1.2369373345427892, + "learning_rate": 0.0003100820633059789, + "loss": 3.558310031890869, + "step": 1058, + "token_acc": 0.248153233703751 + }, + { + "epoch": 0.6209322779243623, + "grad_norm": 1.2807449244278, + "learning_rate": 0.0003103751465416178, + "loss": 3.57058048248291, + "step": 1059, + "token_acc": 0.2465741393764388 + }, + { + "epoch": 0.6215186162415713, + "grad_norm": 1.7017458386965152, + "learning_rate": 0.00031066822977725674, + "loss": 3.581871509552002, + "step": 1060, + "token_acc": 0.24369723233250434 + }, + { + "epoch": 0.6221049545587805, + "grad_norm": 1.008323281954998, + "learning_rate": 0.00031096131301289567, + "loss": 3.576396942138672, + "step": 1061, + "token_acc": 0.24595551379233238 + }, + { + "epoch": 0.6226912928759895, + "grad_norm": 1.6023932079371275, + "learning_rate": 0.0003112543962485346, + "loss": 3.582634449005127, + "step": 1062, + "token_acc": 0.24497768890370122 + }, + { + "epoch": 0.6232776311931985, + "grad_norm": 1.034828430848396, + "learning_rate": 0.0003115474794841735, + "loss": 3.5482912063598633, + "step": 1063, + "token_acc": 0.24882604664078267 + }, + { + "epoch": 0.6238639695104075, + "grad_norm": 1.3511534247738706, + "learning_rate": 0.00031184056271981245, + "loss": 3.564587354660034, + "step": 1064, + "token_acc": 0.2474659930453532 + }, + { + "epoch": 0.6244503078276166, + "grad_norm": 1.1506197615092917, + "learning_rate": 0.0003121336459554514, + "loss": 3.621370792388916, + "step": 1065, + "token_acc": 0.24299295756183764 + }, + { + "epoch": 0.6250366461448256, + "grad_norm": 1.2004968585508264, + "learning_rate": 0.0003124267291910903, + "loss": 3.62357759475708, + "step": 1066, + "token_acc": 0.23938016145495036 + }, + { + "epoch": 0.6256229844620346, + "grad_norm": 1.3657658591337762, + "learning_rate": 0.0003127198124267292, + "loss": 3.601881265640259, + "step": 1067, + "token_acc": 0.24152505961459966 + }, + { + "epoch": 0.6262093227792436, + "grad_norm": 1.4428572892473865, + "learning_rate": 0.0003130128956623681, + "loss": 3.605905771255493, + "step": 1068, + "token_acc": 0.24115721980531626 + }, + { + "epoch": 0.6267956610964527, + "grad_norm": 1.3483259583455922, + "learning_rate": 0.00031330597889800703, + "loss": 3.50486421585083, + "step": 1069, + "token_acc": 0.2549713510991744 + }, + { + "epoch": 0.6273819994136617, + "grad_norm": 1.4080052386525508, + "learning_rate": 0.00031359906213364596, + "loss": 3.589035749435425, + "step": 1070, + "token_acc": 0.24182433311646065 + }, + { + "epoch": 0.6279683377308707, + "grad_norm": 1.3657795830421322, + "learning_rate": 0.0003138921453692849, + "loss": 3.6037979125976562, + "step": 1071, + "token_acc": 0.24357838795394154 + }, + { + "epoch": 0.6285546760480798, + "grad_norm": 1.2079770564130006, + "learning_rate": 0.0003141852286049238, + "loss": 3.6032891273498535, + "step": 1072, + "token_acc": 0.24244832217638151 + }, + { + "epoch": 0.6291410143652888, + "grad_norm": 1.3856147410341002, + "learning_rate": 0.00031447831184056274, + "loss": 3.5857534408569336, + "step": 1073, + "token_acc": 0.244249200856987 + }, + { + "epoch": 0.6297273526824978, + "grad_norm": 1.1243418437968338, + "learning_rate": 0.00031477139507620167, + "loss": 3.6087419986724854, + "step": 1074, + "token_acc": 0.24339176787382535 + }, + { + "epoch": 0.6303136909997068, + "grad_norm": 1.0467605121947294, + "learning_rate": 0.0003150644783118406, + "loss": 3.5274486541748047, + "step": 1075, + "token_acc": 0.2508644671502483 + }, + { + "epoch": 0.6309000293169159, + "grad_norm": 1.1430397492857838, + "learning_rate": 0.00031535756154747947, + "loss": 3.5465409755706787, + "step": 1076, + "token_acc": 0.24883430799220274 + }, + { + "epoch": 0.6314863676341249, + "grad_norm": 1.5479730623570227, + "learning_rate": 0.0003156506447831184, + "loss": 3.5545127391815186, + "step": 1077, + "token_acc": 0.24666378667085576 + }, + { + "epoch": 0.6320727059513339, + "grad_norm": 1.0795272060536478, + "learning_rate": 0.0003159437280187573, + "loss": 3.5707156658172607, + "step": 1078, + "token_acc": 0.24650693047260813 + }, + { + "epoch": 0.6326590442685429, + "grad_norm": 1.5190832449251364, + "learning_rate": 0.00031623681125439625, + "loss": 3.605468273162842, + "step": 1079, + "token_acc": 0.23916483205063277 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 0.8291199410879295, + "learning_rate": 0.0003165298944900352, + "loss": 3.560580253601074, + "step": 1080, + "token_acc": 0.2454103310975121 + }, + { + "epoch": 0.633831720902961, + "grad_norm": 1.420828667300388, + "learning_rate": 0.0003168229777256741, + "loss": 3.5293874740600586, + "step": 1081, + "token_acc": 0.2519151152193725 + }, + { + "epoch": 0.63441805922017, + "grad_norm": 1.1572233907469214, + "learning_rate": 0.00031711606096131303, + "loss": 3.5554580688476562, + "step": 1082, + "token_acc": 0.24748734706890368 + }, + { + "epoch": 0.635004397537379, + "grad_norm": 1.3992772126364095, + "learning_rate": 0.00031740914419695196, + "loss": 3.634899377822876, + "step": 1083, + "token_acc": 0.23648715806553 + }, + { + "epoch": 0.6355907358545881, + "grad_norm": 1.2881920686561694, + "learning_rate": 0.0003177022274325909, + "loss": 3.572756290435791, + "step": 1084, + "token_acc": 0.24625218019043404 + }, + { + "epoch": 0.6361770741717971, + "grad_norm": 1.1330656735565585, + "learning_rate": 0.0003179953106682298, + "loss": 3.590031147003174, + "step": 1085, + "token_acc": 0.24307775597211884 + }, + { + "epoch": 0.6367634124890061, + "grad_norm": 1.1416541572544145, + "learning_rate": 0.00031828839390386874, + "loss": 3.591721534729004, + "step": 1086, + "token_acc": 0.2436965605587694 + }, + { + "epoch": 0.6373497508062151, + "grad_norm": 1.2962455590491624, + "learning_rate": 0.0003185814771395076, + "loss": 3.5729780197143555, + "step": 1087, + "token_acc": 0.24474010473529356 + }, + { + "epoch": 0.6379360891234243, + "grad_norm": 1.2000039526613873, + "learning_rate": 0.00031887456037514654, + "loss": 3.5554540157318115, + "step": 1088, + "token_acc": 0.24678570804315852 + }, + { + "epoch": 0.6385224274406333, + "grad_norm": 1.1573919067125542, + "learning_rate": 0.00031916764361078546, + "loss": 3.5619325637817383, + "step": 1089, + "token_acc": 0.2472662643207856 + }, + { + "epoch": 0.6391087657578423, + "grad_norm": 1.1822609910139814, + "learning_rate": 0.0003194607268464244, + "loss": 3.5487148761749268, + "step": 1090, + "token_acc": 0.2490390021223998 + }, + { + "epoch": 0.6396951040750513, + "grad_norm": 1.2179345246746551, + "learning_rate": 0.0003197538100820633, + "loss": 3.5300283432006836, + "step": 1091, + "token_acc": 0.2503283397503825 + }, + { + "epoch": 0.6402814423922604, + "grad_norm": 1.1379827928680268, + "learning_rate": 0.00032004689331770225, + "loss": 3.539045572280884, + "step": 1092, + "token_acc": 0.24947478721819574 + }, + { + "epoch": 0.6408677807094694, + "grad_norm": 1.1383140911851763, + "learning_rate": 0.0003203399765533411, + "loss": 3.6093735694885254, + "step": 1093, + "token_acc": 0.24054456046624575 + }, + { + "epoch": 0.6414541190266784, + "grad_norm": 1.3386410166576508, + "learning_rate": 0.00032063305978898005, + "loss": 3.5443878173828125, + "step": 1094, + "token_acc": 0.24872283931852482 + }, + { + "epoch": 0.6420404573438874, + "grad_norm": 0.9958463126199089, + "learning_rate": 0.000320926143024619, + "loss": 3.5027291774749756, + "step": 1095, + "token_acc": 0.252696881462228 + }, + { + "epoch": 0.6426267956610965, + "grad_norm": 1.2637195292149674, + "learning_rate": 0.0003212192262602579, + "loss": 3.5294947624206543, + "step": 1096, + "token_acc": 0.25016069846893585 + }, + { + "epoch": 0.6432131339783055, + "grad_norm": 1.0775112577219639, + "learning_rate": 0.00032151230949589683, + "loss": 3.524590015411377, + "step": 1097, + "token_acc": 0.2503629842108192 + }, + { + "epoch": 0.6437994722955145, + "grad_norm": 1.3906939994597405, + "learning_rate": 0.00032180539273153575, + "loss": 3.562063694000244, + "step": 1098, + "token_acc": 0.24525273863208116 + }, + { + "epoch": 0.6443858106127235, + "grad_norm": 1.260113265740618, + "learning_rate": 0.0003220984759671747, + "loss": 3.5748047828674316, + "step": 1099, + "token_acc": 0.2450031146418931 + }, + { + "epoch": 0.6449721489299326, + "grad_norm": 1.230003679861811, + "learning_rate": 0.0003223915592028136, + "loss": 3.595684051513672, + "step": 1100, + "token_acc": 0.24075744832683438 + }, + { + "epoch": 0.6455584872471416, + "grad_norm": 1.3402025501774848, + "learning_rate": 0.00032268464243845254, + "loss": 3.593212604522705, + "step": 1101, + "token_acc": 0.2434796958300396 + }, + { + "epoch": 0.6461448255643506, + "grad_norm": 0.9260537558080999, + "learning_rate": 0.00032297772567409146, + "loss": 3.5308728218078613, + "step": 1102, + "token_acc": 0.24810283374200562 + }, + { + "epoch": 0.6467311638815597, + "grad_norm": 1.3607546331294096, + "learning_rate": 0.0003232708089097304, + "loss": 3.5628058910369873, + "step": 1103, + "token_acc": 0.2468668613154561 + }, + { + "epoch": 0.6473175021987687, + "grad_norm": 1.1217365511168185, + "learning_rate": 0.0003235638921453693, + "loss": 3.540827751159668, + "step": 1104, + "token_acc": 0.2516629336829591 + }, + { + "epoch": 0.6479038405159777, + "grad_norm": 1.351973406383005, + "learning_rate": 0.00032385697538100824, + "loss": 3.59983491897583, + "step": 1105, + "token_acc": 0.2413246809454221 + }, + { + "epoch": 0.6484901788331867, + "grad_norm": 1.355231792514714, + "learning_rate": 0.00032415005861664717, + "loss": 3.5499110221862793, + "step": 1106, + "token_acc": 0.24766060323300657 + }, + { + "epoch": 0.6490765171503958, + "grad_norm": 1.1929701896496483, + "learning_rate": 0.0003244431418522861, + "loss": 3.5612032413482666, + "step": 1107, + "token_acc": 0.2467023210035984 + }, + { + "epoch": 0.6496628554676048, + "grad_norm": 1.3111405815678407, + "learning_rate": 0.00032473622508792497, + "loss": 3.5945374965667725, + "step": 1108, + "token_acc": 0.2418001062457312 + }, + { + "epoch": 0.6502491937848138, + "grad_norm": 1.0737005914840283, + "learning_rate": 0.0003250293083235639, + "loss": 3.4876513481140137, + "step": 1109, + "token_acc": 0.25558413787262874 + }, + { + "epoch": 0.6508355321020228, + "grad_norm": 1.307575869342278, + "learning_rate": 0.0003253223915592028, + "loss": 3.5603158473968506, + "step": 1110, + "token_acc": 0.24741451659155544 + }, + { + "epoch": 0.6514218704192319, + "grad_norm": 1.209003447570316, + "learning_rate": 0.0003256154747948417, + "loss": 3.5082802772521973, + "step": 1111, + "token_acc": 0.25135685656087786 + }, + { + "epoch": 0.6520082087364409, + "grad_norm": 1.0395884785351583, + "learning_rate": 0.0003259085580304806, + "loss": 3.5476131439208984, + "step": 1112, + "token_acc": 0.24823504280184025 + }, + { + "epoch": 0.6525945470536499, + "grad_norm": 1.2087237450963717, + "learning_rate": 0.00032620164126611955, + "loss": 3.5242671966552734, + "step": 1113, + "token_acc": 0.25044261411746593 + }, + { + "epoch": 0.6531808853708589, + "grad_norm": 1.1238498900421816, + "learning_rate": 0.0003264947245017585, + "loss": 3.57403564453125, + "step": 1114, + "token_acc": 0.24646323511065646 + }, + { + "epoch": 0.653767223688068, + "grad_norm": 0.9858208484784472, + "learning_rate": 0.0003267878077373974, + "loss": 3.553864002227783, + "step": 1115, + "token_acc": 0.24866785546395923 + }, + { + "epoch": 0.6543535620052771, + "grad_norm": 1.4029590370655733, + "learning_rate": 0.00032708089097303634, + "loss": 3.559656858444214, + "step": 1116, + "token_acc": 0.24425655346921143 + }, + { + "epoch": 0.6549399003224861, + "grad_norm": 1.3028626023006458, + "learning_rate": 0.00032737397420867526, + "loss": 3.5004982948303223, + "step": 1117, + "token_acc": 0.2532436141346724 + }, + { + "epoch": 0.6555262386396951, + "grad_norm": 0.9296701964983376, + "learning_rate": 0.0003276670574443142, + "loss": 3.5941061973571777, + "step": 1118, + "token_acc": 0.2403735487907555 + }, + { + "epoch": 0.6561125769569042, + "grad_norm": 1.3879578424444867, + "learning_rate": 0.0003279601406799531, + "loss": 3.5592808723449707, + "step": 1119, + "token_acc": 0.24724811460503343 + }, + { + "epoch": 0.6566989152741132, + "grad_norm": 1.1693239724211568, + "learning_rate": 0.00032825322391559204, + "loss": 3.5292327404022217, + "step": 1120, + "token_acc": 0.2498458529695373 + }, + { + "epoch": 0.6572852535913222, + "grad_norm": 1.0471179945909954, + "learning_rate": 0.00032854630715123097, + "loss": 3.552354335784912, + "step": 1121, + "token_acc": 0.2473074620115077 + }, + { + "epoch": 0.6578715919085312, + "grad_norm": 1.3109248459522524, + "learning_rate": 0.0003288393903868699, + "loss": 3.5934853553771973, + "step": 1122, + "token_acc": 0.24050911031246566 + }, + { + "epoch": 0.6584579302257403, + "grad_norm": 0.969650259836423, + "learning_rate": 0.0003291324736225088, + "loss": 3.5656421184539795, + "step": 1123, + "token_acc": 0.24619973492344135 + }, + { + "epoch": 0.6590442685429493, + "grad_norm": 1.220902322112274, + "learning_rate": 0.00032942555685814775, + "loss": 3.538597583770752, + "step": 1124, + "token_acc": 0.24896147309072797 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 1.3066563924582757, + "learning_rate": 0.0003297186400937867, + "loss": 3.481600761413574, + "step": 1125, + "token_acc": 0.256214083676146 + }, + { + "epoch": 0.6602169451773673, + "grad_norm": 0.9900540592016093, + "learning_rate": 0.0003300117233294256, + "loss": 3.5342531204223633, + "step": 1126, + "token_acc": 0.2467207591884573 + }, + { + "epoch": 0.6608032834945764, + "grad_norm": 1.052127838539941, + "learning_rate": 0.00033030480656506453, + "loss": 3.5289125442504883, + "step": 1127, + "token_acc": 0.24936321294159233 + }, + { + "epoch": 0.6613896218117854, + "grad_norm": 1.332404081317421, + "learning_rate": 0.0003305978898007034, + "loss": 3.5762147903442383, + "step": 1128, + "token_acc": 0.24507660283721616 + }, + { + "epoch": 0.6619759601289944, + "grad_norm": 1.136520272589364, + "learning_rate": 0.0003308909730363423, + "loss": 3.5230135917663574, + "step": 1129, + "token_acc": 0.25160075329566856 + }, + { + "epoch": 0.6625622984462035, + "grad_norm": 1.1348348005989932, + "learning_rate": 0.0003311840562719812, + "loss": 3.5148396492004395, + "step": 1130, + "token_acc": 0.25060369346420647 + }, + { + "epoch": 0.6631486367634125, + "grad_norm": 1.440027444073951, + "learning_rate": 0.00033147713950762013, + "loss": 3.5625317096710205, + "step": 1131, + "token_acc": 0.24465781356872765 + }, + { + "epoch": 0.6637349750806215, + "grad_norm": 0.9401583073871229, + "learning_rate": 0.00033177022274325906, + "loss": 3.5676870346069336, + "step": 1132, + "token_acc": 0.24446914448993518 + }, + { + "epoch": 0.6643213133978305, + "grad_norm": 1.248781771466693, + "learning_rate": 0.000332063305978898, + "loss": 3.5910234451293945, + "step": 1133, + "token_acc": 0.24314042086156315 + }, + { + "epoch": 0.6649076517150396, + "grad_norm": 1.390313023548334, + "learning_rate": 0.0003323563892145369, + "loss": 3.5809402465820312, + "step": 1134, + "token_acc": 0.24386106704183755 + }, + { + "epoch": 0.6654939900322486, + "grad_norm": 1.1535699229215877, + "learning_rate": 0.00033264947245017584, + "loss": 3.533020496368408, + "step": 1135, + "token_acc": 0.249339550348733 + }, + { + "epoch": 0.6660803283494576, + "grad_norm": 0.957687999299108, + "learning_rate": 0.00033294255568581477, + "loss": 3.498629570007324, + "step": 1136, + "token_acc": 0.2547027831028947 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0455585881163558, + "learning_rate": 0.0003332356389214537, + "loss": 3.547109603881836, + "step": 1137, + "token_acc": 0.24668252939684915 + }, + { + "epoch": 0.6672530049838757, + "grad_norm": 1.1612803316441251, + "learning_rate": 0.0003335287221570926, + "loss": 3.5291762351989746, + "step": 1138, + "token_acc": 0.25067939617429447 + }, + { + "epoch": 0.6678393433010847, + "grad_norm": 1.108069522393023, + "learning_rate": 0.00033382180539273155, + "loss": 3.547048568725586, + "step": 1139, + "token_acc": 0.24649685396759177 + }, + { + "epoch": 0.6684256816182937, + "grad_norm": 1.2785390987270575, + "learning_rate": 0.0003341148886283705, + "loss": 3.5433878898620605, + "step": 1140, + "token_acc": 0.25009580343389565 + }, + { + "epoch": 0.6690120199355027, + "grad_norm": 1.10542642193351, + "learning_rate": 0.0003344079718640094, + "loss": 3.540172576904297, + "step": 1141, + "token_acc": 0.2467566933861023 + }, + { + "epoch": 0.6695983582527119, + "grad_norm": 1.2953390295739424, + "learning_rate": 0.00033470105509964833, + "loss": 3.5272157192230225, + "step": 1142, + "token_acc": 0.2501110803925828 + }, + { + "epoch": 0.6701846965699209, + "grad_norm": 1.1108067743533743, + "learning_rate": 0.00033499413833528726, + "loss": 3.528212785720825, + "step": 1143, + "token_acc": 0.25052162007209744 + }, + { + "epoch": 0.6707710348871299, + "grad_norm": 1.3463178539451273, + "learning_rate": 0.0003352872215709262, + "loss": 3.5627002716064453, + "step": 1144, + "token_acc": 0.245476117451452 + }, + { + "epoch": 0.6713573732043389, + "grad_norm": 1.195757802747223, + "learning_rate": 0.0003355803048065651, + "loss": 3.5535452365875244, + "step": 1145, + "token_acc": 0.24571338058844494 + }, + { + "epoch": 0.671943711521548, + "grad_norm": 1.182634878953239, + "learning_rate": 0.00033587338804220404, + "loss": 3.495540142059326, + "step": 1146, + "token_acc": 0.25310745607774027 + }, + { + "epoch": 0.672530049838757, + "grad_norm": 1.1596468853060404, + "learning_rate": 0.0003361664712778429, + "loss": 3.508944034576416, + "step": 1147, + "token_acc": 0.25265622418635386 + }, + { + "epoch": 0.673116388155966, + "grad_norm": 1.1390830936271432, + "learning_rate": 0.0003364595545134818, + "loss": 3.5513880252838135, + "step": 1148, + "token_acc": 0.2457098802355412 + }, + { + "epoch": 0.673702726473175, + "grad_norm": 1.1886038458253203, + "learning_rate": 0.0003367526377491207, + "loss": 3.540112018585205, + "step": 1149, + "token_acc": 0.24757359557783776 + }, + { + "epoch": 0.6742890647903841, + "grad_norm": 1.0771840986201147, + "learning_rate": 0.00033704572098475964, + "loss": 3.5456600189208984, + "step": 1150, + "token_acc": 0.24660703310977458 + }, + { + "epoch": 0.6748754031075931, + "grad_norm": 1.1187039781642754, + "learning_rate": 0.00033733880422039857, + "loss": 3.5277175903320312, + "step": 1151, + "token_acc": 0.25042674309148166 + }, + { + "epoch": 0.6754617414248021, + "grad_norm": 1.3258653333578712, + "learning_rate": 0.0003376318874560375, + "loss": 3.5155279636383057, + "step": 1152, + "token_acc": 0.2514580254093943 + }, + { + "epoch": 0.6760480797420111, + "grad_norm": 1.063979292920111, + "learning_rate": 0.0003379249706916764, + "loss": 3.5286808013916016, + "step": 1153, + "token_acc": 0.24790684954560754 + }, + { + "epoch": 0.6766344180592202, + "grad_norm": 1.1978726028697113, + "learning_rate": 0.00033821805392731535, + "loss": 3.562964916229248, + "step": 1154, + "token_acc": 0.2453826068848415 + }, + { + "epoch": 0.6772207563764292, + "grad_norm": 1.1271359028397923, + "learning_rate": 0.0003385111371629543, + "loss": 3.519597053527832, + "step": 1155, + "token_acc": 0.25035711435090513 + }, + { + "epoch": 0.6778070946936382, + "grad_norm": 1.1437329064594568, + "learning_rate": 0.0003388042203985932, + "loss": 3.5524344444274902, + "step": 1156, + "token_acc": 0.2450053735998502 + }, + { + "epoch": 0.6783934330108473, + "grad_norm": 1.0928591410622746, + "learning_rate": 0.00033909730363423213, + "loss": 3.4724159240722656, + "step": 1157, + "token_acc": 0.2541115970065948 + }, + { + "epoch": 0.6789797713280563, + "grad_norm": 1.115896710197314, + "learning_rate": 0.00033939038686987106, + "loss": 3.559156894683838, + "step": 1158, + "token_acc": 0.2444567147924417 + }, + { + "epoch": 0.6795661096452653, + "grad_norm": 1.4392482058325582, + "learning_rate": 0.00033968347010551, + "loss": 3.5400705337524414, + "step": 1159, + "token_acc": 0.24868272388414306 + }, + { + "epoch": 0.6801524479624743, + "grad_norm": 1.0183113818136642, + "learning_rate": 0.0003399765533411489, + "loss": 3.4402782917022705, + "step": 1160, + "token_acc": 0.25780541620752456 + }, + { + "epoch": 0.6807387862796834, + "grad_norm": 1.3486267132712528, + "learning_rate": 0.00034026963657678784, + "loss": 3.5807957649230957, + "step": 1161, + "token_acc": 0.24274357981675054 + }, + { + "epoch": 0.6813251245968924, + "grad_norm": 1.0751325789540667, + "learning_rate": 0.00034056271981242677, + "loss": 3.4894602298736572, + "step": 1162, + "token_acc": 0.254426472023464 + }, + { + "epoch": 0.6819114629141014, + "grad_norm": 1.3859886314991148, + "learning_rate": 0.0003408558030480657, + "loss": 3.5615060329437256, + "step": 1163, + "token_acc": 0.24496862605191916 + }, + { + "epoch": 0.6824978012313104, + "grad_norm": 1.0178750970828456, + "learning_rate": 0.00034114888628370457, + "loss": 3.52013897895813, + "step": 1164, + "token_acc": 0.25224347216670956 + }, + { + "epoch": 0.6830841395485195, + "grad_norm": 1.2366225371445583, + "learning_rate": 0.0003414419695193435, + "loss": 3.5442519187927246, + "step": 1165, + "token_acc": 0.24564135368430173 + }, + { + "epoch": 0.6836704778657285, + "grad_norm": 1.2195981142350034, + "learning_rate": 0.0003417350527549824, + "loss": 3.535538673400879, + "step": 1166, + "token_acc": 0.24952392748215696 + }, + { + "epoch": 0.6842568161829375, + "grad_norm": 0.8817938506181477, + "learning_rate": 0.00034202813599062135, + "loss": 3.5537772178649902, + "step": 1167, + "token_acc": 0.24662827622408515 + }, + { + "epoch": 0.6848431545001465, + "grad_norm": 1.0967838727921415, + "learning_rate": 0.0003423212192262602, + "loss": 3.5506629943847656, + "step": 1168, + "token_acc": 0.2473518822966308 + }, + { + "epoch": 0.6854294928173557, + "grad_norm": 1.2437757554668571, + "learning_rate": 0.00034261430246189915, + "loss": 3.5429272651672363, + "step": 1169, + "token_acc": 0.24572574739724282 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 1.1782248783121134, + "learning_rate": 0.0003429073856975381, + "loss": 3.5021414756774902, + "step": 1170, + "token_acc": 0.2516976942342947 + }, + { + "epoch": 0.6866021694517737, + "grad_norm": 1.2704713431524937, + "learning_rate": 0.000343200468933177, + "loss": 3.528017520904541, + "step": 1171, + "token_acc": 0.2482961285870467 + }, + { + "epoch": 0.6871885077689827, + "grad_norm": 1.0816706899115751, + "learning_rate": 0.00034349355216881593, + "loss": 3.5300867557525635, + "step": 1172, + "token_acc": 0.24826474707530072 + }, + { + "epoch": 0.6877748460861918, + "grad_norm": 1.3348431394423883, + "learning_rate": 0.00034378663540445486, + "loss": 3.5262646675109863, + "step": 1173, + "token_acc": 0.248 + }, + { + "epoch": 0.6883611844034008, + "grad_norm": 1.006175191731131, + "learning_rate": 0.0003440797186400938, + "loss": 3.5259833335876465, + "step": 1174, + "token_acc": 0.25009552144520125 + }, + { + "epoch": 0.6889475227206098, + "grad_norm": 1.360901272289494, + "learning_rate": 0.0003443728018757327, + "loss": 3.5509562492370605, + "step": 1175, + "token_acc": 0.2466580212615457 + }, + { + "epoch": 0.6895338610378188, + "grad_norm": 0.9082580758852431, + "learning_rate": 0.00034466588511137164, + "loss": 3.540731906890869, + "step": 1176, + "token_acc": 0.2474872649419408 + }, + { + "epoch": 0.6901201993550279, + "grad_norm": 1.2298691569647349, + "learning_rate": 0.00034495896834701057, + "loss": 3.522731304168701, + "step": 1177, + "token_acc": 0.24689660611477693 + }, + { + "epoch": 0.6907065376722369, + "grad_norm": 1.2562276728572417, + "learning_rate": 0.0003452520515826495, + "loss": 3.563549518585205, + "step": 1178, + "token_acc": 0.24390001103096473 + }, + { + "epoch": 0.6912928759894459, + "grad_norm": 1.4389558089534102, + "learning_rate": 0.0003455451348182884, + "loss": 3.5353622436523438, + "step": 1179, + "token_acc": 0.24938515029220187 + }, + { + "epoch": 0.6918792143066549, + "grad_norm": 1.0543070598216913, + "learning_rate": 0.00034583821805392735, + "loss": 3.512922525405884, + "step": 1180, + "token_acc": 0.24991110162861815 + }, + { + "epoch": 0.692465552623864, + "grad_norm": 1.2160004716382278, + "learning_rate": 0.0003461313012895663, + "loss": 3.561436176300049, + "step": 1181, + "token_acc": 0.24524189579432804 + }, + { + "epoch": 0.693051890941073, + "grad_norm": 1.018136713771027, + "learning_rate": 0.00034642438452520515, + "loss": 3.464406728744507, + "step": 1182, + "token_acc": 0.25382326395121596 + }, + { + "epoch": 0.693638229258282, + "grad_norm": 1.431307631593445, + "learning_rate": 0.0003467174677608441, + "loss": 3.4845755100250244, + "step": 1183, + "token_acc": 0.2531205169628433 + }, + { + "epoch": 0.6942245675754911, + "grad_norm": 0.9004627438402442, + "learning_rate": 0.000347010550996483, + "loss": 3.472142219543457, + "step": 1184, + "token_acc": 0.25498336935561106 + }, + { + "epoch": 0.6948109058927001, + "grad_norm": 1.158960208969192, + "learning_rate": 0.00034730363423212193, + "loss": 3.5171923637390137, + "step": 1185, + "token_acc": 0.2506687934253579 + }, + { + "epoch": 0.6953972442099091, + "grad_norm": 0.9875161955788607, + "learning_rate": 0.00034759671746776086, + "loss": 3.5057528018951416, + "step": 1186, + "token_acc": 0.25150209945947616 + }, + { + "epoch": 0.6959835825271181, + "grad_norm": 1.2011554338477437, + "learning_rate": 0.0003478898007033998, + "loss": 3.5390753746032715, + "step": 1187, + "token_acc": 0.2474407327586207 + }, + { + "epoch": 0.6965699208443272, + "grad_norm": 1.2559757321722238, + "learning_rate": 0.0003481828839390387, + "loss": 3.507009506225586, + "step": 1188, + "token_acc": 0.24938997554693867 + }, + { + "epoch": 0.6971562591615362, + "grad_norm": 1.0191285004044885, + "learning_rate": 0.0003484759671746776, + "loss": 3.506450891494751, + "step": 1189, + "token_acc": 0.2503491822546896 + }, + { + "epoch": 0.6977425974787452, + "grad_norm": 0.9548017028787985, + "learning_rate": 0.0003487690504103165, + "loss": 3.5579071044921875, + "step": 1190, + "token_acc": 0.24633295462139945 + }, + { + "epoch": 0.6983289357959542, + "grad_norm": 0.981484851692534, + "learning_rate": 0.00034906213364595544, + "loss": 3.527952194213867, + "step": 1191, + "token_acc": 0.2501665828265616 + }, + { + "epoch": 0.6989152741131633, + "grad_norm": 1.3016208447900965, + "learning_rate": 0.00034935521688159437, + "loss": 3.5000200271606445, + "step": 1192, + "token_acc": 0.25106593727059723 + }, + { + "epoch": 0.6995016124303723, + "grad_norm": 1.011815590310952, + "learning_rate": 0.0003496483001172333, + "loss": 3.537379264831543, + "step": 1193, + "token_acc": 0.24750544787246243 + }, + { + "epoch": 0.7000879507475813, + "grad_norm": 1.0429483729395566, + "learning_rate": 0.0003499413833528722, + "loss": 3.484342575073242, + "step": 1194, + "token_acc": 0.2543926294995961 + }, + { + "epoch": 0.7006742890647903, + "grad_norm": 0.918082263696897, + "learning_rate": 0.00035023446658851115, + "loss": 3.5429186820983887, + "step": 1195, + "token_acc": 0.24734160243144343 + }, + { + "epoch": 0.7012606273819995, + "grad_norm": 1.0569709727769927, + "learning_rate": 0.0003505275498241501, + "loss": 3.5127363204956055, + "step": 1196, + "token_acc": 0.25067248601867015 + }, + { + "epoch": 0.7018469656992085, + "grad_norm": 1.049521149314088, + "learning_rate": 0.000350820633059789, + "loss": 3.5081734657287598, + "step": 1197, + "token_acc": 0.250620198828065 + }, + { + "epoch": 0.7024333040164175, + "grad_norm": 1.3525048616174489, + "learning_rate": 0.00035111371629542793, + "loss": 3.577838897705078, + "step": 1198, + "token_acc": 0.2428012769976505 + }, + { + "epoch": 0.7030196423336265, + "grad_norm": 1.0552220222608861, + "learning_rate": 0.00035140679953106686, + "loss": 3.5475518703460693, + "step": 1199, + "token_acc": 0.24710181093179073 + }, + { + "epoch": 0.7036059806508356, + "grad_norm": 1.4598665176116388, + "learning_rate": 0.00035169988276670573, + "loss": 3.554635524749756, + "step": 1200, + "token_acc": 0.2472148744909943 + }, + { + "epoch": 0.7041923189680446, + "grad_norm": 1.0895906890988776, + "learning_rate": 0.00035199296600234466, + "loss": 3.549295425415039, + "step": 1201, + "token_acc": 0.24648935709490916 + }, + { + "epoch": 0.7047786572852536, + "grad_norm": 1.1261405686585428, + "learning_rate": 0.0003522860492379836, + "loss": 3.535259962081909, + "step": 1202, + "token_acc": 0.2483663353489217 + }, + { + "epoch": 0.7053649956024626, + "grad_norm": 1.0943018129831263, + "learning_rate": 0.0003525791324736225, + "loss": 3.5624148845672607, + "step": 1203, + "token_acc": 0.24475923222521112 + }, + { + "epoch": 0.7059513339196717, + "grad_norm": 1.0284215128615037, + "learning_rate": 0.00035287221570926144, + "loss": 3.4818179607391357, + "step": 1204, + "token_acc": 0.2532396268625216 + }, + { + "epoch": 0.7065376722368807, + "grad_norm": 1.0716823631593215, + "learning_rate": 0.00035316529894490037, + "loss": 3.515615463256836, + "step": 1205, + "token_acc": 0.2520421197817211 + }, + { + "epoch": 0.7071240105540897, + "grad_norm": 1.2324743349430587, + "learning_rate": 0.0003534583821805393, + "loss": 3.5280814170837402, + "step": 1206, + "token_acc": 0.24831418331101615 + }, + { + "epoch": 0.7077103488712987, + "grad_norm": 1.0825953154193289, + "learning_rate": 0.0003537514654161782, + "loss": 3.5493626594543457, + "step": 1207, + "token_acc": 0.2467819801348404 + }, + { + "epoch": 0.7082966871885078, + "grad_norm": 1.0841989989331555, + "learning_rate": 0.00035404454865181715, + "loss": 3.5270919799804688, + "step": 1208, + "token_acc": 0.25028672577494515 + }, + { + "epoch": 0.7088830255057168, + "grad_norm": 0.9459441428851809, + "learning_rate": 0.000354337631887456, + "loss": 3.489910125732422, + "step": 1209, + "token_acc": 0.2526744881714106 + }, + { + "epoch": 0.7094693638229258, + "grad_norm": 0.9354617600278087, + "learning_rate": 0.00035463071512309495, + "loss": 3.5249242782592773, + "step": 1210, + "token_acc": 0.2486229410576238 + }, + { + "epoch": 0.7100557021401348, + "grad_norm": 1.0326240429524933, + "learning_rate": 0.0003549237983587339, + "loss": 3.5006346702575684, + "step": 1211, + "token_acc": 0.2512047879260994 + }, + { + "epoch": 0.7106420404573439, + "grad_norm": 1.3338598333550424, + "learning_rate": 0.0003552168815943728, + "loss": 3.52524733543396, + "step": 1212, + "token_acc": 0.250570148812942 + }, + { + "epoch": 0.7112283787745529, + "grad_norm": 0.9497428972354047, + "learning_rate": 0.00035550996483001173, + "loss": 3.491339921951294, + "step": 1213, + "token_acc": 0.2525678764914039 + }, + { + "epoch": 0.7118147170917619, + "grad_norm": 1.356692229245831, + "learning_rate": 0.00035580304806565066, + "loss": 3.5024614334106445, + "step": 1214, + "token_acc": 0.2491468965341877 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 1.0586248772881608, + "learning_rate": 0.0003560961313012896, + "loss": 3.4918408393859863, + "step": 1215, + "token_acc": 0.2523862523144468 + }, + { + "epoch": 0.71298739372618, + "grad_norm": 0.8235559547246575, + "learning_rate": 0.0003563892145369285, + "loss": 3.4780097007751465, + "step": 1216, + "token_acc": 0.25423402465111006 + }, + { + "epoch": 0.713573732043389, + "grad_norm": 0.8871343746621152, + "learning_rate": 0.00035668229777256744, + "loss": 3.515353202819824, + "step": 1217, + "token_acc": 0.2500533748131882 + }, + { + "epoch": 0.714160070360598, + "grad_norm": 1.1428938447313297, + "learning_rate": 0.0003569753810082063, + "loss": 3.5247015953063965, + "step": 1218, + "token_acc": 0.24737294472340415 + }, + { + "epoch": 0.7147464086778071, + "grad_norm": 1.0848607754382695, + "learning_rate": 0.00035726846424384524, + "loss": 3.4805145263671875, + "step": 1219, + "token_acc": 0.25320163928068157 + }, + { + "epoch": 0.7153327469950161, + "grad_norm": 1.2135675779157435, + "learning_rate": 0.00035756154747948417, + "loss": 3.523923397064209, + "step": 1220, + "token_acc": 0.24836656520671696 + }, + { + "epoch": 0.7159190853122251, + "grad_norm": 0.8789548797979777, + "learning_rate": 0.0003578546307151231, + "loss": 3.498096466064453, + "step": 1221, + "token_acc": 0.2510642049318657 + }, + { + "epoch": 0.7165054236294341, + "grad_norm": 0.7578662696495625, + "learning_rate": 0.000358147713950762, + "loss": 3.4700088500976562, + "step": 1222, + "token_acc": 0.25496737451534607 + }, + { + "epoch": 0.7170917619466433, + "grad_norm": 1.0228189917114539, + "learning_rate": 0.00035844079718640095, + "loss": 3.5253689289093018, + "step": 1223, + "token_acc": 0.2494752038706153 + }, + { + "epoch": 0.7176781002638523, + "grad_norm": 1.2213198960682865, + "learning_rate": 0.0003587338804220399, + "loss": 3.456556797027588, + "step": 1224, + "token_acc": 0.25771414530093273 + }, + { + "epoch": 0.7182644385810613, + "grad_norm": 1.0631151053663197, + "learning_rate": 0.0003590269636576788, + "loss": 3.5372540950775146, + "step": 1225, + "token_acc": 0.2492148667712632 + }, + { + "epoch": 0.7188507768982703, + "grad_norm": 1.2113437554086248, + "learning_rate": 0.00035932004689331773, + "loss": 3.5380754470825195, + "step": 1226, + "token_acc": 0.24649231626117116 + }, + { + "epoch": 0.7194371152154794, + "grad_norm": 0.9555943062405109, + "learning_rate": 0.00035961313012895666, + "loss": 3.5020651817321777, + "step": 1227, + "token_acc": 0.25017010831489445 + }, + { + "epoch": 0.7200234535326884, + "grad_norm": 1.125559620095925, + "learning_rate": 0.0003599062133645956, + "loss": 3.5488991737365723, + "step": 1228, + "token_acc": 0.24545114072594487 + }, + { + "epoch": 0.7206097918498974, + "grad_norm": 1.0992764462400064, + "learning_rate": 0.00036019929660023446, + "loss": 3.450565814971924, + "step": 1229, + "token_acc": 0.25807414076485796 + }, + { + "epoch": 0.7211961301671064, + "grad_norm": 0.9545388846500981, + "learning_rate": 0.0003604923798358734, + "loss": 3.5097219944000244, + "step": 1230, + "token_acc": 0.25133427522476376 + }, + { + "epoch": 0.7217824684843155, + "grad_norm": 1.0387227315444867, + "learning_rate": 0.0003607854630715123, + "loss": 3.524634838104248, + "step": 1231, + "token_acc": 0.24821223670190967 + }, + { + "epoch": 0.7223688068015245, + "grad_norm": 0.8514007532483667, + "learning_rate": 0.00036107854630715124, + "loss": 3.5065577030181885, + "step": 1232, + "token_acc": 0.2523945202022857 + }, + { + "epoch": 0.7229551451187335, + "grad_norm": 0.9994290359397887, + "learning_rate": 0.00036137162954279017, + "loss": 3.536440849304199, + "step": 1233, + "token_acc": 0.24660880829015544 + }, + { + "epoch": 0.7235414834359425, + "grad_norm": 1.178655552475769, + "learning_rate": 0.0003616647127784291, + "loss": 3.4668939113616943, + "step": 1234, + "token_acc": 0.25552713294940016 + }, + { + "epoch": 0.7241278217531516, + "grad_norm": 0.9964407980041861, + "learning_rate": 0.000361957796014068, + "loss": 3.471785306930542, + "step": 1235, + "token_acc": 0.25725433020990324 + }, + { + "epoch": 0.7247141600703606, + "grad_norm": 1.4628672089350807, + "learning_rate": 0.0003622508792497069, + "loss": 3.534055709838867, + "step": 1236, + "token_acc": 0.24756990398580359 + }, + { + "epoch": 0.7253004983875696, + "grad_norm": 0.936330363919642, + "learning_rate": 0.0003625439624853458, + "loss": 3.4989395141601562, + "step": 1237, + "token_acc": 0.25195990974445087 + }, + { + "epoch": 0.7258868367047786, + "grad_norm": 0.9241122426506498, + "learning_rate": 0.00036283704572098475, + "loss": 3.4999499320983887, + "step": 1238, + "token_acc": 0.2507256837926323 + }, + { + "epoch": 0.7264731750219877, + "grad_norm": 0.9099503060430971, + "learning_rate": 0.0003631301289566237, + "loss": 3.531130313873291, + "step": 1239, + "token_acc": 0.24756478876550658 + }, + { + "epoch": 0.7270595133391967, + "grad_norm": 0.8444130566891199, + "learning_rate": 0.0003634232121922626, + "loss": 3.4660487174987793, + "step": 1240, + "token_acc": 0.2547390233330162 + }, + { + "epoch": 0.7276458516564057, + "grad_norm": 0.9945255179413414, + "learning_rate": 0.00036371629542790153, + "loss": 3.488422155380249, + "step": 1241, + "token_acc": 0.2546753879862216 + }, + { + "epoch": 0.7282321899736148, + "grad_norm": 1.0246236548373886, + "learning_rate": 0.00036400937866354046, + "loss": 3.466963291168213, + "step": 1242, + "token_acc": 0.2553291695433211 + }, + { + "epoch": 0.7288185282908238, + "grad_norm": 1.1706729509074931, + "learning_rate": 0.0003643024618991794, + "loss": 3.558746814727783, + "step": 1243, + "token_acc": 0.24539399728732697 + }, + { + "epoch": 0.7294048666080328, + "grad_norm": 1.09724316464441, + "learning_rate": 0.0003645955451348183, + "loss": 3.4664244651794434, + "step": 1244, + "token_acc": 0.2568734561082883 + }, + { + "epoch": 0.7299912049252418, + "grad_norm": 0.7928447267564022, + "learning_rate": 0.00036488862837045724, + "loss": 3.554063320159912, + "step": 1245, + "token_acc": 0.2435292165098495 + }, + { + "epoch": 0.7305775432424509, + "grad_norm": 0.9070691599138223, + "learning_rate": 0.00036518171160609616, + "loss": 3.5058159828186035, + "step": 1246, + "token_acc": 0.2518943413847264 + }, + { + "epoch": 0.73116388155966, + "grad_norm": 1.1480665913191976, + "learning_rate": 0.0003654747948417351, + "loss": 3.5055370330810547, + "step": 1247, + "token_acc": 0.2504925904240956 + }, + { + "epoch": 0.731750219876869, + "grad_norm": 1.0917878330671396, + "learning_rate": 0.000365767878077374, + "loss": 3.4870190620422363, + "step": 1248, + "token_acc": 0.25269090964089275 + }, + { + "epoch": 0.732336558194078, + "grad_norm": 1.2122370406200964, + "learning_rate": 0.00036606096131301295, + "loss": 3.5182714462280273, + "step": 1249, + "token_acc": 0.2504461826315001 + }, + { + "epoch": 0.7329228965112871, + "grad_norm": 1.0992772061454728, + "learning_rate": 0.0003663540445486518, + "loss": 3.5024056434631348, + "step": 1250, + "token_acc": 0.250882372065804 + }, + { + "epoch": 0.7335092348284961, + "grad_norm": 1.0945904717565895, + "learning_rate": 0.00036664712778429075, + "loss": 3.4923954010009766, + "step": 1251, + "token_acc": 0.2549660129076592 + }, + { + "epoch": 0.7340955731457051, + "grad_norm": 1.217726506041747, + "learning_rate": 0.0003669402110199297, + "loss": 3.4819135665893555, + "step": 1252, + "token_acc": 0.2544570544435777 + }, + { + "epoch": 0.7346819114629141, + "grad_norm": 0.8045037059017836, + "learning_rate": 0.00036723329425556855, + "loss": 3.496025562286377, + "step": 1253, + "token_acc": 0.2513662789162702 + }, + { + "epoch": 0.7352682497801232, + "grad_norm": 1.0533464337294605, + "learning_rate": 0.0003675263774912075, + "loss": 3.570786952972412, + "step": 1254, + "token_acc": 0.24342378010861904 + }, + { + "epoch": 0.7358545880973322, + "grad_norm": 1.098386702161414, + "learning_rate": 0.0003678194607268464, + "loss": 3.5218682289123535, + "step": 1255, + "token_acc": 0.24898910165003055 + }, + { + "epoch": 0.7364409264145412, + "grad_norm": 1.096858754177103, + "learning_rate": 0.00036811254396248533, + "loss": 3.474954128265381, + "step": 1256, + "token_acc": 0.2530937376510472 + }, + { + "epoch": 0.7370272647317502, + "grad_norm": 1.0361171354256042, + "learning_rate": 0.00036840562719812425, + "loss": 3.563544750213623, + "step": 1257, + "token_acc": 0.24390654370148665 + }, + { + "epoch": 0.7376136030489593, + "grad_norm": 0.964936736934839, + "learning_rate": 0.0003686987104337632, + "loss": 3.524034261703491, + "step": 1258, + "token_acc": 0.24788131229920093 + }, + { + "epoch": 0.7381999413661683, + "grad_norm": 1.0730831406859893, + "learning_rate": 0.0003689917936694021, + "loss": 3.473174571990967, + "step": 1259, + "token_acc": 0.25377940307403235 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 1.2270883194190767, + "learning_rate": 0.00036928487690504104, + "loss": 3.4905569553375244, + "step": 1260, + "token_acc": 0.25130399079476273 + }, + { + "epoch": 0.7393726180005863, + "grad_norm": 0.9903905620973006, + "learning_rate": 0.00036957796014067996, + "loss": 3.526292562484741, + "step": 1261, + "token_acc": 0.2472362765060365 + }, + { + "epoch": 0.7399589563177954, + "grad_norm": 1.3057551281395756, + "learning_rate": 0.0003698710433763189, + "loss": 3.4739603996276855, + "step": 1262, + "token_acc": 0.2543431986943893 + }, + { + "epoch": 0.7405452946350044, + "grad_norm": 1.0857925859584348, + "learning_rate": 0.0003701641266119578, + "loss": 3.4918572902679443, + "step": 1263, + "token_acc": 0.2496519247619918 + }, + { + "epoch": 0.7411316329522134, + "grad_norm": 1.1067318405815867, + "learning_rate": 0.00037045720984759674, + "loss": 3.4486000537872314, + "step": 1264, + "token_acc": 0.2564126836374346 + }, + { + "epoch": 0.7417179712694224, + "grad_norm": 1.0311115412015102, + "learning_rate": 0.00037075029308323567, + "loss": 3.5447144508361816, + "step": 1265, + "token_acc": 0.24598300512124774 + }, + { + "epoch": 0.7423043095866315, + "grad_norm": 0.9604109003891884, + "learning_rate": 0.0003710433763188746, + "loss": 3.4529128074645996, + "step": 1266, + "token_acc": 0.25577408900847837 + }, + { + "epoch": 0.7428906479038405, + "grad_norm": 1.1410367723596473, + "learning_rate": 0.0003713364595545135, + "loss": 3.443695545196533, + "step": 1267, + "token_acc": 0.2578074183007376 + }, + { + "epoch": 0.7434769862210495, + "grad_norm": 1.0710813989193992, + "learning_rate": 0.00037162954279015245, + "loss": 3.470284938812256, + "step": 1268, + "token_acc": 0.25557474959946813 + }, + { + "epoch": 0.7440633245382586, + "grad_norm": 1.003934501544495, + "learning_rate": 0.0003719226260257914, + "loss": 3.515272617340088, + "step": 1269, + "token_acc": 0.25064775378091797 + }, + { + "epoch": 0.7446496628554676, + "grad_norm": 1.2399214122389657, + "learning_rate": 0.00037221570926143025, + "loss": 3.5106518268585205, + "step": 1270, + "token_acc": 0.2490702668896131 + }, + { + "epoch": 0.7452360011726766, + "grad_norm": 1.0755101054033664, + "learning_rate": 0.0003725087924970691, + "loss": 3.461902379989624, + "step": 1271, + "token_acc": 0.25596038619869915 + }, + { + "epoch": 0.7458223394898856, + "grad_norm": 1.152119971935176, + "learning_rate": 0.00037280187573270805, + "loss": 3.4664485454559326, + "step": 1272, + "token_acc": 0.2557741616105202 + }, + { + "epoch": 0.7464086778070947, + "grad_norm": 1.0472432895044648, + "learning_rate": 0.000373094958968347, + "loss": 3.483273506164551, + "step": 1273, + "token_acc": 0.25200952204291105 + }, + { + "epoch": 0.7469950161243037, + "grad_norm": 0.8790006673685149, + "learning_rate": 0.0003733880422039859, + "loss": 3.4371213912963867, + "step": 1274, + "token_acc": 0.2581320134323977 + }, + { + "epoch": 0.7475813544415127, + "grad_norm": 0.886736452804881, + "learning_rate": 0.00037368112543962484, + "loss": 3.4758927822113037, + "step": 1275, + "token_acc": 0.2527080694618192 + }, + { + "epoch": 0.7481676927587217, + "grad_norm": 1.1449442477726546, + "learning_rate": 0.00037397420867526376, + "loss": 3.5382113456726074, + "step": 1276, + "token_acc": 0.24422432713411113 + }, + { + "epoch": 0.7487540310759309, + "grad_norm": 1.097091454594263, + "learning_rate": 0.0003742672919109027, + "loss": 3.4651803970336914, + "step": 1277, + "token_acc": 0.25596913992811243 + }, + { + "epoch": 0.7493403693931399, + "grad_norm": 1.138410631270296, + "learning_rate": 0.0003745603751465416, + "loss": 3.5188822746276855, + "step": 1278, + "token_acc": 0.24840670304609871 + }, + { + "epoch": 0.7499267077103489, + "grad_norm": 0.9394261071605571, + "learning_rate": 0.00037485345838218054, + "loss": 3.5016980171203613, + "step": 1279, + "token_acc": 0.2501159551792046 + }, + { + "epoch": 0.7505130460275579, + "grad_norm": 1.182035471455298, + "learning_rate": 0.00037514654161781947, + "loss": 3.487163543701172, + "step": 1280, + "token_acc": 0.25179838940700894 + }, + { + "epoch": 0.751099384344767, + "grad_norm": 1.01190698276881, + "learning_rate": 0.0003754396248534584, + "loss": 3.4713244438171387, + "step": 1281, + "token_acc": 0.2530424675354666 + }, + { + "epoch": 0.751685722661976, + "grad_norm": 1.1464516562070586, + "learning_rate": 0.0003757327080890973, + "loss": 3.442625045776367, + "step": 1282, + "token_acc": 0.2588711967438974 + }, + { + "epoch": 0.752272060979185, + "grad_norm": 1.0431492011488195, + "learning_rate": 0.00037602579132473625, + "loss": 3.4936037063598633, + "step": 1283, + "token_acc": 0.25121045106907675 + }, + { + "epoch": 0.752858399296394, + "grad_norm": 1.050580213235949, + "learning_rate": 0.0003763188745603752, + "loss": 3.534257411956787, + "step": 1284, + "token_acc": 0.24597087918194954 + }, + { + "epoch": 0.7534447376136031, + "grad_norm": 1.0272016491610667, + "learning_rate": 0.0003766119577960141, + "loss": 3.5080385208129883, + "step": 1285, + "token_acc": 0.2503993049232866 + }, + { + "epoch": 0.7540310759308121, + "grad_norm": 0.8400726262906948, + "learning_rate": 0.00037690504103165303, + "loss": 3.4579713344573975, + "step": 1286, + "token_acc": 0.25657550247714184 + }, + { + "epoch": 0.7546174142480211, + "grad_norm": 0.9676103867597385, + "learning_rate": 0.00037719812426729196, + "loss": 3.551908493041992, + "step": 1287, + "token_acc": 0.2439862816612781 + }, + { + "epoch": 0.7552037525652301, + "grad_norm": 1.030458951125422, + "learning_rate": 0.0003774912075029309, + "loss": 3.482819080352783, + "step": 1288, + "token_acc": 0.25361263141315904 + }, + { + "epoch": 0.7557900908824392, + "grad_norm": 1.0138518380325043, + "learning_rate": 0.00037778429073856976, + "loss": 3.5801682472229004, + "step": 1289, + "token_acc": 0.24201541083316241 + }, + { + "epoch": 0.7563764291996482, + "grad_norm": 1.0795402076389047, + "learning_rate": 0.00037807737397420863, + "loss": 3.4637675285339355, + "step": 1290, + "token_acc": 0.25344788268771257 + }, + { + "epoch": 0.7569627675168572, + "grad_norm": 0.9483087373746847, + "learning_rate": 0.00037837045720984756, + "loss": 3.4972915649414062, + "step": 1291, + "token_acc": 0.2523110899921164 + }, + { + "epoch": 0.7575491058340662, + "grad_norm": 0.9929928646355264, + "learning_rate": 0.0003786635404454865, + "loss": 3.499781608581543, + "step": 1292, + "token_acc": 0.24930556275328822 + }, + { + "epoch": 0.7581354441512753, + "grad_norm": 0.8805073109554905, + "learning_rate": 0.0003789566236811254, + "loss": 3.5048651695251465, + "step": 1293, + "token_acc": 0.2518881285489272 + }, + { + "epoch": 0.7587217824684843, + "grad_norm": 0.8408700573444599, + "learning_rate": 0.00037924970691676434, + "loss": 3.495518207550049, + "step": 1294, + "token_acc": 0.24965957039980777 + }, + { + "epoch": 0.7593081207856933, + "grad_norm": 0.9853569578582353, + "learning_rate": 0.00037954279015240327, + "loss": 3.495433807373047, + "step": 1295, + "token_acc": 0.2517029212946871 + }, + { + "epoch": 0.7598944591029023, + "grad_norm": 1.081447091992857, + "learning_rate": 0.0003798358733880422, + "loss": 3.449873924255371, + "step": 1296, + "token_acc": 0.25396492790444886 + }, + { + "epoch": 0.7604807974201114, + "grad_norm": 1.002555529161158, + "learning_rate": 0.0003801289566236811, + "loss": 3.4970436096191406, + "step": 1297, + "token_acc": 0.2514988088069744 + }, + { + "epoch": 0.7610671357373204, + "grad_norm": 0.9577475569141637, + "learning_rate": 0.00038042203985932005, + "loss": 3.5140223503112793, + "step": 1298, + "token_acc": 0.24797525601705048 + }, + { + "epoch": 0.7616534740545294, + "grad_norm": 1.1279460541170863, + "learning_rate": 0.000380715123094959, + "loss": 3.473954677581787, + "step": 1299, + "token_acc": 0.2529052675501462 + }, + { + "epoch": 0.7622398123717385, + "grad_norm": 0.9474834382608195, + "learning_rate": 0.0003810082063305979, + "loss": 3.4546966552734375, + "step": 1300, + "token_acc": 0.2558521285847343 + }, + { + "epoch": 0.7628261506889475, + "grad_norm": 1.1003938100742867, + "learning_rate": 0.00038130128956623683, + "loss": 3.4985344409942627, + "step": 1301, + "token_acc": 0.25046651724592556 + }, + { + "epoch": 0.7634124890061565, + "grad_norm": 1.1268289807504321, + "learning_rate": 0.00038159437280187576, + "loss": 3.464116096496582, + "step": 1302, + "token_acc": 0.25468803448479843 + }, + { + "epoch": 0.7639988273233655, + "grad_norm": 1.0437345553194601, + "learning_rate": 0.0003818874560375147, + "loss": 3.4308760166168213, + "step": 1303, + "token_acc": 0.2579341929083774 + }, + { + "epoch": 0.7645851656405747, + "grad_norm": 1.0168721838168475, + "learning_rate": 0.0003821805392731536, + "loss": 3.499641180038452, + "step": 1304, + "token_acc": 0.25185373259615834 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 0.9809467629589137, + "learning_rate": 0.00038247362250879254, + "loss": 3.53367280960083, + "step": 1305, + "token_acc": 0.24565776529901218 + }, + { + "epoch": 0.7657578422749927, + "grad_norm": 0.9777913711597344, + "learning_rate": 0.00038276670574443147, + "loss": 3.5120949745178223, + "step": 1306, + "token_acc": 0.24813353581449063 + }, + { + "epoch": 0.7663441805922017, + "grad_norm": 0.9721165069007411, + "learning_rate": 0.00038305978898007034, + "loss": 3.50046706199646, + "step": 1307, + "token_acc": 0.2501995672406845 + }, + { + "epoch": 0.7669305189094108, + "grad_norm": 1.1242031761992586, + "learning_rate": 0.00038335287221570927, + "loss": 3.432429790496826, + "step": 1308, + "token_acc": 0.2584317354360933 + }, + { + "epoch": 0.7675168572266198, + "grad_norm": 0.9033200657614953, + "learning_rate": 0.0003836459554513482, + "loss": 3.450303554534912, + "step": 1309, + "token_acc": 0.2545959443750807 + }, + { + "epoch": 0.7681031955438288, + "grad_norm": 0.8387323915753694, + "learning_rate": 0.00038393903868698707, + "loss": 3.427049160003662, + "step": 1310, + "token_acc": 0.26161080958375466 + }, + { + "epoch": 0.7686895338610378, + "grad_norm": 0.9207370783235117, + "learning_rate": 0.000384232121922626, + "loss": 3.512810230255127, + "step": 1311, + "token_acc": 0.2500586039871782 + }, + { + "epoch": 0.7692758721782469, + "grad_norm": 0.9286609876884838, + "learning_rate": 0.0003845252051582649, + "loss": 3.4888782501220703, + "step": 1312, + "token_acc": 0.24952270965661105 + }, + { + "epoch": 0.7698622104954559, + "grad_norm": 0.8513281468983792, + "learning_rate": 0.00038481828839390385, + "loss": 3.4999337196350098, + "step": 1313, + "token_acc": 0.2507118114171627 + }, + { + "epoch": 0.7704485488126649, + "grad_norm": 1.0558274250444264, + "learning_rate": 0.0003851113716295428, + "loss": 3.49381160736084, + "step": 1314, + "token_acc": 0.2524881784594836 + }, + { + "epoch": 0.7710348871298739, + "grad_norm": 1.2848425923343314, + "learning_rate": 0.0003854044548651817, + "loss": 3.476231575012207, + "step": 1315, + "token_acc": 0.2532608206872654 + }, + { + "epoch": 0.771621225447083, + "grad_norm": 0.9618716502650655, + "learning_rate": 0.00038569753810082063, + "loss": 3.4800541400909424, + "step": 1316, + "token_acc": 0.25382663269231265 + }, + { + "epoch": 0.772207563764292, + "grad_norm": 1.3026909008185643, + "learning_rate": 0.00038599062133645956, + "loss": 3.4334373474121094, + "step": 1317, + "token_acc": 0.25964722030133486 + }, + { + "epoch": 0.772793902081501, + "grad_norm": 0.8279225536912802, + "learning_rate": 0.0003862837045720985, + "loss": 3.4873340129852295, + "step": 1318, + "token_acc": 0.2507323710680078 + }, + { + "epoch": 0.77338024039871, + "grad_norm": 1.1738047788359856, + "learning_rate": 0.0003865767878077374, + "loss": 3.5061237812042236, + "step": 1319, + "token_acc": 0.24763991996363882 + }, + { + "epoch": 0.7739665787159191, + "grad_norm": 0.9826258911228014, + "learning_rate": 0.00038686987104337634, + "loss": 3.472834348678589, + "step": 1320, + "token_acc": 0.2559736420741471 + }, + { + "epoch": 0.7745529170331281, + "grad_norm": 0.9680921098301252, + "learning_rate": 0.00038716295427901527, + "loss": 3.4527368545532227, + "step": 1321, + "token_acc": 0.25550909827847346 + }, + { + "epoch": 0.7751392553503371, + "grad_norm": 0.9599234070876219, + "learning_rate": 0.0003874560375146542, + "loss": 3.490530014038086, + "step": 1322, + "token_acc": 0.25294146038011067 + }, + { + "epoch": 0.7757255936675461, + "grad_norm": 0.8859629164520336, + "learning_rate": 0.0003877491207502931, + "loss": 3.4869954586029053, + "step": 1323, + "token_acc": 0.25216506749512 + }, + { + "epoch": 0.7763119319847552, + "grad_norm": 0.8370047156045757, + "learning_rate": 0.000388042203985932, + "loss": 3.4701340198516846, + "step": 1324, + "token_acc": 0.253744326179443 + }, + { + "epoch": 0.7768982703019642, + "grad_norm": 0.809264841428513, + "learning_rate": 0.0003883352872215709, + "loss": 3.455357074737549, + "step": 1325, + "token_acc": 0.2543043215956026 + }, + { + "epoch": 0.7774846086191732, + "grad_norm": 0.8866636587753497, + "learning_rate": 0.00038862837045720985, + "loss": 3.4878439903259277, + "step": 1326, + "token_acc": 0.25365168614321604 + }, + { + "epoch": 0.7780709469363823, + "grad_norm": 1.3516164428753499, + "learning_rate": 0.0003889214536928488, + "loss": 3.512117385864258, + "step": 1327, + "token_acc": 0.24848324910604722 + }, + { + "epoch": 0.7786572852535913, + "grad_norm": 1.0427806512913531, + "learning_rate": 0.0003892145369284877, + "loss": 3.4557085037231445, + "step": 1328, + "token_acc": 0.255793352978092 + }, + { + "epoch": 0.7792436235708003, + "grad_norm": 0.7482853663299864, + "learning_rate": 0.00038950762016412663, + "loss": 3.4501166343688965, + "step": 1329, + "token_acc": 0.255271152831596 + }, + { + "epoch": 0.7798299618880093, + "grad_norm": 0.8558108600578263, + "learning_rate": 0.00038980070339976556, + "loss": 3.510521411895752, + "step": 1330, + "token_acc": 0.2493889149300728 + }, + { + "epoch": 0.7804163002052185, + "grad_norm": 0.8919451203223152, + "learning_rate": 0.00039009378663540443, + "loss": 3.446441888809204, + "step": 1331, + "token_acc": 0.2563788324277697 + }, + { + "epoch": 0.7810026385224275, + "grad_norm": 0.9448816313122309, + "learning_rate": 0.00039038686987104336, + "loss": 3.485175132751465, + "step": 1332, + "token_acc": 0.25235118904892073 + }, + { + "epoch": 0.7815889768396365, + "grad_norm": 1.172294510352743, + "learning_rate": 0.0003906799531066823, + "loss": 3.505655288696289, + "step": 1333, + "token_acc": 0.24697578067057108 + }, + { + "epoch": 0.7821753151568455, + "grad_norm": 1.1952119988026573, + "learning_rate": 0.0003909730363423212, + "loss": 3.543708324432373, + "step": 1334, + "token_acc": 0.24501603072534195 + }, + { + "epoch": 0.7827616534740546, + "grad_norm": 0.8748477516495661, + "learning_rate": 0.00039126611957796014, + "loss": 3.4726717472076416, + "step": 1335, + "token_acc": 0.2526044792701584 + }, + { + "epoch": 0.7833479917912636, + "grad_norm": 0.7946392964916559, + "learning_rate": 0.00039155920281359907, + "loss": 3.4403886795043945, + "step": 1336, + "token_acc": 0.2559458652602249 + }, + { + "epoch": 0.7839343301084726, + "grad_norm": 1.103170369075015, + "learning_rate": 0.000391852286049238, + "loss": 3.428464412689209, + "step": 1337, + "token_acc": 0.2595619474794555 + }, + { + "epoch": 0.7845206684256816, + "grad_norm": 1.194272859704073, + "learning_rate": 0.0003921453692848769, + "loss": 3.503781318664551, + "step": 1338, + "token_acc": 0.25076877320261093 + }, + { + "epoch": 0.7851070067428907, + "grad_norm": 0.8001982418328981, + "learning_rate": 0.00039243845252051585, + "loss": 3.4887337684631348, + "step": 1339, + "token_acc": 0.25095999273663355 + }, + { + "epoch": 0.7856933450600997, + "grad_norm": 1.0203876461531287, + "learning_rate": 0.0003927315357561548, + "loss": 3.4944047927856445, + "step": 1340, + "token_acc": 0.25051094130358376 + }, + { + "epoch": 0.7862796833773087, + "grad_norm": 1.1035535984900533, + "learning_rate": 0.0003930246189917937, + "loss": 3.5003156661987305, + "step": 1341, + "token_acc": 0.2486349386223504 + }, + { + "epoch": 0.7868660216945177, + "grad_norm": 0.8397372201335467, + "learning_rate": 0.0003933177022274326, + "loss": 3.48427152633667, + "step": 1342, + "token_acc": 0.2521773918184088 + }, + { + "epoch": 0.7874523600117268, + "grad_norm": 1.0557629358776, + "learning_rate": 0.0003936107854630715, + "loss": 3.4606246948242188, + "step": 1343, + "token_acc": 0.2532410076816997 + }, + { + "epoch": 0.7880386983289358, + "grad_norm": 1.3835518556883315, + "learning_rate": 0.00039390386869871043, + "loss": 3.527385711669922, + "step": 1344, + "token_acc": 0.24679863927573348 + }, + { + "epoch": 0.7886250366461448, + "grad_norm": 0.7060137875705657, + "learning_rate": 0.00039419695193434936, + "loss": 3.4654507637023926, + "step": 1345, + "token_acc": 0.2550812122556437 + }, + { + "epoch": 0.7892113749633538, + "grad_norm": 0.9843111851801306, + "learning_rate": 0.0003944900351699883, + "loss": 3.464249610900879, + "step": 1346, + "token_acc": 0.25332529093045747 + }, + { + "epoch": 0.7897977132805629, + "grad_norm": 1.1098217910051005, + "learning_rate": 0.0003947831184056272, + "loss": 3.478701114654541, + "step": 1347, + "token_acc": 0.2545163459732028 + }, + { + "epoch": 0.7903840515977719, + "grad_norm": 0.8101602358852716, + "learning_rate": 0.00039507620164126614, + "loss": 3.5002989768981934, + "step": 1348, + "token_acc": 0.24926323643049375 + }, + { + "epoch": 0.7909703899149809, + "grad_norm": 0.9472157344819309, + "learning_rate": 0.00039536928487690507, + "loss": 3.493964672088623, + "step": 1349, + "token_acc": 0.25211640488473575 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 0.8253257653614141, + "learning_rate": 0.000395662368112544, + "loss": 3.4287829399108887, + "step": 1350, + "token_acc": 0.2595739271649969 + }, + { + "epoch": 0.792143066549399, + "grad_norm": 0.9959955467271533, + "learning_rate": 0.00039595545134818287, + "loss": 3.513934850692749, + "step": 1351, + "token_acc": 0.2474573231204237 + }, + { + "epoch": 0.792729404866608, + "grad_norm": 1.3208406456319923, + "learning_rate": 0.0003962485345838218, + "loss": 3.4896159172058105, + "step": 1352, + "token_acc": 0.25308993726811474 + }, + { + "epoch": 0.793315743183817, + "grad_norm": 0.8886095797458569, + "learning_rate": 0.0003965416178194607, + "loss": 3.5137810707092285, + "step": 1353, + "token_acc": 0.2456682044389553 + }, + { + "epoch": 0.7939020815010261, + "grad_norm": 0.9601854114348637, + "learning_rate": 0.00039683470105509965, + "loss": 3.541001081466675, + "step": 1354, + "token_acc": 0.2445393757304402 + }, + { + "epoch": 0.7944884198182351, + "grad_norm": 1.1096799473887897, + "learning_rate": 0.0003971277842907386, + "loss": 3.463557720184326, + "step": 1355, + "token_acc": 0.2540497488688722 + }, + { + "epoch": 0.7950747581354441, + "grad_norm": 1.1615019689632204, + "learning_rate": 0.0003974208675263775, + "loss": 3.458465099334717, + "step": 1356, + "token_acc": 0.2565389893434878 + }, + { + "epoch": 0.7956610964526531, + "grad_norm": 0.8567150855293202, + "learning_rate": 0.00039771395076201643, + "loss": 3.3802685737609863, + "step": 1357, + "token_acc": 0.26693192402170957 + }, + { + "epoch": 0.7962474347698623, + "grad_norm": 0.8819941643727316, + "learning_rate": 0.00039800703399765536, + "loss": 3.535536050796509, + "step": 1358, + "token_acc": 0.24338467976044145 + }, + { + "epoch": 0.7968337730870713, + "grad_norm": 0.8674153197792284, + "learning_rate": 0.0003983001172332943, + "loss": 3.4684572219848633, + "step": 1359, + "token_acc": 0.25408120339091084 + }, + { + "epoch": 0.7974201114042803, + "grad_norm": 0.9884503667344221, + "learning_rate": 0.00039859320046893316, + "loss": 3.4723939895629883, + "step": 1360, + "token_acc": 0.2526795454372077 + }, + { + "epoch": 0.7980064497214893, + "grad_norm": 1.2055938752026887, + "learning_rate": 0.0003988862837045721, + "loss": 3.4520530700683594, + "step": 1361, + "token_acc": 0.25510560691292317 + }, + { + "epoch": 0.7985927880386984, + "grad_norm": 0.8980384725050254, + "learning_rate": 0.000399179366940211, + "loss": 3.3998494148254395, + "step": 1362, + "token_acc": 0.2651210777998559 + }, + { + "epoch": 0.7991791263559074, + "grad_norm": 0.9689317948322111, + "learning_rate": 0.00039947245017584994, + "loss": 3.4836082458496094, + "step": 1363, + "token_acc": 0.25241331719804044 + }, + { + "epoch": 0.7997654646731164, + "grad_norm": 0.913257659928347, + "learning_rate": 0.00039976553341148887, + "loss": 3.4455323219299316, + "step": 1364, + "token_acc": 0.2559823218270327 + }, + { + "epoch": 0.8003518029903254, + "grad_norm": 0.8098367705266915, + "learning_rate": 0.0004000586166471278, + "loss": 3.461963176727295, + "step": 1365, + "token_acc": 0.2544156604817866 + }, + { + "epoch": 0.8009381413075345, + "grad_norm": 0.842508688137061, + "learning_rate": 0.0004003516998827667, + "loss": 3.472806930541992, + "step": 1366, + "token_acc": 0.2523059396304877 + }, + { + "epoch": 0.8015244796247435, + "grad_norm": 0.9386681215326841, + "learning_rate": 0.00040064478311840565, + "loss": 3.4677324295043945, + "step": 1367, + "token_acc": 0.25425463395509756 + }, + { + "epoch": 0.8021108179419525, + "grad_norm": 1.1981374414805317, + "learning_rate": 0.0004009378663540446, + "loss": 3.4789719581604004, + "step": 1368, + "token_acc": 0.2501066035961829 + }, + { + "epoch": 0.8026971562591615, + "grad_norm": 1.0594723685365937, + "learning_rate": 0.0004012309495896835, + "loss": 3.4348092079162598, + "step": 1369, + "token_acc": 0.2579946893043786 + }, + { + "epoch": 0.8032834945763706, + "grad_norm": 1.0776821177607765, + "learning_rate": 0.00040152403282532243, + "loss": 3.445159912109375, + "step": 1370, + "token_acc": 0.2551865284974093 + }, + { + "epoch": 0.8038698328935796, + "grad_norm": 0.9232657095786017, + "learning_rate": 0.0004018171160609613, + "loss": 3.475691318511963, + "step": 1371, + "token_acc": 0.25365587060339106 + }, + { + "epoch": 0.8044561712107886, + "grad_norm": 0.7796945633236816, + "learning_rate": 0.00040211019929660023, + "loss": 3.472425937652588, + "step": 1372, + "token_acc": 0.25509974501274935 + }, + { + "epoch": 0.8050425095279976, + "grad_norm": 0.8640813817278077, + "learning_rate": 0.00040240328253223916, + "loss": 3.4655838012695312, + "step": 1373, + "token_acc": 0.2542636992101896 + }, + { + "epoch": 0.8056288478452067, + "grad_norm": 0.8537053087070122, + "learning_rate": 0.0004026963657678781, + "loss": 3.4734272956848145, + "step": 1374, + "token_acc": 0.251195033521703 + }, + { + "epoch": 0.8062151861624157, + "grad_norm": 1.0435184273580218, + "learning_rate": 0.000402989449003517, + "loss": 3.4575772285461426, + "step": 1375, + "token_acc": 0.25561145250419226 + }, + { + "epoch": 0.8068015244796247, + "grad_norm": 1.336303247311034, + "learning_rate": 0.00040328253223915594, + "loss": 3.4576003551483154, + "step": 1376, + "token_acc": 0.253703814041205 + }, + { + "epoch": 0.8073878627968337, + "grad_norm": 0.8379482737762568, + "learning_rate": 0.00040357561547479487, + "loss": 3.4817376136779785, + "step": 1377, + "token_acc": 0.2514983959729793 + }, + { + "epoch": 0.8079742011140428, + "grad_norm": 1.0769148677468892, + "learning_rate": 0.00040386869871043374, + "loss": 3.484868288040161, + "step": 1378, + "token_acc": 0.2522101894004133 + }, + { + "epoch": 0.8085605394312518, + "grad_norm": 1.1363569453563487, + "learning_rate": 0.00040416178194607267, + "loss": 3.4506750106811523, + "step": 1379, + "token_acc": 0.2532601789190225 + }, + { + "epoch": 0.8091468777484608, + "grad_norm": 1.2384835183434617, + "learning_rate": 0.0004044548651817116, + "loss": 3.5003504753112793, + "step": 1380, + "token_acc": 0.2497182533496546 + }, + { + "epoch": 0.8097332160656698, + "grad_norm": 0.8918144598700374, + "learning_rate": 0.0004047479484173505, + "loss": 3.481995105743408, + "step": 1381, + "token_acc": 0.25035134403870957 + }, + { + "epoch": 0.810319554382879, + "grad_norm": 0.8508959591167338, + "learning_rate": 0.00040504103165298945, + "loss": 3.4822134971618652, + "step": 1382, + "token_acc": 0.25215677456051006 + }, + { + "epoch": 0.810905892700088, + "grad_norm": 0.9730378953300497, + "learning_rate": 0.0004053341148886284, + "loss": 3.4935007095336914, + "step": 1383, + "token_acc": 0.2493882045317992 + }, + { + "epoch": 0.811492231017297, + "grad_norm": 1.205260522557191, + "learning_rate": 0.0004056271981242673, + "loss": 3.460737705230713, + "step": 1384, + "token_acc": 0.2547436127439082 + }, + { + "epoch": 0.8120785693345061, + "grad_norm": 1.0319250304632726, + "learning_rate": 0.00040592028135990623, + "loss": 3.44663667678833, + "step": 1385, + "token_acc": 0.25536034870685265 + }, + { + "epoch": 0.8126649076517151, + "grad_norm": 1.178763445900931, + "learning_rate": 0.00040621336459554516, + "loss": 3.4629967212677, + "step": 1386, + "token_acc": 0.2543632846032621 + }, + { + "epoch": 0.8132512459689241, + "grad_norm": 0.777232254189139, + "learning_rate": 0.0004065064478311841, + "loss": 3.436275005340576, + "step": 1387, + "token_acc": 0.25696680590106463 + }, + { + "epoch": 0.8138375842861331, + "grad_norm": 0.8379848816516873, + "learning_rate": 0.000406799531066823, + "loss": 3.513934850692749, + "step": 1388, + "token_acc": 0.2472319473971217 + }, + { + "epoch": 0.8144239226033422, + "grad_norm": 0.8496106230172547, + "learning_rate": 0.00040709261430246194, + "loss": 3.455428123474121, + "step": 1389, + "token_acc": 0.25344041250539323 + }, + { + "epoch": 0.8150102609205512, + "grad_norm": 0.7953816851023453, + "learning_rate": 0.00040738569753810086, + "loss": 3.4240341186523438, + "step": 1390, + "token_acc": 0.2573587462056567 + }, + { + "epoch": 0.8155965992377602, + "grad_norm": 0.7818021752142441, + "learning_rate": 0.0004076787807737398, + "loss": 3.4149389266967773, + "step": 1391, + "token_acc": 0.2580034849312809 + }, + { + "epoch": 0.8161829375549692, + "grad_norm": 0.9059756382908705, + "learning_rate": 0.00040797186400937866, + "loss": 3.4758903980255127, + "step": 1392, + "token_acc": 0.2547208728458244 + }, + { + "epoch": 0.8167692758721783, + "grad_norm": 1.0849325275453237, + "learning_rate": 0.0004082649472450176, + "loss": 3.4370362758636475, + "step": 1393, + "token_acc": 0.25648111728076584 + }, + { + "epoch": 0.8173556141893873, + "grad_norm": 0.9861873957805206, + "learning_rate": 0.0004085580304806565, + "loss": 3.4439001083374023, + "step": 1394, + "token_acc": 0.25485562267322204 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 0.9493362416674663, + "learning_rate": 0.00040885111371629545, + "loss": 3.4527602195739746, + "step": 1395, + "token_acc": 0.2546407189513649 + }, + { + "epoch": 0.8185282908238053, + "grad_norm": 0.890556405471747, + "learning_rate": 0.0004091441969519343, + "loss": 3.433462142944336, + "step": 1396, + "token_acc": 0.2568332558827257 + }, + { + "epoch": 0.8191146291410144, + "grad_norm": 0.8517890219455814, + "learning_rate": 0.00040943728018757325, + "loss": 3.479552745819092, + "step": 1397, + "token_acc": 0.25350896654219446 + }, + { + "epoch": 0.8197009674582234, + "grad_norm": 0.9247260836531138, + "learning_rate": 0.0004097303634232122, + "loss": 3.5245985984802246, + "step": 1398, + "token_acc": 0.24757986321755046 + }, + { + "epoch": 0.8202873057754324, + "grad_norm": 1.0050053466281912, + "learning_rate": 0.0004100234466588511, + "loss": 3.5012197494506836, + "step": 1399, + "token_acc": 0.24878283652233152 + }, + { + "epoch": 0.8208736440926414, + "grad_norm": 0.8551374936045113, + "learning_rate": 0.00041031652989449003, + "loss": 3.4459376335144043, + "step": 1400, + "token_acc": 0.25510156577334303 + }, + { + "epoch": 0.8214599824098505, + "grad_norm": 0.7644142277675621, + "learning_rate": 0.00041060961313012896, + "loss": 3.483003854751587, + "step": 1401, + "token_acc": 0.25132728991888276 + }, + { + "epoch": 0.8220463207270595, + "grad_norm": 0.9172063868011725, + "learning_rate": 0.0004109026963657679, + "loss": 3.4474453926086426, + "step": 1402, + "token_acc": 0.25700021818861885 + }, + { + "epoch": 0.8226326590442685, + "grad_norm": 1.1086408235294434, + "learning_rate": 0.0004111957796014068, + "loss": 3.4551424980163574, + "step": 1403, + "token_acc": 0.2570417779255457 + }, + { + "epoch": 0.8232189973614775, + "grad_norm": 1.0763783036603938, + "learning_rate": 0.00041148886283704574, + "loss": 3.4139785766601562, + "step": 1404, + "token_acc": 0.25992755109015103 + }, + { + "epoch": 0.8238053356786866, + "grad_norm": 0.8374865732826589, + "learning_rate": 0.00041178194607268466, + "loss": 3.4263217449188232, + "step": 1405, + "token_acc": 0.25560226060393637 + }, + { + "epoch": 0.8243916739958956, + "grad_norm": 0.8004204572117098, + "learning_rate": 0.0004120750293083236, + "loss": 3.4471328258514404, + "step": 1406, + "token_acc": 0.2545264302493768 + }, + { + "epoch": 0.8249780123131046, + "grad_norm": 0.9693946951942966, + "learning_rate": 0.0004123681125439625, + "loss": 3.5001187324523926, + "step": 1407, + "token_acc": 0.24848862909995456 + }, + { + "epoch": 0.8255643506303136, + "grad_norm": 0.8964247682417612, + "learning_rate": 0.00041266119577960145, + "loss": 3.394019842147827, + "step": 1408, + "token_acc": 0.26270580001520316 + }, + { + "epoch": 0.8261506889475227, + "grad_norm": 0.8471161278870991, + "learning_rate": 0.00041295427901524037, + "loss": 3.511037588119507, + "step": 1409, + "token_acc": 0.24715160264121688 + }, + { + "epoch": 0.8267370272647317, + "grad_norm": 0.7692320627526633, + "learning_rate": 0.0004132473622508793, + "loss": 3.4227454662323, + "step": 1410, + "token_acc": 0.2576579684690336 + }, + { + "epoch": 0.8273233655819408, + "grad_norm": 0.7436822861973503, + "learning_rate": 0.00041354044548651823, + "loss": 3.4588029384613037, + "step": 1411, + "token_acc": 0.2553178530643319 + }, + { + "epoch": 0.8279097038991499, + "grad_norm": 0.7737460051288806, + "learning_rate": 0.0004138335287221571, + "loss": 3.4498367309570312, + "step": 1412, + "token_acc": 0.25420237332214984 + }, + { + "epoch": 0.8284960422163589, + "grad_norm": 0.8057163774560374, + "learning_rate": 0.000414126611957796, + "loss": 3.406430721282959, + "step": 1413, + "token_acc": 0.260003003469798 + }, + { + "epoch": 0.8290823805335679, + "grad_norm": 0.9261067619289706, + "learning_rate": 0.0004144196951934349, + "loss": 3.4880783557891846, + "step": 1414, + "token_acc": 0.2502901614401702 + }, + { + "epoch": 0.8296687188507769, + "grad_norm": 1.1313316390815347, + "learning_rate": 0.00041471277842907383, + "loss": 3.4647789001464844, + "step": 1415, + "token_acc": 0.25555520575904966 + }, + { + "epoch": 0.830255057167986, + "grad_norm": 0.9775831090958634, + "learning_rate": 0.00041500586166471275, + "loss": 3.4898481369018555, + "step": 1416, + "token_acc": 0.2508996303594658 + }, + { + "epoch": 0.830841395485195, + "grad_norm": 0.9163661926362927, + "learning_rate": 0.0004152989449003517, + "loss": 3.4043221473693848, + "step": 1417, + "token_acc": 0.2600599554364189 + }, + { + "epoch": 0.831427733802404, + "grad_norm": 1.0774927102610308, + "learning_rate": 0.0004155920281359906, + "loss": 3.4807019233703613, + "step": 1418, + "token_acc": 0.25223451491461474 + }, + { + "epoch": 0.832014072119613, + "grad_norm": 1.0666783787457559, + "learning_rate": 0.00041588511137162954, + "loss": 3.4704935550689697, + "step": 1419, + "token_acc": 0.25268332900743545 + }, + { + "epoch": 0.8326004104368221, + "grad_norm": 0.9716982065304188, + "learning_rate": 0.00041617819460726846, + "loss": 3.437676429748535, + "step": 1420, + "token_acc": 0.25658662707352037 + }, + { + "epoch": 0.8331867487540311, + "grad_norm": 0.9443100783668076, + "learning_rate": 0.0004164712778429074, + "loss": 3.4950509071350098, + "step": 1421, + "token_acc": 0.24932606709207758 + }, + { + "epoch": 0.8337730870712401, + "grad_norm": 0.9652461415355784, + "learning_rate": 0.0004167643610785463, + "loss": 3.466236114501953, + "step": 1422, + "token_acc": 0.2525144463949375 + }, + { + "epoch": 0.8343594253884491, + "grad_norm": 0.9398557077942571, + "learning_rate": 0.00041705744431418524, + "loss": 3.4050326347351074, + "step": 1423, + "token_acc": 0.26099609275891816 + }, + { + "epoch": 0.8349457637056582, + "grad_norm": 0.8597226819661934, + "learning_rate": 0.00041735052754982417, + "loss": 3.4048702716827393, + "step": 1424, + "token_acc": 0.2592719197900545 + }, + { + "epoch": 0.8355321020228672, + "grad_norm": 0.914149612924782, + "learning_rate": 0.0004176436107854631, + "loss": 3.408811092376709, + "step": 1425, + "token_acc": 0.26108501467252854 + }, + { + "epoch": 0.8361184403400762, + "grad_norm": 0.891830093648117, + "learning_rate": 0.000417936694021102, + "loss": 3.439952850341797, + "step": 1426, + "token_acc": 0.25521750828760137 + }, + { + "epoch": 0.8367047786572852, + "grad_norm": 0.8167885744609052, + "learning_rate": 0.00041822977725674095, + "loss": 3.4615821838378906, + "step": 1427, + "token_acc": 0.2547976621265537 + }, + { + "epoch": 0.8372911169744943, + "grad_norm": 0.8148129062872108, + "learning_rate": 0.0004185228604923799, + "loss": 3.4843926429748535, + "step": 1428, + "token_acc": 0.2515890409532684 + }, + { + "epoch": 0.8378774552917033, + "grad_norm": 0.7574575552111641, + "learning_rate": 0.0004188159437280188, + "loss": 3.4420738220214844, + "step": 1429, + "token_acc": 0.25465257953143644 + }, + { + "epoch": 0.8384637936089123, + "grad_norm": 0.782761945837116, + "learning_rate": 0.00041910902696365774, + "loss": 3.455498218536377, + "step": 1430, + "token_acc": 0.25486719734471996 + }, + { + "epoch": 0.8390501319261213, + "grad_norm": 0.7577469802258135, + "learning_rate": 0.0004194021101992966, + "loss": 3.459378719329834, + "step": 1431, + "token_acc": 0.2553597919079843 + }, + { + "epoch": 0.8396364702433304, + "grad_norm": 0.790007110800162, + "learning_rate": 0.0004196951934349355, + "loss": 3.434330940246582, + "step": 1432, + "token_acc": 0.25602041218944405 + }, + { + "epoch": 0.8402228085605394, + "grad_norm": 0.88987461672636, + "learning_rate": 0.0004199882766705744, + "loss": 3.4827966690063477, + "step": 1433, + "token_acc": 0.2518430599838126 + }, + { + "epoch": 0.8408091468777484, + "grad_norm": 1.1223352167923297, + "learning_rate": 0.00042028135990621334, + "loss": 3.441190481185913, + "step": 1434, + "token_acc": 0.2541853464425524 + }, + { + "epoch": 0.8413954851949574, + "grad_norm": 1.026808984090443, + "learning_rate": 0.00042057444314185226, + "loss": 3.4398837089538574, + "step": 1435, + "token_acc": 0.25734161951581486 + }, + { + "epoch": 0.8419818235121665, + "grad_norm": 0.9312421803437729, + "learning_rate": 0.0004208675263774912, + "loss": 3.4850120544433594, + "step": 1436, + "token_acc": 0.2504329044141798 + }, + { + "epoch": 0.8425681618293756, + "grad_norm": 1.039285703844888, + "learning_rate": 0.0004211606096131301, + "loss": 3.5089030265808105, + "step": 1437, + "token_acc": 0.24912117602626305 + }, + { + "epoch": 0.8431545001465846, + "grad_norm": 0.9521912847470324, + "learning_rate": 0.00042145369284876904, + "loss": 3.4704322814941406, + "step": 1438, + "token_acc": 0.253545115953876 + }, + { + "epoch": 0.8437408384637937, + "grad_norm": 0.8965030271138373, + "learning_rate": 0.00042174677608440797, + "loss": 3.447204113006592, + "step": 1439, + "token_acc": 0.2533925639064364 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 1.00228246741743, + "learning_rate": 0.0004220398593200469, + "loss": 3.4474263191223145, + "step": 1440, + "token_acc": 0.2554929491203626 + }, + { + "epoch": 0.8449135150982117, + "grad_norm": 1.0662324263822087, + "learning_rate": 0.0004223329425556858, + "loss": 3.463426113128662, + "step": 1441, + "token_acc": 0.2536346183780418 + }, + { + "epoch": 0.8454998534154207, + "grad_norm": 0.6862167973664497, + "learning_rate": 0.00042262602579132475, + "loss": 3.4336538314819336, + "step": 1442, + "token_acc": 0.25574753182869603 + }, + { + "epoch": 0.8460861917326298, + "grad_norm": 0.816118152117597, + "learning_rate": 0.0004229191090269637, + "loss": 3.4890904426574707, + "step": 1443, + "token_acc": 0.24978243118378923 + }, + { + "epoch": 0.8466725300498388, + "grad_norm": 0.7534709349554227, + "learning_rate": 0.0004232121922626026, + "loss": 3.459723472595215, + "step": 1444, + "token_acc": 0.25407042786823175 + }, + { + "epoch": 0.8472588683670478, + "grad_norm": 0.8106012433488445, + "learning_rate": 0.00042350527549824153, + "loss": 3.433746337890625, + "step": 1445, + "token_acc": 0.2556739497379244 + }, + { + "epoch": 0.8478452066842568, + "grad_norm": 0.860812528294037, + "learning_rate": 0.00042379835873388046, + "loss": 3.457742691040039, + "step": 1446, + "token_acc": 0.2537492721769646 + }, + { + "epoch": 0.8484315450014659, + "grad_norm": 0.8339282560506189, + "learning_rate": 0.0004240914419695194, + "loss": 3.4138875007629395, + "step": 1447, + "token_acc": 0.26043569847172365 + }, + { + "epoch": 0.8490178833186749, + "grad_norm": 0.8167831666660748, + "learning_rate": 0.0004243845252051583, + "loss": 3.47696590423584, + "step": 1448, + "token_acc": 0.2512036727484278 + }, + { + "epoch": 0.8496042216358839, + "grad_norm": 0.8640974255530178, + "learning_rate": 0.0004246776084407972, + "loss": 3.447117805480957, + "step": 1449, + "token_acc": 0.25497912729706673 + }, + { + "epoch": 0.8501905599530929, + "grad_norm": 0.9085842442267588, + "learning_rate": 0.0004249706916764361, + "loss": 3.4408185482025146, + "step": 1450, + "token_acc": 0.25575547593737286 + }, + { + "epoch": 0.850776898270302, + "grad_norm": 1.0624063708683804, + "learning_rate": 0.00042526377491207504, + "loss": 3.4095301628112793, + "step": 1451, + "token_acc": 0.259989085806351 + }, + { + "epoch": 0.851363236587511, + "grad_norm": 1.1174616615503816, + "learning_rate": 0.0004255568581477139, + "loss": 3.4574272632598877, + "step": 1452, + "token_acc": 0.2539443758860185 + }, + { + "epoch": 0.85194957490472, + "grad_norm": 0.8601468245880046, + "learning_rate": 0.00042584994138335284, + "loss": 3.4638266563415527, + "step": 1453, + "token_acc": 0.25422001905772484 + }, + { + "epoch": 0.852535913221929, + "grad_norm": 0.7607748812315341, + "learning_rate": 0.00042614302461899177, + "loss": 3.4572696685791016, + "step": 1454, + "token_acc": 0.25261450946577263 + }, + { + "epoch": 0.8531222515391381, + "grad_norm": 0.8406494619883967, + "learning_rate": 0.0004264361078546307, + "loss": 3.4227023124694824, + "step": 1455, + "token_acc": 0.2592655428636565 + }, + { + "epoch": 0.8537085898563471, + "grad_norm": 0.9024436803761858, + "learning_rate": 0.0004267291910902696, + "loss": 3.3479840755462646, + "step": 1456, + "token_acc": 0.26578968745557563 + }, + { + "epoch": 0.8542949281735561, + "grad_norm": 0.8822254387646381, + "learning_rate": 0.00042702227432590855, + "loss": 3.4071271419525146, + "step": 1457, + "token_acc": 0.2608795010722058 + }, + { + "epoch": 0.8548812664907651, + "grad_norm": 0.8309948721357688, + "learning_rate": 0.0004273153575615475, + "loss": 3.4616141319274902, + "step": 1458, + "token_acc": 0.25356811624072545 + }, + { + "epoch": 0.8554676048079742, + "grad_norm": 0.836223798076072, + "learning_rate": 0.0004276084407971864, + "loss": 3.3768367767333984, + "step": 1459, + "token_acc": 0.26454960467478883 + }, + { + "epoch": 0.8560539431251832, + "grad_norm": 0.86669354929888, + "learning_rate": 0.00042790152403282533, + "loss": 3.4410605430603027, + "step": 1460, + "token_acc": 0.25572766977909717 + }, + { + "epoch": 0.8566402814423922, + "grad_norm": 0.9473105411947157, + "learning_rate": 0.00042819460726846426, + "loss": 3.4127588272094727, + "step": 1461, + "token_acc": 0.2586588625850053 + }, + { + "epoch": 0.8572266197596012, + "grad_norm": 0.940028395892319, + "learning_rate": 0.0004284876905041032, + "loss": 3.4587528705596924, + "step": 1462, + "token_acc": 0.2546515619636114 + }, + { + "epoch": 0.8578129580768104, + "grad_norm": 1.0154059126279888, + "learning_rate": 0.0004287807737397421, + "loss": 3.462338447570801, + "step": 1463, + "token_acc": 0.25339021947759327 + }, + { + "epoch": 0.8583992963940194, + "grad_norm": 0.9312404537777714, + "learning_rate": 0.00042907385697538104, + "loss": 3.4368348121643066, + "step": 1464, + "token_acc": 0.25753379824274314 + }, + { + "epoch": 0.8589856347112284, + "grad_norm": 0.7195493711637365, + "learning_rate": 0.00042936694021101997, + "loss": 3.420156478881836, + "step": 1465, + "token_acc": 0.25829490319783993 + }, + { + "epoch": 0.8595719730284375, + "grad_norm": 0.7055490871307102, + "learning_rate": 0.0004296600234466589, + "loss": 3.400454521179199, + "step": 1466, + "token_acc": 0.2606092502516162 + }, + { + "epoch": 0.8601583113456465, + "grad_norm": 0.7829869333068712, + "learning_rate": 0.00042995310668229777, + "loss": 3.456515312194824, + "step": 1467, + "token_acc": 0.2539611791560821 + }, + { + "epoch": 0.8607446496628555, + "grad_norm": 0.8827962992302656, + "learning_rate": 0.0004302461899179367, + "loss": 3.4581494331359863, + "step": 1468, + "token_acc": 0.2515526966220927 + }, + { + "epoch": 0.8613309879800645, + "grad_norm": 0.9745171058837616, + "learning_rate": 0.0004305392731535756, + "loss": 3.4407095909118652, + "step": 1469, + "token_acc": 0.2564492051436503 + }, + { + "epoch": 0.8619173262972736, + "grad_norm": 0.9109063054679837, + "learning_rate": 0.00043083235638921455, + "loss": 3.4063034057617188, + "step": 1470, + "token_acc": 0.25997696356606104 + }, + { + "epoch": 0.8625036646144826, + "grad_norm": 0.9629509262400516, + "learning_rate": 0.0004311254396248535, + "loss": 3.4884450435638428, + "step": 1471, + "token_acc": 0.24953335243505184 + }, + { + "epoch": 0.8630900029316916, + "grad_norm": 0.8275062747254033, + "learning_rate": 0.0004314185228604924, + "loss": 3.4839179515838623, + "step": 1472, + "token_acc": 0.25182774238914624 + }, + { + "epoch": 0.8636763412489006, + "grad_norm": 0.8412194139229128, + "learning_rate": 0.0004317116060961313, + "loss": 3.41888165473938, + "step": 1473, + "token_acc": 0.25964413677764897 + }, + { + "epoch": 0.8642626795661097, + "grad_norm": 0.8805580040254419, + "learning_rate": 0.0004320046893317702, + "loss": 3.4754481315612793, + "step": 1474, + "token_acc": 0.25245035436290786 + }, + { + "epoch": 0.8648490178833187, + "grad_norm": 0.9185386368495354, + "learning_rate": 0.00043229777256740913, + "loss": 3.4386610984802246, + "step": 1475, + "token_acc": 0.254469358034086 + }, + { + "epoch": 0.8654353562005277, + "grad_norm": 0.9583935089343669, + "learning_rate": 0.00043259085580304806, + "loss": 3.4110469818115234, + "step": 1476, + "token_acc": 0.26002842786560226 + }, + { + "epoch": 0.8660216945177367, + "grad_norm": 0.9505998025954556, + "learning_rate": 0.000432883939038687, + "loss": 3.4340872764587402, + "step": 1477, + "token_acc": 0.2556921305440022 + }, + { + "epoch": 0.8666080328349458, + "grad_norm": 0.9200141999600606, + "learning_rate": 0.0004331770222743259, + "loss": 3.4843838214874268, + "step": 1478, + "token_acc": 0.25116328651760256 + }, + { + "epoch": 0.8671943711521548, + "grad_norm": 0.9547306496408687, + "learning_rate": 0.00043347010550996484, + "loss": 3.4228217601776123, + "step": 1479, + "token_acc": 0.2578297792429996 + }, + { + "epoch": 0.8677807094693638, + "grad_norm": 1.05516449975363, + "learning_rate": 0.00043376318874560377, + "loss": 3.4603793621063232, + "step": 1480, + "token_acc": 0.25362284511119887 + }, + { + "epoch": 0.8683670477865728, + "grad_norm": 1.2468264497910304, + "learning_rate": 0.0004340562719812427, + "loss": 3.4523167610168457, + "step": 1481, + "token_acc": 0.2544499288532605 + }, + { + "epoch": 0.8689533861037819, + "grad_norm": 0.8341177163474129, + "learning_rate": 0.0004343493552168816, + "loss": 3.374677896499634, + "step": 1482, + "token_acc": 0.2639230366575195 + }, + { + "epoch": 0.8695397244209909, + "grad_norm": 0.7183986127931309, + "learning_rate": 0.00043464243845252055, + "loss": 3.4278154373168945, + "step": 1483, + "token_acc": 0.2578605254622121 + }, + { + "epoch": 0.8701260627381999, + "grad_norm": 0.7075598297851543, + "learning_rate": 0.0004349355216881594, + "loss": 3.4355833530426025, + "step": 1484, + "token_acc": 0.25609448554135844 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.7943205193572124, + "learning_rate": 0.00043522860492379835, + "loss": 3.4582817554473877, + "step": 1485, + "token_acc": 0.2526775595766982 + }, + { + "epoch": 0.871298739372618, + "grad_norm": 0.6936701984103647, + "learning_rate": 0.0004355216881594373, + "loss": 3.4594743251800537, + "step": 1486, + "token_acc": 0.25172484675349316 + }, + { + "epoch": 0.871885077689827, + "grad_norm": 0.7843735283362575, + "learning_rate": 0.0004358147713950762, + "loss": 3.4104268550872803, + "step": 1487, + "token_acc": 0.2591858239498279 + }, + { + "epoch": 0.872471416007036, + "grad_norm": 0.8385010528910897, + "learning_rate": 0.00043610785463071513, + "loss": 3.4594948291778564, + "step": 1488, + "token_acc": 0.2539525229035631 + }, + { + "epoch": 0.873057754324245, + "grad_norm": 0.8521190273723712, + "learning_rate": 0.00043640093786635406, + "loss": 3.45263409614563, + "step": 1489, + "token_acc": 0.253604147803329 + }, + { + "epoch": 0.8736440926414542, + "grad_norm": 0.7969212495836359, + "learning_rate": 0.000436694021101993, + "loss": 3.451590061187744, + "step": 1490, + "token_acc": 0.25418343173015456 + }, + { + "epoch": 0.8742304309586632, + "grad_norm": 0.7939133493957939, + "learning_rate": 0.0004369871043376319, + "loss": 3.436300754547119, + "step": 1491, + "token_acc": 0.25612652254320517 + }, + { + "epoch": 0.8748167692758722, + "grad_norm": 0.8659457667904533, + "learning_rate": 0.00043728018757327084, + "loss": 3.40575909614563, + "step": 1492, + "token_acc": 0.25953701488782954 + }, + { + "epoch": 0.8754031075930812, + "grad_norm": 1.0481689355984503, + "learning_rate": 0.0004375732708089097, + "loss": 3.437127113342285, + "step": 1493, + "token_acc": 0.25430251710448515 + }, + { + "epoch": 0.8759894459102903, + "grad_norm": 1.1270808248628945, + "learning_rate": 0.00043786635404454864, + "loss": 3.4267630577087402, + "step": 1494, + "token_acc": 0.25789778726985463 + }, + { + "epoch": 0.8765757842274993, + "grad_norm": 0.8821833823317513, + "learning_rate": 0.00043815943728018757, + "loss": 3.4602396488189697, + "step": 1495, + "token_acc": 0.2535968699826976 + }, + { + "epoch": 0.8771621225447083, + "grad_norm": 0.8636231313320814, + "learning_rate": 0.0004384525205158265, + "loss": 3.44230318069458, + "step": 1496, + "token_acc": 0.25648189029124735 + }, + { + "epoch": 0.8777484608619174, + "grad_norm": 0.727782951489715, + "learning_rate": 0.0004387456037514654, + "loss": 3.5044708251953125, + "step": 1497, + "token_acc": 0.2473282688038731 + }, + { + "epoch": 0.8783347991791264, + "grad_norm": 0.7491378264358077, + "learning_rate": 0.00043903868698710435, + "loss": 3.436920642852783, + "step": 1498, + "token_acc": 0.25539466062871885 + }, + { + "epoch": 0.8789211374963354, + "grad_norm": 0.7251735352081701, + "learning_rate": 0.0004393317702227433, + "loss": 3.4543185234069824, + "step": 1499, + "token_acc": 0.2535053640072425 + }, + { + "epoch": 0.8795074758135444, + "grad_norm": 0.694502544977926, + "learning_rate": 0.0004396248534583822, + "loss": 3.4446935653686523, + "step": 1500, + "token_acc": 0.2561802556526069 + }, + { + "epoch": 0.8800938141307535, + "grad_norm": 0.844119397593279, + "learning_rate": 0.00043991793669402113, + "loss": 3.4321775436401367, + "step": 1501, + "token_acc": 0.25758230363777757 + }, + { + "epoch": 0.8806801524479625, + "grad_norm": 0.9564906810529307, + "learning_rate": 0.00044021101992966, + "loss": 3.4222607612609863, + "step": 1502, + "token_acc": 0.2599147049056446 + }, + { + "epoch": 0.8812664907651715, + "grad_norm": 1.064010489593906, + "learning_rate": 0.00044050410316529893, + "loss": 3.4573817253112793, + "step": 1503, + "token_acc": 0.2537067968649621 + }, + { + "epoch": 0.8818528290823805, + "grad_norm": 0.9666213167522902, + "learning_rate": 0.00044079718640093786, + "loss": 3.463282585144043, + "step": 1504, + "token_acc": 0.251973219374716 + }, + { + "epoch": 0.8824391673995896, + "grad_norm": 0.832827453958595, + "learning_rate": 0.0004410902696365768, + "loss": 3.395981788635254, + "step": 1505, + "token_acc": 0.2627781102947247 + }, + { + "epoch": 0.8830255057167986, + "grad_norm": 0.7249847479709868, + "learning_rate": 0.0004413833528722157, + "loss": 3.3865585327148438, + "step": 1506, + "token_acc": 0.26271599965920267 + }, + { + "epoch": 0.8836118440340076, + "grad_norm": 0.895989554590513, + "learning_rate": 0.00044167643610785464, + "loss": 3.464002847671509, + "step": 1507, + "token_acc": 0.25222992929263066 + }, + { + "epoch": 0.8841981823512166, + "grad_norm": 0.9866100257618226, + "learning_rate": 0.00044196951934349357, + "loss": 3.44522762298584, + "step": 1508, + "token_acc": 0.257718978295675 + }, + { + "epoch": 0.8847845206684257, + "grad_norm": 1.1298580544241126, + "learning_rate": 0.0004422626025791325, + "loss": 3.4064228534698486, + "step": 1509, + "token_acc": 0.2585145752112565 + }, + { + "epoch": 0.8853708589856347, + "grad_norm": 1.0633836265481191, + "learning_rate": 0.0004425556858147714, + "loss": 3.380262613296509, + "step": 1510, + "token_acc": 0.26509709960872224 + }, + { + "epoch": 0.8859571973028437, + "grad_norm": 0.967645857665294, + "learning_rate": 0.00044284876905041035, + "loss": 3.468302011489868, + "step": 1511, + "token_acc": 0.25397846532655205 + }, + { + "epoch": 0.8865435356200527, + "grad_norm": 0.7681624013133922, + "learning_rate": 0.0004431418522860493, + "loss": 3.4191842079162598, + "step": 1512, + "token_acc": 0.2582953125921478 + }, + { + "epoch": 0.8871298739372618, + "grad_norm": 0.7604443185612104, + "learning_rate": 0.0004434349355216882, + "loss": 3.4150850772857666, + "step": 1513, + "token_acc": 0.2591268902038133 + }, + { + "epoch": 0.8877162122544708, + "grad_norm": 0.7256366834177113, + "learning_rate": 0.0004437280187573271, + "loss": 3.4862349033355713, + "step": 1514, + "token_acc": 0.24925985120159877 + }, + { + "epoch": 0.8883025505716798, + "grad_norm": 0.7450095216733653, + "learning_rate": 0.000444021101992966, + "loss": 3.471071243286133, + "step": 1515, + "token_acc": 0.25373223246128307 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7552527488203084, + "learning_rate": 0.00044431418522860493, + "loss": 3.425395965576172, + "step": 1516, + "token_acc": 0.2553320911036851 + }, + { + "epoch": 0.889475227206098, + "grad_norm": 0.7276074726692404, + "learning_rate": 0.00044460726846424386, + "loss": 3.427705764770508, + "step": 1517, + "token_acc": 0.25619508471430325 + }, + { + "epoch": 0.890061565523307, + "grad_norm": 0.6958205881846867, + "learning_rate": 0.0004449003516998828, + "loss": 3.4386587142944336, + "step": 1518, + "token_acc": 0.2563137110845808 + }, + { + "epoch": 0.890647903840516, + "grad_norm": 0.6143305518967327, + "learning_rate": 0.0004451934349355217, + "loss": 3.4120535850524902, + "step": 1519, + "token_acc": 0.2598844662414826 + }, + { + "epoch": 0.891234242157725, + "grad_norm": 0.6187609831929747, + "learning_rate": 0.0004454865181711606, + "loss": 3.3950066566467285, + "step": 1520, + "token_acc": 0.2598706229483616 + }, + { + "epoch": 0.8918205804749341, + "grad_norm": 0.6690485881618454, + "learning_rate": 0.0004457796014067995, + "loss": 3.4305474758148193, + "step": 1521, + "token_acc": 0.2563496843329457 + }, + { + "epoch": 0.8924069187921431, + "grad_norm": 0.8080564417157619, + "learning_rate": 0.00044607268464243844, + "loss": 3.457669258117676, + "step": 1522, + "token_acc": 0.2535174039800916 + }, + { + "epoch": 0.8929932571093521, + "grad_norm": 1.0022077962969926, + "learning_rate": 0.00044636576787807737, + "loss": 3.4499759674072266, + "step": 1523, + "token_acc": 0.2530148405089499 + }, + { + "epoch": 0.8935795954265612, + "grad_norm": 1.0991819722787757, + "learning_rate": 0.0004466588511137163, + "loss": 3.4505443572998047, + "step": 1524, + "token_acc": 0.25348414355120064 + }, + { + "epoch": 0.8941659337437702, + "grad_norm": 0.8229506536416142, + "learning_rate": 0.0004469519343493552, + "loss": 3.4218320846557617, + "step": 1525, + "token_acc": 0.2562675979369103 + }, + { + "epoch": 0.8947522720609792, + "grad_norm": 0.9723971810192745, + "learning_rate": 0.00044724501758499415, + "loss": 3.4341561794281006, + "step": 1526, + "token_acc": 0.25721039341525004 + }, + { + "epoch": 0.8953386103781882, + "grad_norm": 1.2789149366603556, + "learning_rate": 0.0004475381008206331, + "loss": 3.3914241790771484, + "step": 1527, + "token_acc": 0.26124540922705675 + }, + { + "epoch": 0.8959249486953973, + "grad_norm": 0.8915035534052372, + "learning_rate": 0.000447831184056272, + "loss": 3.4878382682800293, + "step": 1528, + "token_acc": 0.25029549612514645 + }, + { + "epoch": 0.8965112870126063, + "grad_norm": 1.0395847724431233, + "learning_rate": 0.00044812426729191093, + "loss": 3.4202322959899902, + "step": 1529, + "token_acc": 0.2591754538007044 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.8809901991927357, + "learning_rate": 0.00044841735052754986, + "loss": 3.3865814208984375, + "step": 1530, + "token_acc": 0.2627483866065274 + }, + { + "epoch": 0.8976839636470243, + "grad_norm": 0.8507205426019562, + "learning_rate": 0.0004487104337631888, + "loss": 3.45574951171875, + "step": 1531, + "token_acc": 0.2545419027153741 + }, + { + "epoch": 0.8982703019642334, + "grad_norm": 1.0113115296818116, + "learning_rate": 0.0004490035169988277, + "loss": 3.4781317710876465, + "step": 1532, + "token_acc": 0.25018436530797356 + }, + { + "epoch": 0.8988566402814424, + "grad_norm": 0.9702843157526281, + "learning_rate": 0.00044929660023446664, + "loss": 3.406515598297119, + "step": 1533, + "token_acc": 0.25830583286276015 + }, + { + "epoch": 0.8994429785986514, + "grad_norm": 0.8307863112443473, + "learning_rate": 0.0004495896834701055, + "loss": 3.4074923992156982, + "step": 1534, + "token_acc": 0.26047814003682834 + }, + { + "epoch": 0.9000293169158604, + "grad_norm": 0.6881073759023008, + "learning_rate": 0.00044988276670574444, + "loss": 3.410942792892456, + "step": 1535, + "token_acc": 0.25954212619061484 + }, + { + "epoch": 0.9006156552330695, + "grad_norm": 0.7094587050482819, + "learning_rate": 0.00045017584994138337, + "loss": 3.4270544052124023, + "step": 1536, + "token_acc": 0.25797216805295764 + }, + { + "epoch": 0.9012019935502785, + "grad_norm": 0.6896778192969361, + "learning_rate": 0.0004504689331770223, + "loss": 3.396339178085327, + "step": 1537, + "token_acc": 0.26028072170111394 + }, + { + "epoch": 0.9017883318674875, + "grad_norm": 0.6997777555512175, + "learning_rate": 0.00045076201641266117, + "loss": 3.4065232276916504, + "step": 1538, + "token_acc": 0.2586796314979729 + }, + { + "epoch": 0.9023746701846965, + "grad_norm": 0.7851855432610988, + "learning_rate": 0.0004510550996483001, + "loss": 3.456482410430908, + "step": 1539, + "token_acc": 0.25301366793636565 + }, + { + "epoch": 0.9029610085019056, + "grad_norm": 0.8142782169840991, + "learning_rate": 0.000451348182883939, + "loss": 3.40527081489563, + "step": 1540, + "token_acc": 0.2588093084075203 + }, + { + "epoch": 0.9035473468191146, + "grad_norm": 1.0882979760075386, + "learning_rate": 0.00045164126611957795, + "loss": 3.402641773223877, + "step": 1541, + "token_acc": 0.26020041644976577 + }, + { + "epoch": 0.9041336851363236, + "grad_norm": 0.9696739952497907, + "learning_rate": 0.0004519343493552169, + "loss": 3.4109444618225098, + "step": 1542, + "token_acc": 0.2592243617104094 + }, + { + "epoch": 0.9047200234535326, + "grad_norm": 0.8256579594392981, + "learning_rate": 0.0004522274325908558, + "loss": 3.4110050201416016, + "step": 1543, + "token_acc": 0.2589638542557138 + }, + { + "epoch": 0.9053063617707418, + "grad_norm": 0.9063908896488115, + "learning_rate": 0.00045252051582649473, + "loss": 3.4508819580078125, + "step": 1544, + "token_acc": 0.254906256604824 + }, + { + "epoch": 0.9058927000879508, + "grad_norm": 0.8362884002696215, + "learning_rate": 0.00045281359906213366, + "loss": 3.3710813522338867, + "step": 1545, + "token_acc": 0.26476358848185666 + }, + { + "epoch": 0.9064790384051598, + "grad_norm": 0.8237916840830777, + "learning_rate": 0.0004531066822977726, + "loss": 3.344583034515381, + "step": 1546, + "token_acc": 0.2665647351508241 + }, + { + "epoch": 0.9070653767223688, + "grad_norm": 0.8644669591830477, + "learning_rate": 0.0004533997655334115, + "loss": 3.382384777069092, + "step": 1547, + "token_acc": 0.26273714607242576 + }, + { + "epoch": 0.9076517150395779, + "grad_norm": 0.8158798497352968, + "learning_rate": 0.00045369284876905044, + "loss": 3.448951244354248, + "step": 1548, + "token_acc": 0.25178438082974697 + }, + { + "epoch": 0.9082380533567869, + "grad_norm": 0.82736387667415, + "learning_rate": 0.00045398593200468936, + "loss": 3.3965988159179688, + "step": 1549, + "token_acc": 0.2596030213273226 + }, + { + "epoch": 0.9088243916739959, + "grad_norm": 0.8628189460305075, + "learning_rate": 0.0004542790152403283, + "loss": 3.384303569793701, + "step": 1550, + "token_acc": 0.2631956587598127 + }, + { + "epoch": 0.909410729991205, + "grad_norm": 0.7776277001445487, + "learning_rate": 0.0004545720984759672, + "loss": 3.416213274002075, + "step": 1551, + "token_acc": 0.2589604935669291 + }, + { + "epoch": 0.909997068308414, + "grad_norm": 0.7525587208813943, + "learning_rate": 0.00045486518171160615, + "loss": 3.440732002258301, + "step": 1552, + "token_acc": 0.25442421869902787 + }, + { + "epoch": 0.910583406625623, + "grad_norm": 0.7461404215388849, + "learning_rate": 0.0004551582649472451, + "loss": 3.401362180709839, + "step": 1553, + "token_acc": 0.2611922234960221 + }, + { + "epoch": 0.911169744942832, + "grad_norm": 0.7976664744244445, + "learning_rate": 0.00045545134818288395, + "loss": 3.444603443145752, + "step": 1554, + "token_acc": 0.255203014958997 + }, + { + "epoch": 0.9117560832600411, + "grad_norm": 0.934012421168276, + "learning_rate": 0.0004557444314185228, + "loss": 3.4650862216949463, + "step": 1555, + "token_acc": 0.25218694743818426 + }, + { + "epoch": 0.9123424215772501, + "grad_norm": 1.0056039007466018, + "learning_rate": 0.00045603751465416175, + "loss": 3.4694719314575195, + "step": 1556, + "token_acc": 0.2504398919535937 + }, + { + "epoch": 0.9129287598944591, + "grad_norm": 0.8584646325625831, + "learning_rate": 0.0004563305978898007, + "loss": 3.490917682647705, + "step": 1557, + "token_acc": 0.24916871012055308 + }, + { + "epoch": 0.9135150982116681, + "grad_norm": 0.6892864356013069, + "learning_rate": 0.0004566236811254396, + "loss": 3.4395768642425537, + "step": 1558, + "token_acc": 0.2560468170521397 + }, + { + "epoch": 0.9141014365288772, + "grad_norm": 0.5715343669688692, + "learning_rate": 0.00045691676436107853, + "loss": 3.4453001022338867, + "step": 1559, + "token_acc": 0.2535166352725612 + }, + { + "epoch": 0.9146877748460862, + "grad_norm": 0.5783989184446671, + "learning_rate": 0.00045720984759671746, + "loss": 3.424844741821289, + "step": 1560, + "token_acc": 0.25697604381254147 + }, + { + "epoch": 0.9152741131632952, + "grad_norm": 0.6417535487109237, + "learning_rate": 0.0004575029308323564, + "loss": 3.3383431434631348, + "step": 1561, + "token_acc": 0.26958431265802246 + }, + { + "epoch": 0.9158604514805042, + "grad_norm": 0.7253250977890331, + "learning_rate": 0.0004577960140679953, + "loss": 3.410315752029419, + "step": 1562, + "token_acc": 0.2582153056753575 + }, + { + "epoch": 0.9164467897977133, + "grad_norm": 0.833096932275878, + "learning_rate": 0.00045808909730363424, + "loss": 3.379018545150757, + "step": 1563, + "token_acc": 0.26390519489729186 + }, + { + "epoch": 0.9170331281149223, + "grad_norm": 1.0364196269763921, + "learning_rate": 0.00045838218053927316, + "loss": 3.3636443614959717, + "step": 1564, + "token_acc": 0.26538452806486035 + }, + { + "epoch": 0.9176194664321313, + "grad_norm": 0.9223858474968215, + "learning_rate": 0.0004586752637749121, + "loss": 3.4284346103668213, + "step": 1565, + "token_acc": 0.2564395985746259 + }, + { + "epoch": 0.9182058047493403, + "grad_norm": 0.7095741677287191, + "learning_rate": 0.000458968347010551, + "loss": 3.386415958404541, + "step": 1566, + "token_acc": 0.26226823136235 + }, + { + "epoch": 0.9187921430665494, + "grad_norm": 1.0024116063579434, + "learning_rate": 0.00045926143024618995, + "loss": 3.469064474105835, + "step": 1567, + "token_acc": 0.2506314933995212 + }, + { + "epoch": 0.9193784813837584, + "grad_norm": 1.0187843948557882, + "learning_rate": 0.00045955451348182887, + "loss": 3.4033994674682617, + "step": 1568, + "token_acc": 0.26132159563307067 + }, + { + "epoch": 0.9199648197009674, + "grad_norm": 1.0015692640637548, + "learning_rate": 0.0004598475967174678, + "loss": 3.384068727493286, + "step": 1569, + "token_acc": 0.2616135022319235 + }, + { + "epoch": 0.9205511580181764, + "grad_norm": 0.9776105114346834, + "learning_rate": 0.00046014067995310673, + "loss": 3.4020183086395264, + "step": 1570, + "token_acc": 0.26139094616528225 + }, + { + "epoch": 0.9211374963353856, + "grad_norm": 1.0423668930768366, + "learning_rate": 0.00046043376318874565, + "loss": 3.4187798500061035, + "step": 1571, + "token_acc": 0.2581928344199305 + }, + { + "epoch": 0.9217238346525946, + "grad_norm": 1.095887504601878, + "learning_rate": 0.0004607268464243846, + "loss": 3.4371373653411865, + "step": 1572, + "token_acc": 0.25752831300325135 + }, + { + "epoch": 0.9223101729698036, + "grad_norm": 0.7967860483310408, + "learning_rate": 0.00046101992966002345, + "loss": 3.405745029449463, + "step": 1573, + "token_acc": 0.2594555011050765 + }, + { + "epoch": 0.9228965112870126, + "grad_norm": 0.6371829929420623, + "learning_rate": 0.00046131301289566233, + "loss": 3.423616886138916, + "step": 1574, + "token_acc": 0.25608867302081106 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 0.802053166181579, + "learning_rate": 0.00046160609613130125, + "loss": 3.3841419219970703, + "step": 1575, + "token_acc": 0.2621176312052327 + }, + { + "epoch": 0.9240691879214307, + "grad_norm": 0.7545336693719783, + "learning_rate": 0.0004618991793669402, + "loss": 3.413292407989502, + "step": 1576, + "token_acc": 0.2594243066979091 + }, + { + "epoch": 0.9246555262386397, + "grad_norm": 0.6806272522945446, + "learning_rate": 0.0004621922626025791, + "loss": 3.3866686820983887, + "step": 1577, + "token_acc": 0.26241214668686225 + }, + { + "epoch": 0.9252418645558487, + "grad_norm": 0.6601874188764207, + "learning_rate": 0.00046248534583821804, + "loss": 3.497767448425293, + "step": 1578, + "token_acc": 0.24897872902629145 + }, + { + "epoch": 0.9258282028730578, + "grad_norm": 0.6994873009410468, + "learning_rate": 0.00046277842907385696, + "loss": 3.3819782733917236, + "step": 1579, + "token_acc": 0.2612785056598792 + }, + { + "epoch": 0.9264145411902668, + "grad_norm": 0.9191257119463673, + "learning_rate": 0.0004630715123094959, + "loss": 3.42500901222229, + "step": 1580, + "token_acc": 0.25566361935595155 + }, + { + "epoch": 0.9270008795074758, + "grad_norm": 0.8625670309788941, + "learning_rate": 0.0004633645955451348, + "loss": 3.388427734375, + "step": 1581, + "token_acc": 0.26102203893579606 + }, + { + "epoch": 0.9275872178246849, + "grad_norm": 0.6473263962471194, + "learning_rate": 0.00046365767878077374, + "loss": 3.4127166271209717, + "step": 1582, + "token_acc": 0.2586886488535153 + }, + { + "epoch": 0.9281735561418939, + "grad_norm": 0.6452112309743301, + "learning_rate": 0.00046395076201641267, + "loss": 3.398099422454834, + "step": 1583, + "token_acc": 0.2612349063310302 + }, + { + "epoch": 0.9287598944591029, + "grad_norm": 0.773040487725159, + "learning_rate": 0.0004642438452520516, + "loss": 3.428492546081543, + "step": 1584, + "token_acc": 0.2571181127708332 + }, + { + "epoch": 0.9293462327763119, + "grad_norm": 0.6020016419547088, + "learning_rate": 0.0004645369284876905, + "loss": 3.407637119293213, + "step": 1585, + "token_acc": 0.2573565891472868 + }, + { + "epoch": 0.929932571093521, + "grad_norm": 0.7526683456667859, + "learning_rate": 0.00046483001172332945, + "loss": 3.420018196105957, + "step": 1586, + "token_acc": 0.25794642122337585 + }, + { + "epoch": 0.93051890941073, + "grad_norm": 0.8750191173531543, + "learning_rate": 0.0004651230949589684, + "loss": 3.4087538719177246, + "step": 1587, + "token_acc": 0.2581692218205991 + }, + { + "epoch": 0.931105247727939, + "grad_norm": 0.8565740622312593, + "learning_rate": 0.0004654161781946073, + "loss": 3.459507465362549, + "step": 1588, + "token_acc": 0.25244922372834533 + }, + { + "epoch": 0.931691586045148, + "grad_norm": 0.8451456813762578, + "learning_rate": 0.00046570926143024624, + "loss": 3.4358930587768555, + "step": 1589, + "token_acc": 0.25554337812509925 + }, + { + "epoch": 0.9322779243623571, + "grad_norm": 0.8084681051793079, + "learning_rate": 0.00046600234466588516, + "loss": 3.3934881687164307, + "step": 1590, + "token_acc": 0.26135432532561215 + }, + { + "epoch": 0.9328642626795661, + "grad_norm": 0.8093867823702288, + "learning_rate": 0.00046629542790152404, + "loss": 3.4133834838867188, + "step": 1591, + "token_acc": 0.25900633085604186 + }, + { + "epoch": 0.9334506009967751, + "grad_norm": 0.7827236144484844, + "learning_rate": 0.00046658851113716296, + "loss": 3.4193758964538574, + "step": 1592, + "token_acc": 0.2549593554643174 + }, + { + "epoch": 0.9340369393139841, + "grad_norm": 0.8614505004918908, + "learning_rate": 0.0004668815943728019, + "loss": 3.3967010974884033, + "step": 1593, + "token_acc": 0.2623547238108266 + }, + { + "epoch": 0.9346232776311932, + "grad_norm": 0.838653188362184, + "learning_rate": 0.0004671746776084408, + "loss": 3.364828586578369, + "step": 1594, + "token_acc": 0.26203081095253355 + }, + { + "epoch": 0.9352096159484022, + "grad_norm": 0.8650187103391519, + "learning_rate": 0.0004674677608440797, + "loss": 3.4230520725250244, + "step": 1595, + "token_acc": 0.2580488773476161 + }, + { + "epoch": 0.9357959542656112, + "grad_norm": 0.8085338404694958, + "learning_rate": 0.0004677608440797186, + "loss": 3.353464126586914, + "step": 1596, + "token_acc": 0.2656242771892492 + }, + { + "epoch": 0.9363822925828202, + "grad_norm": 0.6541015660443199, + "learning_rate": 0.00046805392731535754, + "loss": 3.4348366260528564, + "step": 1597, + "token_acc": 0.25665783815454934 + }, + { + "epoch": 0.9369686309000294, + "grad_norm": 0.7302554032103659, + "learning_rate": 0.00046834701055099647, + "loss": 3.3766796588897705, + "step": 1598, + "token_acc": 0.2622136767327506 + }, + { + "epoch": 0.9375549692172384, + "grad_norm": 0.8012532300872414, + "learning_rate": 0.0004686400937866354, + "loss": 3.395601511001587, + "step": 1599, + "token_acc": 0.26080943990602806 + }, + { + "epoch": 0.9381413075344474, + "grad_norm": 0.7580375437116129, + "learning_rate": 0.0004689331770222743, + "loss": 3.411672353744507, + "step": 1600, + "token_acc": 0.25875652011185996 + }, + { + "epoch": 0.9387276458516564, + "grad_norm": 0.8228498393194645, + "learning_rate": 0.00046922626025791325, + "loss": 3.418936252593994, + "step": 1601, + "token_acc": 0.25610453881558143 + }, + { + "epoch": 0.9393139841688655, + "grad_norm": 0.9847939664936951, + "learning_rate": 0.0004695193434935522, + "loss": 3.364391326904297, + "step": 1602, + "token_acc": 0.2644745540385086 + }, + { + "epoch": 0.9399003224860745, + "grad_norm": 0.8533649591447857, + "learning_rate": 0.0004698124267291911, + "loss": 3.429565668106079, + "step": 1603, + "token_acc": 0.2551882863876285 + }, + { + "epoch": 0.9404866608032835, + "grad_norm": 0.785523459566351, + "learning_rate": 0.00047010550996483003, + "loss": 3.4098591804504395, + "step": 1604, + "token_acc": 0.2579425172409315 + }, + { + "epoch": 0.9410729991204925, + "grad_norm": 0.8813742367554283, + "learning_rate": 0.00047039859320046896, + "loss": 3.455411672592163, + "step": 1605, + "token_acc": 0.25353182237238275 + }, + { + "epoch": 0.9416593374377016, + "grad_norm": 0.8617999188506723, + "learning_rate": 0.0004706916764361079, + "loss": 3.4443020820617676, + "step": 1606, + "token_acc": 0.25336424378668665 + }, + { + "epoch": 0.9422456757549106, + "grad_norm": 0.7926410203151111, + "learning_rate": 0.0004709847596717468, + "loss": 3.3781723976135254, + "step": 1607, + "token_acc": 0.2607925445787291 + }, + { + "epoch": 0.9428320140721196, + "grad_norm": 0.6707672444765512, + "learning_rate": 0.00047127784290738574, + "loss": 3.386352777481079, + "step": 1608, + "token_acc": 0.26079590279484227 + }, + { + "epoch": 0.9434183523893287, + "grad_norm": 0.716145505301023, + "learning_rate": 0.0004715709261430246, + "loss": 3.419933795928955, + "step": 1609, + "token_acc": 0.2577940441651602 + }, + { + "epoch": 0.9440046907065377, + "grad_norm": 0.8430247330819329, + "learning_rate": 0.00047186400937866354, + "loss": 3.432588577270508, + "step": 1610, + "token_acc": 0.25608389217120386 + }, + { + "epoch": 0.9445910290237467, + "grad_norm": 0.9469236448094432, + "learning_rate": 0.00047215709261430247, + "loss": 3.398334503173828, + "step": 1611, + "token_acc": 0.2584604188031361 + }, + { + "epoch": 0.9451773673409557, + "grad_norm": 0.9579249439622376, + "learning_rate": 0.0004724501758499414, + "loss": 3.426422357559204, + "step": 1612, + "token_acc": 0.25801911469080036 + }, + { + "epoch": 0.9457637056581648, + "grad_norm": 0.7695586715347654, + "learning_rate": 0.0004727432590855803, + "loss": 3.4373886585235596, + "step": 1613, + "token_acc": 0.2563144605301693 + }, + { + "epoch": 0.9463500439753738, + "grad_norm": 0.8098633244935107, + "learning_rate": 0.00047303634232121925, + "loss": 3.3671505451202393, + "step": 1614, + "token_acc": 0.26410754023702326 + }, + { + "epoch": 0.9469363822925828, + "grad_norm": 0.6710145475248834, + "learning_rate": 0.0004733294255568581, + "loss": 3.400850772857666, + "step": 1615, + "token_acc": 0.2604837949137091 + }, + { + "epoch": 0.9475227206097918, + "grad_norm": 0.5800965944556575, + "learning_rate": 0.00047362250879249705, + "loss": 3.3928072452545166, + "step": 1616, + "token_acc": 0.262466446691352 + }, + { + "epoch": 0.9481090589270009, + "grad_norm": 0.6754966391847237, + "learning_rate": 0.000473915592028136, + "loss": 3.4092674255371094, + "step": 1617, + "token_acc": 0.25772788868852636 + }, + { + "epoch": 0.9486953972442099, + "grad_norm": 0.6339734969168743, + "learning_rate": 0.0004742086752637749, + "loss": 3.4248099327087402, + "step": 1618, + "token_acc": 0.25686503674689914 + }, + { + "epoch": 0.9492817355614189, + "grad_norm": 0.5920743661390458, + "learning_rate": 0.00047450175849941383, + "loss": 3.396275520324707, + "step": 1619, + "token_acc": 0.2600200771384794 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 0.5970969865219967, + "learning_rate": 0.00047479484173505276, + "loss": 3.44085955619812, + "step": 1620, + "token_acc": 0.25566467214899774 + }, + { + "epoch": 0.950454412195837, + "grad_norm": 0.6481590670443838, + "learning_rate": 0.0004750879249706917, + "loss": 3.433203935623169, + "step": 1621, + "token_acc": 0.2551482914916075 + }, + { + "epoch": 0.951040750513046, + "grad_norm": 0.7167417366934338, + "learning_rate": 0.0004753810082063306, + "loss": 3.3610758781433105, + "step": 1622, + "token_acc": 0.26279211393652363 + }, + { + "epoch": 0.951627088830255, + "grad_norm": 0.7010329218212996, + "learning_rate": 0.00047567409144196954, + "loss": 3.3953728675842285, + "step": 1623, + "token_acc": 0.2593054258914925 + }, + { + "epoch": 0.952213427147464, + "grad_norm": 0.7630775961357231, + "learning_rate": 0.00047596717467760847, + "loss": 3.417132616043091, + "step": 1624, + "token_acc": 0.25767371840583153 + }, + { + "epoch": 0.9527997654646732, + "grad_norm": 0.7898798427939647, + "learning_rate": 0.0004762602579132474, + "loss": 3.4229321479797363, + "step": 1625, + "token_acc": 0.25517000107042065 + }, + { + "epoch": 0.9533861037818822, + "grad_norm": 0.8658687412395234, + "learning_rate": 0.0004765533411488863, + "loss": 3.3890128135681152, + "step": 1626, + "token_acc": 0.2610182408920889 + }, + { + "epoch": 0.9539724420990912, + "grad_norm": 0.954383604278532, + "learning_rate": 0.0004768464243845252, + "loss": 3.296166181564331, + "step": 1627, + "token_acc": 0.2732230694201181 + }, + { + "epoch": 0.9545587804163002, + "grad_norm": 0.871037437062733, + "learning_rate": 0.0004771395076201641, + "loss": 3.403876304626465, + "step": 1628, + "token_acc": 0.2604372744867815 + }, + { + "epoch": 0.9551451187335093, + "grad_norm": 0.6887515036189064, + "learning_rate": 0.00047743259085580305, + "loss": 3.3721160888671875, + "step": 1629, + "token_acc": 0.2629014008691085 + }, + { + "epoch": 0.9557314570507183, + "grad_norm": 0.6502597623294148, + "learning_rate": 0.000477725674091442, + "loss": 3.4297432899475098, + "step": 1630, + "token_acc": 0.2571084158991657 + }, + { + "epoch": 0.9563177953679273, + "grad_norm": 0.736743551503019, + "learning_rate": 0.0004780187573270809, + "loss": 3.3795535564422607, + "step": 1631, + "token_acc": 0.26382471816745073 + }, + { + "epoch": 0.9569041336851363, + "grad_norm": 0.7216480645634892, + "learning_rate": 0.00047831184056271983, + "loss": 3.4060494899749756, + "step": 1632, + "token_acc": 0.25803727355662714 + }, + { + "epoch": 0.9574904720023454, + "grad_norm": 0.688184405778739, + "learning_rate": 0.00047860492379835876, + "loss": 3.405019998550415, + "step": 1633, + "token_acc": 0.2596055868475178 + }, + { + "epoch": 0.9580768103195544, + "grad_norm": 0.7190072608485956, + "learning_rate": 0.0004788980070339977, + "loss": 3.447218894958496, + "step": 1634, + "token_acc": 0.25326800728269544 + }, + { + "epoch": 0.9586631486367634, + "grad_norm": 0.7239985510905795, + "learning_rate": 0.00047919109026963656, + "loss": 3.4083261489868164, + "step": 1635, + "token_acc": 0.2599820172091127 + }, + { + "epoch": 0.9592494869539725, + "grad_norm": 0.8339365726306746, + "learning_rate": 0.0004794841735052755, + "loss": 3.41027569770813, + "step": 1636, + "token_acc": 0.2577975645569752 + }, + { + "epoch": 0.9598358252711815, + "grad_norm": 1.0206311370767058, + "learning_rate": 0.0004797772567409144, + "loss": 3.389923095703125, + "step": 1637, + "token_acc": 0.2631311200951761 + }, + { + "epoch": 0.9604221635883905, + "grad_norm": 1.1153323497999141, + "learning_rate": 0.00048007033997655334, + "loss": 3.37899112701416, + "step": 1638, + "token_acc": 0.2600477920887954 + }, + { + "epoch": 0.9610085019055995, + "grad_norm": 0.9268614356177819, + "learning_rate": 0.00048036342321219227, + "loss": 3.452530860900879, + "step": 1639, + "token_acc": 0.25428895726922934 + }, + { + "epoch": 0.9615948402228086, + "grad_norm": 0.8818451293908526, + "learning_rate": 0.0004806565064478312, + "loss": 3.402853488922119, + "step": 1640, + "token_acc": 0.25927329611212147 + }, + { + "epoch": 0.9621811785400176, + "grad_norm": 0.8675753557854642, + "learning_rate": 0.0004809495896834701, + "loss": 3.4168453216552734, + "step": 1641, + "token_acc": 0.2570313335472142 + }, + { + "epoch": 0.9627675168572266, + "grad_norm": 0.9406236909494683, + "learning_rate": 0.00048124267291910905, + "loss": 3.468832015991211, + "step": 1642, + "token_acc": 0.24978093661766138 + }, + { + "epoch": 0.9633538551744356, + "grad_norm": 0.7194484908773244, + "learning_rate": 0.000481535756154748, + "loss": 3.477181911468506, + "step": 1643, + "token_acc": 0.2500675574631976 + }, + { + "epoch": 0.9639401934916447, + "grad_norm": 0.7158929938451088, + "learning_rate": 0.00048182883939038685, + "loss": 3.4275970458984375, + "step": 1644, + "token_acc": 0.25635618338903954 + }, + { + "epoch": 0.9645265318088537, + "grad_norm": 0.7264616014841921, + "learning_rate": 0.0004821219226260258, + "loss": 3.381517171859741, + "step": 1645, + "token_acc": 0.2620527127574895 + }, + { + "epoch": 0.9651128701260627, + "grad_norm": 0.7003298576904327, + "learning_rate": 0.0004824150058616647, + "loss": 3.4201953411102295, + "step": 1646, + "token_acc": 0.2557165427934221 + }, + { + "epoch": 0.9656992084432717, + "grad_norm": 0.6600357159929129, + "learning_rate": 0.00048270808909730363, + "loss": 3.344597816467285, + "step": 1647, + "token_acc": 0.26677482325630475 + }, + { + "epoch": 0.9662855467604808, + "grad_norm": 0.7301278050131998, + "learning_rate": 0.00048300117233294256, + "loss": 3.395505905151367, + "step": 1648, + "token_acc": 0.2616742383786475 + }, + { + "epoch": 0.9668718850776898, + "grad_norm": 0.7758075543448638, + "learning_rate": 0.0004832942555685815, + "loss": 3.427757501602173, + "step": 1649, + "token_acc": 0.2550704389335196 + }, + { + "epoch": 0.9674582233948988, + "grad_norm": 0.8507861698669322, + "learning_rate": 0.0004835873388042204, + "loss": 3.3657493591308594, + "step": 1650, + "token_acc": 0.2646349945811294 + }, + { + "epoch": 0.9680445617121078, + "grad_norm": 0.8676422858402819, + "learning_rate": 0.00048388042203985934, + "loss": 3.436417579650879, + "step": 1651, + "token_acc": 0.2544621086511265 + }, + { + "epoch": 0.968630900029317, + "grad_norm": 0.809804243949132, + "learning_rate": 0.00048417350527549827, + "loss": 3.365145683288574, + "step": 1652, + "token_acc": 0.2659257237326631 + }, + { + "epoch": 0.969217238346526, + "grad_norm": 0.8750343611634581, + "learning_rate": 0.0004844665885111372, + "loss": 3.426231622695923, + "step": 1653, + "token_acc": 0.25500453264919865 + }, + { + "epoch": 0.969803576663735, + "grad_norm": 0.9551722112847345, + "learning_rate": 0.0004847596717467761, + "loss": 3.3726325035095215, + "step": 1654, + "token_acc": 0.26359752614834325 + }, + { + "epoch": 0.970389914980944, + "grad_norm": 0.6610719693143526, + "learning_rate": 0.00048505275498241505, + "loss": 3.3770828247070312, + "step": 1655, + "token_acc": 0.2626288159179623 + }, + { + "epoch": 0.9709762532981531, + "grad_norm": 0.5935609038606166, + "learning_rate": 0.0004853458382180539, + "loss": 3.385439872741699, + "step": 1656, + "token_acc": 0.26044661184452567 + }, + { + "epoch": 0.9715625916153621, + "grad_norm": 0.7036883488546932, + "learning_rate": 0.00048563892145369285, + "loss": 3.3625690937042236, + "step": 1657, + "token_acc": 0.26399290706164596 + }, + { + "epoch": 0.9721489299325711, + "grad_norm": 0.8974794960322794, + "learning_rate": 0.0004859320046893318, + "loss": 3.41567325592041, + "step": 1658, + "token_acc": 0.25691336367859746 + }, + { + "epoch": 0.9727352682497801, + "grad_norm": 0.8198680084145463, + "learning_rate": 0.0004862250879249707, + "loss": 3.3812437057495117, + "step": 1659, + "token_acc": 0.26185687643982886 + }, + { + "epoch": 0.9733216065669892, + "grad_norm": 0.6649985622623101, + "learning_rate": 0.00048651817116060963, + "loss": 3.3992867469787598, + "step": 1660, + "token_acc": 0.2576827028526882 + }, + { + "epoch": 0.9739079448841982, + "grad_norm": 0.7137361713751447, + "learning_rate": 0.00048681125439624856, + "loss": 3.3787105083465576, + "step": 1661, + "token_acc": 0.26169331429557685 + }, + { + "epoch": 0.9744942832014072, + "grad_norm": 0.637235866239349, + "learning_rate": 0.00048710433763188743, + "loss": 3.364614963531494, + "step": 1662, + "token_acc": 0.2627127032066321 + }, + { + "epoch": 0.9750806215186162, + "grad_norm": 0.5648236779574606, + "learning_rate": 0.00048739742086752636, + "loss": 3.395259380340576, + "step": 1663, + "token_acc": 0.2592245025289073 + }, + { + "epoch": 0.9756669598358253, + "grad_norm": 0.6271310964666041, + "learning_rate": 0.0004876905041031653, + "loss": 3.4439775943756104, + "step": 1664, + "token_acc": 0.25464983302440103 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 0.6723891038383868, + "learning_rate": 0.0004879835873388042, + "loss": 3.3858578205108643, + "step": 1665, + "token_acc": 0.259938133388556 + }, + { + "epoch": 0.9768396364702433, + "grad_norm": 0.6157248971507705, + "learning_rate": 0.00048827667057444314, + "loss": 3.4156439304351807, + "step": 1666, + "token_acc": 0.2571010195871461 + }, + { + "epoch": 0.9774259747874524, + "grad_norm": 0.6192836367967911, + "learning_rate": 0.0004885697538100821, + "loss": 3.386714458465576, + "step": 1667, + "token_acc": 0.26158883688592266 + }, + { + "epoch": 0.9780123131046614, + "grad_norm": 0.7146332695280808, + "learning_rate": 0.000488862837045721, + "loss": 3.3835902214050293, + "step": 1668, + "token_acc": 0.26070445542856086 + }, + { + "epoch": 0.9785986514218704, + "grad_norm": 0.6593723124701049, + "learning_rate": 0.0004891559202813599, + "loss": 3.419379234313965, + "step": 1669, + "token_acc": 0.2575162235246998 + }, + { + "epoch": 0.9791849897390794, + "grad_norm": 0.6899664470717639, + "learning_rate": 0.0004894490035169988, + "loss": 3.4234187602996826, + "step": 1670, + "token_acc": 0.25621284276434536 + }, + { + "epoch": 0.9797713280562885, + "grad_norm": 0.7132113193640718, + "learning_rate": 0.0004897420867526378, + "loss": 3.325669050216675, + "step": 1671, + "token_acc": 0.26915408184724654 + }, + { + "epoch": 0.9803576663734975, + "grad_norm": 0.7689165626854885, + "learning_rate": 0.0004900351699882767, + "loss": 3.434572219848633, + "step": 1672, + "token_acc": 0.25396825396825395 + }, + { + "epoch": 0.9809440046907065, + "grad_norm": 0.7958390660551526, + "learning_rate": 0.0004903282532239156, + "loss": 3.408353805541992, + "step": 1673, + "token_acc": 0.2560165497617441 + }, + { + "epoch": 0.9815303430079155, + "grad_norm": 0.8642436370261324, + "learning_rate": 0.0004906213364595546, + "loss": 3.4102115631103516, + "step": 1674, + "token_acc": 0.2596058029532425 + }, + { + "epoch": 0.9821166813251246, + "grad_norm": 0.8545379790654077, + "learning_rate": 0.0004909144196951935, + "loss": 3.4542784690856934, + "step": 1675, + "token_acc": 0.25245472795165436 + }, + { + "epoch": 0.9827030196423336, + "grad_norm": 0.6886905204726381, + "learning_rate": 0.0004912075029308324, + "loss": 3.367812395095825, + "step": 1676, + "token_acc": 0.26391417341987683 + }, + { + "epoch": 0.9832893579595426, + "grad_norm": 0.7189864992837707, + "learning_rate": 0.0004915005861664713, + "loss": 3.4203402996063232, + "step": 1677, + "token_acc": 0.25541509248683375 + }, + { + "epoch": 0.9838756962767516, + "grad_norm": 0.8062820180700397, + "learning_rate": 0.0004917936694021103, + "loss": 3.4165563583374023, + "step": 1678, + "token_acc": 0.2579511418581393 + }, + { + "epoch": 0.9844620345939608, + "grad_norm": 0.7328879590792131, + "learning_rate": 0.0004920867526377492, + "loss": 3.4020133018493652, + "step": 1679, + "token_acc": 0.25841710397401657 + }, + { + "epoch": 0.9850483729111698, + "grad_norm": 0.744101512580751, + "learning_rate": 0.000492379835873388, + "loss": 3.381073236465454, + "step": 1680, + "token_acc": 0.2624654101185193 + }, + { + "epoch": 0.9856347112283788, + "grad_norm": 0.8031669352623175, + "learning_rate": 0.0004926729191090269, + "loss": 3.4260525703430176, + "step": 1681, + "token_acc": 0.2569296923622934 + }, + { + "epoch": 0.9862210495455878, + "grad_norm": 0.7796885413825525, + "learning_rate": 0.0004929660023446659, + "loss": 3.376763343811035, + "step": 1682, + "token_acc": 0.26230354358909536 + }, + { + "epoch": 0.9868073878627969, + "grad_norm": 0.765091477223617, + "learning_rate": 0.0004932590855803048, + "loss": 3.430267333984375, + "step": 1683, + "token_acc": 0.2569303674269375 + }, + { + "epoch": 0.9873937261800059, + "grad_norm": 0.8131342256736485, + "learning_rate": 0.0004935521688159437, + "loss": 3.404228925704956, + "step": 1684, + "token_acc": 0.2591612865042625 + }, + { + "epoch": 0.9879800644972149, + "grad_norm": 0.6887164853520207, + "learning_rate": 0.0004938452520515826, + "loss": 3.395916223526001, + "step": 1685, + "token_acc": 0.2582110421430905 + }, + { + "epoch": 0.9885664028144239, + "grad_norm": 0.683908218051115, + "learning_rate": 0.0004941383352872216, + "loss": 3.386446952819824, + "step": 1686, + "token_acc": 0.2630170465612051 + }, + { + "epoch": 0.989152741131633, + "grad_norm": 0.82379347470404, + "learning_rate": 0.0004944314185228605, + "loss": 3.3611955642700195, + "step": 1687, + "token_acc": 0.262778975790798 + }, + { + "epoch": 0.989739079448842, + "grad_norm": 0.7358622119024149, + "learning_rate": 0.0004947245017584994, + "loss": 3.413051128387451, + "step": 1688, + "token_acc": 0.25815074871289884 + }, + { + "epoch": 0.990325417766051, + "grad_norm": 0.5960150507073257, + "learning_rate": 0.0004950175849941384, + "loss": 3.4064674377441406, + "step": 1689, + "token_acc": 0.2595074463454207 + }, + { + "epoch": 0.99091175608326, + "grad_norm": 0.5853059987316303, + "learning_rate": 0.0004953106682297773, + "loss": 3.3748228549957275, + "step": 1690, + "token_acc": 0.26273726540838127 + }, + { + "epoch": 0.9914980944004691, + "grad_norm": 0.708395783955718, + "learning_rate": 0.0004956037514654162, + "loss": 3.39245867729187, + "step": 1691, + "token_acc": 0.26029490391772464 + }, + { + "epoch": 0.9920844327176781, + "grad_norm": 0.7771210564653088, + "learning_rate": 0.0004958968347010551, + "loss": 3.4332962036132812, + "step": 1692, + "token_acc": 0.2545539516316423 + }, + { + "epoch": 0.9926707710348871, + "grad_norm": 0.7487859833065365, + "learning_rate": 0.0004961899179366941, + "loss": 3.3970022201538086, + "step": 1693, + "token_acc": 0.2584394604345602 + }, + { + "epoch": 0.9932571093520962, + "grad_norm": 0.7307276119763373, + "learning_rate": 0.000496483001172333, + "loss": 3.402738094329834, + "step": 1694, + "token_acc": 0.2589121818800597 + }, + { + "epoch": 0.9938434476693052, + "grad_norm": 0.6769403752420412, + "learning_rate": 0.0004967760844079719, + "loss": 3.3789596557617188, + "step": 1695, + "token_acc": 0.2631026635090789 + }, + { + "epoch": 0.9944297859865142, + "grad_norm": 0.7209543302853496, + "learning_rate": 0.0004970691676436108, + "loss": 3.346503734588623, + "step": 1696, + "token_acc": 0.2656507929735107 + }, + { + "epoch": 0.9950161243037232, + "grad_norm": 0.7222250086788583, + "learning_rate": 0.0004973622508792498, + "loss": 3.394810676574707, + "step": 1697, + "token_acc": 0.25921117592203186 + }, + { + "epoch": 0.9956024626209323, + "grad_norm": 0.7763937070218895, + "learning_rate": 0.0004976553341148886, + "loss": 3.4150874614715576, + "step": 1698, + "token_acc": 0.25711403527255106 + }, + { + "epoch": 0.9961888009381413, + "grad_norm": 0.8701727333777808, + "learning_rate": 0.0004979484173505275, + "loss": 3.4142632484436035, + "step": 1699, + "token_acc": 0.2567015464493374 + }, + { + "epoch": 0.9967751392553503, + "grad_norm": 0.8057586007862209, + "learning_rate": 0.0004982415005861664, + "loss": 3.3474903106689453, + "step": 1700, + "token_acc": 0.2655471289274106 + }, + { + "epoch": 0.9973614775725593, + "grad_norm": 0.8036537891855855, + "learning_rate": 0.0004985345838218054, + "loss": 3.364363670349121, + "step": 1701, + "token_acc": 0.2631264529032091 + }, + { + "epoch": 0.9979478158897684, + "grad_norm": 0.7338803456436668, + "learning_rate": 0.0004988276670574443, + "loss": 3.3639016151428223, + "step": 1702, + "token_acc": 0.26256075722691224 + }, + { + "epoch": 0.9985341542069774, + "grad_norm": 0.7603604772565937, + "learning_rate": 0.0004991207502930832, + "loss": 3.407386064529419, + "step": 1703, + "token_acc": 0.26093307558615664 + }, + { + "epoch": 0.9991204925241864, + "grad_norm": 0.670563794799557, + "learning_rate": 0.0004994138335287222, + "loss": 3.3578433990478516, + "step": 1704, + "token_acc": 0.2657139079783387 + }, + { + "epoch": 0.9997068308413954, + "grad_norm": 0.7924571457431127, + "learning_rate": 0.0004997069167643611, + "loss": 3.4050941467285156, + "step": 1705, + "token_acc": 0.2587272240085745 + }, + { + "epoch": 1.0, + "grad_norm": 0.7857934700198668, + "learning_rate": 0.0005, + "loss": 3.3149847984313965, + "step": 1706, + "token_acc": 0.27139554777083036 + }, + { + "epoch": 1.0, + "eval_loss": 3.363898515701294, + "eval_runtime": 6.3304, + "eval_samples_per_second": 40.44, + "eval_steps_per_second": 5.055, + "eval_token_acc": 0.26314017610325324, + "step": 1706 + }, + { + "epoch": 1.0005863383172091, + "grad_norm": 0.6748829955942852, + "learning_rate": 0.0004999999988257934, + "loss": 3.3930788040161133, + "step": 1707, + "token_acc": 0.2604787022109171 + }, + { + "epoch": 1.001172676634418, + "grad_norm": 0.6337504746544477, + "learning_rate": 0.0004999999953031737, + "loss": 3.3502395153045654, + "step": 1708, + "token_acc": 0.2652086723321074 + }, + { + "epoch": 1.0017590149516271, + "grad_norm": 0.6047408353877396, + "learning_rate": 0.000499999989432141, + "loss": 3.381061553955078, + "step": 1709, + "token_acc": 0.2618050757872131 + }, + { + "epoch": 1.0023453532688362, + "grad_norm": 0.7720425176929202, + "learning_rate": 0.0004999999812126953, + "loss": 3.3666725158691406, + "step": 1710, + "token_acc": 0.263822899024266 + }, + { + "epoch": 1.0029316915860451, + "grad_norm": 0.7813815613616496, + "learning_rate": 0.0004999999706448365, + "loss": 3.3831634521484375, + "step": 1711, + "token_acc": 0.2616015886255239 + }, + { + "epoch": 1.0035180299032542, + "grad_norm": 0.8105809824926559, + "learning_rate": 0.000499999957728565, + "loss": 3.475907802581787, + "step": 1712, + "token_acc": 0.2525038961038961 + }, + { + "epoch": 1.0041043682204631, + "grad_norm": 0.749777924398785, + "learning_rate": 0.0004999999424638807, + "loss": 3.3729724884033203, + "step": 1713, + "token_acc": 0.26232628973919664 + }, + { + "epoch": 1.0046907065376722, + "grad_norm": 0.7139398521142178, + "learning_rate": 0.0004999999248507838, + "loss": 3.368234634399414, + "step": 1714, + "token_acc": 0.26018118033426374 + }, + { + "epoch": 1.0052770448548813, + "grad_norm": 0.7397609215524301, + "learning_rate": 0.0004999999048892746, + "loss": 3.3365745544433594, + "step": 1715, + "token_acc": 0.2675504364707075 + }, + { + "epoch": 1.0058633831720902, + "grad_norm": 0.6976681068583725, + "learning_rate": 0.0004999998825793531, + "loss": 3.3686835765838623, + "step": 1716, + "token_acc": 0.26274320375664184 + }, + { + "epoch": 1.0064497214892993, + "grad_norm": 0.7475038689031207, + "learning_rate": 0.0004999998579210196, + "loss": 3.388590097427368, + "step": 1717, + "token_acc": 0.26067491111944596 + }, + { + "epoch": 1.0070360598065085, + "grad_norm": 0.7938229829548396, + "learning_rate": 0.0004999998309142742, + "loss": 3.3868818283081055, + "step": 1718, + "token_acc": 0.2588959239199532 + }, + { + "epoch": 1.0076223981237173, + "grad_norm": 0.7568816544778835, + "learning_rate": 0.0004999998015591174, + "loss": 3.3844709396362305, + "step": 1719, + "token_acc": 0.2609687949898292 + }, + { + "epoch": 1.0082087364409265, + "grad_norm": 0.8196030834242516, + "learning_rate": 0.0004999997698555493, + "loss": 3.3978371620178223, + "step": 1720, + "token_acc": 0.2597024001621623 + }, + { + "epoch": 1.0087950747581353, + "grad_norm": 0.767341805870375, + "learning_rate": 0.0004999997358035703, + "loss": 3.399653911590576, + "step": 1721, + "token_acc": 0.2593581171436213 + }, + { + "epoch": 1.0093814130753445, + "grad_norm": 0.6071678324713047, + "learning_rate": 0.0004999996994031805, + "loss": 3.3867225646972656, + "step": 1722, + "token_acc": 0.26138374753513005 + }, + { + "epoch": 1.0099677513925536, + "grad_norm": 0.6668094674908489, + "learning_rate": 0.0004999996606543806, + "loss": 3.330446720123291, + "step": 1723, + "token_acc": 0.2656608654059413 + }, + { + "epoch": 1.0105540897097625, + "grad_norm": 0.7300729028364705, + "learning_rate": 0.0004999996195571706, + "loss": 3.4130914211273193, + "step": 1724, + "token_acc": 0.25771057544115755 + }, + { + "epoch": 1.0111404280269716, + "grad_norm": 0.7227177106444123, + "learning_rate": 0.0004999995761115511, + "loss": 3.379822254180908, + "step": 1725, + "token_acc": 0.26179503766392764 + }, + { + "epoch": 1.0117267663441807, + "grad_norm": 0.7187443263441782, + "learning_rate": 0.0004999995303175225, + "loss": 3.3946266174316406, + "step": 1726, + "token_acc": 0.25713018837152407 + }, + { + "epoch": 1.0123131046613896, + "grad_norm": 0.6451969883365569, + "learning_rate": 0.0004999994821750852, + "loss": 3.3877639770507812, + "step": 1727, + "token_acc": 0.25795602563434583 + }, + { + "epoch": 1.0128994429785987, + "grad_norm": 0.6142002800653262, + "learning_rate": 0.0004999994316842397, + "loss": 3.412135124206543, + "step": 1728, + "token_acc": 0.25619754965946645 + }, + { + "epoch": 1.0134857812958076, + "grad_norm": 0.5043742488623404, + "learning_rate": 0.0004999993788449863, + "loss": 3.360666275024414, + "step": 1729, + "token_acc": 0.2632796036634787 + }, + { + "epoch": 1.0140721196130167, + "grad_norm": 0.5947335928966146, + "learning_rate": 0.0004999993236573257, + "loss": 3.393095016479492, + "step": 1730, + "token_acc": 0.25890385388924964 + }, + { + "epoch": 1.0146584579302258, + "grad_norm": 0.6515982104663784, + "learning_rate": 0.0004999992661212583, + "loss": 3.325471878051758, + "step": 1731, + "token_acc": 0.26492156221913554 + }, + { + "epoch": 1.0152447962474347, + "grad_norm": 0.5735870934063698, + "learning_rate": 0.0004999992062367846, + "loss": 3.331839084625244, + "step": 1732, + "token_acc": 0.2659748888405879 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.6803176211640519, + "learning_rate": 0.0004999991440039054, + "loss": 3.292379856109619, + "step": 1733, + "token_acc": 0.271922122043643 + }, + { + "epoch": 1.016417472881853, + "grad_norm": 0.7130285588326162, + "learning_rate": 0.0004999990794226209, + "loss": 3.376286506652832, + "step": 1734, + "token_acc": 0.2602660452968364 + }, + { + "epoch": 1.0170038111990618, + "grad_norm": 0.7155190872456177, + "learning_rate": 0.000499999012492932, + "loss": 3.3886499404907227, + "step": 1735, + "token_acc": 0.2611530302206128 + }, + { + "epoch": 1.017590149516271, + "grad_norm": 0.667956458513744, + "learning_rate": 0.0004999989432148393, + "loss": 3.348695755004883, + "step": 1736, + "token_acc": 0.2645891064131046 + }, + { + "epoch": 1.01817648783348, + "grad_norm": 0.6093231373645646, + "learning_rate": 0.0004999988715883435, + "loss": 3.400038242340088, + "step": 1737, + "token_acc": 0.2591881639870949 + }, + { + "epoch": 1.018762826150689, + "grad_norm": 0.5906252342828342, + "learning_rate": 0.000499998797613445, + "loss": 3.351621150970459, + "step": 1738, + "token_acc": 0.26448133972273813 + }, + { + "epoch": 1.019349164467898, + "grad_norm": 0.7274545561441506, + "learning_rate": 0.0004999987212901448, + "loss": 3.3263134956359863, + "step": 1739, + "token_acc": 0.2667792889674464 + }, + { + "epoch": 1.019935502785107, + "grad_norm": 0.701253329179986, + "learning_rate": 0.0004999986426184435, + "loss": 3.3744101524353027, + "step": 1740, + "token_acc": 0.26211195604908416 + }, + { + "epoch": 1.020521841102316, + "grad_norm": 0.6769895553211768, + "learning_rate": 0.0004999985615983418, + "loss": 3.354360818862915, + "step": 1741, + "token_acc": 0.26484586120949755 + }, + { + "epoch": 1.0211081794195251, + "grad_norm": 0.76944864621591, + "learning_rate": 0.0004999984782298404, + "loss": 3.368668556213379, + "step": 1742, + "token_acc": 0.2641847250700224 + }, + { + "epoch": 1.021694517736734, + "grad_norm": 0.7947936730781323, + "learning_rate": 0.0004999983925129403, + "loss": 3.432093620300293, + "step": 1743, + "token_acc": 0.2538197183851945 + }, + { + "epoch": 1.0222808560539431, + "grad_norm": 0.6766084274368989, + "learning_rate": 0.0004999983044476421, + "loss": 3.357813835144043, + "step": 1744, + "token_acc": 0.26502710428389326 + }, + { + "epoch": 1.0228671943711523, + "grad_norm": 0.666759444314078, + "learning_rate": 0.0004999982140339468, + "loss": 3.4198641777038574, + "step": 1745, + "token_acc": 0.25719278747779845 + }, + { + "epoch": 1.0234535326883611, + "grad_norm": 0.6746485385187634, + "learning_rate": 0.0004999981212718551, + "loss": 3.354325771331787, + "step": 1746, + "token_acc": 0.26598047905777833 + }, + { + "epoch": 1.0240398710055703, + "grad_norm": 0.8935724371046717, + "learning_rate": 0.0004999980261613678, + "loss": 3.3523499965667725, + "step": 1747, + "token_acc": 0.2646181423802162 + }, + { + "epoch": 1.0246262093227791, + "grad_norm": 0.9342917892161134, + "learning_rate": 0.0004999979287024861, + "loss": 3.362705707550049, + "step": 1748, + "token_acc": 0.2627683119527913 + }, + { + "epoch": 1.0252125476399883, + "grad_norm": 0.777772479521273, + "learning_rate": 0.0004999978288952106, + "loss": 3.3829448223114014, + "step": 1749, + "token_acc": 0.2608665723354622 + }, + { + "epoch": 1.0257988859571974, + "grad_norm": 0.5551377629841278, + "learning_rate": 0.0004999977267395424, + "loss": 3.368907928466797, + "step": 1750, + "token_acc": 0.26152975499835907 + }, + { + "epoch": 1.0263852242744063, + "grad_norm": 0.559240511469829, + "learning_rate": 0.0004999976222354825, + "loss": 3.3463454246520996, + "step": 1751, + "token_acc": 0.2652679739964385 + }, + { + "epoch": 1.0269715625916154, + "grad_norm": 0.6133721785415677, + "learning_rate": 0.0004999975153830319, + "loss": 3.345900535583496, + "step": 1752, + "token_acc": 0.26380465151198274 + }, + { + "epoch": 1.0275579009088245, + "grad_norm": 0.6599106867148942, + "learning_rate": 0.0004999974061821914, + "loss": 3.3614983558654785, + "step": 1753, + "token_acc": 0.26276796224642235 + }, + { + "epoch": 1.0281442392260334, + "grad_norm": 0.6571596715306742, + "learning_rate": 0.0004999972946329621, + "loss": 3.3581602573394775, + "step": 1754, + "token_acc": 0.2650858522505024 + }, + { + "epoch": 1.0287305775432425, + "grad_norm": 0.709919111647143, + "learning_rate": 0.0004999971807353452, + "loss": 3.3908140659332275, + "step": 1755, + "token_acc": 0.2572123102325368 + }, + { + "epoch": 1.0293169158604514, + "grad_norm": 0.569630564184072, + "learning_rate": 0.0004999970644893416, + "loss": 3.356912612915039, + "step": 1756, + "token_acc": 0.26457645032386196 + }, + { + "epoch": 1.0299032541776605, + "grad_norm": 0.607527295907633, + "learning_rate": 0.0004999969458949524, + "loss": 3.3633108139038086, + "step": 1757, + "token_acc": 0.26201357811674375 + }, + { + "epoch": 1.0304895924948696, + "grad_norm": 0.6859989890742458, + "learning_rate": 0.0004999968249521789, + "loss": 3.333763599395752, + "step": 1758, + "token_acc": 0.26654568139284296 + }, + { + "epoch": 1.0310759308120785, + "grad_norm": 0.6080341757915679, + "learning_rate": 0.000499996701661022, + "loss": 3.3805861473083496, + "step": 1759, + "token_acc": 0.26147884947073846 + }, + { + "epoch": 1.0316622691292876, + "grad_norm": 0.7174563110958333, + "learning_rate": 0.0004999965760214831, + "loss": 3.369626998901367, + "step": 1760, + "token_acc": 0.26097056705461474 + }, + { + "epoch": 1.0322486074464967, + "grad_norm": 0.8150131382092486, + "learning_rate": 0.000499996448033563, + "loss": 3.377744197845459, + "step": 1761, + "token_acc": 0.26109926365987923 + }, + { + "epoch": 1.0328349457637056, + "grad_norm": 1.0471528401807257, + "learning_rate": 0.0004999963176972634, + "loss": 3.3738291263580322, + "step": 1762, + "token_acc": 0.26174884027748224 + }, + { + "epoch": 1.0334212840809147, + "grad_norm": 0.9328608970303516, + "learning_rate": 0.0004999961850125852, + "loss": 3.3788812160491943, + "step": 1763, + "token_acc": 0.2606408762285867 + }, + { + "epoch": 1.0340076223981236, + "grad_norm": 0.7065766123746435, + "learning_rate": 0.0004999960499795296, + "loss": 3.376729965209961, + "step": 1764, + "token_acc": 0.2613145549608382 + }, + { + "epoch": 1.0345939607153327, + "grad_norm": 0.5407566341838578, + "learning_rate": 0.000499995912598098, + "loss": 3.3315343856811523, + "step": 1765, + "token_acc": 0.2660919420627653 + }, + { + "epoch": 1.0351802990325418, + "grad_norm": 0.621189182172432, + "learning_rate": 0.0004999957728682918, + "loss": 3.323065757751465, + "step": 1766, + "token_acc": 0.2681113218883895 + }, + { + "epoch": 1.0357666373497507, + "grad_norm": 0.5422461458584718, + "learning_rate": 0.0004999956307901121, + "loss": 3.3605518341064453, + "step": 1767, + "token_acc": 0.2646861190383962 + }, + { + "epoch": 1.0363529756669598, + "grad_norm": 0.6210377597270093, + "learning_rate": 0.0004999954863635604, + "loss": 3.394392490386963, + "step": 1768, + "token_acc": 0.25997867092162785 + }, + { + "epoch": 1.036939313984169, + "grad_norm": 0.6425091949025237, + "learning_rate": 0.0004999953395886378, + "loss": 3.328765869140625, + "step": 1769, + "token_acc": 0.2654887864050384 + }, + { + "epoch": 1.0375256523013778, + "grad_norm": 0.6271117122961921, + "learning_rate": 0.000499995190465346, + "loss": 3.3462252616882324, + "step": 1770, + "token_acc": 0.2639479486828194 + }, + { + "epoch": 1.038111990618587, + "grad_norm": 0.7426235276624706, + "learning_rate": 0.0004999950389936862, + "loss": 3.375129461288452, + "step": 1771, + "token_acc": 0.26430534033049885 + }, + { + "epoch": 1.038698328935796, + "grad_norm": 0.7971650133320137, + "learning_rate": 0.00049999488517366, + "loss": 3.4017906188964844, + "step": 1772, + "token_acc": 0.2572413774625774 + }, + { + "epoch": 1.039284667253005, + "grad_norm": 0.7147371173369438, + "learning_rate": 0.0004999947290052686, + "loss": 3.390611171722412, + "step": 1773, + "token_acc": 0.25935235920248884 + }, + { + "epoch": 1.039871005570214, + "grad_norm": 0.6193729362901955, + "learning_rate": 0.0004999945704885137, + "loss": 3.3345460891723633, + "step": 1774, + "token_acc": 0.2665105683493924 + }, + { + "epoch": 1.040457343887423, + "grad_norm": 0.7206153699453736, + "learning_rate": 0.0004999944096233966, + "loss": 3.367556095123291, + "step": 1775, + "token_acc": 0.2627952338019445 + }, + { + "epoch": 1.041043682204632, + "grad_norm": 0.7897275617631395, + "learning_rate": 0.000499994246409919, + "loss": 3.400113821029663, + "step": 1776, + "token_acc": 0.25870588298139435 + }, + { + "epoch": 1.0416300205218412, + "grad_norm": 0.6176379273788828, + "learning_rate": 0.0004999940808480822, + "loss": 3.366276502609253, + "step": 1777, + "token_acc": 0.26104859825020754 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.530185348259988, + "learning_rate": 0.0004999939129378878, + "loss": 3.3744144439697266, + "step": 1778, + "token_acc": 0.2620523319404325 + }, + { + "epoch": 1.0428026971562592, + "grad_norm": 0.6424307231782191, + "learning_rate": 0.0004999937426793376, + "loss": 3.3443009853363037, + "step": 1779, + "token_acc": 0.2643695140060487 + }, + { + "epoch": 1.0433890354734683, + "grad_norm": 0.787044190861387, + "learning_rate": 0.0004999935700724332, + "loss": 3.3889405727386475, + "step": 1780, + "token_acc": 0.2584142390748931 + }, + { + "epoch": 1.0439753737906772, + "grad_norm": 0.8824838370481669, + "learning_rate": 0.0004999933951171759, + "loss": 3.375654697418213, + "step": 1781, + "token_acc": 0.259952274823826 + }, + { + "epoch": 1.0445617121078863, + "grad_norm": 0.8935646686320523, + "learning_rate": 0.0004999932178135675, + "loss": 3.3010785579681396, + "step": 1782, + "token_acc": 0.2697119190866822 + }, + { + "epoch": 1.0451480504250952, + "grad_norm": 0.8487326770320669, + "learning_rate": 0.0004999930381616097, + "loss": 3.3687915802001953, + "step": 1783, + "token_acc": 0.2626485251181288 + }, + { + "epoch": 1.0457343887423043, + "grad_norm": 0.6954601614095431, + "learning_rate": 0.0004999928561613042, + "loss": 3.323179006576538, + "step": 1784, + "token_acc": 0.2681812605673993 + }, + { + "epoch": 1.0463207270595134, + "grad_norm": 0.5976900116670606, + "learning_rate": 0.0004999926718126527, + "loss": 3.3723034858703613, + "step": 1785, + "token_acc": 0.26233275576805326 + }, + { + "epoch": 1.0469070653767223, + "grad_norm": 0.6694147867330238, + "learning_rate": 0.000499992485115657, + "loss": 3.3868842124938965, + "step": 1786, + "token_acc": 0.2607573843631575 + }, + { + "epoch": 1.0474934036939314, + "grad_norm": 0.5973223226217586, + "learning_rate": 0.0004999922960703186, + "loss": 3.4020771980285645, + "step": 1787, + "token_acc": 0.2592431174104918 + }, + { + "epoch": 1.0480797420111405, + "grad_norm": 0.6068263756477659, + "learning_rate": 0.0004999921046766395, + "loss": 3.367609977722168, + "step": 1788, + "token_acc": 0.26299552035317797 + }, + { + "epoch": 1.0486660803283494, + "grad_norm": 0.6786263449527892, + "learning_rate": 0.0004999919109346214, + "loss": 3.3985230922698975, + "step": 1789, + "token_acc": 0.25843748705271447 + }, + { + "epoch": 1.0492524186455585, + "grad_norm": 0.5951488449789288, + "learning_rate": 0.0004999917148442663, + "loss": 3.3365986347198486, + "step": 1790, + "token_acc": 0.26665637597139524 + }, + { + "epoch": 1.0498387569627674, + "grad_norm": 0.7109334805720633, + "learning_rate": 0.0004999915164055759, + "loss": 3.3945555686950684, + "step": 1791, + "token_acc": 0.2583650645719611 + }, + { + "epoch": 1.0504250952799765, + "grad_norm": 0.7271284886524519, + "learning_rate": 0.000499991315618552, + "loss": 3.371025562286377, + "step": 1792, + "token_acc": 0.2617175269309693 + }, + { + "epoch": 1.0510114335971856, + "grad_norm": 0.5688838521807622, + "learning_rate": 0.0004999911124831967, + "loss": 3.319046974182129, + "step": 1793, + "token_acc": 0.2682160262215897 + }, + { + "epoch": 1.0515977719143945, + "grad_norm": 0.641690404391398, + "learning_rate": 0.0004999909069995116, + "loss": 3.379556179046631, + "step": 1794, + "token_acc": 0.26156531428810914 + }, + { + "epoch": 1.0521841102316036, + "grad_norm": 0.6572637441109637, + "learning_rate": 0.0004999906991674988, + "loss": 3.3643383979797363, + "step": 1795, + "token_acc": 0.26225160674648995 + }, + { + "epoch": 1.0527704485488127, + "grad_norm": 0.549759502075547, + "learning_rate": 0.0004999904889871603, + "loss": 3.34519100189209, + "step": 1796, + "token_acc": 0.26352962639596555 + }, + { + "epoch": 1.0533567868660216, + "grad_norm": 0.5434664415730925, + "learning_rate": 0.000499990276458498, + "loss": 3.3356029987335205, + "step": 1797, + "token_acc": 0.26516858043306696 + }, + { + "epoch": 1.0539431251832307, + "grad_norm": 0.5761478692168411, + "learning_rate": 0.0004999900615815139, + "loss": 3.383047103881836, + "step": 1798, + "token_acc": 0.26080178173719376 + }, + { + "epoch": 1.0545294635004399, + "grad_norm": 0.5726446208156213, + "learning_rate": 0.0004999898443562101, + "loss": 3.344855308532715, + "step": 1799, + "token_acc": 0.2655866823643122 + }, + { + "epoch": 1.0551158018176487, + "grad_norm": 0.5781232852428487, + "learning_rate": 0.0004999896247825885, + "loss": 3.331421375274658, + "step": 1800, + "token_acc": 0.2655137388647701 + }, + { + "epoch": 1.0557021401348579, + "grad_norm": 0.6298368392090262, + "learning_rate": 0.0004999894028606514, + "loss": 3.390155553817749, + "step": 1801, + "token_acc": 0.2595282528063859 + }, + { + "epoch": 1.0562884784520667, + "grad_norm": 0.574812584160153, + "learning_rate": 0.0004999891785904004, + "loss": 3.3871545791625977, + "step": 1802, + "token_acc": 0.25976305514689135 + }, + { + "epoch": 1.0568748167692759, + "grad_norm": 0.6317646322221564, + "learning_rate": 0.0004999889519718382, + "loss": 3.388237476348877, + "step": 1803, + "token_acc": 0.2582999890411545 + }, + { + "epoch": 1.057461155086485, + "grad_norm": 0.6900534822495569, + "learning_rate": 0.0004999887230049667, + "loss": 3.375983953475952, + "step": 1804, + "token_acc": 0.26143238539886926 + }, + { + "epoch": 1.0580474934036939, + "grad_norm": 0.8767784815629192, + "learning_rate": 0.0004999884916897879, + "loss": 3.3584787845611572, + "step": 1805, + "token_acc": 0.26349800873603574 + }, + { + "epoch": 1.058633831720903, + "grad_norm": 1.007099225192306, + "learning_rate": 0.000499988258026304, + "loss": 3.3546409606933594, + "step": 1806, + "token_acc": 0.262421111459455 + }, + { + "epoch": 1.059220170038112, + "grad_norm": 0.8832702286904426, + "learning_rate": 0.0004999880220145174, + "loss": 3.3483357429504395, + "step": 1807, + "token_acc": 0.2629402200508295 + }, + { + "epoch": 1.059806508355321, + "grad_norm": 0.6009907201765745, + "learning_rate": 0.0004999877836544302, + "loss": 3.338658571243286, + "step": 1808, + "token_acc": 0.266400339185691 + }, + { + "epoch": 1.06039284667253, + "grad_norm": 0.654329565064158, + "learning_rate": 0.0004999875429460446, + "loss": 3.372558116912842, + "step": 1809, + "token_acc": 0.2592413487245147 + }, + { + "epoch": 1.060979184989739, + "grad_norm": 0.6799726831288975, + "learning_rate": 0.0004999872998893628, + "loss": 3.3641517162323, + "step": 1810, + "token_acc": 0.26096974502615644 + }, + { + "epoch": 1.061565523306948, + "grad_norm": 0.6776127141225181, + "learning_rate": 0.0004999870544843874, + "loss": 3.367945671081543, + "step": 1811, + "token_acc": 0.26141845321862356 + }, + { + "epoch": 1.0621518616241572, + "grad_norm": 0.6440756717176906, + "learning_rate": 0.0004999868067311204, + "loss": 3.348313331604004, + "step": 1812, + "token_acc": 0.26333283761606957 + }, + { + "epoch": 1.062738199941366, + "grad_norm": 0.6499229999172275, + "learning_rate": 0.0004999865566295642, + "loss": 3.3412609100341797, + "step": 1813, + "token_acc": 0.2657629102257766 + }, + { + "epoch": 1.0633245382585752, + "grad_norm": 0.5763340880916704, + "learning_rate": 0.000499986304179721, + "loss": 3.3790879249572754, + "step": 1814, + "token_acc": 0.2619082930343173 + }, + { + "epoch": 1.0639108765757843, + "grad_norm": 0.5513246032993949, + "learning_rate": 0.0004999860493815935, + "loss": 3.362651824951172, + "step": 1815, + "token_acc": 0.263337948139556 + }, + { + "epoch": 1.0644972148929932, + "grad_norm": 0.5268377542252404, + "learning_rate": 0.0004999857922351839, + "loss": 3.3605196475982666, + "step": 1816, + "token_acc": 0.26262203413911384 + }, + { + "epoch": 1.0650835532102023, + "grad_norm": 0.6251928919287948, + "learning_rate": 0.0004999855327404947, + "loss": 3.3256547451019287, + "step": 1817, + "token_acc": 0.2667386778127397 + }, + { + "epoch": 1.0656698915274112, + "grad_norm": 0.7228491988503415, + "learning_rate": 0.0004999852708975283, + "loss": 3.36371111869812, + "step": 1818, + "token_acc": 0.2629876962719242 + }, + { + "epoch": 1.0662562298446203, + "grad_norm": 0.5326920518090941, + "learning_rate": 0.000499985006706287, + "loss": 3.371650218963623, + "step": 1819, + "token_acc": 0.2576653202271892 + }, + { + "epoch": 1.0668425681618294, + "grad_norm": 0.5529386787284536, + "learning_rate": 0.0004999847401667734, + "loss": 3.371994972229004, + "step": 1820, + "token_acc": 0.2615316914843285 + }, + { + "epoch": 1.0674289064790383, + "grad_norm": 0.5786759343693647, + "learning_rate": 0.0004999844712789902, + "loss": 3.367558479309082, + "step": 1821, + "token_acc": 0.26153073724166576 + }, + { + "epoch": 1.0680152447962474, + "grad_norm": 0.5424333065781486, + "learning_rate": 0.0004999842000429395, + "loss": 3.367112874984741, + "step": 1822, + "token_acc": 0.2620458120805028 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.5202329588101864, + "learning_rate": 0.0004999839264586243, + "loss": 3.329667091369629, + "step": 1823, + "token_acc": 0.26598960158743123 + }, + { + "epoch": 1.0691879214306654, + "grad_norm": 0.5689297057917363, + "learning_rate": 0.0004999836505260469, + "loss": 3.3664710521698, + "step": 1824, + "token_acc": 0.26191230074395677 + }, + { + "epoch": 1.0697742597478745, + "grad_norm": 0.6323169839214782, + "learning_rate": 0.0004999833722452101, + "loss": 3.3379154205322266, + "step": 1825, + "token_acc": 0.2655530891697525 + }, + { + "epoch": 1.0703605980650837, + "grad_norm": 0.5124151991894181, + "learning_rate": 0.0004999830916161162, + "loss": 3.316084146499634, + "step": 1826, + "token_acc": 0.2683226797016983 + }, + { + "epoch": 1.0709469363822925, + "grad_norm": 0.5598974165907181, + "learning_rate": 0.0004999828086387681, + "loss": 3.2998197078704834, + "step": 1827, + "token_acc": 0.270402033210165 + }, + { + "epoch": 1.0715332746995017, + "grad_norm": 0.6750139041590645, + "learning_rate": 0.0004999825233131684, + "loss": 3.3208088874816895, + "step": 1828, + "token_acc": 0.265082214812915 + }, + { + "epoch": 1.0721196130167105, + "grad_norm": 0.7644314555676089, + "learning_rate": 0.0004999822356393196, + "loss": 3.369450569152832, + "step": 1829, + "token_acc": 0.25960608910278193 + }, + { + "epoch": 1.0727059513339197, + "grad_norm": 0.8627228572789439, + "learning_rate": 0.0004999819456172246, + "loss": 3.406212329864502, + "step": 1830, + "token_acc": 0.25401668360281865 + }, + { + "epoch": 1.0732922896511288, + "grad_norm": 0.7150873148017909, + "learning_rate": 0.0004999816532468862, + "loss": 3.3242688179016113, + "step": 1831, + "token_acc": 0.2687652418103312 + }, + { + "epoch": 1.0738786279683377, + "grad_norm": 0.5205099399792226, + "learning_rate": 0.0004999813585283069, + "loss": 3.2824041843414307, + "step": 1832, + "token_acc": 0.27185962055895924 + }, + { + "epoch": 1.0744649662855468, + "grad_norm": 0.48907368406473145, + "learning_rate": 0.0004999810614614897, + "loss": 3.332336902618408, + "step": 1833, + "token_acc": 0.265657668903105 + }, + { + "epoch": 1.0750513046027559, + "grad_norm": 0.5768644799840614, + "learning_rate": 0.0004999807620464371, + "loss": 3.312974691390991, + "step": 1834, + "token_acc": 0.2683395486341777 + }, + { + "epoch": 1.0756376429199648, + "grad_norm": 0.6864426625179267, + "learning_rate": 0.0004999804602831522, + "loss": 3.3786778450012207, + "step": 1835, + "token_acc": 0.2609175577785443 + }, + { + "epoch": 1.0762239812371739, + "grad_norm": 0.6407044114194256, + "learning_rate": 0.0004999801561716378, + "loss": 3.327113151550293, + "step": 1836, + "token_acc": 0.26692839676485003 + }, + { + "epoch": 1.0768103195543828, + "grad_norm": 0.6681801364819084, + "learning_rate": 0.0004999798497118966, + "loss": 3.3837780952453613, + "step": 1837, + "token_acc": 0.26060003444004376 + }, + { + "epoch": 1.077396657871592, + "grad_norm": 0.6652683721978996, + "learning_rate": 0.0004999795409039316, + "loss": 3.3408524990081787, + "step": 1838, + "token_acc": 0.26348956495779846 + }, + { + "epoch": 1.077982996188801, + "grad_norm": 0.6039194938169614, + "learning_rate": 0.0004999792297477457, + "loss": 3.3619892597198486, + "step": 1839, + "token_acc": 0.2623747790218032 + }, + { + "epoch": 1.07856933450601, + "grad_norm": 0.547788902364413, + "learning_rate": 0.0004999789162433417, + "loss": 3.3704473972320557, + "step": 1840, + "token_acc": 0.26020085208409227 + }, + { + "epoch": 1.079155672823219, + "grad_norm": 0.6651554678105396, + "learning_rate": 0.0004999786003907226, + "loss": 3.3197226524353027, + "step": 1841, + "token_acc": 0.2675078050416699 + }, + { + "epoch": 1.0797420111404281, + "grad_norm": 0.7083009286521162, + "learning_rate": 0.0004999782821898915, + "loss": 3.333406925201416, + "step": 1842, + "token_acc": 0.2653315556874858 + }, + { + "epoch": 1.080328349457637, + "grad_norm": 0.5973029569690527, + "learning_rate": 0.0004999779616408513, + "loss": 3.379715919494629, + "step": 1843, + "token_acc": 0.2600905616372774 + }, + { + "epoch": 1.0809146877748461, + "grad_norm": 0.5147418838862436, + "learning_rate": 0.000499977638743605, + "loss": 3.347729206085205, + "step": 1844, + "token_acc": 0.2622559923610282 + }, + { + "epoch": 1.081501026092055, + "grad_norm": 0.6218856387475501, + "learning_rate": 0.0004999773134981555, + "loss": 3.33701753616333, + "step": 1845, + "token_acc": 0.26537342896624333 + }, + { + "epoch": 1.0820873644092641, + "grad_norm": 0.6766762559448689, + "learning_rate": 0.0004999769859045061, + "loss": 3.293914794921875, + "step": 1846, + "token_acc": 0.27081108281301347 + }, + { + "epoch": 1.0826737027264732, + "grad_norm": 0.6075565064024652, + "learning_rate": 0.0004999766559626597, + "loss": 3.3183226585388184, + "step": 1847, + "token_acc": 0.2680923613695326 + }, + { + "epoch": 1.0832600410436821, + "grad_norm": 0.5895830466848409, + "learning_rate": 0.0004999763236726196, + "loss": 3.292226791381836, + "step": 1848, + "token_acc": 0.2695374329179644 + }, + { + "epoch": 1.0838463793608912, + "grad_norm": 0.4498300994789812, + "learning_rate": 0.0004999759890343886, + "loss": 3.255733013153076, + "step": 1849, + "token_acc": 0.2766067382638417 + }, + { + "epoch": 1.0844327176781003, + "grad_norm": 0.5709093043660186, + "learning_rate": 0.0004999756520479701, + "loss": 3.348172426223755, + "step": 1850, + "token_acc": 0.2636486823806298 + }, + { + "epoch": 1.0850190559953092, + "grad_norm": 0.5794314239898045, + "learning_rate": 0.0004999753127133673, + "loss": 3.3362908363342285, + "step": 1851, + "token_acc": 0.2637277857505098 + }, + { + "epoch": 1.0856053943125183, + "grad_norm": 0.6140688818797233, + "learning_rate": 0.0004999749710305832, + "loss": 3.3446223735809326, + "step": 1852, + "token_acc": 0.26468104167060175 + }, + { + "epoch": 1.0861917326297275, + "grad_norm": 0.5738251792865984, + "learning_rate": 0.0004999746269996211, + "loss": 3.3634095191955566, + "step": 1853, + "token_acc": 0.2618142664594124 + }, + { + "epoch": 1.0867780709469363, + "grad_norm": 0.6388997649358731, + "learning_rate": 0.0004999742806204842, + "loss": 3.3830952644348145, + "step": 1854, + "token_acc": 0.26032733984677164 + }, + { + "epoch": 1.0873644092641455, + "grad_norm": 0.9324259492460218, + "learning_rate": 0.0004999739318931758, + "loss": 3.3028879165649414, + "step": 1855, + "token_acc": 0.2693825183778949 + }, + { + "epoch": 1.0879507475813543, + "grad_norm": 1.0466821770471486, + "learning_rate": 0.0004999735808176992, + "loss": 3.3617472648620605, + "step": 1856, + "token_acc": 0.2610474782686597 + }, + { + "epoch": 1.0885370858985635, + "grad_norm": 0.6822823226594966, + "learning_rate": 0.0004999732273940575, + "loss": 3.410457134246826, + "step": 1857, + "token_acc": 0.2556543242541889 + }, + { + "epoch": 1.0891234242157726, + "grad_norm": 0.5957904986127947, + "learning_rate": 0.0004999728716222543, + "loss": 3.3506553173065186, + "step": 1858, + "token_acc": 0.26247800134259847 + }, + { + "epoch": 1.0897097625329815, + "grad_norm": 0.6466425149601915, + "learning_rate": 0.0004999725135022928, + "loss": 3.353973150253296, + "step": 1859, + "token_acc": 0.2618879293331768 + }, + { + "epoch": 1.0902961008501906, + "grad_norm": 0.5617310524495857, + "learning_rate": 0.0004999721530341764, + "loss": 3.3509185314178467, + "step": 1860, + "token_acc": 0.2619573456493217 + }, + { + "epoch": 1.0908824391673997, + "grad_norm": 0.5686563991207016, + "learning_rate": 0.0004999717902179083, + "loss": 3.406980037689209, + "step": 1861, + "token_acc": 0.2577925979174264 + }, + { + "epoch": 1.0914687774846086, + "grad_norm": 0.5780438723543403, + "learning_rate": 0.0004999714250534923, + "loss": 3.3280835151672363, + "step": 1862, + "token_acc": 0.26801147419097987 + }, + { + "epoch": 1.0920551158018177, + "grad_norm": 0.5615025690471361, + "learning_rate": 0.0004999710575409315, + "loss": 3.3526835441589355, + "step": 1863, + "token_acc": 0.2635024850555529 + }, + { + "epoch": 1.0926414541190266, + "grad_norm": 0.6735715924777766, + "learning_rate": 0.0004999706876802295, + "loss": 3.2892231941223145, + "step": 1864, + "token_acc": 0.2718837628343871 + }, + { + "epoch": 1.0932277924362357, + "grad_norm": 0.7136312296739743, + "learning_rate": 0.0004999703154713897, + "loss": 3.304704427719116, + "step": 1865, + "token_acc": 0.26911732136025934 + }, + { + "epoch": 1.0938141307534448, + "grad_norm": 0.6296857619462491, + "learning_rate": 0.0004999699409144156, + "loss": 3.2942373752593994, + "step": 1866, + "token_acc": 0.2705142660277491 + }, + { + "epoch": 1.0944004690706537, + "grad_norm": 0.5584824313959695, + "learning_rate": 0.0004999695640093107, + "loss": 3.3270745277404785, + "step": 1867, + "token_acc": 0.2684870080474036 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.43891433906760874, + "learning_rate": 0.0004999691847560787, + "loss": 3.360311508178711, + "step": 1868, + "token_acc": 0.2626777038438418 + }, + { + "epoch": 1.095573145705072, + "grad_norm": 0.5333554592436626, + "learning_rate": 0.0004999688031547229, + "loss": 3.3480441570281982, + "step": 1869, + "token_acc": 0.26455563992981673 + }, + { + "epoch": 1.0961594840222808, + "grad_norm": 0.6366161160603616, + "learning_rate": 0.0004999684192052472, + "loss": 3.366748571395874, + "step": 1870, + "token_acc": 0.2623923753882953 + }, + { + "epoch": 1.09674582233949, + "grad_norm": 0.6133990830848296, + "learning_rate": 0.000499968032907655, + "loss": 3.32450532913208, + "step": 1871, + "token_acc": 0.2667108760268192 + }, + { + "epoch": 1.0973321606566988, + "grad_norm": 0.5519151161152714, + "learning_rate": 0.0004999676442619498, + "loss": 3.3259434700012207, + "step": 1872, + "token_acc": 0.266930347960166 + }, + { + "epoch": 1.097918498973908, + "grad_norm": 0.6297742425965059, + "learning_rate": 0.0004999672532681357, + "loss": 3.309068202972412, + "step": 1873, + "token_acc": 0.2677486830489108 + }, + { + "epoch": 1.098504837291117, + "grad_norm": 0.5514107391268099, + "learning_rate": 0.0004999668599262159, + "loss": 3.3154306411743164, + "step": 1874, + "token_acc": 0.27020064833872764 + }, + { + "epoch": 1.099091175608326, + "grad_norm": 0.4772557412534295, + "learning_rate": 0.0004999664642361943, + "loss": 3.3363611698150635, + "step": 1875, + "token_acc": 0.263571445319522 + }, + { + "epoch": 1.099677513925535, + "grad_norm": 0.5123964763096657, + "learning_rate": 0.0004999660661980746, + "loss": 3.3685688972473145, + "step": 1876, + "token_acc": 0.260429188084458 + }, + { + "epoch": 1.1002638522427441, + "grad_norm": 0.5863914556804758, + "learning_rate": 0.0004999656658118605, + "loss": 3.2840476036071777, + "step": 1877, + "token_acc": 0.2736421947713758 + }, + { + "epoch": 1.100850190559953, + "grad_norm": 0.5891678446157472, + "learning_rate": 0.0004999652630775559, + "loss": 3.325604200363159, + "step": 1878, + "token_acc": 0.2674743370235364 + }, + { + "epoch": 1.1014365288771621, + "grad_norm": 0.5704012617130357, + "learning_rate": 0.0004999648579951645, + "loss": 3.423212766647339, + "step": 1879, + "token_acc": 0.2529669515451143 + }, + { + "epoch": 1.1020228671943713, + "grad_norm": 0.4671860749231948, + "learning_rate": 0.0004999644505646899, + "loss": 3.292140245437622, + "step": 1880, + "token_acc": 0.27090793847766725 + }, + { + "epoch": 1.1026092055115801, + "grad_norm": 0.5532526950883575, + "learning_rate": 0.0004999640407861364, + "loss": 3.4039430618286133, + "step": 1881, + "token_acc": 0.2583785466981929 + }, + { + "epoch": 1.1031955438287893, + "grad_norm": 0.7362505440893078, + "learning_rate": 0.0004999636286595075, + "loss": 3.3160059452056885, + "step": 1882, + "token_acc": 0.2665891730322152 + }, + { + "epoch": 1.1037818821459981, + "grad_norm": 0.8003015890089482, + "learning_rate": 0.0004999632141848069, + "loss": 3.3513550758361816, + "step": 1883, + "token_acc": 0.26512444616860975 + }, + { + "epoch": 1.1043682204632073, + "grad_norm": 0.7668373017060995, + "learning_rate": 0.000499962797362039, + "loss": 3.393906354904175, + "step": 1884, + "token_acc": 0.25884256099530584 + }, + { + "epoch": 1.1049545587804164, + "grad_norm": 0.690665700931511, + "learning_rate": 0.0004999623781912074, + "loss": 3.346266746520996, + "step": 1885, + "token_acc": 0.26421327937376304 + }, + { + "epoch": 1.1055408970976253, + "grad_norm": 0.6975919825549942, + "learning_rate": 0.000499961956672316, + "loss": 3.3419108390808105, + "step": 1886, + "token_acc": 0.2651025889385862 + }, + { + "epoch": 1.1061272354148344, + "grad_norm": 0.6737058837854076, + "learning_rate": 0.0004999615328053688, + "loss": 3.3344521522521973, + "step": 1887, + "token_acc": 0.2672704262373105 + }, + { + "epoch": 1.1067135737320435, + "grad_norm": 0.6398945191049568, + "learning_rate": 0.0004999611065903699, + "loss": 3.3697056770324707, + "step": 1888, + "token_acc": 0.2620787687918469 + }, + { + "epoch": 1.1072999120492524, + "grad_norm": 0.6698940892123584, + "learning_rate": 0.0004999606780273232, + "loss": 3.36635160446167, + "step": 1889, + "token_acc": 0.26058160720010637 + }, + { + "epoch": 1.1078862503664615, + "grad_norm": 0.592741016520332, + "learning_rate": 0.0004999602471162329, + "loss": 3.3192174434661865, + "step": 1890, + "token_acc": 0.2680414890439599 + }, + { + "epoch": 1.1084725886836704, + "grad_norm": 0.5222829067995212, + "learning_rate": 0.0004999598138571027, + "loss": 3.3659634590148926, + "step": 1891, + "token_acc": 0.26195347190771423 + }, + { + "epoch": 1.1090589270008795, + "grad_norm": 0.5941686575947177, + "learning_rate": 0.000499959378249937, + "loss": 3.3365256786346436, + "step": 1892, + "token_acc": 0.2647146916419458 + }, + { + "epoch": 1.1096452653180886, + "grad_norm": 0.5645723417676275, + "learning_rate": 0.0004999589402947397, + "loss": 3.327695846557617, + "step": 1893, + "token_acc": 0.2664400041845381 + }, + { + "epoch": 1.1102316036352975, + "grad_norm": 0.4825747104994076, + "learning_rate": 0.0004999584999915151, + "loss": 3.331441879272461, + "step": 1894, + "token_acc": 0.26707964100093023 + }, + { + "epoch": 1.1108179419525066, + "grad_norm": 0.532321136983003, + "learning_rate": 0.0004999580573402671, + "loss": 3.3440840244293213, + "step": 1895, + "token_acc": 0.26270350772859197 + }, + { + "epoch": 1.1114042802697157, + "grad_norm": 0.5461527069716696, + "learning_rate": 0.000499957612341, + "loss": 3.3447318077087402, + "step": 1896, + "token_acc": 0.26430457046531597 + }, + { + "epoch": 1.1119906185869246, + "grad_norm": 0.5639976855880591, + "learning_rate": 0.000499957164993718, + "loss": 3.3305516242980957, + "step": 1897, + "token_acc": 0.26659962087404737 + }, + { + "epoch": 1.1125769569041337, + "grad_norm": 0.6025613452212991, + "learning_rate": 0.0004999567152984253, + "loss": 3.3148622512817383, + "step": 1898, + "token_acc": 0.2698700237590978 + }, + { + "epoch": 1.1131632952213426, + "grad_norm": 0.6505915523023115, + "learning_rate": 0.0004999562632551259, + "loss": 3.366891384124756, + "step": 1899, + "token_acc": 0.26387677236943063 + }, + { + "epoch": 1.1137496335385517, + "grad_norm": 0.5666705951303488, + "learning_rate": 0.0004999558088638243, + "loss": 3.278472900390625, + "step": 1900, + "token_acc": 0.27175363999484603 + }, + { + "epoch": 1.1143359718557608, + "grad_norm": 0.47537337236353827, + "learning_rate": 0.0004999553521245247, + "loss": 3.3631415367126465, + "step": 1901, + "token_acc": 0.26305859480138705 + }, + { + "epoch": 1.1149223101729697, + "grad_norm": 0.6173271370983678, + "learning_rate": 0.0004999548930372314, + "loss": 3.3516504764556885, + "step": 1902, + "token_acc": 0.2643220486268732 + }, + { + "epoch": 1.1155086484901788, + "grad_norm": 0.8013410822645078, + "learning_rate": 0.0004999544316019488, + "loss": 3.30094051361084, + "step": 1903, + "token_acc": 0.27045977385544673 + }, + { + "epoch": 1.116094986807388, + "grad_norm": 0.6951029556078197, + "learning_rate": 0.0004999539678186809, + "loss": 3.376927137374878, + "step": 1904, + "token_acc": 0.2605311514522544 + }, + { + "epoch": 1.1166813251245968, + "grad_norm": 0.4459216631307039, + "learning_rate": 0.0004999535016874325, + "loss": 3.2677159309387207, + "step": 1905, + "token_acc": 0.2740752526470845 + }, + { + "epoch": 1.117267663441806, + "grad_norm": 0.6874695730036623, + "learning_rate": 0.0004999530332082077, + "loss": 3.3685755729675293, + "step": 1906, + "token_acc": 0.2614025943320186 + }, + { + "epoch": 1.117854001759015, + "grad_norm": 1.0556619445726723, + "learning_rate": 0.0004999525623810109, + "loss": 3.399080276489258, + "step": 1907, + "token_acc": 0.25721220401435585 + }, + { + "epoch": 1.118440340076224, + "grad_norm": 0.8531936734435709, + "learning_rate": 0.0004999520892058467, + "loss": 3.3394012451171875, + "step": 1908, + "token_acc": 0.26410148472021716 + }, + { + "epoch": 1.119026678393433, + "grad_norm": 0.6192289943454509, + "learning_rate": 0.0004999516136827194, + "loss": 3.2935938835144043, + "step": 1909, + "token_acc": 0.2682424447070741 + }, + { + "epoch": 1.119613016710642, + "grad_norm": 0.572382491857413, + "learning_rate": 0.0004999511358116335, + "loss": 3.3077783584594727, + "step": 1910, + "token_acc": 0.2691926921677334 + }, + { + "epoch": 1.120199355027851, + "grad_norm": 0.5894869325414929, + "learning_rate": 0.0004999506555925934, + "loss": 3.3611085414886475, + "step": 1911, + "token_acc": 0.26167399275030206 + }, + { + "epoch": 1.1207856933450602, + "grad_norm": 0.5529352808543856, + "learning_rate": 0.0004999501730256038, + "loss": 3.3481454849243164, + "step": 1912, + "token_acc": 0.26226218509962673 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.503615500815024, + "learning_rate": 0.0004999496881106692, + "loss": 3.3238844871520996, + "step": 1913, + "token_acc": 0.2676295536172396 + }, + { + "epoch": 1.1219583699794782, + "grad_norm": 0.592746502645452, + "learning_rate": 0.0004999492008477941, + "loss": 3.3782854080200195, + "step": 1914, + "token_acc": 0.26008726398937654 + }, + { + "epoch": 1.1225447082966873, + "grad_norm": 0.5593075911734255, + "learning_rate": 0.0004999487112369829, + "loss": 3.340618133544922, + "step": 1915, + "token_acc": 0.26365046021027705 + }, + { + "epoch": 1.1231310466138962, + "grad_norm": 0.6167612992835367, + "learning_rate": 0.0004999482192782405, + "loss": 3.3364768028259277, + "step": 1916, + "token_acc": 0.2636481941992759 + }, + { + "epoch": 1.1237173849311053, + "grad_norm": 0.6646407615293147, + "learning_rate": 0.0004999477249715713, + "loss": 3.3232669830322266, + "step": 1917, + "token_acc": 0.26646408302424435 + }, + { + "epoch": 1.1243037232483142, + "grad_norm": 0.5352018803183902, + "learning_rate": 0.0004999472283169801, + "loss": 3.3411102294921875, + "step": 1918, + "token_acc": 0.265033784237244 + }, + { + "epoch": 1.1248900615655233, + "grad_norm": 0.6226158618441165, + "learning_rate": 0.0004999467293144715, + "loss": 3.365734577178955, + "step": 1919, + "token_acc": 0.2614583032874363 + }, + { + "epoch": 1.1254763998827324, + "grad_norm": 0.6410674399817448, + "learning_rate": 0.0004999462279640501, + "loss": 3.3511767387390137, + "step": 1920, + "token_acc": 0.26362318879547003 + }, + { + "epoch": 1.1260627381999413, + "grad_norm": 0.5671011036793839, + "learning_rate": 0.0004999457242657209, + "loss": 3.3141424655914307, + "step": 1921, + "token_acc": 0.26580010373607027 + }, + { + "epoch": 1.1266490765171504, + "grad_norm": 0.4911033285680153, + "learning_rate": 0.0004999452182194882, + "loss": 3.345874309539795, + "step": 1922, + "token_acc": 0.2666244865730989 + }, + { + "epoch": 1.1272354148343595, + "grad_norm": 0.49938218798965683, + "learning_rate": 0.000499944709825357, + "loss": 3.3207216262817383, + "step": 1923, + "token_acc": 0.26815428941024283 + }, + { + "epoch": 1.1278217531515684, + "grad_norm": 0.4872232565360168, + "learning_rate": 0.0004999441990833321, + "loss": 3.3287510871887207, + "step": 1924, + "token_acc": 0.26651184030269426 + }, + { + "epoch": 1.1284080914687775, + "grad_norm": 0.6002348136143194, + "learning_rate": 0.0004999436859934183, + "loss": 3.328094959259033, + "step": 1925, + "token_acc": 0.26770943040196593 + }, + { + "epoch": 1.1289944297859864, + "grad_norm": 0.6679810165256036, + "learning_rate": 0.0004999431705556203, + "loss": 3.3474817276000977, + "step": 1926, + "token_acc": 0.26200465022240194 + }, + { + "epoch": 1.1295807681031955, + "grad_norm": 0.6669379184892937, + "learning_rate": 0.0004999426527699431, + "loss": 3.3377952575683594, + "step": 1927, + "token_acc": 0.26330037694986236 + }, + { + "epoch": 1.1301671064204046, + "grad_norm": 0.6382859696933323, + "learning_rate": 0.0004999421326363914, + "loss": 3.348890781402588, + "step": 1928, + "token_acc": 0.2635761179877766 + }, + { + "epoch": 1.1307534447376135, + "grad_norm": 0.5763306563268678, + "learning_rate": 0.0004999416101549703, + "loss": 3.2861194610595703, + "step": 1929, + "token_acc": 0.2722027875968424 + }, + { + "epoch": 1.1313397830548226, + "grad_norm": 0.6358846500409145, + "learning_rate": 0.0004999410853256844, + "loss": 3.3237404823303223, + "step": 1930, + "token_acc": 0.2668121316779103 + }, + { + "epoch": 1.1319261213720317, + "grad_norm": 0.545416214518993, + "learning_rate": 0.0004999405581485389, + "loss": 3.3272385597229004, + "step": 1931, + "token_acc": 0.26671641272108243 + }, + { + "epoch": 1.1325124596892406, + "grad_norm": 0.5281814969688989, + "learning_rate": 0.0004999400286235387, + "loss": 3.3143866062164307, + "step": 1932, + "token_acc": 0.26942847342040455 + }, + { + "epoch": 1.1330987980064497, + "grad_norm": 0.5410201660790362, + "learning_rate": 0.0004999394967506886, + "loss": 3.3470473289489746, + "step": 1933, + "token_acc": 0.26370858566918537 + }, + { + "epoch": 1.1336851363236589, + "grad_norm": 0.4977793139044205, + "learning_rate": 0.0004999389625299939, + "loss": 3.3750386238098145, + "step": 1934, + "token_acc": 0.2593037903774528 + }, + { + "epoch": 1.1342714746408677, + "grad_norm": 0.4570057985395873, + "learning_rate": 0.0004999384259614593, + "loss": 3.3094420433044434, + "step": 1935, + "token_acc": 0.268883009806734 + }, + { + "epoch": 1.1348578129580769, + "grad_norm": 0.45848330350191957, + "learning_rate": 0.00049993788704509, + "loss": 3.294532299041748, + "step": 1936, + "token_acc": 0.2720610655462359 + }, + { + "epoch": 1.1354441512752858, + "grad_norm": 0.5463846156054845, + "learning_rate": 0.0004999373457808911, + "loss": 3.334261417388916, + "step": 1937, + "token_acc": 0.26462669401470224 + }, + { + "epoch": 1.1360304895924949, + "grad_norm": 0.5651418954888727, + "learning_rate": 0.0004999368021688676, + "loss": 3.330078601837158, + "step": 1938, + "token_acc": 0.26544012309651405 + }, + { + "epoch": 1.136616827909704, + "grad_norm": 0.549910936933279, + "learning_rate": 0.0004999362562090246, + "loss": 3.366504192352295, + "step": 1939, + "token_acc": 0.260170889867471 + }, + { + "epoch": 1.1372031662269129, + "grad_norm": 0.5246737950631898, + "learning_rate": 0.0004999357079013674, + "loss": 3.3516793251037598, + "step": 1940, + "token_acc": 0.26385470257022425 + }, + { + "epoch": 1.137789504544122, + "grad_norm": 0.6110038833137961, + "learning_rate": 0.0004999351572459007, + "loss": 3.3069732189178467, + "step": 1941, + "token_acc": 0.26802802845200246 + }, + { + "epoch": 1.1383758428613309, + "grad_norm": 0.5901884805530477, + "learning_rate": 0.0004999346042426303, + "loss": 3.32161283493042, + "step": 1942, + "token_acc": 0.2678407694645193 + }, + { + "epoch": 1.13896218117854, + "grad_norm": 0.5773580003365661, + "learning_rate": 0.000499934048891561, + "loss": 3.363138198852539, + "step": 1943, + "token_acc": 0.26155261862773244 + }, + { + "epoch": 1.139548519495749, + "grad_norm": 0.5687998878253862, + "learning_rate": 0.0004999334911926981, + "loss": 3.3818531036376953, + "step": 1944, + "token_acc": 0.2597661416453294 + }, + { + "epoch": 1.140134857812958, + "grad_norm": 0.5312921631730979, + "learning_rate": 0.0004999329311460469, + "loss": 3.29067325592041, + "step": 1945, + "token_acc": 0.2702794951555724 + }, + { + "epoch": 1.140721196130167, + "grad_norm": 0.6673850120935002, + "learning_rate": 0.0004999323687516125, + "loss": 3.3104586601257324, + "step": 1946, + "token_acc": 0.26910676506018 + }, + { + "epoch": 1.1413075344473762, + "grad_norm": 0.7037292888804452, + "learning_rate": 0.0004999318040094003, + "loss": 3.378039598464966, + "step": 1947, + "token_acc": 0.2598437712884327 + }, + { + "epoch": 1.141893872764585, + "grad_norm": 0.6704269240044067, + "learning_rate": 0.0004999312369194156, + "loss": 3.296900749206543, + "step": 1948, + "token_acc": 0.2706122418396477 + }, + { + "epoch": 1.1424802110817942, + "grad_norm": 0.5668683720984143, + "learning_rate": 0.0004999306674816637, + "loss": 3.3289437294006348, + "step": 1949, + "token_acc": 0.2674995401975528 + }, + { + "epoch": 1.1430665493990033, + "grad_norm": 0.4909697172829595, + "learning_rate": 0.00049993009569615, + "loss": 3.3821184635162354, + "step": 1950, + "token_acc": 0.2589990517114412 + }, + { + "epoch": 1.1436528877162122, + "grad_norm": 0.7546160577081155, + "learning_rate": 0.0004999295215628799, + "loss": 3.361788749694824, + "step": 1951, + "token_acc": 0.26076422734953486 + }, + { + "epoch": 1.1442392260334213, + "grad_norm": 0.7942132246360624, + "learning_rate": 0.0004999289450818587, + "loss": 3.3548426628112793, + "step": 1952, + "token_acc": 0.26182707993474713 + }, + { + "epoch": 1.1448255643506302, + "grad_norm": 0.7251923793813759, + "learning_rate": 0.0004999283662530917, + "loss": 3.322568893432617, + "step": 1953, + "token_acc": 0.26760266891207724 + }, + { + "epoch": 1.1454119026678393, + "grad_norm": 0.5260972020853423, + "learning_rate": 0.0004999277850765845, + "loss": 3.3087902069091797, + "step": 1954, + "token_acc": 0.2683812284737829 + }, + { + "epoch": 1.1459982409850484, + "grad_norm": 0.6211888120211889, + "learning_rate": 0.0004999272015523427, + "loss": 3.3634419441223145, + "step": 1955, + "token_acc": 0.2624973219540108 + }, + { + "epoch": 1.1465845793022573, + "grad_norm": 0.6463600187087004, + "learning_rate": 0.0004999266156803715, + "loss": 3.369861364364624, + "step": 1956, + "token_acc": 0.25803941571267597 + }, + { + "epoch": 1.1471709176194664, + "grad_norm": 0.5679036380986741, + "learning_rate": 0.0004999260274606766, + "loss": 3.337130546569824, + "step": 1957, + "token_acc": 0.26478252672412234 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.47638931566558745, + "learning_rate": 0.0004999254368932635, + "loss": 3.300784111022949, + "step": 1958, + "token_acc": 0.2696280270779538 + }, + { + "epoch": 1.1483435942538844, + "grad_norm": 0.4778844444996467, + "learning_rate": 0.0004999248439781375, + "loss": 3.331264019012451, + "step": 1959, + "token_acc": 0.26758135526101234 + }, + { + "epoch": 1.1489299325710935, + "grad_norm": 0.4466850287285112, + "learning_rate": 0.0004999242487153045, + "loss": 3.333454132080078, + "step": 1960, + "token_acc": 0.2660251451944885 + }, + { + "epoch": 1.1495162708883027, + "grad_norm": 0.45498073276311674, + "learning_rate": 0.00049992365110477, + "loss": 3.2760415077209473, + "step": 1961, + "token_acc": 0.27177966482454957 + }, + { + "epoch": 1.1501026092055116, + "grad_norm": 0.48689519718408053, + "learning_rate": 0.0004999230511465395, + "loss": 3.349882125854492, + "step": 1962, + "token_acc": 0.26373021335168617 + }, + { + "epoch": 1.1506889475227207, + "grad_norm": 0.5021402612562448, + "learning_rate": 0.0004999224488406187, + "loss": 3.3303253650665283, + "step": 1963, + "token_acc": 0.2658426835894433 + }, + { + "epoch": 1.1512752858399296, + "grad_norm": 0.4930948466902079, + "learning_rate": 0.0004999218441870133, + "loss": 3.2795398235321045, + "step": 1964, + "token_acc": 0.27140242489964134 + }, + { + "epoch": 1.1518616241571387, + "grad_norm": 0.4089300845606071, + "learning_rate": 0.0004999212371857289, + "loss": 3.3557021617889404, + "step": 1965, + "token_acc": 0.2634841368707698 + }, + { + "epoch": 1.1524479624743478, + "grad_norm": 0.46428686106387707, + "learning_rate": 0.0004999206278367713, + "loss": 3.3432679176330566, + "step": 1966, + "token_acc": 0.262956020114102 + }, + { + "epoch": 1.1530343007915567, + "grad_norm": 0.4827792193358561, + "learning_rate": 0.0004999200161401462, + "loss": 3.3332736492156982, + "step": 1967, + "token_acc": 0.26565287818896655 + }, + { + "epoch": 1.1536206391087658, + "grad_norm": 0.4955232974253411, + "learning_rate": 0.0004999194020958594, + "loss": 3.3183631896972656, + "step": 1968, + "token_acc": 0.266028720458497 + }, + { + "epoch": 1.1542069774259747, + "grad_norm": 0.5387027148881859, + "learning_rate": 0.0004999187857039164, + "loss": 3.27771258354187, + "step": 1969, + "token_acc": 0.27252718111906654 + }, + { + "epoch": 1.1547933157431838, + "grad_norm": 0.5097067438738796, + "learning_rate": 0.0004999181669643232, + "loss": 3.284806728363037, + "step": 1970, + "token_acc": 0.27260333575054385 + }, + { + "epoch": 1.155379654060393, + "grad_norm": 0.5376775667793493, + "learning_rate": 0.0004999175458770857, + "loss": 3.367516040802002, + "step": 1971, + "token_acc": 0.2622917731908824 + }, + { + "epoch": 1.1559659923776018, + "grad_norm": 0.6902034020546416, + "learning_rate": 0.0004999169224422096, + "loss": 3.2984836101531982, + "step": 1972, + "token_acc": 0.27052429375085757 + }, + { + "epoch": 1.156552330694811, + "grad_norm": 0.7599261340523564, + "learning_rate": 0.0004999162966597007, + "loss": 3.289255142211914, + "step": 1973, + "token_acc": 0.2711683833943151 + }, + { + "epoch": 1.15713866901202, + "grad_norm": 0.7883124066125632, + "learning_rate": 0.000499915668529565, + "loss": 3.368717670440674, + "step": 1974, + "token_acc": 0.2599040848881709 + }, + { + "epoch": 1.157725007329229, + "grad_norm": 0.708867332249208, + "learning_rate": 0.0004999150380518084, + "loss": 3.324082374572754, + "step": 1975, + "token_acc": 0.2677142063459402 + }, + { + "epoch": 1.158311345646438, + "grad_norm": 0.6819957599965444, + "learning_rate": 0.0004999144052264368, + "loss": 3.280559778213501, + "step": 1976, + "token_acc": 0.27372438667951865 + }, + { + "epoch": 1.1588976839636471, + "grad_norm": 0.6193460370668442, + "learning_rate": 0.0004999137700534561, + "loss": 3.3649582862854004, + "step": 1977, + "token_acc": 0.26222782048207693 + }, + { + "epoch": 1.159484022280856, + "grad_norm": 0.6925984551908982, + "learning_rate": 0.0004999131325328722, + "loss": 3.358515501022339, + "step": 1978, + "token_acc": 0.2620345398383878 + }, + { + "epoch": 1.1600703605980651, + "grad_norm": 0.6343771196827566, + "learning_rate": 0.0004999124926646913, + "loss": 3.3289148807525635, + "step": 1979, + "token_acc": 0.267540932722899 + }, + { + "epoch": 1.160656698915274, + "grad_norm": 0.5391867951746058, + "learning_rate": 0.0004999118504489192, + "loss": 3.3240575790405273, + "step": 1980, + "token_acc": 0.2653484716698198 + }, + { + "epoch": 1.1612430372324831, + "grad_norm": 0.5134899132205668, + "learning_rate": 0.0004999112058855622, + "loss": 3.3434672355651855, + "step": 1981, + "token_acc": 0.2659219217965709 + }, + { + "epoch": 1.1618293755496922, + "grad_norm": 0.49861589487998115, + "learning_rate": 0.000499910558974626, + "loss": 3.3181262016296387, + "step": 1982, + "token_acc": 0.26775059905758997 + }, + { + "epoch": 1.1624157138669011, + "grad_norm": 0.5340075308783796, + "learning_rate": 0.0004999099097161169, + "loss": 3.3119540214538574, + "step": 1983, + "token_acc": 0.2680917750317468 + }, + { + "epoch": 1.1630020521841102, + "grad_norm": 0.5318725733219032, + "learning_rate": 0.0004999092581100409, + "loss": 3.316734552383423, + "step": 1984, + "token_acc": 0.2680798699421582 + }, + { + "epoch": 1.1635883905013193, + "grad_norm": 0.5011606879683798, + "learning_rate": 0.0004999086041564042, + "loss": 3.3209729194641113, + "step": 1985, + "token_acc": 0.265934351641209 + }, + { + "epoch": 1.1641747288185282, + "grad_norm": 0.541041190243588, + "learning_rate": 0.0004999079478552131, + "loss": 3.341071128845215, + "step": 1986, + "token_acc": 0.2649006797333861 + }, + { + "epoch": 1.1647610671357373, + "grad_norm": 0.48645049943106616, + "learning_rate": 0.0004999072892064734, + "loss": 3.2668838500976562, + "step": 1987, + "token_acc": 0.27502875405864813 + }, + { + "epoch": 1.1653474054529465, + "grad_norm": 0.466205060143751, + "learning_rate": 0.0004999066282101915, + "loss": 3.3229565620422363, + "step": 1988, + "token_acc": 0.2666699778326153 + }, + { + "epoch": 1.1659337437701554, + "grad_norm": 0.5275081305958017, + "learning_rate": 0.0004999059648663737, + "loss": 3.2949752807617188, + "step": 1989, + "token_acc": 0.2690598363427695 + }, + { + "epoch": 1.1665200820873645, + "grad_norm": 0.46418700701633336, + "learning_rate": 0.0004999052991750259, + "loss": 3.350149154663086, + "step": 1990, + "token_acc": 0.26396976033456654 + }, + { + "epoch": 1.1671064204045734, + "grad_norm": 0.4142658577256744, + "learning_rate": 0.0004999046311361547, + "loss": 3.3271024227142334, + "step": 1991, + "token_acc": 0.2648428450575909 + }, + { + "epoch": 1.1676927587217825, + "grad_norm": 0.48063041404868223, + "learning_rate": 0.0004999039607497663, + "loss": 3.3046584129333496, + "step": 1992, + "token_acc": 0.2703142290964305 + }, + { + "epoch": 1.1682790970389916, + "grad_norm": 0.4561730771656601, + "learning_rate": 0.0004999032880158668, + "loss": 3.357513904571533, + "step": 1993, + "token_acc": 0.26053569328772935 + }, + { + "epoch": 1.1688654353562005, + "grad_norm": 0.559179657980282, + "learning_rate": 0.0004999026129344627, + "loss": 3.360114574432373, + "step": 1994, + "token_acc": 0.2621969072612128 + }, + { + "epoch": 1.1694517736734096, + "grad_norm": 0.6341301938141158, + "learning_rate": 0.0004999019355055604, + "loss": 3.3208227157592773, + "step": 1995, + "token_acc": 0.26457756005305944 + }, + { + "epoch": 1.1700381119906185, + "grad_norm": 0.8285580891347714, + "learning_rate": 0.0004999012557291661, + "loss": 3.38165283203125, + "step": 1996, + "token_acc": 0.2595735683371358 + }, + { + "epoch": 1.1706244503078276, + "grad_norm": 0.9027796847159149, + "learning_rate": 0.0004999005736052862, + "loss": 3.3802688121795654, + "step": 1997, + "token_acc": 0.2578397299475314 + }, + { + "epoch": 1.1712107886250367, + "grad_norm": 0.724674578428661, + "learning_rate": 0.0004998998891339271, + "loss": 3.3230371475219727, + "step": 1998, + "token_acc": 0.2671989337570965 + }, + { + "epoch": 1.1717971269422456, + "grad_norm": 0.5813224091701463, + "learning_rate": 0.0004998992023150955, + "loss": 3.3323206901550293, + "step": 1999, + "token_acc": 0.2647921108742004 + }, + { + "epoch": 1.1723834652594547, + "grad_norm": 0.8201661939092288, + "learning_rate": 0.0004998985131487975, + "loss": 3.343085289001465, + "step": 2000, + "token_acc": 0.26475652556830914 + }, + { + "epoch": 1.1729698035766638, + "grad_norm": 0.797415017670064, + "learning_rate": 0.0004998978216350398, + "loss": 3.3278908729553223, + "step": 2001, + "token_acc": 0.2651759870178098 + }, + { + "epoch": 1.1735561418938727, + "grad_norm": 0.4927705292189527, + "learning_rate": 0.0004998971277738286, + "loss": 3.292464256286621, + "step": 2002, + "token_acc": 0.2699018290896971 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.5610036063882936, + "learning_rate": 0.0004998964315651708, + "loss": 3.3572187423706055, + "step": 2003, + "token_acc": 0.26312081448195646 + }, + { + "epoch": 1.174728818528291, + "grad_norm": 0.6355199880436163, + "learning_rate": 0.0004998957330090727, + "loss": 3.3247575759887695, + "step": 2004, + "token_acc": 0.26561243987612837 + }, + { + "epoch": 1.1753151568454998, + "grad_norm": 0.5248953516221727, + "learning_rate": 0.000499895032105541, + "loss": 3.3342525959014893, + "step": 2005, + "token_acc": 0.2655909725931709 + }, + { + "epoch": 1.175901495162709, + "grad_norm": 0.47433152121930633, + "learning_rate": 0.0004998943288545821, + "loss": 3.299567461013794, + "step": 2006, + "token_acc": 0.2696991600278686 + }, + { + "epoch": 1.1764878334799178, + "grad_norm": 0.5432641689028098, + "learning_rate": 0.0004998936232562028, + "loss": 3.363025665283203, + "step": 2007, + "token_acc": 0.2599285984153667 + }, + { + "epoch": 1.177074171797127, + "grad_norm": 0.5067973984940682, + "learning_rate": 0.0004998929153104095, + "loss": 3.347628116607666, + "step": 2008, + "token_acc": 0.26132443971502356 + }, + { + "epoch": 1.177660510114336, + "grad_norm": 0.5082029427180463, + "learning_rate": 0.0004998922050172092, + "loss": 3.303496837615967, + "step": 2009, + "token_acc": 0.2681056016121417 + }, + { + "epoch": 1.178246848431545, + "grad_norm": 0.4029167060868648, + "learning_rate": 0.0004998914923766083, + "loss": 3.3124451637268066, + "step": 2010, + "token_acc": 0.26707596454138616 + }, + { + "epoch": 1.178833186748754, + "grad_norm": 0.41997556626733346, + "learning_rate": 0.0004998907773886136, + "loss": 3.258422374725342, + "step": 2011, + "token_acc": 0.27420642739368045 + }, + { + "epoch": 1.1794195250659631, + "grad_norm": 0.43706206607002823, + "learning_rate": 0.0004998900600532317, + "loss": 3.3531785011291504, + "step": 2012, + "token_acc": 0.26127253539479356 + }, + { + "epoch": 1.180005863383172, + "grad_norm": 0.4151720452922065, + "learning_rate": 0.0004998893403704694, + "loss": 3.3099682331085205, + "step": 2013, + "token_acc": 0.26929813654522944 + }, + { + "epoch": 1.1805922017003811, + "grad_norm": 0.5140371857316828, + "learning_rate": 0.0004998886183403335, + "loss": 3.309345006942749, + "step": 2014, + "token_acc": 0.26859707413574935 + }, + { + "epoch": 1.1811785400175903, + "grad_norm": 0.5869395215828356, + "learning_rate": 0.0004998878939628308, + "loss": 3.2969727516174316, + "step": 2015, + "token_acc": 0.2695838018453392 + }, + { + "epoch": 1.1817648783347992, + "grad_norm": 0.59960934027287, + "learning_rate": 0.000499887167237968, + "loss": 3.3286962509155273, + "step": 2016, + "token_acc": 0.26622080472114645 + }, + { + "epoch": 1.1823512166520083, + "grad_norm": 0.5920231438797806, + "learning_rate": 0.0004998864381657521, + "loss": 3.28824782371521, + "step": 2017, + "token_acc": 0.2701760379133556 + }, + { + "epoch": 1.1829375549692172, + "grad_norm": 0.5002944280232379, + "learning_rate": 0.0004998857067461897, + "loss": 3.321464776992798, + "step": 2018, + "token_acc": 0.26673569321074636 + }, + { + "epoch": 1.1835238932864263, + "grad_norm": 0.41974701657187496, + "learning_rate": 0.0004998849729792879, + "loss": 3.290827989578247, + "step": 2019, + "token_acc": 0.2691389995457702 + }, + { + "epoch": 1.1841102316036354, + "grad_norm": 0.507182419312586, + "learning_rate": 0.0004998842368650535, + "loss": 3.348458766937256, + "step": 2020, + "token_acc": 0.2634811592388583 + }, + { + "epoch": 1.1846965699208443, + "grad_norm": 0.534986168661541, + "learning_rate": 0.0004998834984034934, + "loss": 3.3211283683776855, + "step": 2021, + "token_acc": 0.2668648255962674 + }, + { + "epoch": 1.1852829082380534, + "grad_norm": 0.4254196401649543, + "learning_rate": 0.0004998827575946146, + "loss": 3.316277027130127, + "step": 2022, + "token_acc": 0.26781443242801495 + }, + { + "epoch": 1.1858692465552623, + "grad_norm": 0.4535068332589637, + "learning_rate": 0.000499882014438424, + "loss": 3.2859256267547607, + "step": 2023, + "token_acc": 0.27251482077146083 + }, + { + "epoch": 1.1864555848724714, + "grad_norm": 0.4942597283648693, + "learning_rate": 0.0004998812689349286, + "loss": 3.3470168113708496, + "step": 2024, + "token_acc": 0.2666321005924868 + }, + { + "epoch": 1.1870419231896805, + "grad_norm": 0.5022677914197416, + "learning_rate": 0.0004998805210841353, + "loss": 3.3218302726745605, + "step": 2025, + "token_acc": 0.2659985117663473 + }, + { + "epoch": 1.1876282615068894, + "grad_norm": 0.44267495812071506, + "learning_rate": 0.0004998797708860513, + "loss": 3.329369068145752, + "step": 2026, + "token_acc": 0.2633799117325032 + }, + { + "epoch": 1.1882145998240985, + "grad_norm": 0.5023398808775057, + "learning_rate": 0.0004998790183406835, + "loss": 3.3358349800109863, + "step": 2027, + "token_acc": 0.2649848562607861 + }, + { + "epoch": 1.1888009381413076, + "grad_norm": 0.6651362621601263, + "learning_rate": 0.0004998782634480391, + "loss": 3.3057913780212402, + "step": 2028, + "token_acc": 0.26732774779369434 + }, + { + "epoch": 1.1893872764585165, + "grad_norm": 0.7907328585268072, + "learning_rate": 0.0004998775062081251, + "loss": 3.323847770690918, + "step": 2029, + "token_acc": 0.2657837384744342 + }, + { + "epoch": 1.1899736147757256, + "grad_norm": 0.687031894732298, + "learning_rate": 0.0004998767466209488, + "loss": 3.331947088241577, + "step": 2030, + "token_acc": 0.26395333117368425 + }, + { + "epoch": 1.1905599530929347, + "grad_norm": 0.5672892746009975, + "learning_rate": 0.000499875984686517, + "loss": 3.291057586669922, + "step": 2031, + "token_acc": 0.2687209329319724 + }, + { + "epoch": 1.1911462914101436, + "grad_norm": 0.5686607015619654, + "learning_rate": 0.0004998752204048371, + "loss": 3.2978549003601074, + "step": 2032, + "token_acc": 0.27016012991284327 + }, + { + "epoch": 1.1917326297273527, + "grad_norm": 0.5832588955863457, + "learning_rate": 0.0004998744537759161, + "loss": 3.3795387744903564, + "step": 2033, + "token_acc": 0.2593577911917112 + }, + { + "epoch": 1.1923189680445616, + "grad_norm": 0.6169656167197718, + "learning_rate": 0.0004998736847997615, + "loss": 3.2823522090911865, + "step": 2034, + "token_acc": 0.27315726309349186 + }, + { + "epoch": 1.1929053063617707, + "grad_norm": 0.46286552827349114, + "learning_rate": 0.0004998729134763802, + "loss": 3.355337619781494, + "step": 2035, + "token_acc": 0.26245440306117085 + }, + { + "epoch": 1.1934916446789798, + "grad_norm": 0.44136780811216014, + "learning_rate": 0.0004998721398057797, + "loss": 3.2957332134246826, + "step": 2036, + "token_acc": 0.27055936738028835 + }, + { + "epoch": 1.1940779829961887, + "grad_norm": 0.4413040541938301, + "learning_rate": 0.000499871363787967, + "loss": 3.3235702514648438, + "step": 2037, + "token_acc": 0.26550061061581887 + }, + { + "epoch": 1.1946643213133978, + "grad_norm": 0.5388273982131709, + "learning_rate": 0.0004998705854229497, + "loss": 3.275033473968506, + "step": 2038, + "token_acc": 0.27238143882863397 + }, + { + "epoch": 1.195250659630607, + "grad_norm": 0.5449101509333466, + "learning_rate": 0.0004998698047107349, + "loss": 3.311260461807251, + "step": 2039, + "token_acc": 0.268357345969066 + }, + { + "epoch": 1.1958369979478158, + "grad_norm": 0.49513579424566173, + "learning_rate": 0.0004998690216513299, + "loss": 3.304572105407715, + "step": 2040, + "token_acc": 0.2675595750340317 + }, + { + "epoch": 1.196423336265025, + "grad_norm": 0.45019171030117655, + "learning_rate": 0.0004998682362447423, + "loss": 3.3287153244018555, + "step": 2041, + "token_acc": 0.26570323725696066 + }, + { + "epoch": 1.197009674582234, + "grad_norm": 0.4619610040695394, + "learning_rate": 0.0004998674484909794, + "loss": 3.3112473487854004, + "step": 2042, + "token_acc": 0.26907846887374404 + }, + { + "epoch": 1.197596012899443, + "grad_norm": 0.5159074386659498, + "learning_rate": 0.0004998666583900483, + "loss": 3.353604555130005, + "step": 2043, + "token_acc": 0.2624648839870981 + }, + { + "epoch": 1.198182351216652, + "grad_norm": 0.5278940524411757, + "learning_rate": 0.0004998658659419568, + "loss": 3.2849254608154297, + "step": 2044, + "token_acc": 0.27182364120902425 + }, + { + "epoch": 1.198768689533861, + "grad_norm": 0.4853549663762368, + "learning_rate": 0.000499865071146712, + "loss": 3.33880615234375, + "step": 2045, + "token_acc": 0.26337082955400865 + }, + { + "epoch": 1.19935502785107, + "grad_norm": 0.4967489983274647, + "learning_rate": 0.0004998642740043217, + "loss": 3.300752878189087, + "step": 2046, + "token_acc": 0.2708160560623555 + }, + { + "epoch": 1.1999413661682792, + "grad_norm": 0.5064700710445951, + "learning_rate": 0.0004998634745147931, + "loss": 3.255930185317993, + "step": 2047, + "token_acc": 0.2761124152152204 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.43717318165201213, + "learning_rate": 0.000499862672678134, + "loss": 3.3873236179351807, + "step": 2048, + "token_acc": 0.2602438504951549 + }, + { + "epoch": 1.2011140428026972, + "grad_norm": 0.5229383969261472, + "learning_rate": 0.0004998618684943517, + "loss": 3.313478946685791, + "step": 2049, + "token_acc": 0.2663258537185066 + }, + { + "epoch": 1.201700381119906, + "grad_norm": 0.5486282738089729, + "learning_rate": 0.0004998610619634539, + "loss": 3.333477735519409, + "step": 2050, + "token_acc": 0.26640099588201765 + }, + { + "epoch": 1.2022867194371152, + "grad_norm": 0.6476656598009072, + "learning_rate": 0.0004998602530854481, + "loss": 3.2968034744262695, + "step": 2051, + "token_acc": 0.27081136142650625 + }, + { + "epoch": 1.2028730577543243, + "grad_norm": 0.6190001487911446, + "learning_rate": 0.0004998594418603419, + "loss": 3.30812668800354, + "step": 2052, + "token_acc": 0.26787274997398813 + }, + { + "epoch": 1.2034593960715332, + "grad_norm": 0.5754424924334861, + "learning_rate": 0.0004998586282881429, + "loss": 3.2833471298217773, + "step": 2053, + "token_acc": 0.27251918888630455 + }, + { + "epoch": 1.2040457343887423, + "grad_norm": 0.43768417940362286, + "learning_rate": 0.0004998578123688589, + "loss": 3.3144237995147705, + "step": 2054, + "token_acc": 0.26716080534090314 + }, + { + "epoch": 1.2046320727059514, + "grad_norm": 0.44906101688761907, + "learning_rate": 0.0004998569941024973, + "loss": 3.249899387359619, + "step": 2055, + "token_acc": 0.27483905096651223 + }, + { + "epoch": 1.2052184110231603, + "grad_norm": 0.4423444422510579, + "learning_rate": 0.000499856173489066, + "loss": 3.330540180206299, + "step": 2056, + "token_acc": 0.2658774678582021 + }, + { + "epoch": 1.2058047493403694, + "grad_norm": 0.5284401008979768, + "learning_rate": 0.0004998553505285725, + "loss": 3.298649311065674, + "step": 2057, + "token_acc": 0.26917473693036514 + }, + { + "epoch": 1.2063910876575785, + "grad_norm": 0.697519914652804, + "learning_rate": 0.0004998545252210249, + "loss": 3.3638739585876465, + "step": 2058, + "token_acc": 0.26099888266984206 + }, + { + "epoch": 1.2069774259747874, + "grad_norm": 0.6063135151966607, + "learning_rate": 0.0004998536975664306, + "loss": 3.3142952919006348, + "step": 2059, + "token_acc": 0.26800522025803425 + }, + { + "epoch": 1.2075637642919965, + "grad_norm": 0.5454980827361674, + "learning_rate": 0.0004998528675647974, + "loss": 3.302705764770508, + "step": 2060, + "token_acc": 0.26873167715042007 + }, + { + "epoch": 1.2081501026092054, + "grad_norm": 0.5978719029382357, + "learning_rate": 0.0004998520352161334, + "loss": 3.3411378860473633, + "step": 2061, + "token_acc": 0.2640342038340417 + }, + { + "epoch": 1.2087364409264145, + "grad_norm": 0.5450771873655044, + "learning_rate": 0.000499851200520446, + "loss": 3.320988893508911, + "step": 2062, + "token_acc": 0.2677429261267367 + }, + { + "epoch": 1.2093227792436236, + "grad_norm": 0.48164115859238466, + "learning_rate": 0.0004998503634777434, + "loss": 3.3001298904418945, + "step": 2063, + "token_acc": 0.2694761614890696 + }, + { + "epoch": 1.2099091175608325, + "grad_norm": 0.5311665158489401, + "learning_rate": 0.0004998495240880333, + "loss": 3.2426795959472656, + "step": 2064, + "token_acc": 0.2765058602603121 + }, + { + "epoch": 1.2104954558780416, + "grad_norm": 0.5144423843601922, + "learning_rate": 0.0004998486823513236, + "loss": 3.3316783905029297, + "step": 2065, + "token_acc": 0.26528989157233374 + }, + { + "epoch": 1.2110817941952507, + "grad_norm": 0.4624926794183494, + "learning_rate": 0.0004998478382676221, + "loss": 3.338933229446411, + "step": 2066, + "token_acc": 0.2627043226262826 + }, + { + "epoch": 1.2116681325124596, + "grad_norm": 0.4484913570209329, + "learning_rate": 0.0004998469918369369, + "loss": 3.2829103469848633, + "step": 2067, + "token_acc": 0.2709052799136505 + }, + { + "epoch": 1.2122544708296688, + "grad_norm": 0.4221430128613876, + "learning_rate": 0.0004998461430592758, + "loss": 3.3139305114746094, + "step": 2068, + "token_acc": 0.2679495963691252 + }, + { + "epoch": 1.2128408091468779, + "grad_norm": 0.4490465945185443, + "learning_rate": 0.000499845291934647, + "loss": 3.26668643951416, + "step": 2069, + "token_acc": 0.27291121594981615 + }, + { + "epoch": 1.2134271474640868, + "grad_norm": 0.4291015903904159, + "learning_rate": 0.0004998444384630582, + "loss": 3.27933931350708, + "step": 2070, + "token_acc": 0.2720215409030009 + }, + { + "epoch": 1.2140134857812959, + "grad_norm": 0.4649709197684189, + "learning_rate": 0.0004998435826445177, + "loss": 3.3592960834503174, + "step": 2071, + "token_acc": 0.26242346537582933 + }, + { + "epoch": 1.2145998240985048, + "grad_norm": 0.5130252359673275, + "learning_rate": 0.0004998427244790333, + "loss": 3.3406920433044434, + "step": 2072, + "token_acc": 0.2639213790714686 + }, + { + "epoch": 1.2151861624157139, + "grad_norm": 0.5197240919438159, + "learning_rate": 0.0004998418639666133, + "loss": 3.3269901275634766, + "step": 2073, + "token_acc": 0.2651729202136647 + }, + { + "epoch": 1.215772500732923, + "grad_norm": 0.5873915998181899, + "learning_rate": 0.0004998410011072656, + "loss": 3.31498384475708, + "step": 2074, + "token_acc": 0.26813710894586074 + }, + { + "epoch": 1.2163588390501319, + "grad_norm": 0.5929268493264973, + "learning_rate": 0.0004998401359009983, + "loss": 3.306807518005371, + "step": 2075, + "token_acc": 0.2696637558755186 + }, + { + "epoch": 1.216945177367341, + "grad_norm": 0.5658766916562017, + "learning_rate": 0.0004998392683478196, + "loss": 3.266423225402832, + "step": 2076, + "token_acc": 0.27343681169184525 + }, + { + "epoch": 1.2175315156845499, + "grad_norm": 0.590874438978482, + "learning_rate": 0.0004998383984477378, + "loss": 3.2856032848358154, + "step": 2077, + "token_acc": 0.2710271632523156 + }, + { + "epoch": 1.218117854001759, + "grad_norm": 0.5590312966286752, + "learning_rate": 0.0004998375262007607, + "loss": 3.2995848655700684, + "step": 2078, + "token_acc": 0.26880451380800785 + }, + { + "epoch": 1.218704192318968, + "grad_norm": 0.5657210950305598, + "learning_rate": 0.0004998366516068968, + "loss": 3.287076711654663, + "step": 2079, + "token_acc": 0.27062500641005915 + }, + { + "epoch": 1.219290530636177, + "grad_norm": 0.6294065023184018, + "learning_rate": 0.0004998357746661542, + "loss": 3.3046817779541016, + "step": 2080, + "token_acc": 0.26814411578118874 + }, + { + "epoch": 1.219876868953386, + "grad_norm": 0.5717863710469163, + "learning_rate": 0.0004998348953785412, + "loss": 3.2898173332214355, + "step": 2081, + "token_acc": 0.27171867972781294 + }, + { + "epoch": 1.2204632072705952, + "grad_norm": 0.5591427533700243, + "learning_rate": 0.000499834013744066, + "loss": 3.3410966396331787, + "step": 2082, + "token_acc": 0.2637180668862226 + }, + { + "epoch": 1.221049545587804, + "grad_norm": 0.5732897584919422, + "learning_rate": 0.0004998331297627368, + "loss": 3.3258461952209473, + "step": 2083, + "token_acc": 0.265448460822719 + }, + { + "epoch": 1.2216358839050132, + "grad_norm": 0.5453331744023011, + "learning_rate": 0.0004998322434345621, + "loss": 3.301168441772461, + "step": 2084, + "token_acc": 0.26884017989131564 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.5236881772813752, + "learning_rate": 0.0004998313547595501, + "loss": 3.3436872959136963, + "step": 2085, + "token_acc": 0.26186578100069496 + }, + { + "epoch": 1.2228085605394312, + "grad_norm": 0.5535504529491678, + "learning_rate": 0.0004998304637377091, + "loss": 3.3017029762268066, + "step": 2086, + "token_acc": 0.26890689984453053 + }, + { + "epoch": 1.2233948988566403, + "grad_norm": 0.5235068998876725, + "learning_rate": 0.0004998295703690476, + "loss": 3.330578327178955, + "step": 2087, + "token_acc": 0.2669153282634098 + }, + { + "epoch": 1.2239812371738492, + "grad_norm": 0.4616826402754192, + "learning_rate": 0.000499828674653574, + "loss": 3.28640079498291, + "step": 2088, + "token_acc": 0.27065902187996704 + }, + { + "epoch": 1.2245675754910583, + "grad_norm": 0.48382335926060116, + "learning_rate": 0.0004998277765912966, + "loss": 3.3055338859558105, + "step": 2089, + "token_acc": 0.26869214531536195 + }, + { + "epoch": 1.2251539138082674, + "grad_norm": 0.5245911342083793, + "learning_rate": 0.0004998268761822239, + "loss": 3.306577682495117, + "step": 2090, + "token_acc": 0.2673301839673687 + }, + { + "epoch": 1.2257402521254763, + "grad_norm": 0.5675000733721677, + "learning_rate": 0.0004998259734263643, + "loss": 3.313633441925049, + "step": 2091, + "token_acc": 0.2680983913723904 + }, + { + "epoch": 1.2263265904426854, + "grad_norm": 0.44073167989521644, + "learning_rate": 0.0004998250683237264, + "loss": 3.2866435050964355, + "step": 2092, + "token_acc": 0.27021551311475833 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.47749182330955664, + "learning_rate": 0.0004998241608743185, + "loss": 3.322282314300537, + "step": 2093, + "token_acc": 0.26587579589019494 + }, + { + "epoch": 1.2274992670771034, + "grad_norm": 0.5065970450133327, + "learning_rate": 0.0004998232510781494, + "loss": 3.3330321311950684, + "step": 2094, + "token_acc": 0.2635963820851256 + }, + { + "epoch": 1.2280856053943126, + "grad_norm": 0.407741121959121, + "learning_rate": 0.0004998223389352275, + "loss": 3.3167710304260254, + "step": 2095, + "token_acc": 0.26751700454780497 + }, + { + "epoch": 1.2286719437115217, + "grad_norm": 0.46011877455774647, + "learning_rate": 0.0004998214244455612, + "loss": 3.2706167697906494, + "step": 2096, + "token_acc": 0.27219761310604523 + }, + { + "epoch": 1.2292582820287306, + "grad_norm": 0.46780702669159663, + "learning_rate": 0.0004998205076091593, + "loss": 3.3330459594726562, + "step": 2097, + "token_acc": 0.2654161433645289 + }, + { + "epoch": 1.2298446203459397, + "grad_norm": 0.4105843333920586, + "learning_rate": 0.0004998195884260304, + "loss": 3.30873966217041, + "step": 2098, + "token_acc": 0.2671012527392933 + }, + { + "epoch": 1.2304309586631486, + "grad_norm": 0.4387013374158282, + "learning_rate": 0.0004998186668961832, + "loss": 3.2975692749023438, + "step": 2099, + "token_acc": 0.2701121130568983 + }, + { + "epoch": 1.2310172969803577, + "grad_norm": 0.4198984972799839, + "learning_rate": 0.0004998177430196261, + "loss": 3.34445858001709, + "step": 2100, + "token_acc": 0.2638136752181451 + }, + { + "epoch": 1.2316036352975668, + "grad_norm": 0.4457632350662402, + "learning_rate": 0.000499816816796368, + "loss": 3.328608512878418, + "step": 2101, + "token_acc": 0.2658296100345539 + }, + { + "epoch": 1.2321899736147757, + "grad_norm": 0.47800049833001435, + "learning_rate": 0.0004998158882264177, + "loss": 3.398808479309082, + "step": 2102, + "token_acc": 0.25723926445465445 + }, + { + "epoch": 1.2327763119319848, + "grad_norm": 0.531390804076189, + "learning_rate": 0.0004998149573097835, + "loss": 3.3021271228790283, + "step": 2103, + "token_acc": 0.2702262069113705 + }, + { + "epoch": 1.2333626502491937, + "grad_norm": 0.5415415266687958, + "learning_rate": 0.0004998140240464746, + "loss": 3.3238556385040283, + "step": 2104, + "token_acc": 0.2662548984127927 + }, + { + "epoch": 1.2339489885664028, + "grad_norm": 0.5520885664623458, + "learning_rate": 0.0004998130884364994, + "loss": 3.3366012573242188, + "step": 2105, + "token_acc": 0.2627775608087584 + }, + { + "epoch": 1.234535326883612, + "grad_norm": 0.6038190730308144, + "learning_rate": 0.000499812150479867, + "loss": 3.3262522220611572, + "step": 2106, + "token_acc": 0.26526967337440255 + }, + { + "epoch": 1.2351216652008208, + "grad_norm": 0.6860717149168699, + "learning_rate": 0.0004998112101765861, + "loss": 3.3490939140319824, + "step": 2107, + "token_acc": 0.26322550628739805 + }, + { + "epoch": 1.23570800351803, + "grad_norm": 0.6127259214853676, + "learning_rate": 0.0004998102675266654, + "loss": 3.362032413482666, + "step": 2108, + "token_acc": 0.2611543173846971 + }, + { + "epoch": 1.236294341835239, + "grad_norm": 0.5163032196851721, + "learning_rate": 0.0004998093225301139, + "loss": 3.2941439151763916, + "step": 2109, + "token_acc": 0.26956833872212715 + }, + { + "epoch": 1.236880680152448, + "grad_norm": 0.5091333054474084, + "learning_rate": 0.0004998083751869405, + "loss": 3.3019800186157227, + "step": 2110, + "token_acc": 0.26967103552774285 + }, + { + "epoch": 1.237467018469657, + "grad_norm": 0.4794440213897465, + "learning_rate": 0.0004998074254971539, + "loss": 3.3275718688964844, + "step": 2111, + "token_acc": 0.26466550531707445 + }, + { + "epoch": 1.2380533567868661, + "grad_norm": 0.5205151099699346, + "learning_rate": 0.0004998064734607632, + "loss": 3.2655792236328125, + "step": 2112, + "token_acc": 0.27337097576150426 + }, + { + "epoch": 1.238639695104075, + "grad_norm": 0.4936815382841612, + "learning_rate": 0.0004998055190777774, + "loss": 3.3203606605529785, + "step": 2113, + "token_acc": 0.2668731624807244 + }, + { + "epoch": 1.2392260334212841, + "grad_norm": 0.5191613155982038, + "learning_rate": 0.0004998045623482053, + "loss": 3.2999205589294434, + "step": 2114, + "token_acc": 0.26844001467003215 + }, + { + "epoch": 1.239812371738493, + "grad_norm": 0.6418739712426673, + "learning_rate": 0.0004998036032720558, + "loss": 3.320920467376709, + "step": 2115, + "token_acc": 0.26616907268486417 + }, + { + "epoch": 1.2403987100557021, + "grad_norm": 0.6718805892842361, + "learning_rate": 0.0004998026418493383, + "loss": 3.326570987701416, + "step": 2116, + "token_acc": 0.26596548444082074 + }, + { + "epoch": 1.2409850483729112, + "grad_norm": 0.5820633407029908, + "learning_rate": 0.0004998016780800615, + "loss": 3.286497116088867, + "step": 2117, + "token_acc": 0.27011828233479046 + }, + { + "epoch": 1.2415713866901201, + "grad_norm": 0.4704664072097044, + "learning_rate": 0.0004998007119642345, + "loss": 3.347278118133545, + "step": 2118, + "token_acc": 0.262891077998943 + }, + { + "epoch": 1.2421577250073292, + "grad_norm": 0.49139195510202993, + "learning_rate": 0.0004997997435018665, + "loss": 3.3077545166015625, + "step": 2119, + "token_acc": 0.26853775102858224 + }, + { + "epoch": 1.2427440633245384, + "grad_norm": 0.5887750309598477, + "learning_rate": 0.0004997987726929664, + "loss": 3.336679458618164, + "step": 2120, + "token_acc": 0.2630566379258954 + }, + { + "epoch": 1.2433304016417472, + "grad_norm": 0.5823319978781171, + "learning_rate": 0.0004997977995375436, + "loss": 3.3195934295654297, + "step": 2121, + "token_acc": 0.26626570901608426 + }, + { + "epoch": 1.2439167399589564, + "grad_norm": 0.5267169948019839, + "learning_rate": 0.000499796824035607, + "loss": 3.3138718605041504, + "step": 2122, + "token_acc": 0.2679616646298321 + }, + { + "epoch": 1.2445030782761655, + "grad_norm": 0.4571683439465644, + "learning_rate": 0.0004997958461871658, + "loss": 3.2633495330810547, + "step": 2123, + "token_acc": 0.2727988990018367 + }, + { + "epoch": 1.2450894165933744, + "grad_norm": 0.46302262742243044, + "learning_rate": 0.0004997948659922293, + "loss": 3.3327198028564453, + "step": 2124, + "token_acc": 0.2654166890699931 + }, + { + "epoch": 1.2456757549105835, + "grad_norm": 0.4175375683138514, + "learning_rate": 0.0004997938834508067, + "loss": 3.3322274684906006, + "step": 2125, + "token_acc": 0.26659637800390845 + }, + { + "epoch": 1.2462620932277924, + "grad_norm": 0.40134172357393805, + "learning_rate": 0.000499792898562907, + "loss": 3.36427640914917, + "step": 2126, + "token_acc": 0.25978651339423176 + }, + { + "epoch": 1.2468484315450015, + "grad_norm": 0.40208596100362987, + "learning_rate": 0.0004997919113285397, + "loss": 3.345595121383667, + "step": 2127, + "token_acc": 0.2607705988188507 + }, + { + "epoch": 1.2474347698622106, + "grad_norm": 0.4513734282016161, + "learning_rate": 0.000499790921747714, + "loss": 3.3036911487579346, + "step": 2128, + "token_acc": 0.2693736034325399 + }, + { + "epoch": 1.2480211081794195, + "grad_norm": 0.47616020535154974, + "learning_rate": 0.0004997899298204391, + "loss": 3.2801637649536133, + "step": 2129, + "token_acc": 0.27253698856088465 + }, + { + "epoch": 1.2486074464966286, + "grad_norm": 0.5012025818456276, + "learning_rate": 0.0004997889355467245, + "loss": 3.3106446266174316, + "step": 2130, + "token_acc": 0.2670502049868943 + }, + { + "epoch": 1.2491937848138375, + "grad_norm": 0.4650654240770107, + "learning_rate": 0.0004997879389265795, + "loss": 3.3095996379852295, + "step": 2131, + "token_acc": 0.2683281280551284 + }, + { + "epoch": 1.2497801231310466, + "grad_norm": 0.48766268190844136, + "learning_rate": 0.0004997869399600134, + "loss": 3.319863796234131, + "step": 2132, + "token_acc": 0.2664853453523462 + }, + { + "epoch": 1.2503664614482557, + "grad_norm": 0.6404063018357209, + "learning_rate": 0.0004997859386470355, + "loss": 3.3398847579956055, + "step": 2133, + "token_acc": 0.2614067022768643 + }, + { + "epoch": 1.2509527997654648, + "grad_norm": 0.6695813267925528, + "learning_rate": 0.0004997849349876553, + "loss": 3.349310874938965, + "step": 2134, + "token_acc": 0.26202523178187753 + }, + { + "epoch": 1.2515391380826737, + "grad_norm": 0.6495189887764161, + "learning_rate": 0.0004997839289818823, + "loss": 3.3054168224334717, + "step": 2135, + "token_acc": 0.2681828124920513 + }, + { + "epoch": 1.2521254763998828, + "grad_norm": 0.5852846148002033, + "learning_rate": 0.0004997829206297257, + "loss": 3.3067233562469482, + "step": 2136, + "token_acc": 0.2691477116460054 + }, + { + "epoch": 1.2527118147170917, + "grad_norm": 0.4558384604605822, + "learning_rate": 0.0004997819099311953, + "loss": 3.291916847229004, + "step": 2137, + "token_acc": 0.2710718645635584 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.4837480430820536, + "learning_rate": 0.0004997808968863005, + "loss": 3.2962217330932617, + "step": 2138, + "token_acc": 0.268129293252959 + }, + { + "epoch": 1.25388449135151, + "grad_norm": 0.5853034581896214, + "learning_rate": 0.0004997798814950506, + "loss": 3.2862343788146973, + "step": 2139, + "token_acc": 0.2698820319396517 + }, + { + "epoch": 1.2544708296687188, + "grad_norm": 0.4442622420681341, + "learning_rate": 0.0004997788637574554, + "loss": 3.352389335632324, + "step": 2140, + "token_acc": 0.2634235055776913 + }, + { + "epoch": 1.255057167985928, + "grad_norm": 0.47521460412611005, + "learning_rate": 0.0004997778436735243, + "loss": 3.2933759689331055, + "step": 2141, + "token_acc": 0.27030047410064184 + }, + { + "epoch": 1.2556435063031368, + "grad_norm": 0.4943872005971034, + "learning_rate": 0.0004997768212432669, + "loss": 3.2984228134155273, + "step": 2142, + "token_acc": 0.26990768306485785 + }, + { + "epoch": 1.256229844620346, + "grad_norm": 0.4525343170053042, + "learning_rate": 0.000499775796466693, + "loss": 3.297734260559082, + "step": 2143, + "token_acc": 0.26891125437655844 + }, + { + "epoch": 1.256816182937555, + "grad_norm": 0.5259778817212585, + "learning_rate": 0.000499774769343812, + "loss": 3.3735077381134033, + "step": 2144, + "token_acc": 0.26067582715786375 + }, + { + "epoch": 1.257402521254764, + "grad_norm": 0.591554418910151, + "learning_rate": 0.0004997737398746336, + "loss": 3.3246655464172363, + "step": 2145, + "token_acc": 0.26484505129592495 + }, + { + "epoch": 1.257988859571973, + "grad_norm": 0.5442635455552316, + "learning_rate": 0.0004997727080591674, + "loss": 3.3255317211151123, + "step": 2146, + "token_acc": 0.26505455770676206 + }, + { + "epoch": 1.258575197889182, + "grad_norm": 0.4990162605324992, + "learning_rate": 0.0004997716738974233, + "loss": 3.2527995109558105, + "step": 2147, + "token_acc": 0.2746930182999369 + }, + { + "epoch": 1.259161536206391, + "grad_norm": 0.500296782374342, + "learning_rate": 0.0004997706373894109, + "loss": 3.3446402549743652, + "step": 2148, + "token_acc": 0.2630214093401774 + }, + { + "epoch": 1.2597478745236002, + "grad_norm": 0.45665751983693237, + "learning_rate": 0.0004997695985351398, + "loss": 3.256342887878418, + "step": 2149, + "token_acc": 0.27315003671371463 + }, + { + "epoch": 1.2603342128408093, + "grad_norm": 0.3918018089168429, + "learning_rate": 0.00049976855733462, + "loss": 3.2846970558166504, + "step": 2150, + "token_acc": 0.27114200481390005 + }, + { + "epoch": 1.2609205511580182, + "grad_norm": 0.41335462526156974, + "learning_rate": 0.0004997675137878611, + "loss": 3.288651943206787, + "step": 2151, + "token_acc": 0.2700356643268516 + }, + { + "epoch": 1.2615068894752273, + "grad_norm": 0.40587142651340774, + "learning_rate": 0.000499766467894873, + "loss": 3.2813515663146973, + "step": 2152, + "token_acc": 0.27146011808742226 + }, + { + "epoch": 1.2620932277924362, + "grad_norm": 0.39450349765745246, + "learning_rate": 0.0004997654196556656, + "loss": 3.286311388015747, + "step": 2153, + "token_acc": 0.2697991785183897 + }, + { + "epoch": 1.2626795661096453, + "grad_norm": 0.44544252814626434, + "learning_rate": 0.0004997643690702486, + "loss": 3.301556348800659, + "step": 2154, + "token_acc": 0.26922668727600274 + }, + { + "epoch": 1.2632659044268544, + "grad_norm": 0.5963410556025828, + "learning_rate": 0.0004997633161386318, + "loss": 3.2753562927246094, + "step": 2155, + "token_acc": 0.27290407705502046 + }, + { + "epoch": 1.2638522427440633, + "grad_norm": 0.44640056642794085, + "learning_rate": 0.0004997622608608253, + "loss": 3.2942678928375244, + "step": 2156, + "token_acc": 0.26906797266136234 + }, + { + "epoch": 1.2644385810612724, + "grad_norm": 0.48046694174735866, + "learning_rate": 0.000499761203236839, + "loss": 3.3624701499938965, + "step": 2157, + "token_acc": 0.2598998013347575 + }, + { + "epoch": 1.2650249193784813, + "grad_norm": 0.5122597343770945, + "learning_rate": 0.0004997601432666826, + "loss": 3.335897922515869, + "step": 2158, + "token_acc": 0.2632955307042328 + }, + { + "epoch": 1.2656112576956904, + "grad_norm": 0.5243895145402654, + "learning_rate": 0.0004997590809503662, + "loss": 3.283290386199951, + "step": 2159, + "token_acc": 0.2707971217154701 + }, + { + "epoch": 1.2661975960128995, + "grad_norm": 0.4555532403765818, + "learning_rate": 0.0004997580162879, + "loss": 3.280341386795044, + "step": 2160, + "token_acc": 0.27032714112894957 + }, + { + "epoch": 1.2667839343301086, + "grad_norm": 0.4809312677690126, + "learning_rate": 0.0004997569492792936, + "loss": 3.299182891845703, + "step": 2161, + "token_acc": 0.26816311087431566 + }, + { + "epoch": 1.2673702726473175, + "grad_norm": 0.5189217449957898, + "learning_rate": 0.0004997558799245572, + "loss": 3.269404649734497, + "step": 2162, + "token_acc": 0.2733627899637052 + }, + { + "epoch": 1.2679566109645266, + "grad_norm": 0.47306107009117765, + "learning_rate": 0.000499754808223701, + "loss": 3.3310341835021973, + "step": 2163, + "token_acc": 0.26330186040937 + }, + { + "epoch": 1.2685429492817355, + "grad_norm": 0.5576696509813547, + "learning_rate": 0.0004997537341767348, + "loss": 3.2655444145202637, + "step": 2164, + "token_acc": 0.27113668662488644 + }, + { + "epoch": 1.2691292875989446, + "grad_norm": 0.5634013829567075, + "learning_rate": 0.0004997526577836689, + "loss": 3.2641098499298096, + "step": 2165, + "token_acc": 0.27565313904434147 + }, + { + "epoch": 1.2697156259161537, + "grad_norm": 0.5006032753112797, + "learning_rate": 0.0004997515790445133, + "loss": 3.29668927192688, + "step": 2166, + "token_acc": 0.26810630976886207 + }, + { + "epoch": 1.2703019642333626, + "grad_norm": 0.5463232411828371, + "learning_rate": 0.0004997504979592781, + "loss": 3.3136603832244873, + "step": 2167, + "token_acc": 0.26847550675675674 + }, + { + "epoch": 1.2708883025505717, + "grad_norm": 0.6108799668056303, + "learning_rate": 0.0004997494145279735, + "loss": 3.3317110538482666, + "step": 2168, + "token_acc": 0.26250752845369424 + }, + { + "epoch": 1.2714746408677806, + "grad_norm": 0.5068173858876764, + "learning_rate": 0.0004997483287506098, + "loss": 3.325237274169922, + "step": 2169, + "token_acc": 0.2638804811796663 + }, + { + "epoch": 1.2720609791849897, + "grad_norm": 0.4936194502911628, + "learning_rate": 0.000499747240627197, + "loss": 3.3076202869415283, + "step": 2170, + "token_acc": 0.2673911492882544 + }, + { + "epoch": 1.2726473175021988, + "grad_norm": 0.5033630673852391, + "learning_rate": 0.0004997461501577455, + "loss": 3.3397345542907715, + "step": 2171, + "token_acc": 0.2633098611537578 + }, + { + "epoch": 1.2732336558194077, + "grad_norm": 0.4178462490460003, + "learning_rate": 0.0004997450573422654, + "loss": 3.327944755554199, + "step": 2172, + "token_acc": 0.2650836931585574 + }, + { + "epoch": 1.2738199941366168, + "grad_norm": 0.452308088754542, + "learning_rate": 0.0004997439621807671, + "loss": 3.3097381591796875, + "step": 2173, + "token_acc": 0.267644548518405 + }, + { + "epoch": 1.2744063324538257, + "grad_norm": 0.4931493824872053, + "learning_rate": 0.0004997428646732607, + "loss": 3.303246259689331, + "step": 2174, + "token_acc": 0.26850617613080957 + }, + { + "epoch": 1.2749926707710348, + "grad_norm": 0.43023303548659314, + "learning_rate": 0.0004997417648197566, + "loss": 3.3206167221069336, + "step": 2175, + "token_acc": 0.2657444993148383 + }, + { + "epoch": 1.275579009088244, + "grad_norm": 0.41911010236334423, + "learning_rate": 0.0004997406626202653, + "loss": 3.3061723709106445, + "step": 2176, + "token_acc": 0.26769735288852453 + }, + { + "epoch": 1.276165347405453, + "grad_norm": 0.4209302323210968, + "learning_rate": 0.0004997395580747969, + "loss": 3.298943519592285, + "step": 2177, + "token_acc": 0.2690348106492787 + }, + { + "epoch": 1.276751685722662, + "grad_norm": 0.4617857219841965, + "learning_rate": 0.0004997384511833619, + "loss": 3.3021743297576904, + "step": 2178, + "token_acc": 0.26724531749657376 + }, + { + "epoch": 1.277338024039871, + "grad_norm": 0.41443708513598293, + "learning_rate": 0.0004997373419459707, + "loss": 3.288419485092163, + "step": 2179, + "token_acc": 0.26922466332118017 + }, + { + "epoch": 1.27792436235708, + "grad_norm": 0.5068318801664615, + "learning_rate": 0.0004997362303626337, + "loss": 3.3216655254364014, + "step": 2180, + "token_acc": 0.2640912369749846 + }, + { + "epoch": 1.278510700674289, + "grad_norm": 0.5615175594461506, + "learning_rate": 0.0004997351164333612, + "loss": 3.3075475692749023, + "step": 2181, + "token_acc": 0.2668439366562251 + }, + { + "epoch": 1.2790970389914982, + "grad_norm": 0.6616765764690321, + "learning_rate": 0.0004997340001581639, + "loss": 3.342186689376831, + "step": 2182, + "token_acc": 0.26381982130697645 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.5792090207835741, + "learning_rate": 0.0004997328815370524, + "loss": 3.299501657485962, + "step": 2183, + "token_acc": 0.26922838469685617 + }, + { + "epoch": 1.2802697156259162, + "grad_norm": 0.5036005331718914, + "learning_rate": 0.0004997317605700366, + "loss": 3.2714381217956543, + "step": 2184, + "token_acc": 0.27095021571896166 + }, + { + "epoch": 1.280856053943125, + "grad_norm": 0.5290923695339709, + "learning_rate": 0.0004997306372571278, + "loss": 3.3222498893737793, + "step": 2185, + "token_acc": 0.26354727080282764 + }, + { + "epoch": 1.2814423922603342, + "grad_norm": 0.5654401740763333, + "learning_rate": 0.0004997295115983359, + "loss": 3.2666594982147217, + "step": 2186, + "token_acc": 0.2723970572424703 + }, + { + "epoch": 1.2820287305775433, + "grad_norm": 0.4596166410423911, + "learning_rate": 0.0004997283835936719, + "loss": 3.336409091949463, + "step": 2187, + "token_acc": 0.2625966165499362 + }, + { + "epoch": 1.2826150688947524, + "grad_norm": 0.5382056117059197, + "learning_rate": 0.0004997272532431462, + "loss": 3.325645923614502, + "step": 2188, + "token_acc": 0.264492223561876 + }, + { + "epoch": 1.2832014072119613, + "grad_norm": 0.5291570545348604, + "learning_rate": 0.0004997261205467694, + "loss": 3.27892804145813, + "step": 2189, + "token_acc": 0.27071375428567046 + }, + { + "epoch": 1.2837877455291704, + "grad_norm": 0.4862945935625387, + "learning_rate": 0.0004997249855045523, + "loss": 3.2991814613342285, + "step": 2190, + "token_acc": 0.2692623266410517 + }, + { + "epoch": 1.2843740838463793, + "grad_norm": 0.49025649432895674, + "learning_rate": 0.0004997238481165055, + "loss": 3.299318790435791, + "step": 2191, + "token_acc": 0.26810181380684595 + }, + { + "epoch": 1.2849604221635884, + "grad_norm": 0.47720798043497037, + "learning_rate": 0.0004997227083826396, + "loss": 3.3071675300598145, + "step": 2192, + "token_acc": 0.26961167415106335 + }, + { + "epoch": 1.2855467604807975, + "grad_norm": 0.46856350221331255, + "learning_rate": 0.0004997215663029654, + "loss": 3.3346195220947266, + "step": 2193, + "token_acc": 0.262274133468022 + }, + { + "epoch": 1.2861330987980064, + "grad_norm": 0.444839551729711, + "learning_rate": 0.0004997204218774936, + "loss": 3.270496368408203, + "step": 2194, + "token_acc": 0.27275033785707997 + }, + { + "epoch": 1.2867194371152155, + "grad_norm": 0.5146535595751268, + "learning_rate": 0.0004997192751062349, + "loss": 3.30252742767334, + "step": 2195, + "token_acc": 0.2686239244619755 + }, + { + "epoch": 1.2873057754324244, + "grad_norm": 0.5829457858013953, + "learning_rate": 0.0004997181259892001, + "loss": 3.376220226287842, + "step": 2196, + "token_acc": 0.258965831284834 + }, + { + "epoch": 1.2878921137496335, + "grad_norm": 0.5547005144293703, + "learning_rate": 0.0004997169745264, + "loss": 3.288825750350952, + "step": 2197, + "token_acc": 0.2712078192662799 + }, + { + "epoch": 1.2884784520668426, + "grad_norm": 0.4379835936649123, + "learning_rate": 0.0004997158207178454, + "loss": 3.3246407508850098, + "step": 2198, + "token_acc": 0.26629575356329593 + }, + { + "epoch": 1.2890647903840515, + "grad_norm": 0.48753978266355136, + "learning_rate": 0.0004997146645635473, + "loss": 3.3396501541137695, + "step": 2199, + "token_acc": 0.2652016704013037 + }, + { + "epoch": 1.2896511287012606, + "grad_norm": 0.6188543684437843, + "learning_rate": 0.0004997135060635163, + "loss": 3.2410683631896973, + "step": 2200, + "token_acc": 0.2769529270604421 + }, + { + "epoch": 1.2902374670184695, + "grad_norm": 0.5105522784304846, + "learning_rate": 0.0004997123452177635, + "loss": 3.3327670097351074, + "step": 2201, + "token_acc": 0.2650352519103809 + }, + { + "epoch": 1.2908238053356786, + "grad_norm": 0.49442579958638455, + "learning_rate": 0.0004997111820262995, + "loss": 3.271937608718872, + "step": 2202, + "token_acc": 0.27052095629759754 + }, + { + "epoch": 1.2914101436528878, + "grad_norm": 0.5361497376013924, + "learning_rate": 0.0004997100164891356, + "loss": 3.231008291244507, + "step": 2203, + "token_acc": 0.27780742507952455 + }, + { + "epoch": 1.2919964819700969, + "grad_norm": 0.4377373573893023, + "learning_rate": 0.0004997088486062825, + "loss": 3.3321752548217773, + "step": 2204, + "token_acc": 0.26604221105527637 + }, + { + "epoch": 1.2925828202873058, + "grad_norm": 0.46926770006513724, + "learning_rate": 0.0004997076783777513, + "loss": 3.2693886756896973, + "step": 2205, + "token_acc": 0.2738393943343761 + }, + { + "epoch": 1.2931691586045149, + "grad_norm": 0.4960038827735289, + "learning_rate": 0.0004997065058035531, + "loss": 3.2350211143493652, + "step": 2206, + "token_acc": 0.27611864695971167 + }, + { + "epoch": 1.2937554969217238, + "grad_norm": 0.5083223173561959, + "learning_rate": 0.0004997053308836985, + "loss": 3.3048973083496094, + "step": 2207, + "token_acc": 0.26771323662834756 + }, + { + "epoch": 1.2943418352389329, + "grad_norm": 0.4522092033218727, + "learning_rate": 0.0004997041536181989, + "loss": 3.2984819412231445, + "step": 2208, + "token_acc": 0.2691531380152969 + }, + { + "epoch": 1.294928173556142, + "grad_norm": 0.48258725121531354, + "learning_rate": 0.0004997029740070653, + "loss": 3.3176679611206055, + "step": 2209, + "token_acc": 0.26765786536963343 + }, + { + "epoch": 1.2955145118733509, + "grad_norm": 0.42293475133752956, + "learning_rate": 0.0004997017920503088, + "loss": 3.3052749633789062, + "step": 2210, + "token_acc": 0.2670055158665964 + }, + { + "epoch": 1.29610085019056, + "grad_norm": 0.44979381404927116, + "learning_rate": 0.0004997006077479402, + "loss": 3.3270926475524902, + "step": 2211, + "token_acc": 0.26639968670536357 + }, + { + "epoch": 1.2966871885077689, + "grad_norm": 0.450920512801608, + "learning_rate": 0.0004996994210999711, + "loss": 3.2868075370788574, + "step": 2212, + "token_acc": 0.27022886119658035 + }, + { + "epoch": 1.297273526824978, + "grad_norm": 0.4264177799939463, + "learning_rate": 0.0004996982321064123, + "loss": 3.282705307006836, + "step": 2213, + "token_acc": 0.2722188641424239 + }, + { + "epoch": 1.297859865142187, + "grad_norm": 0.4003885566222595, + "learning_rate": 0.0004996970407672751, + "loss": 3.299790620803833, + "step": 2214, + "token_acc": 0.2674795253347281 + }, + { + "epoch": 1.2984462034593962, + "grad_norm": 0.3771369900651132, + "learning_rate": 0.0004996958470825706, + "loss": 3.3144707679748535, + "step": 2215, + "token_acc": 0.26723066436790827 + }, + { + "epoch": 1.299032541776605, + "grad_norm": 0.5038678633378196, + "learning_rate": 0.0004996946510523102, + "loss": 3.293795585632324, + "step": 2216, + "token_acc": 0.2704631703250542 + }, + { + "epoch": 1.2996188800938142, + "grad_norm": 0.4888576301119778, + "learning_rate": 0.0004996934526765051, + "loss": 3.316286087036133, + "step": 2217, + "token_acc": 0.26507311601002703 + }, + { + "epoch": 1.300205218411023, + "grad_norm": 0.49161036521898716, + "learning_rate": 0.0004996922519551663, + "loss": 3.3045406341552734, + "step": 2218, + "token_acc": 0.2670406980069572 + }, + { + "epoch": 1.3007915567282322, + "grad_norm": 0.49902279908074704, + "learning_rate": 0.0004996910488883053, + "loss": 3.2669148445129395, + "step": 2219, + "token_acc": 0.2716930840395937 + }, + { + "epoch": 1.3013778950454413, + "grad_norm": 0.4951241312204706, + "learning_rate": 0.0004996898434759334, + "loss": 3.271190643310547, + "step": 2220, + "token_acc": 0.2707422658084419 + }, + { + "epoch": 1.3019642333626502, + "grad_norm": 0.47242105791113687, + "learning_rate": 0.0004996886357180619, + "loss": 3.3263590335845947, + "step": 2221, + "token_acc": 0.2663918028399651 + }, + { + "epoch": 1.3025505716798593, + "grad_norm": 0.4470259326086328, + "learning_rate": 0.0004996874256147021, + "loss": 3.2767844200134277, + "step": 2222, + "token_acc": 0.2722967517545494 + }, + { + "epoch": 1.3031369099970682, + "grad_norm": 0.4701214307860167, + "learning_rate": 0.0004996862131658653, + "loss": 3.268568277359009, + "step": 2223, + "token_acc": 0.27282175690822885 + }, + { + "epoch": 1.3037232483142773, + "grad_norm": 0.45613686172146267, + "learning_rate": 0.0004996849983715631, + "loss": 3.2823128700256348, + "step": 2224, + "token_acc": 0.2709843818683501 + }, + { + "epoch": 1.3043095866314864, + "grad_norm": 0.421122404917672, + "learning_rate": 0.0004996837812318068, + "loss": 3.2828450202941895, + "step": 2225, + "token_acc": 0.2702336235453052 + }, + { + "epoch": 1.3048959249486953, + "grad_norm": 0.45499029353645193, + "learning_rate": 0.0004996825617466078, + "loss": 3.3510751724243164, + "step": 2226, + "token_acc": 0.26293417441912076 + }, + { + "epoch": 1.3054822632659044, + "grad_norm": 0.49747402890776315, + "learning_rate": 0.0004996813399159776, + "loss": 3.3093278408050537, + "step": 2227, + "token_acc": 0.2676202149372541 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.4645288054654548, + "learning_rate": 0.0004996801157399277, + "loss": 3.2721948623657227, + "step": 2228, + "token_acc": 0.27333101538600074 + }, + { + "epoch": 1.3066549399003224, + "grad_norm": 0.4128075605110155, + "learning_rate": 0.0004996788892184694, + "loss": 3.2565155029296875, + "step": 2229, + "token_acc": 0.2749232853374932 + }, + { + "epoch": 1.3072412782175316, + "grad_norm": 0.38658078362225445, + "learning_rate": 0.0004996776603516146, + "loss": 3.2716047763824463, + "step": 2230, + "token_acc": 0.27161358250796197 + }, + { + "epoch": 1.3078276165347407, + "grad_norm": 0.39847331921011103, + "learning_rate": 0.0004996764291393744, + "loss": 3.283726453781128, + "step": 2231, + "token_acc": 0.271840397646942 + }, + { + "epoch": 1.3084139548519496, + "grad_norm": 0.4277870017027911, + "learning_rate": 0.0004996751955817607, + "loss": 3.3273580074310303, + "step": 2232, + "token_acc": 0.26410638319578056 + }, + { + "epoch": 1.3090002931691587, + "grad_norm": 0.4483263796533055, + "learning_rate": 0.0004996739596787851, + "loss": 3.295280694961548, + "step": 2233, + "token_acc": 0.268735041314783 + }, + { + "epoch": 1.3095866314863676, + "grad_norm": 0.4308812073059536, + "learning_rate": 0.0004996727214304588, + "loss": 3.275662422180176, + "step": 2234, + "token_acc": 0.2710301776487783 + }, + { + "epoch": 1.3101729698035767, + "grad_norm": 0.5177150331109224, + "learning_rate": 0.0004996714808367939, + "loss": 3.3067426681518555, + "step": 2235, + "token_acc": 0.2681522664155393 + }, + { + "epoch": 1.3107593081207858, + "grad_norm": 0.6754298317599252, + "learning_rate": 0.000499670237897802, + "loss": 3.338925361633301, + "step": 2236, + "token_acc": 0.26438869278413474 + }, + { + "epoch": 1.3113456464379947, + "grad_norm": 0.6639512376373334, + "learning_rate": 0.0004996689926134944, + "loss": 3.334921360015869, + "step": 2237, + "token_acc": 0.26212603537283896 + }, + { + "epoch": 1.3119319847552038, + "grad_norm": 0.5436504678072306, + "learning_rate": 0.0004996677449838833, + "loss": 3.310713052749634, + "step": 2238, + "token_acc": 0.26560021293585306 + }, + { + "epoch": 1.3125183230724127, + "grad_norm": 0.44233419944564484, + "learning_rate": 0.0004996664950089799, + "loss": 3.325927495956421, + "step": 2239, + "token_acc": 0.26413510715070126 + }, + { + "epoch": 1.3131046613896218, + "grad_norm": 0.4728014412617763, + "learning_rate": 0.0004996652426887964, + "loss": 3.269191265106201, + "step": 2240, + "token_acc": 0.2717898837587282 + }, + { + "epoch": 1.313690999706831, + "grad_norm": 0.4594942925677933, + "learning_rate": 0.0004996639880233443, + "loss": 3.245236396789551, + "step": 2241, + "token_acc": 0.27534943081346774 + }, + { + "epoch": 1.31427733802404, + "grad_norm": 0.4047002324070713, + "learning_rate": 0.0004996627310126354, + "loss": 3.3222436904907227, + "step": 2242, + "token_acc": 0.26485410842931845 + }, + { + "epoch": 1.314863676341249, + "grad_norm": 0.43657293632927474, + "learning_rate": 0.0004996614716566817, + "loss": 3.2813055515289307, + "step": 2243, + "token_acc": 0.27142814740460713 + }, + { + "epoch": 1.315450014658458, + "grad_norm": 0.4510626863038871, + "learning_rate": 0.0004996602099554948, + "loss": 3.2938055992126465, + "step": 2244, + "token_acc": 0.26900252424179355 + }, + { + "epoch": 1.316036352975667, + "grad_norm": 0.5363536105881868, + "learning_rate": 0.0004996589459090867, + "loss": 3.26090669631958, + "step": 2245, + "token_acc": 0.2737723614673299 + }, + { + "epoch": 1.316622691292876, + "grad_norm": 0.5140672080798115, + "learning_rate": 0.0004996576795174692, + "loss": 3.293609142303467, + "step": 2246, + "token_acc": 0.26938525775755917 + }, + { + "epoch": 1.3172090296100851, + "grad_norm": 0.5974246786366461, + "learning_rate": 0.0004996564107806542, + "loss": 3.295177459716797, + "step": 2247, + "token_acc": 0.26836824134277004 + }, + { + "epoch": 1.317795367927294, + "grad_norm": 0.5758398581578759, + "learning_rate": 0.0004996551396986537, + "loss": 3.301840305328369, + "step": 2248, + "token_acc": 0.2677889599226528 + }, + { + "epoch": 1.3183817062445031, + "grad_norm": 0.4620066722988521, + "learning_rate": 0.0004996538662714795, + "loss": 3.261955499649048, + "step": 2249, + "token_acc": 0.2727928594311194 + }, + { + "epoch": 1.318968044561712, + "grad_norm": 0.5337207741945489, + "learning_rate": 0.0004996525904991437, + "loss": 3.300217628479004, + "step": 2250, + "token_acc": 0.26733015965676765 + }, + { + "epoch": 1.3195543828789211, + "grad_norm": 0.5346345116942756, + "learning_rate": 0.0004996513123816581, + "loss": 3.229556083679199, + "step": 2251, + "token_acc": 0.2780900241338538 + }, + { + "epoch": 1.3201407211961302, + "grad_norm": 0.47281419659908236, + "learning_rate": 0.000499650031919035, + "loss": 3.346834421157837, + "step": 2252, + "token_acc": 0.2626302328413734 + }, + { + "epoch": 1.3207270595133391, + "grad_norm": 0.41286466468372207, + "learning_rate": 0.0004996487491112862, + "loss": 3.3068716526031494, + "step": 2253, + "token_acc": 0.2681019883830737 + }, + { + "epoch": 1.3213133978305482, + "grad_norm": 0.3552851497697773, + "learning_rate": 0.0004996474639584239, + "loss": 3.301759719848633, + "step": 2254, + "token_acc": 0.2693165916568965 + }, + { + "epoch": 1.3218997361477571, + "grad_norm": 0.40354750288115926, + "learning_rate": 0.0004996461764604598, + "loss": 3.307332992553711, + "step": 2255, + "token_acc": 0.26676426377360957 + }, + { + "epoch": 1.3224860744649662, + "grad_norm": 0.4415890002563942, + "learning_rate": 0.0004996448866174065, + "loss": 3.3165102005004883, + "step": 2256, + "token_acc": 0.2678464511927396 + }, + { + "epoch": 1.3230724127821754, + "grad_norm": 0.4280525055945892, + "learning_rate": 0.0004996435944292759, + "loss": 3.23978328704834, + "step": 2257, + "token_acc": 0.27339656430324333 + }, + { + "epoch": 1.3236587510993845, + "grad_norm": 0.475638259301756, + "learning_rate": 0.00049964229989608, + "loss": 3.2623562812805176, + "step": 2258, + "token_acc": 0.2739527337308803 + }, + { + "epoch": 1.3242450894165934, + "grad_norm": 0.4595647603294476, + "learning_rate": 0.0004996410030178312, + "loss": 3.3211758136749268, + "step": 2259, + "token_acc": 0.266445414396033 + }, + { + "epoch": 1.3248314277338025, + "grad_norm": 0.5053541195600768, + "learning_rate": 0.0004996397037945415, + "loss": 3.2797911167144775, + "step": 2260, + "token_acc": 0.2707438290639041 + }, + { + "epoch": 1.3254177660510114, + "grad_norm": 0.48673634177852815, + "learning_rate": 0.0004996384022262233, + "loss": 3.323958396911621, + "step": 2261, + "token_acc": 0.2645838231370508 + }, + { + "epoch": 1.3260041043682205, + "grad_norm": 0.41554370340870184, + "learning_rate": 0.0004996370983128885, + "loss": 3.2735414505004883, + "step": 2262, + "token_acc": 0.2699962467673356 + }, + { + "epoch": 1.3265904426854296, + "grad_norm": 0.5160196316368776, + "learning_rate": 0.0004996357920545497, + "loss": 3.2709333896636963, + "step": 2263, + "token_acc": 0.2716877261049651 + }, + { + "epoch": 1.3271767810026385, + "grad_norm": 0.4527932131855336, + "learning_rate": 0.0004996344834512189, + "loss": 3.258944034576416, + "step": 2264, + "token_acc": 0.27525545611971075 + }, + { + "epoch": 1.3277631193198476, + "grad_norm": 0.4675045985524346, + "learning_rate": 0.0004996331725029086, + "loss": 3.305373191833496, + "step": 2265, + "token_acc": 0.26737845288553125 + }, + { + "epoch": 1.3283494576370565, + "grad_norm": 0.4629519694568142, + "learning_rate": 0.0004996318592096311, + "loss": 3.296513319015503, + "step": 2266, + "token_acc": 0.2673530185100566 + }, + { + "epoch": 1.3289357959542656, + "grad_norm": 0.5105431300050036, + "learning_rate": 0.0004996305435713985, + "loss": 3.27852725982666, + "step": 2267, + "token_acc": 0.2728006061605707 + }, + { + "epoch": 1.3295221342714747, + "grad_norm": 0.47340393460504554, + "learning_rate": 0.0004996292255882236, + "loss": 3.2675280570983887, + "step": 2268, + "token_acc": 0.2701743534232806 + }, + { + "epoch": 1.3301084725886836, + "grad_norm": 0.5127267217317832, + "learning_rate": 0.0004996279052601183, + "loss": 3.330883502960205, + "step": 2269, + "token_acc": 0.2640624258310874 + }, + { + "epoch": 1.3306948109058927, + "grad_norm": 0.43257039177282136, + "learning_rate": 0.0004996265825870952, + "loss": 3.2582836151123047, + "step": 2270, + "token_acc": 0.2736420648906396 + }, + { + "epoch": 1.3312811492231018, + "grad_norm": 0.44015672619747376, + "learning_rate": 0.0004996252575691668, + "loss": 3.279317617416382, + "step": 2271, + "token_acc": 0.2696132804421801 + }, + { + "epoch": 1.3318674875403107, + "grad_norm": 0.42535056449967384, + "learning_rate": 0.0004996239302063454, + "loss": 3.2693228721618652, + "step": 2272, + "token_acc": 0.2711299573822672 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.3789332170904118, + "learning_rate": 0.0004996226004986436, + "loss": 3.3017120361328125, + "step": 2273, + "token_acc": 0.2673867070668089 + }, + { + "epoch": 1.333040164174729, + "grad_norm": 0.38054651342068413, + "learning_rate": 0.0004996212684460738, + "loss": 3.308743953704834, + "step": 2274, + "token_acc": 0.2680272828213398 + }, + { + "epoch": 1.3336265024919378, + "grad_norm": 0.47217391077928006, + "learning_rate": 0.0004996199340486486, + "loss": 3.3293802738189697, + "step": 2275, + "token_acc": 0.26308100618925445 + }, + { + "epoch": 1.334212840809147, + "grad_norm": 0.45629975872730644, + "learning_rate": 0.0004996185973063805, + "loss": 3.3250808715820312, + "step": 2276, + "token_acc": 0.26548479225069427 + }, + { + "epoch": 1.3347991791263558, + "grad_norm": 0.4577758222155869, + "learning_rate": 0.000499617258219282, + "loss": 3.2994184494018555, + "step": 2277, + "token_acc": 0.2672851523301088 + }, + { + "epoch": 1.335385517443565, + "grad_norm": 0.4226715538233513, + "learning_rate": 0.0004996159167873658, + "loss": 3.2190985679626465, + "step": 2278, + "token_acc": 0.2786667582983987 + }, + { + "epoch": 1.335971855760774, + "grad_norm": 0.44332977385741973, + "learning_rate": 0.0004996145730106443, + "loss": 3.303474187850952, + "step": 2279, + "token_acc": 0.2657028668566365 + }, + { + "epoch": 1.336558194077983, + "grad_norm": 0.4584662784562282, + "learning_rate": 0.0004996132268891303, + "loss": 3.287426710128784, + "step": 2280, + "token_acc": 0.2704973815030438 + }, + { + "epoch": 1.337144532395192, + "grad_norm": 0.42453976240035424, + "learning_rate": 0.0004996118784228364, + "loss": 3.30275821685791, + "step": 2281, + "token_acc": 0.26814612387662456 + }, + { + "epoch": 1.337730870712401, + "grad_norm": 0.4110682757956792, + "learning_rate": 0.0004996105276117753, + "loss": 3.2236876487731934, + "step": 2282, + "token_acc": 0.2783025933040254 + }, + { + "epoch": 1.33831720902961, + "grad_norm": 0.586991189478905, + "learning_rate": 0.0004996091744559596, + "loss": 3.2606353759765625, + "step": 2283, + "token_acc": 0.2726600103637461 + }, + { + "epoch": 1.3389035473468192, + "grad_norm": 0.6280868758459562, + "learning_rate": 0.000499607818955402, + "loss": 3.308624744415283, + "step": 2284, + "token_acc": 0.2663674410458432 + }, + { + "epoch": 1.3394898856640283, + "grad_norm": 0.5655773034573501, + "learning_rate": 0.0004996064611101154, + "loss": 3.3003199100494385, + "step": 2285, + "token_acc": 0.26738473743630425 + }, + { + "epoch": 1.3400762239812372, + "grad_norm": 0.5286060506208545, + "learning_rate": 0.0004996051009201124, + "loss": 3.3941969871520996, + "step": 2286, + "token_acc": 0.25605105916673454 + }, + { + "epoch": 1.3406625622984463, + "grad_norm": 0.4661800299005168, + "learning_rate": 0.0004996037383854058, + "loss": 3.249300241470337, + "step": 2287, + "token_acc": 0.2745747967330554 + }, + { + "epoch": 1.3412489006156552, + "grad_norm": 0.5251298017337019, + "learning_rate": 0.0004996023735060085, + "loss": 3.324374198913574, + "step": 2288, + "token_acc": 0.26312886577630695 + }, + { + "epoch": 1.3418352389328643, + "grad_norm": 0.550042515084715, + "learning_rate": 0.0004996010062819332, + "loss": 3.313662052154541, + "step": 2289, + "token_acc": 0.2677766580803709 + }, + { + "epoch": 1.3424215772500734, + "grad_norm": 0.5165924590640047, + "learning_rate": 0.0004995996367131927, + "loss": 3.2825393676757812, + "step": 2290, + "token_acc": 0.26989245332148015 + }, + { + "epoch": 1.3430079155672823, + "grad_norm": 0.44091511025345437, + "learning_rate": 0.0004995982647998, + "loss": 3.2825889587402344, + "step": 2291, + "token_acc": 0.2711889338944139 + }, + { + "epoch": 1.3435942538844914, + "grad_norm": 0.4724264065802057, + "learning_rate": 0.0004995968905417681, + "loss": 3.2880802154541016, + "step": 2292, + "token_acc": 0.26786588225697916 + }, + { + "epoch": 1.3441805922017003, + "grad_norm": 0.5183179885282589, + "learning_rate": 0.0004995955139391095, + "loss": 3.313777446746826, + "step": 2293, + "token_acc": 0.2675678611329499 + }, + { + "epoch": 1.3447669305189094, + "grad_norm": 0.4970497442722069, + "learning_rate": 0.0004995941349918375, + "loss": 3.3378379344940186, + "step": 2294, + "token_acc": 0.2633278464962806 + }, + { + "epoch": 1.3453532688361185, + "grad_norm": 0.4361746270254895, + "learning_rate": 0.0004995927536999649, + "loss": 3.228069543838501, + "step": 2295, + "token_acc": 0.27746875521643843 + }, + { + "epoch": 1.3459396071533274, + "grad_norm": 0.4867497115962423, + "learning_rate": 0.0004995913700635049, + "loss": 3.2935121059417725, + "step": 2296, + "token_acc": 0.26780899203780323 + }, + { + "epoch": 1.3465259454705365, + "grad_norm": 0.38188670290680665, + "learning_rate": 0.0004995899840824701, + "loss": 3.2966370582580566, + "step": 2297, + "token_acc": 0.2684492923783913 + }, + { + "epoch": 1.3471122837877456, + "grad_norm": 0.3774444653300807, + "learning_rate": 0.0004995885957568738, + "loss": 3.2749247550964355, + "step": 2298, + "token_acc": 0.27121060549967313 + }, + { + "epoch": 1.3476986221049545, + "grad_norm": 0.38471692605844315, + "learning_rate": 0.0004995872050867289, + "loss": 3.338170051574707, + "step": 2299, + "token_acc": 0.2624641109445819 + }, + { + "epoch": 1.3482849604221636, + "grad_norm": 0.46373422006992837, + "learning_rate": 0.0004995858120720486, + "loss": 3.276038646697998, + "step": 2300, + "token_acc": 0.27177721396641297 + }, + { + "epoch": 1.3488712987393727, + "grad_norm": 0.3931626896196003, + "learning_rate": 0.0004995844167128458, + "loss": 3.2484288215637207, + "step": 2301, + "token_acc": 0.27562802021468885 + }, + { + "epoch": 1.3494576370565816, + "grad_norm": 0.3881206083640519, + "learning_rate": 0.0004995830190091338, + "loss": 3.2473719120025635, + "step": 2302, + "token_acc": 0.273347781013177 + }, + { + "epoch": 1.3500439753737907, + "grad_norm": 0.395370304110828, + "learning_rate": 0.0004995816189609258, + "loss": 3.2789053916931152, + "step": 2303, + "token_acc": 0.2699337430857536 + }, + { + "epoch": 1.3506303136909996, + "grad_norm": 0.4289588746290127, + "learning_rate": 0.0004995802165682346, + "loss": 3.2638158798217773, + "step": 2304, + "token_acc": 0.2714800278416625 + }, + { + "epoch": 1.3512166520082087, + "grad_norm": 0.4488728805776394, + "learning_rate": 0.0004995788118310737, + "loss": 3.324845790863037, + "step": 2305, + "token_acc": 0.2650002583578773 + }, + { + "epoch": 1.3518029903254178, + "grad_norm": 0.43634082580398587, + "learning_rate": 0.0004995774047494561, + "loss": 3.2578701972961426, + "step": 2306, + "token_acc": 0.2740929941618015 + }, + { + "epoch": 1.3523893286426267, + "grad_norm": 0.4289543631479029, + "learning_rate": 0.0004995759953233951, + "loss": 3.2873027324676514, + "step": 2307, + "token_acc": 0.2705378412590828 + }, + { + "epoch": 1.3529756669598358, + "grad_norm": 0.43759863031655477, + "learning_rate": 0.0004995745835529039, + "loss": 3.3710827827453613, + "step": 2308, + "token_acc": 0.25938648875741843 + }, + { + "epoch": 1.3535620052770447, + "grad_norm": 0.42301958076543245, + "learning_rate": 0.0004995731694379959, + "loss": 3.342059850692749, + "step": 2309, + "token_acc": 0.2628744530461124 + }, + { + "epoch": 1.3541483435942538, + "grad_norm": 0.4192929394802892, + "learning_rate": 0.0004995717529786843, + "loss": 3.268679618835449, + "step": 2310, + "token_acc": 0.27265579027703185 + }, + { + "epoch": 1.354734681911463, + "grad_norm": 0.4133365334774994, + "learning_rate": 0.0004995703341749824, + "loss": 3.279419422149658, + "step": 2311, + "token_acc": 0.2715547606015928 + }, + { + "epoch": 1.355321020228672, + "grad_norm": 0.39031094425635504, + "learning_rate": 0.0004995689130269034, + "loss": 3.2595338821411133, + "step": 2312, + "token_acc": 0.2730151104341982 + }, + { + "epoch": 1.355907358545881, + "grad_norm": 0.388538711182958, + "learning_rate": 0.0004995674895344607, + "loss": 3.2669413089752197, + "step": 2313, + "token_acc": 0.2695651244105602 + }, + { + "epoch": 1.35649369686309, + "grad_norm": 0.43827821792690513, + "learning_rate": 0.0004995660636976678, + "loss": 3.3141121864318848, + "step": 2314, + "token_acc": 0.2647407037946106 + }, + { + "epoch": 1.357080035180299, + "grad_norm": 0.403572987995223, + "learning_rate": 0.000499564635516538, + "loss": 3.2570347785949707, + "step": 2315, + "token_acc": 0.2736133926288906 + }, + { + "epoch": 1.357666373497508, + "grad_norm": 0.38716576904109085, + "learning_rate": 0.0004995632049910848, + "loss": 3.2765936851501465, + "step": 2316, + "token_acc": 0.26916978230318456 + }, + { + "epoch": 1.3582527118147172, + "grad_norm": 0.35697105799472595, + "learning_rate": 0.0004995617721213216, + "loss": 3.264394521713257, + "step": 2317, + "token_acc": 0.2710391629471061 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.3989666368614984, + "learning_rate": 0.0004995603369072618, + "loss": 3.278810977935791, + "step": 2318, + "token_acc": 0.2686326379286158 + }, + { + "epoch": 1.3594253884491352, + "grad_norm": 0.449243980301992, + "learning_rate": 0.0004995588993489189, + "loss": 3.260861396789551, + "step": 2319, + "token_acc": 0.27392802695466134 + }, + { + "epoch": 1.360011726766344, + "grad_norm": 0.4864857035347878, + "learning_rate": 0.0004995574594463064, + "loss": 3.23079514503479, + "step": 2320, + "token_acc": 0.27777124997062486 + }, + { + "epoch": 1.3605980650835532, + "grad_norm": 0.4936721820918022, + "learning_rate": 0.0004995560171994379, + "loss": 3.3202638626098633, + "step": 2321, + "token_acc": 0.26419777882493506 + }, + { + "epoch": 1.3611844034007623, + "grad_norm": 0.4254987751710783, + "learning_rate": 0.0004995545726083269, + "loss": 3.2858686447143555, + "step": 2322, + "token_acc": 0.2692841411428301 + }, + { + "epoch": 1.3617707417179712, + "grad_norm": 0.44402627792273136, + "learning_rate": 0.000499553125672987, + "loss": 3.297593593597412, + "step": 2323, + "token_acc": 0.2687846671060291 + }, + { + "epoch": 1.3623570800351803, + "grad_norm": 0.4434392199715045, + "learning_rate": 0.0004995516763934317, + "loss": 3.288408041000366, + "step": 2324, + "token_acc": 0.26962536631729017 + }, + { + "epoch": 1.3629434183523892, + "grad_norm": 0.4126780463451966, + "learning_rate": 0.0004995502247696747, + "loss": 3.310178756713867, + "step": 2325, + "token_acc": 0.2682862812208002 + }, + { + "epoch": 1.3635297566695983, + "grad_norm": 0.41784952101832085, + "learning_rate": 0.0004995487708017297, + "loss": 3.311824321746826, + "step": 2326, + "token_acc": 0.26665920548600264 + }, + { + "epoch": 1.3641160949868074, + "grad_norm": 0.4295150243882106, + "learning_rate": 0.0004995473144896101, + "loss": 3.2648866176605225, + "step": 2327, + "token_acc": 0.2720135374510627 + }, + { + "epoch": 1.3647024333040165, + "grad_norm": 0.41243937113242696, + "learning_rate": 0.0004995458558333299, + "loss": 3.3204498291015625, + "step": 2328, + "token_acc": 0.26296529835290516 + }, + { + "epoch": 1.3652887716212254, + "grad_norm": 0.39407214413357855, + "learning_rate": 0.0004995443948329026, + "loss": 3.254815101623535, + "step": 2329, + "token_acc": 0.2735143806105975 + }, + { + "epoch": 1.3658751099384345, + "grad_norm": 0.38047983844665134, + "learning_rate": 0.0004995429314883419, + "loss": 3.3011932373046875, + "step": 2330, + "token_acc": 0.2685477973230064 + }, + { + "epoch": 1.3664614482556434, + "grad_norm": 0.40643402206745066, + "learning_rate": 0.0004995414657996617, + "loss": 3.2614247798919678, + "step": 2331, + "token_acc": 0.27164901088919635 + }, + { + "epoch": 1.3670477865728525, + "grad_norm": 0.4471689434073979, + "learning_rate": 0.0004995399977668756, + "loss": 3.3060593605041504, + "step": 2332, + "token_acc": 0.26743122032307376 + }, + { + "epoch": 1.3676341248900616, + "grad_norm": 0.5243078654852139, + "learning_rate": 0.0004995385273899976, + "loss": 3.308076858520508, + "step": 2333, + "token_acc": 0.2677434292834062 + }, + { + "epoch": 1.3682204632072705, + "grad_norm": 0.5245984129948157, + "learning_rate": 0.0004995370546690414, + "loss": 3.2810144424438477, + "step": 2334, + "token_acc": 0.26962385997337646 + }, + { + "epoch": 1.3688068015244796, + "grad_norm": 0.5182262354992822, + "learning_rate": 0.0004995355796040208, + "loss": 3.255418300628662, + "step": 2335, + "token_acc": 0.2734937784445481 + }, + { + "epoch": 1.3693931398416885, + "grad_norm": 0.5616814596444533, + "learning_rate": 0.0004995341021949496, + "loss": 3.2526869773864746, + "step": 2336, + "token_acc": 0.27381642302755227 + }, + { + "epoch": 1.3699794781588976, + "grad_norm": 0.4682025185648729, + "learning_rate": 0.0004995326224418418, + "loss": 3.2829577922821045, + "step": 2337, + "token_acc": 0.27092491207156566 + }, + { + "epoch": 1.3705658164761068, + "grad_norm": 0.48052958982843025, + "learning_rate": 0.0004995311403447112, + "loss": 3.2882399559020996, + "step": 2338, + "token_acc": 0.2693247454068776 + }, + { + "epoch": 1.3711521547933159, + "grad_norm": 0.41281302953097315, + "learning_rate": 0.0004995296559035719, + "loss": 3.2644882202148438, + "step": 2339, + "token_acc": 0.27280497978566387 + }, + { + "epoch": 1.3717384931105248, + "grad_norm": 0.4153637311465592, + "learning_rate": 0.0004995281691184377, + "loss": 3.276925802230835, + "step": 2340, + "token_acc": 0.27157355089729424 + }, + { + "epoch": 1.3723248314277339, + "grad_norm": 0.4207738598214789, + "learning_rate": 0.0004995266799893227, + "loss": 3.311645746231079, + "step": 2341, + "token_acc": 0.26852677696318655 + }, + { + "epoch": 1.3729111697449428, + "grad_norm": 0.45763595935927626, + "learning_rate": 0.0004995251885162406, + "loss": 3.304471492767334, + "step": 2342, + "token_acc": 0.26985741787050227 + }, + { + "epoch": 1.3734975080621519, + "grad_norm": 0.4457647483233761, + "learning_rate": 0.0004995236946992057, + "loss": 3.270564079284668, + "step": 2343, + "token_acc": 0.27217660851032394 + }, + { + "epoch": 1.374083846379361, + "grad_norm": 0.503161367700388, + "learning_rate": 0.0004995221985382318, + "loss": 3.269784450531006, + "step": 2344, + "token_acc": 0.2721498195917928 + }, + { + "epoch": 1.3746701846965699, + "grad_norm": 0.5223022170174856, + "learning_rate": 0.0004995207000333332, + "loss": 3.3418283462524414, + "step": 2345, + "token_acc": 0.26224315851173097 + }, + { + "epoch": 1.375256523013779, + "grad_norm": 0.5800666288440978, + "learning_rate": 0.0004995191991845238, + "loss": 3.2474100589752197, + "step": 2346, + "token_acc": 0.27470501278144527 + }, + { + "epoch": 1.3758428613309879, + "grad_norm": 0.5152070546247417, + "learning_rate": 0.0004995176959918179, + "loss": 3.2900550365448, + "step": 2347, + "token_acc": 0.2713010399244459 + }, + { + "epoch": 1.376429199648197, + "grad_norm": 0.42299369180228974, + "learning_rate": 0.0004995161904552293, + "loss": 3.281383991241455, + "step": 2348, + "token_acc": 0.269463508425241 + }, + { + "epoch": 1.377015537965406, + "grad_norm": 0.45482496073797835, + "learning_rate": 0.0004995146825747724, + "loss": 3.294569969177246, + "step": 2349, + "token_acc": 0.2676048570655062 + }, + { + "epoch": 1.377601876282615, + "grad_norm": 0.4782786042192761, + "learning_rate": 0.0004995131723504612, + "loss": 3.2235167026519775, + "step": 2350, + "token_acc": 0.27888023149315744 + }, + { + "epoch": 1.378188214599824, + "grad_norm": 0.43237970628425565, + "learning_rate": 0.0004995116597823101, + "loss": 3.3272016048431396, + "step": 2351, + "token_acc": 0.2652090954964047 + }, + { + "epoch": 1.378774552917033, + "grad_norm": 0.3637994416873148, + "learning_rate": 0.0004995101448703331, + "loss": 3.293396234512329, + "step": 2352, + "token_acc": 0.26739950719180505 + }, + { + "epoch": 1.379360891234242, + "grad_norm": 0.36937918942289955, + "learning_rate": 0.0004995086276145444, + "loss": 3.31223726272583, + "step": 2353, + "token_acc": 0.267059812359802 + }, + { + "epoch": 1.3799472295514512, + "grad_norm": 0.41105314800442505, + "learning_rate": 0.0004995071080149585, + "loss": 3.3013970851898193, + "step": 2354, + "token_acc": 0.2671748681778832 + }, + { + "epoch": 1.3805335678686603, + "grad_norm": 0.3686336889621483, + "learning_rate": 0.0004995055860715895, + "loss": 3.28922700881958, + "step": 2355, + "token_acc": 0.27046273575167445 + }, + { + "epoch": 1.3811199061858692, + "grad_norm": 0.37225358441161743, + "learning_rate": 0.0004995040617844517, + "loss": 3.2997946739196777, + "step": 2356, + "token_acc": 0.26778001504338544 + }, + { + "epoch": 1.3817062445030783, + "grad_norm": 0.36752493475498094, + "learning_rate": 0.0004995025351535596, + "loss": 3.278599262237549, + "step": 2357, + "token_acc": 0.2707419137712428 + }, + { + "epoch": 1.3822925828202872, + "grad_norm": 0.37544761967342893, + "learning_rate": 0.0004995010061789272, + "loss": 3.2309446334838867, + "step": 2358, + "token_acc": 0.2764470685880567 + }, + { + "epoch": 1.3828789211374963, + "grad_norm": 0.4248999973595112, + "learning_rate": 0.0004994994748605691, + "loss": 3.2970943450927734, + "step": 2359, + "token_acc": 0.26786099322110557 + }, + { + "epoch": 1.3834652594547054, + "grad_norm": 0.38624077122980804, + "learning_rate": 0.0004994979411984997, + "loss": 3.2732152938842773, + "step": 2360, + "token_acc": 0.2685163721046728 + }, + { + "epoch": 1.3840515977719143, + "grad_norm": 0.41785692725258833, + "learning_rate": 0.0004994964051927333, + "loss": 3.324655771255493, + "step": 2361, + "token_acc": 0.26495019561477845 + }, + { + "epoch": 1.3846379360891234, + "grad_norm": 0.39420040668470063, + "learning_rate": 0.0004994948668432844, + "loss": 3.300771713256836, + "step": 2362, + "token_acc": 0.2674537960657567 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.4647716754742074, + "learning_rate": 0.0004994933261501674, + "loss": 3.2652158737182617, + "step": 2363, + "token_acc": 0.2705149971703452 + }, + { + "epoch": 1.3858106127235414, + "grad_norm": 0.4829481963299405, + "learning_rate": 0.0004994917831133968, + "loss": 3.2998428344726562, + "step": 2364, + "token_acc": 0.26695146904921824 + }, + { + "epoch": 1.3863969510407506, + "grad_norm": 0.49291837622427154, + "learning_rate": 0.0004994902377329872, + "loss": 3.2784509658813477, + "step": 2365, + "token_acc": 0.26893013020738277 + }, + { + "epoch": 1.3869832893579597, + "grad_norm": 0.48098713613840594, + "learning_rate": 0.0004994886900089528, + "loss": 3.314309597015381, + "step": 2366, + "token_acc": 0.2659981682831177 + }, + { + "epoch": 1.3875696276751686, + "grad_norm": 0.4231703034287215, + "learning_rate": 0.0004994871399413085, + "loss": 3.2711880207061768, + "step": 2367, + "token_acc": 0.2706885543471952 + }, + { + "epoch": 1.3881559659923777, + "grad_norm": 0.4364650107952657, + "learning_rate": 0.0004994855875300687, + "loss": 3.315361499786377, + "step": 2368, + "token_acc": 0.2669514764887567 + }, + { + "epoch": 1.3887423043095866, + "grad_norm": 0.4183616060764355, + "learning_rate": 0.0004994840327752479, + "loss": 3.3013198375701904, + "step": 2369, + "token_acc": 0.2665252477696106 + }, + { + "epoch": 1.3893286426267957, + "grad_norm": 0.40496518381769436, + "learning_rate": 0.0004994824756768608, + "loss": 3.289513349533081, + "step": 2370, + "token_acc": 0.26910134548039705 + }, + { + "epoch": 1.3899149809440048, + "grad_norm": 0.45478998416071864, + "learning_rate": 0.0004994809162349222, + "loss": 3.2043423652648926, + "step": 2371, + "token_acc": 0.28000396109561765 + }, + { + "epoch": 1.3905013192612137, + "grad_norm": 0.4449421931056942, + "learning_rate": 0.0004994793544494464, + "loss": 3.2940096855163574, + "step": 2372, + "token_acc": 0.2672462263742918 + }, + { + "epoch": 1.3910876575784228, + "grad_norm": 0.4158157231666166, + "learning_rate": 0.0004994777903204483, + "loss": 3.23759126663208, + "step": 2373, + "token_acc": 0.27440058125454103 + }, + { + "epoch": 1.3916739958956317, + "grad_norm": 0.361738811253168, + "learning_rate": 0.0004994762238479426, + "loss": 3.2651798725128174, + "step": 2374, + "token_acc": 0.26993871260936014 + }, + { + "epoch": 1.3922603342128408, + "grad_norm": 0.4403909832363792, + "learning_rate": 0.0004994746550319438, + "loss": 3.2999963760375977, + "step": 2375, + "token_acc": 0.26933310283165907 + }, + { + "epoch": 1.39284667253005, + "grad_norm": 0.4768469028439963, + "learning_rate": 0.000499473083872467, + "loss": 3.269237518310547, + "step": 2376, + "token_acc": 0.2718961361228744 + }, + { + "epoch": 1.3934330108472588, + "grad_norm": 0.43807070948026056, + "learning_rate": 0.0004994715103695265, + "loss": 3.3203930854797363, + "step": 2377, + "token_acc": 0.2640108122023426 + }, + { + "epoch": 1.394019349164468, + "grad_norm": 0.38994892486738564, + "learning_rate": 0.0004994699345231375, + "loss": 3.292973756790161, + "step": 2378, + "token_acc": 0.26975914223153824 + }, + { + "epoch": 1.3946056874816768, + "grad_norm": 0.4880771768020366, + "learning_rate": 0.0004994683563333145, + "loss": 3.267620801925659, + "step": 2379, + "token_acc": 0.27325219680492785 + }, + { + "epoch": 1.395192025798886, + "grad_norm": 0.5543662540661931, + "learning_rate": 0.0004994667758000726, + "loss": 3.333808422088623, + "step": 2380, + "token_acc": 0.2645320626895587 + }, + { + "epoch": 1.395778364116095, + "grad_norm": 0.5592990187397917, + "learning_rate": 0.0004994651929234264, + "loss": 3.345183849334717, + "step": 2381, + "token_acc": 0.26065891219643494 + }, + { + "epoch": 1.3963647024333041, + "grad_norm": 0.4286386821841761, + "learning_rate": 0.0004994636077033909, + "loss": 3.270714521408081, + "step": 2382, + "token_acc": 0.27153173437668904 + }, + { + "epoch": 1.396951040750513, + "grad_norm": 0.42763080389096125, + "learning_rate": 0.0004994620201399809, + "loss": 3.2965826988220215, + "step": 2383, + "token_acc": 0.2668332589003364 + }, + { + "epoch": 1.3975373790677221, + "grad_norm": 0.4663006871980911, + "learning_rate": 0.0004994604302332114, + "loss": 3.318204402923584, + "step": 2384, + "token_acc": 0.2645549366981836 + }, + { + "epoch": 1.398123717384931, + "grad_norm": 0.44304979692414576, + "learning_rate": 0.0004994588379830975, + "loss": 3.2938432693481445, + "step": 2385, + "token_acc": 0.26944825765575503 + }, + { + "epoch": 1.3987100557021401, + "grad_norm": 0.4313757948865563, + "learning_rate": 0.0004994572433896537, + "loss": 3.3045687675476074, + "step": 2386, + "token_acc": 0.2670202272523475 + }, + { + "epoch": 1.3992963940193492, + "grad_norm": 0.45052168335132975, + "learning_rate": 0.0004994556464528953, + "loss": 3.3109493255615234, + "step": 2387, + "token_acc": 0.2658245599975959 + }, + { + "epoch": 1.3998827323365581, + "grad_norm": 0.4278445495586633, + "learning_rate": 0.0004994540471728373, + "loss": 3.2941904067993164, + "step": 2388, + "token_acc": 0.27051228599425864 + }, + { + "epoch": 1.4004690706537672, + "grad_norm": 0.36429457213458977, + "learning_rate": 0.0004994524455494947, + "loss": 3.2693257331848145, + "step": 2389, + "token_acc": 0.27019142830111326 + }, + { + "epoch": 1.4010554089709761, + "grad_norm": 0.4745060657255523, + "learning_rate": 0.0004994508415828826, + "loss": 3.272376775741577, + "step": 2390, + "token_acc": 0.2712222062684608 + }, + { + "epoch": 1.4016417472881852, + "grad_norm": 0.4275148747201497, + "learning_rate": 0.0004994492352730158, + "loss": 3.275245189666748, + "step": 2391, + "token_acc": 0.2693135361804894 + }, + { + "epoch": 1.4022280856053944, + "grad_norm": 0.4060235336079872, + "learning_rate": 0.0004994476266199097, + "loss": 3.303415298461914, + "step": 2392, + "token_acc": 0.26930292285172247 + }, + { + "epoch": 1.4028144239226035, + "grad_norm": 0.4483337946757233, + "learning_rate": 0.0004994460156235792, + "loss": 3.2669177055358887, + "step": 2393, + "token_acc": 0.270843876271965 + }, + { + "epoch": 1.4034007622398124, + "grad_norm": 0.3906221992316279, + "learning_rate": 0.0004994444022840396, + "loss": 3.283518075942993, + "step": 2394, + "token_acc": 0.27247602816011274 + }, + { + "epoch": 1.4039871005570215, + "grad_norm": 0.43270813155875, + "learning_rate": 0.0004994427866013058, + "loss": 3.249089002609253, + "step": 2395, + "token_acc": 0.2728800588226467 + }, + { + "epoch": 1.4045734388742304, + "grad_norm": 0.40722232658592983, + "learning_rate": 0.0004994411685753933, + "loss": 3.275559186935425, + "step": 2396, + "token_acc": 0.26993604655789366 + }, + { + "epoch": 1.4051597771914395, + "grad_norm": 0.388132141574891, + "learning_rate": 0.0004994395482063171, + "loss": 3.2582221031188965, + "step": 2397, + "token_acc": 0.2734531995054263 + }, + { + "epoch": 1.4057461155086486, + "grad_norm": 0.3784741615173718, + "learning_rate": 0.0004994379254940925, + "loss": 3.256376266479492, + "step": 2398, + "token_acc": 0.27358913016048947 + }, + { + "epoch": 1.4063324538258575, + "grad_norm": 0.4498414540179996, + "learning_rate": 0.0004994363004387347, + "loss": 3.3234100341796875, + "step": 2399, + "token_acc": 0.2639112560905962 + }, + { + "epoch": 1.4069187921430666, + "grad_norm": 0.6080359310461864, + "learning_rate": 0.0004994346730402589, + "loss": 3.2941641807556152, + "step": 2400, + "token_acc": 0.26837289785896473 + }, + { + "epoch": 1.4075051304602755, + "grad_norm": 0.6877505090956137, + "learning_rate": 0.0004994330432986806, + "loss": 3.3409247398376465, + "step": 2401, + "token_acc": 0.26172977221158844 + }, + { + "epoch": 1.4080914687774846, + "grad_norm": 0.5597915550915546, + "learning_rate": 0.0004994314112140149, + "loss": 3.326296806335449, + "step": 2402, + "token_acc": 0.2638461885679608 + }, + { + "epoch": 1.4086778070946937, + "grad_norm": 0.49333852787937604, + "learning_rate": 0.0004994297767862772, + "loss": 3.2763686180114746, + "step": 2403, + "token_acc": 0.270786991929702 + }, + { + "epoch": 1.4092641454119026, + "grad_norm": 0.5410737702556037, + "learning_rate": 0.0004994281400154828, + "loss": 3.2548184394836426, + "step": 2404, + "token_acc": 0.27506715925446734 + }, + { + "epoch": 1.4098504837291117, + "grad_norm": 0.3837311574502268, + "learning_rate": 0.0004994265009016473, + "loss": 3.266517162322998, + "step": 2405, + "token_acc": 0.26983625660936283 + }, + { + "epoch": 1.4104368220463206, + "grad_norm": 0.46608704494965136, + "learning_rate": 0.0004994248594447858, + "loss": 3.290151357650757, + "step": 2406, + "token_acc": 0.2697969477159868 + }, + { + "epoch": 1.4110231603635297, + "grad_norm": 0.45837358308905085, + "learning_rate": 0.0004994232156449139, + "loss": 3.3085005283355713, + "step": 2407, + "token_acc": 0.2664963659349978 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.3732815571359981, + "learning_rate": 0.000499421569502047, + "loss": 3.2698757648468018, + "step": 2408, + "token_acc": 0.2725692626641258 + }, + { + "epoch": 1.412195836997948, + "grad_norm": 0.4438848972434387, + "learning_rate": 0.0004994199210162004, + "loss": 3.3200173377990723, + "step": 2409, + "token_acc": 0.26541926364469753 + }, + { + "epoch": 1.4127821753151568, + "grad_norm": 0.45497076739662157, + "learning_rate": 0.00049941827018739, + "loss": 3.285585880279541, + "step": 2410, + "token_acc": 0.270281418152502 + }, + { + "epoch": 1.413368513632366, + "grad_norm": 0.4449436022453608, + "learning_rate": 0.0004994166170156309, + "loss": 3.235724449157715, + "step": 2411, + "token_acc": 0.275249774347163 + }, + { + "epoch": 1.4139548519495748, + "grad_norm": 0.39202483593487275, + "learning_rate": 0.0004994149615009388, + "loss": 3.293283462524414, + "step": 2412, + "token_acc": 0.2682607661194385 + }, + { + "epoch": 1.414541190266784, + "grad_norm": 0.3698871196713657, + "learning_rate": 0.0004994133036433292, + "loss": 3.2842347621917725, + "step": 2413, + "token_acc": 0.270109723595409 + }, + { + "epoch": 1.415127528583993, + "grad_norm": 0.3930784277401731, + "learning_rate": 0.0004994116434428178, + "loss": 3.243622064590454, + "step": 2414, + "token_acc": 0.27539536669132547 + }, + { + "epoch": 1.415713866901202, + "grad_norm": 0.36735858394106163, + "learning_rate": 0.0004994099808994199, + "loss": 3.302119255065918, + "step": 2415, + "token_acc": 0.266067863230932 + }, + { + "epoch": 1.416300205218411, + "grad_norm": 0.35977657501291754, + "learning_rate": 0.0004994083160131514, + "loss": 3.2615537643432617, + "step": 2416, + "token_acc": 0.2721136938578218 + }, + { + "epoch": 1.41688654353562, + "grad_norm": 0.41114291143599424, + "learning_rate": 0.000499406648784028, + "loss": 3.2673168182373047, + "step": 2417, + "token_acc": 0.2721787332571744 + }, + { + "epoch": 1.417472881852829, + "grad_norm": 0.45010157542119594, + "learning_rate": 0.0004994049792120651, + "loss": 3.278304100036621, + "step": 2418, + "token_acc": 0.2707497179857113 + }, + { + "epoch": 1.4180592201700382, + "grad_norm": 0.44480154864729593, + "learning_rate": 0.0004994033072972785, + "loss": 3.309415340423584, + "step": 2419, + "token_acc": 0.2653199143290844 + }, + { + "epoch": 1.4186455584872473, + "grad_norm": 0.39311308357056846, + "learning_rate": 0.0004994016330396838, + "loss": 3.301295280456543, + "step": 2420, + "token_acc": 0.26797842748262124 + }, + { + "epoch": 1.4192318968044562, + "grad_norm": 0.42357246587152864, + "learning_rate": 0.0004993999564392969, + "loss": 3.302398204803467, + "step": 2421, + "token_acc": 0.2656842115760501 + }, + { + "epoch": 1.4198182351216653, + "grad_norm": 0.4386433829983227, + "learning_rate": 0.0004993982774961336, + "loss": 3.261169672012329, + "step": 2422, + "token_acc": 0.2721297522224324 + }, + { + "epoch": 1.4204045734388742, + "grad_norm": 0.442379950669629, + "learning_rate": 0.0004993965962102094, + "loss": 3.263920307159424, + "step": 2423, + "token_acc": 0.2712250122837415 + }, + { + "epoch": 1.4209909117560833, + "grad_norm": 0.43185003691438056, + "learning_rate": 0.0004993949125815404, + "loss": 3.2329049110412598, + "step": 2424, + "token_acc": 0.27541785401840757 + }, + { + "epoch": 1.4215772500732924, + "grad_norm": 0.42617143291573495, + "learning_rate": 0.0004993932266101421, + "loss": 3.242340564727783, + "step": 2425, + "token_acc": 0.27574158133642984 + }, + { + "epoch": 1.4221635883905013, + "grad_norm": 0.41735024874572524, + "learning_rate": 0.0004993915382960305, + "loss": 3.2836499214172363, + "step": 2426, + "token_acc": 0.27084416865438765 + }, + { + "epoch": 1.4227499267077104, + "grad_norm": 0.4658328761863938, + "learning_rate": 0.0004993898476392215, + "loss": 3.2996325492858887, + "step": 2427, + "token_acc": 0.2684273916485308 + }, + { + "epoch": 1.4233362650249193, + "grad_norm": 0.4735765097561413, + "learning_rate": 0.0004993881546397311, + "loss": 3.339658737182617, + "step": 2428, + "token_acc": 0.2627487185696412 + }, + { + "epoch": 1.4239226033421284, + "grad_norm": 0.5020945496923419, + "learning_rate": 0.0004993864592975748, + "loss": 3.262714385986328, + "step": 2429, + "token_acc": 0.27243149127480154 + }, + { + "epoch": 1.4245089416593375, + "grad_norm": 0.47196321610361586, + "learning_rate": 0.0004993847616127689, + "loss": 3.2826731204986572, + "step": 2430, + "token_acc": 0.2680442607652313 + }, + { + "epoch": 1.4250952799765464, + "grad_norm": 0.4224639972912335, + "learning_rate": 0.0004993830615853292, + "loss": 3.251896858215332, + "step": 2431, + "token_acc": 0.27464382980991564 + }, + { + "epoch": 1.4256816182937555, + "grad_norm": 0.4995368108113014, + "learning_rate": 0.0004993813592152716, + "loss": 3.245842456817627, + "step": 2432, + "token_acc": 0.27499923898815865 + }, + { + "epoch": 1.4262679566109644, + "grad_norm": 0.5317309844684932, + "learning_rate": 0.0004993796545026123, + "loss": 3.318612813949585, + "step": 2433, + "token_acc": 0.26326512799411467 + }, + { + "epoch": 1.4268542949281735, + "grad_norm": 0.48420607871156013, + "learning_rate": 0.000499377947447367, + "loss": 3.2774157524108887, + "step": 2434, + "token_acc": 0.27021338805227096 + }, + { + "epoch": 1.4274406332453826, + "grad_norm": 0.3958515984187191, + "learning_rate": 0.0004993762380495521, + "loss": 3.2691493034362793, + "step": 2435, + "token_acc": 0.273173824313031 + }, + { + "epoch": 1.4280269715625917, + "grad_norm": 0.45421733969826983, + "learning_rate": 0.0004993745263091835, + "loss": 3.280101776123047, + "step": 2436, + "token_acc": 0.2707349903871836 + }, + { + "epoch": 1.4286133098798006, + "grad_norm": 0.3955083766544465, + "learning_rate": 0.0004993728122262772, + "loss": 3.2648422718048096, + "step": 2437, + "token_acc": 0.2720867881435877 + }, + { + "epoch": 1.4291996481970097, + "grad_norm": 0.4069911842909755, + "learning_rate": 0.0004993710958008494, + "loss": 3.2858872413635254, + "step": 2438, + "token_acc": 0.2692404690849481 + }, + { + "epoch": 1.4297859865142186, + "grad_norm": 0.371692531984093, + "learning_rate": 0.0004993693770329161, + "loss": 3.2725448608398438, + "step": 2439, + "token_acc": 0.27045549738219893 + }, + { + "epoch": 1.4303723248314277, + "grad_norm": 0.34977394631058667, + "learning_rate": 0.0004993676559224935, + "loss": 3.2855043411254883, + "step": 2440, + "token_acc": 0.26936375100875365 + }, + { + "epoch": 1.4309586631486368, + "grad_norm": 0.3791605862301265, + "learning_rate": 0.0004993659324695979, + "loss": 3.2726244926452637, + "step": 2441, + "token_acc": 0.2693274852259728 + }, + { + "epoch": 1.4315450014658457, + "grad_norm": 0.44948477554412364, + "learning_rate": 0.0004993642066742454, + "loss": 3.2436182498931885, + "step": 2442, + "token_acc": 0.27496982712948104 + }, + { + "epoch": 1.4321313397830548, + "grad_norm": 0.49342786573180974, + "learning_rate": 0.0004993624785364523, + "loss": 3.269402503967285, + "step": 2443, + "token_acc": 0.2718052738336714 + }, + { + "epoch": 1.4327176781002637, + "grad_norm": 0.5020917570218351, + "learning_rate": 0.0004993607480562346, + "loss": 3.2819466590881348, + "step": 2444, + "token_acc": 0.26872154591594055 + }, + { + "epoch": 1.4333040164174728, + "grad_norm": 0.46251155185863657, + "learning_rate": 0.0004993590152336086, + "loss": 3.267725944519043, + "step": 2445, + "token_acc": 0.27092517382332454 + }, + { + "epoch": 1.433890354734682, + "grad_norm": 0.40637519977868697, + "learning_rate": 0.0004993572800685908, + "loss": 3.2750463485717773, + "step": 2446, + "token_acc": 0.2708327037182548 + }, + { + "epoch": 1.434476693051891, + "grad_norm": 0.3349490013743137, + "learning_rate": 0.0004993555425611973, + "loss": 3.29209041595459, + "step": 2447, + "token_acc": 0.269240022384586 + }, + { + "epoch": 1.4350630313691, + "grad_norm": 0.3914006868213982, + "learning_rate": 0.0004993538027114445, + "loss": 3.2564802169799805, + "step": 2448, + "token_acc": 0.2723435324659011 + }, + { + "epoch": 1.435649369686309, + "grad_norm": 0.3795655466018845, + "learning_rate": 0.0004993520605193488, + "loss": 3.299866199493408, + "step": 2449, + "token_acc": 0.26683006665976994 + }, + { + "epoch": 1.436235708003518, + "grad_norm": 0.39320495729549493, + "learning_rate": 0.0004993503159849265, + "loss": 3.2362561225891113, + "step": 2450, + "token_acc": 0.277570010658319 + }, + { + "epoch": 1.436822046320727, + "grad_norm": 0.3855139263559048, + "learning_rate": 0.0004993485691081938, + "loss": 3.2747299671173096, + "step": 2451, + "token_acc": 0.2712236518508354 + }, + { + "epoch": 1.4374083846379362, + "grad_norm": 0.3171450553692962, + "learning_rate": 0.0004993468198891674, + "loss": 3.2112998962402344, + "step": 2452, + "token_acc": 0.2782831704248524 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.343709236642012, + "learning_rate": 0.0004993450683278638, + "loss": 3.2441320419311523, + "step": 2453, + "token_acc": 0.275946520190088 + }, + { + "epoch": 1.4385810612723542, + "grad_norm": 0.4122018025254066, + "learning_rate": 0.0004993433144242991, + "loss": 3.3050241470336914, + "step": 2454, + "token_acc": 0.2652505405302887 + }, + { + "epoch": 1.439167399589563, + "grad_norm": 0.4482920361446149, + "learning_rate": 0.0004993415581784899, + "loss": 3.2310972213745117, + "step": 2455, + "token_acc": 0.278045961471679 + }, + { + "epoch": 1.4397537379067722, + "grad_norm": 0.42084961180158714, + "learning_rate": 0.0004993397995904529, + "loss": 3.282933473587036, + "step": 2456, + "token_acc": 0.268900026237022 + }, + { + "epoch": 1.4403400762239813, + "grad_norm": 0.4193544433482455, + "learning_rate": 0.0004993380386602044, + "loss": 3.254255533218384, + "step": 2457, + "token_acc": 0.27231084229109864 + }, + { + "epoch": 1.4409264145411902, + "grad_norm": 0.42854566305269237, + "learning_rate": 0.0004993362753877611, + "loss": 3.3185832500457764, + "step": 2458, + "token_acc": 0.2659728533633944 + }, + { + "epoch": 1.4415127528583993, + "grad_norm": 0.46673963600122176, + "learning_rate": 0.0004993345097731393, + "loss": 3.273925542831421, + "step": 2459, + "token_acc": 0.26833342992564074 + }, + { + "epoch": 1.4420990911756082, + "grad_norm": 0.5317865469910968, + "learning_rate": 0.0004993327418163559, + "loss": 3.2802789211273193, + "step": 2460, + "token_acc": 0.26909991724496035 + }, + { + "epoch": 1.4426854294928173, + "grad_norm": 0.4691112477627041, + "learning_rate": 0.0004993309715174274, + "loss": 3.2661590576171875, + "step": 2461, + "token_acc": 0.2712857671484171 + }, + { + "epoch": 1.4432717678100264, + "grad_norm": 0.3974951175482918, + "learning_rate": 0.0004993291988763703, + "loss": 3.2456650733947754, + "step": 2462, + "token_acc": 0.2727835286441477 + }, + { + "epoch": 1.4438581061272355, + "grad_norm": 0.40054505264099066, + "learning_rate": 0.0004993274238932014, + "loss": 3.2457351684570312, + "step": 2463, + "token_acc": 0.27604568545297015 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.47178570681883564, + "learning_rate": 0.0004993256465679373, + "loss": 3.2745256423950195, + "step": 2464, + "token_acc": 0.2701804417444937 + }, + { + "epoch": 1.4450307827616535, + "grad_norm": 0.5517408575569929, + "learning_rate": 0.0004993238669005947, + "loss": 3.3075637817382812, + "step": 2465, + "token_acc": 0.26637777147318975 + }, + { + "epoch": 1.4456171210788624, + "grad_norm": 0.44406072492792575, + "learning_rate": 0.0004993220848911904, + "loss": 3.31062912940979, + "step": 2466, + "token_acc": 0.2655964311183318 + }, + { + "epoch": 1.4462034593960715, + "grad_norm": 0.4131787890915645, + "learning_rate": 0.000499320300539741, + "loss": 3.2557196617126465, + "step": 2467, + "token_acc": 0.2733014187750637 + }, + { + "epoch": 1.4467897977132806, + "grad_norm": 0.5057645729067284, + "learning_rate": 0.0004993185138462634, + "loss": 3.273193120956421, + "step": 2468, + "token_acc": 0.27069373145076525 + }, + { + "epoch": 1.4473761360304895, + "grad_norm": 0.37248904123597704, + "learning_rate": 0.0004993167248107744, + "loss": 3.324291706085205, + "step": 2469, + "token_acc": 0.26502664699706285 + }, + { + "epoch": 1.4479624743476986, + "grad_norm": 0.4182448605169437, + "learning_rate": 0.0004993149334332906, + "loss": 3.2704477310180664, + "step": 2470, + "token_acc": 0.26992079544226405 + }, + { + "epoch": 1.4485488126649075, + "grad_norm": 0.41411552948298735, + "learning_rate": 0.000499313139713829, + "loss": 3.242617130279541, + "step": 2471, + "token_acc": 0.27521945603683984 + }, + { + "epoch": 1.4491351509821166, + "grad_norm": 0.42963211641251975, + "learning_rate": 0.0004993113436524063, + "loss": 3.315239906311035, + "step": 2472, + "token_acc": 0.26580921480899783 + }, + { + "epoch": 1.4497214892993258, + "grad_norm": 0.4286416688189302, + "learning_rate": 0.0004993095452490397, + "loss": 3.2252345085144043, + "step": 2473, + "token_acc": 0.2757206792505042 + }, + { + "epoch": 1.4503078276165349, + "grad_norm": 0.47164884619082065, + "learning_rate": 0.0004993077445037457, + "loss": 3.2793056964874268, + "step": 2474, + "token_acc": 0.26860372268262617 + }, + { + "epoch": 1.4508941659337438, + "grad_norm": 0.5175823965836549, + "learning_rate": 0.0004993059414165415, + "loss": 3.3221418857574463, + "step": 2475, + "token_acc": 0.2621854570975704 + }, + { + "epoch": 1.4514805042509529, + "grad_norm": 0.5342560600401027, + "learning_rate": 0.0004993041359874439, + "loss": 3.337784767150879, + "step": 2476, + "token_acc": 0.25989950375756127 + }, + { + "epoch": 1.4520668425681618, + "grad_norm": 0.458328714064463, + "learning_rate": 0.0004993023282164698, + "loss": 3.2408292293548584, + "step": 2477, + "token_acc": 0.2749948885708444 + }, + { + "epoch": 1.4526531808853709, + "grad_norm": 0.39698258685234666, + "learning_rate": 0.0004993005181036363, + "loss": 3.320158004760742, + "step": 2478, + "token_acc": 0.2643404055094533 + }, + { + "epoch": 1.45323951920258, + "grad_norm": 0.3765396597582679, + "learning_rate": 0.0004992987056489604, + "loss": 3.238387107849121, + "step": 2479, + "token_acc": 0.2754056384956155 + }, + { + "epoch": 1.4538258575197889, + "grad_norm": 0.42077219527452947, + "learning_rate": 0.0004992968908524591, + "loss": 3.274458169937134, + "step": 2480, + "token_acc": 0.2701064778888132 + }, + { + "epoch": 1.454412195836998, + "grad_norm": 0.41389339023504246, + "learning_rate": 0.0004992950737141494, + "loss": 3.2594246864318848, + "step": 2481, + "token_acc": 0.27272703628471184 + }, + { + "epoch": 1.4549985341542069, + "grad_norm": 0.3405634738360139, + "learning_rate": 0.0004992932542340485, + "loss": 3.2280430793762207, + "step": 2482, + "token_acc": 0.2774763759120692 + }, + { + "epoch": 1.455584872471416, + "grad_norm": 0.4297230054259209, + "learning_rate": 0.0004992914324121732, + "loss": 3.2634799480438232, + "step": 2483, + "token_acc": 0.27272418788582764 + }, + { + "epoch": 1.456171210788625, + "grad_norm": 0.4059970885983026, + "learning_rate": 0.000499289608248541, + "loss": 3.259629726409912, + "step": 2484, + "token_acc": 0.27341594141371256 + }, + { + "epoch": 1.456757549105834, + "grad_norm": 0.4114114119208386, + "learning_rate": 0.0004992877817431688, + "loss": 3.2147793769836426, + "step": 2485, + "token_acc": 0.2775868208772663 + }, + { + "epoch": 1.457343887423043, + "grad_norm": 0.3621918317386891, + "learning_rate": 0.0004992859528960738, + "loss": 3.2161388397216797, + "step": 2486, + "token_acc": 0.27861182671522905 + }, + { + "epoch": 1.457930225740252, + "grad_norm": 0.33401741015434977, + "learning_rate": 0.0004992841217072733, + "loss": 3.24991512298584, + "step": 2487, + "token_acc": 0.2740352855054979 + }, + { + "epoch": 1.458516564057461, + "grad_norm": 0.34819618250013823, + "learning_rate": 0.0004992822881767843, + "loss": 3.252859354019165, + "step": 2488, + "token_acc": 0.27454571706649267 + }, + { + "epoch": 1.4591029023746702, + "grad_norm": 0.3434519470462143, + "learning_rate": 0.0004992804523046242, + "loss": 3.252744436264038, + "step": 2489, + "token_acc": 0.27188809202584674 + }, + { + "epoch": 1.4596892406918793, + "grad_norm": 0.3676748689804412, + "learning_rate": 0.00049927861409081, + "loss": 3.218554973602295, + "step": 2490, + "token_acc": 0.27627457235345493 + }, + { + "epoch": 1.4602755790090882, + "grad_norm": 0.4093801308459334, + "learning_rate": 0.0004992767735353591, + "loss": 3.230775833129883, + "step": 2491, + "token_acc": 0.2751333000295656 + }, + { + "epoch": 1.4608619173262973, + "grad_norm": 0.47916584493949715, + "learning_rate": 0.0004992749306382889, + "loss": 3.272024154663086, + "step": 2492, + "token_acc": 0.2706898827036384 + }, + { + "epoch": 1.4614482556435062, + "grad_norm": 0.49056228612230235, + "learning_rate": 0.0004992730853996168, + "loss": 3.292412519454956, + "step": 2493, + "token_acc": 0.2688209593017831 + }, + { + "epoch": 1.4620345939607153, + "grad_norm": 0.4262599691382597, + "learning_rate": 0.0004992712378193598, + "loss": 3.2419066429138184, + "step": 2494, + "token_acc": 0.27397899465375974 + }, + { + "epoch": 1.4626209322779244, + "grad_norm": 0.37095447757174566, + "learning_rate": 0.0004992693878975354, + "loss": 3.273664951324463, + "step": 2495, + "token_acc": 0.27020208628838516 + }, + { + "epoch": 1.4632072705951333, + "grad_norm": 0.38679784066929224, + "learning_rate": 0.000499267535634161, + "loss": 3.290173292160034, + "step": 2496, + "token_acc": 0.26697974727456775 + }, + { + "epoch": 1.4637936089123424, + "grad_norm": 0.3810999761910147, + "learning_rate": 0.000499265681029254, + "loss": 3.292463541030884, + "step": 2497, + "token_acc": 0.2668369650759247 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.36770983731819634, + "learning_rate": 0.0004992638240828319, + "loss": 3.300384521484375, + "step": 2498, + "token_acc": 0.26715834547625683 + }, + { + "epoch": 1.4649662855467604, + "grad_norm": 0.32980598892475077, + "learning_rate": 0.0004992619647949119, + "loss": 3.2862884998321533, + "step": 2499, + "token_acc": 0.26934831671995485 + }, + { + "epoch": 1.4655526238639696, + "grad_norm": 0.39278428036680624, + "learning_rate": 0.0004992601031655117, + "loss": 3.3008973598480225, + "step": 2500, + "token_acc": 0.2664430445194996 + }, + { + "epoch": 1.4661389621811787, + "grad_norm": 0.44455873849917044, + "learning_rate": 0.0004992582391946488, + "loss": 3.2516372203826904, + "step": 2501, + "token_acc": 0.2744417779632721 + }, + { + "epoch": 1.4667253004983876, + "grad_norm": 0.427252733739739, + "learning_rate": 0.0004992563728823406, + "loss": 3.2190747261047363, + "step": 2502, + "token_acc": 0.2800599233550749 + }, + { + "epoch": 1.4673116388155967, + "grad_norm": 0.4090301357351065, + "learning_rate": 0.0004992545042286046, + "loss": 3.302269220352173, + "step": 2503, + "token_acc": 0.2660768422355733 + }, + { + "epoch": 1.4678979771328056, + "grad_norm": 0.39886847311459095, + "learning_rate": 0.0004992526332334583, + "loss": 3.252699613571167, + "step": 2504, + "token_acc": 0.271145524819142 + }, + { + "epoch": 1.4684843154500147, + "grad_norm": 0.4078953172404111, + "learning_rate": 0.0004992507598969196, + "loss": 3.2592248916625977, + "step": 2505, + "token_acc": 0.2716038202090196 + }, + { + "epoch": 1.4690706537672238, + "grad_norm": 0.43734613408854095, + "learning_rate": 0.0004992488842190057, + "loss": 3.270545721054077, + "step": 2506, + "token_acc": 0.27089965351989276 + }, + { + "epoch": 1.4696569920844327, + "grad_norm": 0.48848822685530585, + "learning_rate": 0.0004992470061997345, + "loss": 3.2691125869750977, + "step": 2507, + "token_acc": 0.27078320308340964 + }, + { + "epoch": 1.4702433304016418, + "grad_norm": 0.5420366381466455, + "learning_rate": 0.0004992451258391236, + "loss": 3.287473678588867, + "step": 2508, + "token_acc": 0.26925375976639987 + }, + { + "epoch": 1.4708296687188507, + "grad_norm": 0.49461804919858526, + "learning_rate": 0.0004992432431371905, + "loss": 3.262782096862793, + "step": 2509, + "token_acc": 0.27267549764820787 + }, + { + "epoch": 1.4714160070360598, + "grad_norm": 0.4094541211173672, + "learning_rate": 0.000499241358093953, + "loss": 3.2442383766174316, + "step": 2510, + "token_acc": 0.27638559380009864 + }, + { + "epoch": 1.472002345353269, + "grad_norm": 0.3658091359430242, + "learning_rate": 0.0004992394707094289, + "loss": 3.259352684020996, + "step": 2511, + "token_acc": 0.27263883730155725 + }, + { + "epoch": 1.4725886836704778, + "grad_norm": 0.4121571918774716, + "learning_rate": 0.0004992375809836357, + "loss": 3.3123250007629395, + "step": 2512, + "token_acc": 0.2660374538013083 + }, + { + "epoch": 1.473175021987687, + "grad_norm": 0.4282535189365276, + "learning_rate": 0.0004992356889165913, + "loss": 3.2643938064575195, + "step": 2513, + "token_acc": 0.2737601321002591 + }, + { + "epoch": 1.4737613603048958, + "grad_norm": 0.4411158385694457, + "learning_rate": 0.0004992337945083134, + "loss": 3.2541675567626953, + "step": 2514, + "token_acc": 0.2731821033310357 + }, + { + "epoch": 1.474347698622105, + "grad_norm": 0.39037770660671206, + "learning_rate": 0.0004992318977588199, + "loss": 3.266140937805176, + "step": 2515, + "token_acc": 0.2711929502069963 + }, + { + "epoch": 1.474934036939314, + "grad_norm": 0.36433353861365153, + "learning_rate": 0.0004992299986681287, + "loss": 3.271409034729004, + "step": 2516, + "token_acc": 0.2706185635718494 + }, + { + "epoch": 1.4755203752565231, + "grad_norm": 0.34204519996515037, + "learning_rate": 0.0004992280972362573, + "loss": 3.2876739501953125, + "step": 2517, + "token_acc": 0.26861219999690117 + }, + { + "epoch": 1.476106713573732, + "grad_norm": 0.3998745624048512, + "learning_rate": 0.0004992261934632239, + "loss": 3.2723703384399414, + "step": 2518, + "token_acc": 0.2713411164971211 + }, + { + "epoch": 1.4766930518909411, + "grad_norm": 0.3720749801473729, + "learning_rate": 0.0004992242873490462, + "loss": 3.216571092605591, + "step": 2519, + "token_acc": 0.27758389054465155 + }, + { + "epoch": 1.47727939020815, + "grad_norm": 0.4728686180618035, + "learning_rate": 0.0004992223788937421, + "loss": 3.281606674194336, + "step": 2520, + "token_acc": 0.26887030001885287 + }, + { + "epoch": 1.4778657285253591, + "grad_norm": 0.5673286721023328, + "learning_rate": 0.0004992204680973297, + "loss": 3.2853360176086426, + "step": 2521, + "token_acc": 0.26759280641620764 + }, + { + "epoch": 1.4784520668425682, + "grad_norm": 0.4467303957471683, + "learning_rate": 0.0004992185549598267, + "loss": 3.2037525177001953, + "step": 2522, + "token_acc": 0.2789331040054205 + }, + { + "epoch": 1.4790384051597771, + "grad_norm": 0.4201350550856369, + "learning_rate": 0.0004992166394812513, + "loss": 3.320878505706787, + "step": 2523, + "token_acc": 0.26204837693495775 + }, + { + "epoch": 1.4796247434769862, + "grad_norm": 0.37489777082272246, + "learning_rate": 0.0004992147216616214, + "loss": 3.279184341430664, + "step": 2524, + "token_acc": 0.2688679245283019 + }, + { + "epoch": 1.4802110817941951, + "grad_norm": 0.3445151654199398, + "learning_rate": 0.000499212801500955, + "loss": 3.2414145469665527, + "step": 2525, + "token_acc": 0.2742514907526343 + }, + { + "epoch": 1.4807974201114043, + "grad_norm": 0.4057399445518621, + "learning_rate": 0.0004992108789992701, + "loss": 3.288478136062622, + "step": 2526, + "token_acc": 0.2704312093789201 + }, + { + "epoch": 1.4813837584286134, + "grad_norm": 0.46686801053346844, + "learning_rate": 0.0004992089541565848, + "loss": 3.2893803119659424, + "step": 2527, + "token_acc": 0.2675993793804505 + }, + { + "epoch": 1.4819700967458225, + "grad_norm": 0.4408039850018635, + "learning_rate": 0.0004992070269729173, + "loss": 3.2801411151885986, + "step": 2528, + "token_acc": 0.2698625561978163 + }, + { + "epoch": 1.4825564350630314, + "grad_norm": 0.431038949250353, + "learning_rate": 0.0004992050974482855, + "loss": 3.286717176437378, + "step": 2529, + "token_acc": 0.26773762792559697 + }, + { + "epoch": 1.4831427733802405, + "grad_norm": 0.3576258072283714, + "learning_rate": 0.0004992031655827076, + "loss": 3.2499890327453613, + "step": 2530, + "token_acc": 0.2736560310486608 + }, + { + "epoch": 1.4837291116974494, + "grad_norm": 0.4057627696885976, + "learning_rate": 0.0004992012313762017, + "loss": 3.2701375484466553, + "step": 2531, + "token_acc": 0.2716592678376118 + }, + { + "epoch": 1.4843154500146585, + "grad_norm": 0.39603020271321543, + "learning_rate": 0.0004991992948287863, + "loss": 3.2669310569763184, + "step": 2532, + "token_acc": 0.2701963593784906 + }, + { + "epoch": 1.4849017883318676, + "grad_norm": 0.38895173695480456, + "learning_rate": 0.0004991973559404791, + "loss": 3.25887393951416, + "step": 2533, + "token_acc": 0.27116314402802866 + }, + { + "epoch": 1.4854881266490765, + "grad_norm": 0.43479576544405085, + "learning_rate": 0.0004991954147112986, + "loss": 3.2750566005706787, + "step": 2534, + "token_acc": 0.2705086670277928 + }, + { + "epoch": 1.4860744649662856, + "grad_norm": 0.38123934526684516, + "learning_rate": 0.0004991934711412629, + "loss": 3.253174304962158, + "step": 2535, + "token_acc": 0.27202693987218346 + }, + { + "epoch": 1.4866608032834945, + "grad_norm": 0.371384013724577, + "learning_rate": 0.0004991915252303905, + "loss": 3.285108804702759, + "step": 2536, + "token_acc": 0.268993812393922 + }, + { + "epoch": 1.4872471416007036, + "grad_norm": 0.36054233736719843, + "learning_rate": 0.0004991895769786993, + "loss": 3.26125431060791, + "step": 2537, + "token_acc": 0.27457485201299425 + }, + { + "epoch": 1.4878334799179127, + "grad_norm": 0.3459758782275917, + "learning_rate": 0.000499187626386208, + "loss": 3.2527108192443848, + "step": 2538, + "token_acc": 0.27326024872370946 + }, + { + "epoch": 1.4884198182351216, + "grad_norm": 0.38426839468638996, + "learning_rate": 0.0004991856734529347, + "loss": 3.2523908615112305, + "step": 2539, + "token_acc": 0.27540276577580874 + }, + { + "epoch": 1.4890061565523307, + "grad_norm": 0.36496578914511524, + "learning_rate": 0.0004991837181788977, + "loss": 3.3056159019470215, + "step": 2540, + "token_acc": 0.2653258627926925 + }, + { + "epoch": 1.4895924948695396, + "grad_norm": 0.35812148403270494, + "learning_rate": 0.0004991817605641155, + "loss": 3.2906641960144043, + "step": 2541, + "token_acc": 0.268473345334239 + }, + { + "epoch": 1.4901788331867487, + "grad_norm": 0.393978117896672, + "learning_rate": 0.0004991798006086063, + "loss": 3.243887424468994, + "step": 2542, + "token_acc": 0.27553603882801586 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.3454127374929976, + "learning_rate": 0.0004991778383123889, + "loss": 3.2879605293273926, + "step": 2543, + "token_acc": 0.2684119714778691 + }, + { + "epoch": 1.491351509821167, + "grad_norm": 0.34927344356638507, + "learning_rate": 0.0004991758736754814, + "loss": 3.2568609714508057, + "step": 2544, + "token_acc": 0.2726638246009693 + }, + { + "epoch": 1.4919378481383758, + "grad_norm": 0.30426900052763595, + "learning_rate": 0.0004991739066979022, + "loss": 3.2414450645446777, + "step": 2545, + "token_acc": 0.2743923916872138 + }, + { + "epoch": 1.492524186455585, + "grad_norm": 0.367144824183199, + "learning_rate": 0.00049917193737967, + "loss": 3.2983627319335938, + "step": 2546, + "token_acc": 0.2667659673743338 + }, + { + "epoch": 1.4931105247727938, + "grad_norm": 0.4167837345012218, + "learning_rate": 0.0004991699657208032, + "loss": 3.2750966548919678, + "step": 2547, + "token_acc": 0.2689232817083218 + }, + { + "epoch": 1.493696863090003, + "grad_norm": 0.38857331549122276, + "learning_rate": 0.0004991679917213203, + "loss": 3.2825686931610107, + "step": 2548, + "token_acc": 0.268260187053966 + }, + { + "epoch": 1.494283201407212, + "grad_norm": 0.44692810512357894, + "learning_rate": 0.00049916601538124, + "loss": 3.2607336044311523, + "step": 2549, + "token_acc": 0.2724953679370995 + }, + { + "epoch": 1.494869539724421, + "grad_norm": 0.5810174130057221, + "learning_rate": 0.0004991640367005806, + "loss": 3.252685308456421, + "step": 2550, + "token_acc": 0.26933843367237076 + }, + { + "epoch": 1.49545587804163, + "grad_norm": 0.6489466846847916, + "learning_rate": 0.0004991620556793609, + "loss": 3.2388062477111816, + "step": 2551, + "token_acc": 0.2745364469032945 + }, + { + "epoch": 1.496042216358839, + "grad_norm": 0.5430146023631852, + "learning_rate": 0.0004991600723175994, + "loss": 3.3004467487335205, + "step": 2552, + "token_acc": 0.26614094360679574 + }, + { + "epoch": 1.496628554676048, + "grad_norm": 0.4988531391079722, + "learning_rate": 0.0004991580866153148, + "loss": 3.2348427772521973, + "step": 2553, + "token_acc": 0.27469680556923287 + }, + { + "epoch": 1.4972148929932572, + "grad_norm": 0.5388112096588197, + "learning_rate": 0.0004991560985725257, + "loss": 3.233935832977295, + "step": 2554, + "token_acc": 0.27664990529141625 + }, + { + "epoch": 1.4978012313104663, + "grad_norm": 0.4819893143576697, + "learning_rate": 0.0004991541081892508, + "loss": 3.2734124660491943, + "step": 2555, + "token_acc": 0.2714117184427841 + }, + { + "epoch": 1.4983875696276752, + "grad_norm": 0.4377724039456534, + "learning_rate": 0.0004991521154655088, + "loss": 3.2604904174804688, + "step": 2556, + "token_acc": 0.272985248498017 + }, + { + "epoch": 1.4989739079448843, + "grad_norm": 0.4873880855879326, + "learning_rate": 0.0004991501204013184, + "loss": 3.314098596572876, + "step": 2557, + "token_acc": 0.265938466946088 + }, + { + "epoch": 1.4995602462620932, + "grad_norm": 0.39701673753830447, + "learning_rate": 0.0004991481229966985, + "loss": 3.2702064514160156, + "step": 2558, + "token_acc": 0.27099487517905446 + }, + { + "epoch": 1.5001465845793023, + "grad_norm": 0.374282002245181, + "learning_rate": 0.0004991461232516675, + "loss": 3.2452337741851807, + "step": 2559, + "token_acc": 0.2735383869460141 + }, + { + "epoch": 1.5007329228965114, + "grad_norm": 0.34440348717032154, + "learning_rate": 0.0004991441211662444, + "loss": 3.24863862991333, + "step": 2560, + "token_acc": 0.27339911646917964 + }, + { + "epoch": 1.5013192612137203, + "grad_norm": 0.3448228171012622, + "learning_rate": 0.0004991421167404482, + "loss": 3.243621587753296, + "step": 2561, + "token_acc": 0.27413607873412116 + }, + { + "epoch": 1.5019055995309294, + "grad_norm": 0.3850168564546283, + "learning_rate": 0.0004991401099742974, + "loss": 3.319021224975586, + "step": 2562, + "token_acc": 0.26408151786817213 + }, + { + "epoch": 1.5024919378481383, + "grad_norm": 0.3385927206564174, + "learning_rate": 0.000499138100867811, + "loss": 3.2335896492004395, + "step": 2563, + "token_acc": 0.27524289628341597 + }, + { + "epoch": 1.5030782761653474, + "grad_norm": 0.362794282544457, + "learning_rate": 0.0004991360894210079, + "loss": 3.262423515319824, + "step": 2564, + "token_acc": 0.27464957787621574 + }, + { + "epoch": 1.5036646144825565, + "grad_norm": 0.4273206532090922, + "learning_rate": 0.0004991340756339069, + "loss": 3.2533841133117676, + "step": 2565, + "token_acc": 0.2719156340067206 + }, + { + "epoch": 1.5042509527997656, + "grad_norm": 0.4682393230211044, + "learning_rate": 0.000499132059506527, + "loss": 3.2540810108184814, + "step": 2566, + "token_acc": 0.27131067377240375 + }, + { + "epoch": 1.5048372911169745, + "grad_norm": 0.3674835501977162, + "learning_rate": 0.0004991300410388871, + "loss": 3.2532877922058105, + "step": 2567, + "token_acc": 0.2734548079228056 + }, + { + "epoch": 1.5054236294341834, + "grad_norm": 0.36979177350080983, + "learning_rate": 0.0004991280202310062, + "loss": 3.263533592224121, + "step": 2568, + "token_acc": 0.27035336847680785 + }, + { + "epoch": 1.5060099677513925, + "grad_norm": 0.3643822037132622, + "learning_rate": 0.0004991259970829032, + "loss": 3.2781331539154053, + "step": 2569, + "token_acc": 0.26862745098039215 + }, + { + "epoch": 1.5065963060686016, + "grad_norm": 0.4476152904867317, + "learning_rate": 0.0004991239715945972, + "loss": 3.23483943939209, + "step": 2570, + "token_acc": 0.2737736123824582 + }, + { + "epoch": 1.5071826443858107, + "grad_norm": 0.5344595048723654, + "learning_rate": 0.0004991219437661072, + "loss": 3.2789430618286133, + "step": 2571, + "token_acc": 0.26990771932746394 + }, + { + "epoch": 1.5077689827030196, + "grad_norm": 0.4592028198727494, + "learning_rate": 0.0004991199135974522, + "loss": 3.2721993923187256, + "step": 2572, + "token_acc": 0.27100821431785166 + }, + { + "epoch": 1.5083553210202285, + "grad_norm": 0.37682639727390393, + "learning_rate": 0.0004991178810886514, + "loss": 3.248673439025879, + "step": 2573, + "token_acc": 0.27404235118766573 + }, + { + "epoch": 1.5089416593374376, + "grad_norm": 0.40615964776211, + "learning_rate": 0.0004991158462397236, + "loss": 3.289583683013916, + "step": 2574, + "token_acc": 0.268455007796814 + }, + { + "epoch": 1.5095279976546467, + "grad_norm": 0.3691587872426139, + "learning_rate": 0.0004991138090506882, + "loss": 3.28464937210083, + "step": 2575, + "token_acc": 0.2700779782571575 + }, + { + "epoch": 1.5101143359718558, + "grad_norm": 0.35995416533838553, + "learning_rate": 0.0004991117695215643, + "loss": 3.2437875270843506, + "step": 2576, + "token_acc": 0.2777024414138029 + }, + { + "epoch": 1.5107006742890647, + "grad_norm": 0.4020323682739989, + "learning_rate": 0.000499109727652371, + "loss": 3.2557735443115234, + "step": 2577, + "token_acc": 0.27351970942303905 + }, + { + "epoch": 1.5112870126062738, + "grad_norm": 0.40012308842668476, + "learning_rate": 0.0004991076834431275, + "loss": 3.312852382659912, + "step": 2578, + "token_acc": 0.2641404185260604 + }, + { + "epoch": 1.5118733509234827, + "grad_norm": 0.38016669868628233, + "learning_rate": 0.000499105636893853, + "loss": 3.2731266021728516, + "step": 2579, + "token_acc": 0.27011038843863516 + }, + { + "epoch": 1.5124596892406919, + "grad_norm": 0.3536723847367996, + "learning_rate": 0.0004991035880045667, + "loss": 3.257784366607666, + "step": 2580, + "token_acc": 0.27153840979764166 + }, + { + "epoch": 1.513046027557901, + "grad_norm": 0.34200990305898926, + "learning_rate": 0.0004991015367752878, + "loss": 3.3083670139312744, + "step": 2581, + "token_acc": 0.2667002219467596 + }, + { + "epoch": 1.51363236587511, + "grad_norm": 0.4032844574981755, + "learning_rate": 0.0004990994832060356, + "loss": 3.3118104934692383, + "step": 2582, + "token_acc": 0.2641626375675668 + }, + { + "epoch": 1.514218704192319, + "grad_norm": 0.4728202127626407, + "learning_rate": 0.0004990974272968295, + "loss": 3.2963128089904785, + "step": 2583, + "token_acc": 0.26776409346843894 + }, + { + "epoch": 1.5148050425095279, + "grad_norm": 0.45323267559923375, + "learning_rate": 0.0004990953690476887, + "loss": 3.2584738731384277, + "step": 2584, + "token_acc": 0.2733442967647285 + }, + { + "epoch": 1.515391380826737, + "grad_norm": 0.41946261004931057, + "learning_rate": 0.0004990933084586327, + "loss": 3.2761378288269043, + "step": 2585, + "token_acc": 0.27130436160185034 + }, + { + "epoch": 1.515977719143946, + "grad_norm": 0.475321890835984, + "learning_rate": 0.0004990912455296806, + "loss": 3.2937917709350586, + "step": 2586, + "token_acc": 0.26510702183784696 + }, + { + "epoch": 1.5165640574611552, + "grad_norm": 0.41653501663177966, + "learning_rate": 0.0004990891802608519, + "loss": 3.2514376640319824, + "step": 2587, + "token_acc": 0.2732557036153398 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.40933631326340525, + "learning_rate": 0.000499087112652166, + "loss": 3.2598648071289062, + "step": 2588, + "token_acc": 0.2699195041631175 + }, + { + "epoch": 1.5177367340955732, + "grad_norm": 0.43205113531703115, + "learning_rate": 0.0004990850427036424, + "loss": 3.259249448776245, + "step": 2589, + "token_acc": 0.2716156976668106 + }, + { + "epoch": 1.518323072412782, + "grad_norm": 0.34824819515019817, + "learning_rate": 0.0004990829704153004, + "loss": 3.2567336559295654, + "step": 2590, + "token_acc": 0.273110723354454 + }, + { + "epoch": 1.5189094107299912, + "grad_norm": 0.3757132671888345, + "learning_rate": 0.0004990808957871596, + "loss": 3.251190662384033, + "step": 2591, + "token_acc": 0.27156952068289575 + }, + { + "epoch": 1.5194957490472003, + "grad_norm": 0.3800848425365529, + "learning_rate": 0.0004990788188192393, + "loss": 3.2771501541137695, + "step": 2592, + "token_acc": 0.2690651998853508 + }, + { + "epoch": 1.5200820873644094, + "grad_norm": 0.40526590172302934, + "learning_rate": 0.0004990767395115593, + "loss": 3.2796850204467773, + "step": 2593, + "token_acc": 0.269747386489365 + }, + { + "epoch": 1.5206684256816183, + "grad_norm": 0.4112730647827353, + "learning_rate": 0.0004990746578641389, + "loss": 3.26863169670105, + "step": 2594, + "token_acc": 0.2725010098904106 + }, + { + "epoch": 1.5212547639988272, + "grad_norm": 0.3656664415754131, + "learning_rate": 0.0004990725738769977, + "loss": 3.2911901473999023, + "step": 2595, + "token_acc": 0.27179157626952116 + }, + { + "epoch": 1.5218411023160363, + "grad_norm": 0.36439374169814104, + "learning_rate": 0.0004990704875501553, + "loss": 3.2122573852539062, + "step": 2596, + "token_acc": 0.27790043551899346 + }, + { + "epoch": 1.5224274406332454, + "grad_norm": 0.4497139664839275, + "learning_rate": 0.0004990683988836313, + "loss": 3.280503749847412, + "step": 2597, + "token_acc": 0.2700311026926968 + }, + { + "epoch": 1.5230137789504545, + "grad_norm": 0.4490913930307062, + "learning_rate": 0.0004990663078774453, + "loss": 3.254506826400757, + "step": 2598, + "token_acc": 0.27037192228849005 + }, + { + "epoch": 1.5236001172676634, + "grad_norm": 0.45112041187206764, + "learning_rate": 0.000499064214531617, + "loss": 3.2496957778930664, + "step": 2599, + "token_acc": 0.272075400642249 + }, + { + "epoch": 1.5241864555848723, + "grad_norm": 0.4492685229374902, + "learning_rate": 0.000499062118846166, + "loss": 3.301370143890381, + "step": 2600, + "token_acc": 0.26543296642634984 + }, + { + "epoch": 1.5247727939020814, + "grad_norm": 0.3904869025112164, + "learning_rate": 0.000499060020821112, + "loss": 3.2064359188079834, + "step": 2601, + "token_acc": 0.27939304968878625 + }, + { + "epoch": 1.5253591322192905, + "grad_norm": 0.3499647392180847, + "learning_rate": 0.0004990579204564747, + "loss": 3.21273136138916, + "step": 2602, + "token_acc": 0.2794000506661026 + }, + { + "epoch": 1.5259454705364996, + "grad_norm": 0.32554426093460603, + "learning_rate": 0.0004990558177522739, + "loss": 3.23994779586792, + "step": 2603, + "token_acc": 0.2735700223299816 + }, + { + "epoch": 1.5265318088537085, + "grad_norm": 0.3338394247619128, + "learning_rate": 0.0004990537127085292, + "loss": 3.250572919845581, + "step": 2604, + "token_acc": 0.2724101746695889 + }, + { + "epoch": 1.5271181471709177, + "grad_norm": 0.312988608011108, + "learning_rate": 0.0004990516053252606, + "loss": 3.2569456100463867, + "step": 2605, + "token_acc": 0.2736684404684555 + }, + { + "epoch": 1.5277044854881265, + "grad_norm": 0.3299606179099335, + "learning_rate": 0.0004990494956024877, + "loss": 3.246515989303589, + "step": 2606, + "token_acc": 0.27203160816807215 + }, + { + "epoch": 1.5282908238053357, + "grad_norm": 0.35524688646385216, + "learning_rate": 0.0004990473835402304, + "loss": 3.288687229156494, + "step": 2607, + "token_acc": 0.2672541716687477 + }, + { + "epoch": 1.5288771621225448, + "grad_norm": 0.3938440317275084, + "learning_rate": 0.0004990452691385086, + "loss": 3.254031181335449, + "step": 2608, + "token_acc": 0.27265844660661687 + }, + { + "epoch": 1.5294635004397539, + "grad_norm": 0.377281942458023, + "learning_rate": 0.0004990431523973419, + "loss": 3.2196178436279297, + "step": 2609, + "token_acc": 0.27718507618259297 + }, + { + "epoch": 1.5300498387569628, + "grad_norm": 0.3156695530810945, + "learning_rate": 0.0004990410333167506, + "loss": 3.2919702529907227, + "step": 2610, + "token_acc": 0.2675626432002836 + }, + { + "epoch": 1.5306361770741717, + "grad_norm": 0.4069045520187358, + "learning_rate": 0.0004990389118967542, + "loss": 3.269207715988159, + "step": 2611, + "token_acc": 0.27138172380029735 + }, + { + "epoch": 1.5312225153913808, + "grad_norm": 0.36361656979757545, + "learning_rate": 0.0004990367881373729, + "loss": 3.267333745956421, + "step": 2612, + "token_acc": 0.2701189304329584 + }, + { + "epoch": 1.5318088537085899, + "grad_norm": 0.34883269174316006, + "learning_rate": 0.0004990346620386265, + "loss": 3.2118911743164062, + "step": 2613, + "token_acc": 0.27726341705631646 + }, + { + "epoch": 1.532395192025799, + "grad_norm": 0.3691702258654956, + "learning_rate": 0.0004990325336005351, + "loss": 3.2957334518432617, + "step": 2614, + "token_acc": 0.2689772037284975 + }, + { + "epoch": 1.5329815303430079, + "grad_norm": 0.5018513952070723, + "learning_rate": 0.0004990304028231185, + "loss": 3.213566780090332, + "step": 2615, + "token_acc": 0.2770729532700477 + }, + { + "epoch": 1.533567868660217, + "grad_norm": 0.5738740323802886, + "learning_rate": 0.000499028269706397, + "loss": 3.246070146560669, + "step": 2616, + "token_acc": 0.2735984004509367 + }, + { + "epoch": 1.5341542069774259, + "grad_norm": 0.49099425737924635, + "learning_rate": 0.0004990261342503904, + "loss": 3.2986602783203125, + "step": 2617, + "token_acc": 0.2683126997153783 + }, + { + "epoch": 1.534740545294635, + "grad_norm": 0.42632125699657863, + "learning_rate": 0.0004990239964551189, + "loss": 3.301894426345825, + "step": 2618, + "token_acc": 0.26685611879160265 + }, + { + "epoch": 1.535326883611844, + "grad_norm": 0.41861919970031386, + "learning_rate": 0.0004990218563206024, + "loss": 3.2553257942199707, + "step": 2619, + "token_acc": 0.2733750003264756 + }, + { + "epoch": 1.5359132219290532, + "grad_norm": 0.4023218351430975, + "learning_rate": 0.0004990197138468611, + "loss": 3.239457130432129, + "step": 2620, + "token_acc": 0.2743222015433615 + }, + { + "epoch": 1.536499560246262, + "grad_norm": 0.41425083272992663, + "learning_rate": 0.0004990175690339153, + "loss": 3.2661590576171875, + "step": 2621, + "token_acc": 0.2712879760376551 + }, + { + "epoch": 1.537085898563471, + "grad_norm": 0.38063663540170106, + "learning_rate": 0.0004990154218817848, + "loss": 3.2548868656158447, + "step": 2622, + "token_acc": 0.2719864788389261 + }, + { + "epoch": 1.53767223688068, + "grad_norm": 0.4004431423583892, + "learning_rate": 0.0004990132723904901, + "loss": 3.282156229019165, + "step": 2623, + "token_acc": 0.27060004344341493 + }, + { + "epoch": 1.5382585751978892, + "grad_norm": 0.4125367425518996, + "learning_rate": 0.0004990111205600513, + "loss": 3.320084571838379, + "step": 2624, + "token_acc": 0.2650111486539911 + }, + { + "epoch": 1.5388449135150983, + "grad_norm": 0.31626041136147365, + "learning_rate": 0.0004990089663904885, + "loss": 3.254483938217163, + "step": 2625, + "token_acc": 0.2709560046683319 + }, + { + "epoch": 1.5394312518323072, + "grad_norm": 0.42809743032355846, + "learning_rate": 0.0004990068098818221, + "loss": 3.2869207859039307, + "step": 2626, + "token_acc": 0.2681024355436903 + }, + { + "epoch": 1.5400175901495161, + "grad_norm": 0.3893687379457032, + "learning_rate": 0.0004990046510340721, + "loss": 3.240131139755249, + "step": 2627, + "token_acc": 0.2725396923457095 + }, + { + "epoch": 1.5406039284667252, + "grad_norm": 0.42652289423361567, + "learning_rate": 0.0004990024898472589, + "loss": 3.2188005447387695, + "step": 2628, + "token_acc": 0.27687323768171684 + }, + { + "epoch": 1.5411902667839343, + "grad_norm": 0.4646790209668725, + "learning_rate": 0.000499000326321403, + "loss": 3.2517547607421875, + "step": 2629, + "token_acc": 0.27302447524499224 + }, + { + "epoch": 1.5417766051011434, + "grad_norm": 0.3876984049842635, + "learning_rate": 0.0004989981604565245, + "loss": 3.2105846405029297, + "step": 2630, + "token_acc": 0.2779264921080737 + }, + { + "epoch": 1.5423629434183523, + "grad_norm": 0.3619991010331251, + "learning_rate": 0.0004989959922526439, + "loss": 3.2419073581695557, + "step": 2631, + "token_acc": 0.27580839208071256 + }, + { + "epoch": 1.5429492817355615, + "grad_norm": 0.3797860874387506, + "learning_rate": 0.0004989938217097814, + "loss": 3.2080209255218506, + "step": 2632, + "token_acc": 0.279506476037134 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.34234491666687183, + "learning_rate": 0.0004989916488279575, + "loss": 3.2258660793304443, + "step": 2633, + "token_acc": 0.27551688777455047 + }, + { + "epoch": 1.5441219583699795, + "grad_norm": 0.37945457042923514, + "learning_rate": 0.0004989894736071924, + "loss": 3.2722809314727783, + "step": 2634, + "token_acc": 0.2710098930110495 + }, + { + "epoch": 1.5447082966871886, + "grad_norm": 0.46145950945105835, + "learning_rate": 0.0004989872960475069, + "loss": 3.2760467529296875, + "step": 2635, + "token_acc": 0.27053734844731175 + }, + { + "epoch": 1.5452946350043977, + "grad_norm": 0.4835128210450808, + "learning_rate": 0.0004989851161489213, + "loss": 3.214796304702759, + "step": 2636, + "token_acc": 0.27867832096539663 + }, + { + "epoch": 1.5458809733216066, + "grad_norm": 0.42130780625521774, + "learning_rate": 0.000498982933911456, + "loss": 3.2687814235687256, + "step": 2637, + "token_acc": 0.2706117797914617 + }, + { + "epoch": 1.5464673116388155, + "grad_norm": 0.4222852347317549, + "learning_rate": 0.0004989807493351315, + "loss": 3.2881879806518555, + "step": 2638, + "token_acc": 0.2678677588556217 + }, + { + "epoch": 1.5470536499560246, + "grad_norm": 0.3856519286332186, + "learning_rate": 0.0004989785624199684, + "loss": 3.2356760501861572, + "step": 2639, + "token_acc": 0.2743609357908349 + }, + { + "epoch": 1.5476399882732337, + "grad_norm": 0.37731430873539445, + "learning_rate": 0.0004989763731659872, + "loss": 3.2439823150634766, + "step": 2640, + "token_acc": 0.27453879995563607 + }, + { + "epoch": 1.5482263265904428, + "grad_norm": 0.3596363141636464, + "learning_rate": 0.0004989741815732085, + "loss": 3.3185830116271973, + "step": 2641, + "token_acc": 0.2656785517042737 + }, + { + "epoch": 1.5488126649076517, + "grad_norm": 0.3487001734746019, + "learning_rate": 0.0004989719876416529, + "loss": 3.2326765060424805, + "step": 2642, + "token_acc": 0.275534034491769 + }, + { + "epoch": 1.5493990032248608, + "grad_norm": 0.3300309678475686, + "learning_rate": 0.0004989697913713409, + "loss": 3.257556676864624, + "step": 2643, + "token_acc": 0.27281272866138623 + }, + { + "epoch": 1.5499853415420697, + "grad_norm": 0.3838529287168988, + "learning_rate": 0.0004989675927622931, + "loss": 3.249099016189575, + "step": 2644, + "token_acc": 0.273306884335702 + }, + { + "epoch": 1.5505716798592788, + "grad_norm": 0.4143647705928201, + "learning_rate": 0.0004989653918145305, + "loss": 3.2324624061584473, + "step": 2645, + "token_acc": 0.27393156542676983 + }, + { + "epoch": 1.551158018176488, + "grad_norm": 0.49725611499121625, + "learning_rate": 0.0004989631885280733, + "loss": 3.285338878631592, + "step": 2646, + "token_acc": 0.2691531522135892 + }, + { + "epoch": 1.551744356493697, + "grad_norm": 0.4105517857631105, + "learning_rate": 0.0004989609829029425, + "loss": 3.234142303466797, + "step": 2647, + "token_acc": 0.2753726028725965 + }, + { + "epoch": 1.552330694810906, + "grad_norm": 0.39416348603598894, + "learning_rate": 0.0004989587749391587, + "loss": 3.231597661972046, + "step": 2648, + "token_acc": 0.27749008038784956 + }, + { + "epoch": 1.5529170331281148, + "grad_norm": 0.36722345168408743, + "learning_rate": 0.0004989565646367429, + "loss": 3.2738986015319824, + "step": 2649, + "token_acc": 0.2704959227015547 + }, + { + "epoch": 1.553503371445324, + "grad_norm": 0.354552313381656, + "learning_rate": 0.0004989543519957155, + "loss": 3.2706987857818604, + "step": 2650, + "token_acc": 0.2703417754396329 + }, + { + "epoch": 1.554089709762533, + "grad_norm": 0.40294080366790597, + "learning_rate": 0.0004989521370160974, + "loss": 3.3017120361328125, + "step": 2651, + "token_acc": 0.2673277202602611 + }, + { + "epoch": 1.5546760480797421, + "grad_norm": 0.4025411834888025, + "learning_rate": 0.0004989499196979095, + "loss": 3.239588499069214, + "step": 2652, + "token_acc": 0.27497293263115846 + }, + { + "epoch": 1.555262386396951, + "grad_norm": 0.3589102076574963, + "learning_rate": 0.0004989477000411725, + "loss": 3.27361798286438, + "step": 2653, + "token_acc": 0.2719704171062665 + }, + { + "epoch": 1.55584872471416, + "grad_norm": 0.36026880012163814, + "learning_rate": 0.0004989454780459073, + "loss": 3.2707529067993164, + "step": 2654, + "token_acc": 0.2706217817986251 + }, + { + "epoch": 1.556435063031369, + "grad_norm": 0.3769613499464902, + "learning_rate": 0.0004989432537121349, + "loss": 3.3027758598327637, + "step": 2655, + "token_acc": 0.26701522582039744 + }, + { + "epoch": 1.5570214013485781, + "grad_norm": 0.342672851494411, + "learning_rate": 0.000498941027039876, + "loss": 3.2814202308654785, + "step": 2656, + "token_acc": 0.2692951669510899 + }, + { + "epoch": 1.5576077396657872, + "grad_norm": 0.3574098534377979, + "learning_rate": 0.0004989387980291516, + "loss": 3.284370183944702, + "step": 2657, + "token_acc": 0.2699602483481128 + }, + { + "epoch": 1.5581940779829961, + "grad_norm": 0.3514926295971717, + "learning_rate": 0.0004989365666799827, + "loss": 3.252963066101074, + "step": 2658, + "token_acc": 0.2714648020371283 + }, + { + "epoch": 1.5587804163002053, + "grad_norm": 0.3417768728615734, + "learning_rate": 0.0004989343329923902, + "loss": 3.240387439727783, + "step": 2659, + "token_acc": 0.2740051459534683 + }, + { + "epoch": 1.5593667546174141, + "grad_norm": 0.3844651396566197, + "learning_rate": 0.0004989320969663951, + "loss": 3.254425525665283, + "step": 2660, + "token_acc": 0.27187097959933837 + }, + { + "epoch": 1.5599530929346233, + "grad_norm": 0.3846254546361974, + "learning_rate": 0.0004989298586020183, + "loss": 3.2569668292999268, + "step": 2661, + "token_acc": 0.2706875966362179 + }, + { + "epoch": 1.5605394312518324, + "grad_norm": 0.37931114930598936, + "learning_rate": 0.000498927617899281, + "loss": 3.238466262817383, + "step": 2662, + "token_acc": 0.27485740144387616 + }, + { + "epoch": 1.5611257695690415, + "grad_norm": 0.4426821670641154, + "learning_rate": 0.0004989253748582042, + "loss": 3.2260851860046387, + "step": 2663, + "token_acc": 0.2744047588311534 + }, + { + "epoch": 1.5617121078862504, + "grad_norm": 0.5233817670180233, + "learning_rate": 0.0004989231294788088, + "loss": 3.240323305130005, + "step": 2664, + "token_acc": 0.27272104719850054 + }, + { + "epoch": 1.5622984462034593, + "grad_norm": 0.4995519929215957, + "learning_rate": 0.0004989208817611162, + "loss": 3.2973456382751465, + "step": 2665, + "token_acc": 0.2683416875085024 + }, + { + "epoch": 1.5628847845206684, + "grad_norm": 0.42641943666751, + "learning_rate": 0.0004989186317051472, + "loss": 3.241018295288086, + "step": 2666, + "token_acc": 0.2734630072488855 + }, + { + "epoch": 1.5634711228378775, + "grad_norm": 0.39996113411284107, + "learning_rate": 0.0004989163793109231, + "loss": 3.2816903591156006, + "step": 2667, + "token_acc": 0.2705359529357132 + }, + { + "epoch": 1.5640574611550866, + "grad_norm": 0.5530718271321012, + "learning_rate": 0.0004989141245784651, + "loss": 3.33862566947937, + "step": 2668, + "token_acc": 0.26283321746168453 + }, + { + "epoch": 1.5646437994722955, + "grad_norm": 0.5325390424675902, + "learning_rate": 0.0004989118675077942, + "loss": 3.280628204345703, + "step": 2669, + "token_acc": 0.2695132902781736 + }, + { + "epoch": 1.5652301377895046, + "grad_norm": 0.3847569319623661, + "learning_rate": 0.0004989096080989318, + "loss": 3.2433745861053467, + "step": 2670, + "token_acc": 0.2722702003980999 + }, + { + "epoch": 1.5658164761067135, + "grad_norm": 0.4375873969562279, + "learning_rate": 0.000498907346351899, + "loss": 3.3132450580596924, + "step": 2671, + "token_acc": 0.26583004508834446 + }, + { + "epoch": 1.5664028144239226, + "grad_norm": 0.45026986045211215, + "learning_rate": 0.0004989050822667172, + "loss": 3.226424217224121, + "step": 2672, + "token_acc": 0.2761323136224173 + }, + { + "epoch": 1.5669891527411317, + "grad_norm": 0.39761124993926966, + "learning_rate": 0.0004989028158434074, + "loss": 3.2660775184631348, + "step": 2673, + "token_acc": 0.2715718426388341 + }, + { + "epoch": 1.5675754910583408, + "grad_norm": 0.4366372416828852, + "learning_rate": 0.0004989005470819912, + "loss": 3.227046012878418, + "step": 2674, + "token_acc": 0.27720152228072237 + }, + { + "epoch": 1.5681618293755497, + "grad_norm": 0.39021958849509253, + "learning_rate": 0.0004988982759824896, + "loss": 3.2357258796691895, + "step": 2675, + "token_acc": 0.2747005064381357 + }, + { + "epoch": 1.5687481676927586, + "grad_norm": 0.3262171587001197, + "learning_rate": 0.0004988960025449242, + "loss": 3.249258041381836, + "step": 2676, + "token_acc": 0.27259877481365447 + }, + { + "epoch": 1.5693345060099677, + "grad_norm": 0.34448535523871937, + "learning_rate": 0.0004988937267693162, + "loss": 3.2756948471069336, + "step": 2677, + "token_acc": 0.2692577021559959 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.37032248885205776, + "learning_rate": 0.000498891448655687, + "loss": 3.2353405952453613, + "step": 2678, + "token_acc": 0.27351220045178387 + }, + { + "epoch": 1.570507182644386, + "grad_norm": 0.35453190717742655, + "learning_rate": 0.0004988891682040581, + "loss": 3.2334859371185303, + "step": 2679, + "token_acc": 0.2767216447768983 + }, + { + "epoch": 1.5710935209615948, + "grad_norm": 0.318483802055312, + "learning_rate": 0.0004988868854144508, + "loss": 3.2155065536499023, + "step": 2680, + "token_acc": 0.2768662561793556 + }, + { + "epoch": 1.5716798592788037, + "grad_norm": 0.32327002418678236, + "learning_rate": 0.0004988846002868866, + "loss": 3.2319586277008057, + "step": 2681, + "token_acc": 0.2771636545383973 + }, + { + "epoch": 1.5722661975960128, + "grad_norm": 0.3441250186577668, + "learning_rate": 0.0004988823128213869, + "loss": 3.2407686710357666, + "step": 2682, + "token_acc": 0.27392532821678733 + }, + { + "epoch": 1.572852535913222, + "grad_norm": 0.3601102533657783, + "learning_rate": 0.0004988800230179733, + "loss": 3.2567479610443115, + "step": 2683, + "token_acc": 0.27164362550407833 + }, + { + "epoch": 1.573438874230431, + "grad_norm": 0.33662251526197795, + "learning_rate": 0.0004988777308766673, + "loss": 3.2499845027923584, + "step": 2684, + "token_acc": 0.27264931329736003 + }, + { + "epoch": 1.57402521254764, + "grad_norm": 0.3247345631168005, + "learning_rate": 0.0004988754363974904, + "loss": 3.2635385990142822, + "step": 2685, + "token_acc": 0.2718904268455264 + }, + { + "epoch": 1.574611550864849, + "grad_norm": 0.3777905980359913, + "learning_rate": 0.0004988731395804641, + "loss": 3.2611401081085205, + "step": 2686, + "token_acc": 0.27217976724684084 + }, + { + "epoch": 1.575197889182058, + "grad_norm": 0.33663718256597874, + "learning_rate": 0.00049887084042561, + "loss": 3.279417037963867, + "step": 2687, + "token_acc": 0.2686788139760509 + }, + { + "epoch": 1.575784227499267, + "grad_norm": 0.3362316721317457, + "learning_rate": 0.0004988685389329497, + "loss": 3.1999971866607666, + "step": 2688, + "token_acc": 0.2798969002604483 + }, + { + "epoch": 1.5763705658164762, + "grad_norm": 0.3399648099293411, + "learning_rate": 0.0004988662351025048, + "loss": 3.2987563610076904, + "step": 2689, + "token_acc": 0.26550862017274235 + }, + { + "epoch": 1.5769569041336853, + "grad_norm": 0.422309117473261, + "learning_rate": 0.000498863928934297, + "loss": 3.259262800216675, + "step": 2690, + "token_acc": 0.27012342520943916 + }, + { + "epoch": 1.5775432424508942, + "grad_norm": 0.3905374891803225, + "learning_rate": 0.0004988616204283479, + "loss": 3.2702717781066895, + "step": 2691, + "token_acc": 0.27083267394951027 + }, + { + "epoch": 1.578129580768103, + "grad_norm": 0.3503140145756829, + "learning_rate": 0.0004988593095846793, + "loss": 3.215207815170288, + "step": 2692, + "token_acc": 0.276515849039701 + }, + { + "epoch": 1.5787159190853122, + "grad_norm": 0.377567862143921, + "learning_rate": 0.0004988569964033128, + "loss": 3.2278919219970703, + "step": 2693, + "token_acc": 0.27492802039028946 + }, + { + "epoch": 1.5793022574025213, + "grad_norm": 0.4117813332740536, + "learning_rate": 0.0004988546808842703, + "loss": 3.2277774810791016, + "step": 2694, + "token_acc": 0.2758324741994112 + }, + { + "epoch": 1.5798885957197304, + "grad_norm": 0.3942180648872112, + "learning_rate": 0.0004988523630275731, + "loss": 3.2668137550354004, + "step": 2695, + "token_acc": 0.2705968817260749 + }, + { + "epoch": 1.5804749340369393, + "grad_norm": 0.31324215549712725, + "learning_rate": 0.0004988500428332435, + "loss": 3.226727247238159, + "step": 2696, + "token_acc": 0.2740891668032819 + }, + { + "epoch": 1.5810612723541484, + "grad_norm": 0.38645085509564414, + "learning_rate": 0.0004988477203013029, + "loss": 3.2715234756469727, + "step": 2697, + "token_acc": 0.26994020646340916 + }, + { + "epoch": 1.5816476106713573, + "grad_norm": 0.4015884618559776, + "learning_rate": 0.0004988453954317735, + "loss": 3.248443126678467, + "step": 2698, + "token_acc": 0.27439067448138754 + }, + { + "epoch": 1.5822339489885664, + "grad_norm": 0.4374823655905406, + "learning_rate": 0.0004988430682246769, + "loss": 3.2270560264587402, + "step": 2699, + "token_acc": 0.27576205715656893 + }, + { + "epoch": 1.5828202873057755, + "grad_norm": 0.4099083312504755, + "learning_rate": 0.000498840738680035, + "loss": 3.27923846244812, + "step": 2700, + "token_acc": 0.26916692589407315 + }, + { + "epoch": 1.5834066256229846, + "grad_norm": 0.35450487951341336, + "learning_rate": 0.0004988384067978695, + "loss": 3.264312505722046, + "step": 2701, + "token_acc": 0.27171067775107194 + }, + { + "epoch": 1.5839929639401935, + "grad_norm": 0.3603862045876232, + "learning_rate": 0.0004988360725782027, + "loss": 3.249986171722412, + "step": 2702, + "token_acc": 0.27329845143373055 + }, + { + "epoch": 1.5845793022574024, + "grad_norm": 0.38290899483458135, + "learning_rate": 0.0004988337360210562, + "loss": 3.2628514766693115, + "step": 2703, + "token_acc": 0.2723530338529093 + }, + { + "epoch": 1.5851656405746115, + "grad_norm": 0.34659374894191103, + "learning_rate": 0.000498831397126452, + "loss": 3.2192413806915283, + "step": 2704, + "token_acc": 0.2758640701247975 + }, + { + "epoch": 1.5857519788918206, + "grad_norm": 0.3868393656960508, + "learning_rate": 0.0004988290558944123, + "loss": 3.2475297451019287, + "step": 2705, + "token_acc": 0.27241009689148576 + }, + { + "epoch": 1.5863383172090297, + "grad_norm": 0.38793105455210936, + "learning_rate": 0.0004988267123249588, + "loss": 3.228393316268921, + "step": 2706, + "token_acc": 0.2742931815943769 + }, + { + "epoch": 1.5869246555262386, + "grad_norm": 0.40405986606024225, + "learning_rate": 0.0004988243664181137, + "loss": 3.2677111625671387, + "step": 2707, + "token_acc": 0.27193307879780915 + }, + { + "epoch": 1.5875109938434475, + "grad_norm": 0.4490691144683981, + "learning_rate": 0.0004988220181738988, + "loss": 3.2014970779418945, + "step": 2708, + "token_acc": 0.2791681670867843 + }, + { + "epoch": 1.5880973321606566, + "grad_norm": 0.37508884935344244, + "learning_rate": 0.0004988196675923365, + "loss": 3.2355289459228516, + "step": 2709, + "token_acc": 0.2740727508675516 + }, + { + "epoch": 1.5886836704778657, + "grad_norm": 0.3778067038215156, + "learning_rate": 0.0004988173146734487, + "loss": 3.240598678588867, + "step": 2710, + "token_acc": 0.27501648344255014 + }, + { + "epoch": 1.5892700087950749, + "grad_norm": 0.41313150686608957, + "learning_rate": 0.0004988149594172574, + "loss": 3.2623720169067383, + "step": 2711, + "token_acc": 0.2718314091170674 + }, + { + "epoch": 1.5898563471122837, + "grad_norm": 0.3556962425691541, + "learning_rate": 0.000498812601823785, + "loss": 3.250446319580078, + "step": 2712, + "token_acc": 0.27298510394994685 + }, + { + "epoch": 1.5904426854294929, + "grad_norm": 0.3608077178922655, + "learning_rate": 0.0004988102418930534, + "loss": 3.193519115447998, + "step": 2713, + "token_acc": 0.27913297807178733 + }, + { + "epoch": 1.5910290237467017, + "grad_norm": 0.3297838350918935, + "learning_rate": 0.0004988078796250849, + "loss": 3.2485578060150146, + "step": 2714, + "token_acc": 0.27300142067209937 + }, + { + "epoch": 1.5916153620639109, + "grad_norm": 0.411991775655341, + "learning_rate": 0.0004988055150199016, + "loss": 3.302521228790283, + "step": 2715, + "token_acc": 0.26687487740446464 + }, + { + "epoch": 1.59220170038112, + "grad_norm": 0.43107094714592736, + "learning_rate": 0.0004988031480775257, + "loss": 3.2885003089904785, + "step": 2716, + "token_acc": 0.26872974385781495 + }, + { + "epoch": 1.592788038698329, + "grad_norm": 0.45608371945595877, + "learning_rate": 0.0004988007787979795, + "loss": 3.2895407676696777, + "step": 2717, + "token_acc": 0.2688575466809904 + }, + { + "epoch": 1.593374377015538, + "grad_norm": 0.4670488762531189, + "learning_rate": 0.0004987984071812854, + "loss": 3.234917640686035, + "step": 2718, + "token_acc": 0.27583820488186156 + }, + { + "epoch": 1.5939607153327469, + "grad_norm": 0.45755183631185503, + "learning_rate": 0.0004987960332274654, + "loss": 3.232421398162842, + "step": 2719, + "token_acc": 0.2742818600222187 + }, + { + "epoch": 1.594547053649956, + "grad_norm": 0.4227019373376549, + "learning_rate": 0.000498793656936542, + "loss": 3.2531185150146484, + "step": 2720, + "token_acc": 0.2722035918647155 + }, + { + "epoch": 1.595133391967165, + "grad_norm": 0.4215712774910508, + "learning_rate": 0.0004987912783085374, + "loss": 3.2416720390319824, + "step": 2721, + "token_acc": 0.2717643540053012 + }, + { + "epoch": 1.5957197302843742, + "grad_norm": 0.3689758269420926, + "learning_rate": 0.000498788897343474, + "loss": 3.2418856620788574, + "step": 2722, + "token_acc": 0.27507259976920245 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.39639354177563874, + "learning_rate": 0.0004987865140413741, + "loss": 3.255746603012085, + "step": 2723, + "token_acc": 0.2711629428472031 + }, + { + "epoch": 1.5968924069187922, + "grad_norm": 0.38263030782121593, + "learning_rate": 0.0004987841284022601, + "loss": 3.267214775085449, + "step": 2724, + "token_acc": 0.26980066605531156 + }, + { + "epoch": 1.597478745236001, + "grad_norm": 0.36578567798962536, + "learning_rate": 0.0004987817404261546, + "loss": 3.220471143722534, + "step": 2725, + "token_acc": 0.2765835089783396 + }, + { + "epoch": 1.5980650835532102, + "grad_norm": 0.3485440905052892, + "learning_rate": 0.0004987793501130799, + "loss": 3.2583389282226562, + "step": 2726, + "token_acc": 0.27033725767902983 + }, + { + "epoch": 1.5986514218704193, + "grad_norm": 0.35846008610783775, + "learning_rate": 0.0004987769574630583, + "loss": 3.2682206630706787, + "step": 2727, + "token_acc": 0.27180805795416807 + }, + { + "epoch": 1.5992377601876284, + "grad_norm": 0.36281373255912586, + "learning_rate": 0.0004987745624761126, + "loss": 3.234234094619751, + "step": 2728, + "token_acc": 0.27501035558068104 + }, + { + "epoch": 1.5998240985048373, + "grad_norm": 0.3523650325238099, + "learning_rate": 0.0004987721651522649, + "loss": 3.348665714263916, + "step": 2729, + "token_acc": 0.25943302682742536 + }, + { + "epoch": 1.6004104368220462, + "grad_norm": 0.3648963457155009, + "learning_rate": 0.000498769765491538, + "loss": 3.20805287361145, + "step": 2730, + "token_acc": 0.2779875951854581 + }, + { + "epoch": 1.6009967751392553, + "grad_norm": 0.377782425628979, + "learning_rate": 0.0004987673634939544, + "loss": 3.2456960678100586, + "step": 2731, + "token_acc": 0.2727531354251376 + }, + { + "epoch": 1.6015831134564644, + "grad_norm": 0.45546514642939634, + "learning_rate": 0.0004987649591595367, + "loss": 3.280743360519409, + "step": 2732, + "token_acc": 0.26956021593291946 + }, + { + "epoch": 1.6021694517736735, + "grad_norm": 0.35707631745038704, + "learning_rate": 0.0004987625524883074, + "loss": 3.235963821411133, + "step": 2733, + "token_acc": 0.2742844321606503 + }, + { + "epoch": 1.6027557900908824, + "grad_norm": 0.3895146299921404, + "learning_rate": 0.0004987601434802891, + "loss": 3.269639492034912, + "step": 2734, + "token_acc": 0.27049018039007217 + }, + { + "epoch": 1.6033421284080913, + "grad_norm": 0.4028401675283581, + "learning_rate": 0.0004987577321355044, + "loss": 3.288886547088623, + "step": 2735, + "token_acc": 0.2656581424415137 + }, + { + "epoch": 1.6039284667253004, + "grad_norm": 0.42007054227776724, + "learning_rate": 0.0004987553184539761, + "loss": 3.2761054039001465, + "step": 2736, + "token_acc": 0.2700375653310105 + }, + { + "epoch": 1.6045148050425095, + "grad_norm": 0.37746187433885026, + "learning_rate": 0.0004987529024357267, + "loss": 3.3051414489746094, + "step": 2737, + "token_acc": 0.2657674825487854 + }, + { + "epoch": 1.6051011433597187, + "grad_norm": 0.3651236854749246, + "learning_rate": 0.0004987504840807791, + "loss": 3.233194589614868, + "step": 2738, + "token_acc": 0.27516466075915885 + }, + { + "epoch": 1.6056874816769275, + "grad_norm": 0.3821426670517067, + "learning_rate": 0.0004987480633891558, + "loss": 3.2217910289764404, + "step": 2739, + "token_acc": 0.2749110320284697 + }, + { + "epoch": 1.6062738199941367, + "grad_norm": 0.49580605035281067, + "learning_rate": 0.0004987456403608796, + "loss": 3.2270307540893555, + "step": 2740, + "token_acc": 0.274558846799757 + }, + { + "epoch": 1.6068601583113455, + "grad_norm": 0.4330358816269615, + "learning_rate": 0.0004987432149959734, + "loss": 3.235473155975342, + "step": 2741, + "token_acc": 0.2763001323584215 + }, + { + "epoch": 1.6074464966285547, + "grad_norm": 0.347032543621862, + "learning_rate": 0.0004987407872944599, + "loss": 3.2172203063964844, + "step": 2742, + "token_acc": 0.27568816703393334 + }, + { + "epoch": 1.6080328349457638, + "grad_norm": 0.31771705082755164, + "learning_rate": 0.0004987383572563618, + "loss": 3.2275846004486084, + "step": 2743, + "token_acc": 0.27543459854124636 + }, + { + "epoch": 1.6086191732629729, + "grad_norm": 0.35758684373540345, + "learning_rate": 0.0004987359248817021, + "loss": 3.177854299545288, + "step": 2744, + "token_acc": 0.2824852786943278 + }, + { + "epoch": 1.6092055115801818, + "grad_norm": 0.35291716826327824, + "learning_rate": 0.0004987334901705035, + "loss": 3.2149596214294434, + "step": 2745, + "token_acc": 0.2776302978572342 + }, + { + "epoch": 1.6097918498973907, + "grad_norm": 0.3538669298207957, + "learning_rate": 0.000498731053122789, + "loss": 3.253016471862793, + "step": 2746, + "token_acc": 0.27146569784401603 + }, + { + "epoch": 1.6103781882145998, + "grad_norm": 0.3451118309842075, + "learning_rate": 0.0004987286137385813, + "loss": 3.2773189544677734, + "step": 2747, + "token_acc": 0.26837553141363346 + }, + { + "epoch": 1.6109645265318089, + "grad_norm": 0.3179749163014257, + "learning_rate": 0.0004987261720179035, + "loss": 3.2556238174438477, + "step": 2748, + "token_acc": 0.27126350917934017 + }, + { + "epoch": 1.611550864849018, + "grad_norm": 0.32816561709366504, + "learning_rate": 0.0004987237279607784, + "loss": 3.279360771179199, + "step": 2749, + "token_acc": 0.2685902832967442 + }, + { + "epoch": 1.6121372031662269, + "grad_norm": 0.32513850302848574, + "learning_rate": 0.0004987212815672292, + "loss": 3.2462167739868164, + "step": 2750, + "token_acc": 0.27311525000713455 + }, + { + "epoch": 1.612723541483436, + "grad_norm": 0.3411678929209327, + "learning_rate": 0.0004987188328372787, + "loss": 3.206583261489868, + "step": 2751, + "token_acc": 0.27804876753504354 + }, + { + "epoch": 1.6133098798006449, + "grad_norm": 0.3983071952813173, + "learning_rate": 0.0004987163817709498, + "loss": 3.218691349029541, + "step": 2752, + "token_acc": 0.2780238851862712 + }, + { + "epoch": 1.613896218117854, + "grad_norm": 0.3862351281946069, + "learning_rate": 0.0004987139283682656, + "loss": 3.2317378520965576, + "step": 2753, + "token_acc": 0.2751936244543791 + }, + { + "epoch": 1.614482556435063, + "grad_norm": 0.348458635267046, + "learning_rate": 0.0004987114726292494, + "loss": 3.221101760864258, + "step": 2754, + "token_acc": 0.27583929116207506 + }, + { + "epoch": 1.6150688947522722, + "grad_norm": 0.3118101786582252, + "learning_rate": 0.0004987090145539239, + "loss": 3.2600440979003906, + "step": 2755, + "token_acc": 0.27148637727068087 + }, + { + "epoch": 1.6156552330694811, + "grad_norm": 0.4016303275087847, + "learning_rate": 0.0004987065541423125, + "loss": 3.255066394805908, + "step": 2756, + "token_acc": 0.27424011040025165 + }, + { + "epoch": 1.61624157138669, + "grad_norm": 0.44224944029872704, + "learning_rate": 0.0004987040913944379, + "loss": 3.2731754779815674, + "step": 2757, + "token_acc": 0.2719338951054114 + }, + { + "epoch": 1.6168279097038991, + "grad_norm": 0.4151930126060737, + "learning_rate": 0.0004987016263103237, + "loss": 3.2479019165039062, + "step": 2758, + "token_acc": 0.2716210286573963 + }, + { + "epoch": 1.6174142480211082, + "grad_norm": 0.382939023349043, + "learning_rate": 0.0004986991588899929, + "loss": 3.286262035369873, + "step": 2759, + "token_acc": 0.26529028844417196 + }, + { + "epoch": 1.6180005863383173, + "grad_norm": 0.3490235209365785, + "learning_rate": 0.0004986966891334685, + "loss": 3.2657227516174316, + "step": 2760, + "token_acc": 0.2724756398994954 + }, + { + "epoch": 1.6185869246555262, + "grad_norm": 0.3520265759538571, + "learning_rate": 0.0004986942170407737, + "loss": 3.2647459506988525, + "step": 2761, + "token_acc": 0.2707282224129629 + }, + { + "epoch": 1.6191732629727351, + "grad_norm": 0.36733780809683125, + "learning_rate": 0.000498691742611932, + "loss": 3.1752400398254395, + "step": 2762, + "token_acc": 0.2830303266243625 + }, + { + "epoch": 1.6197596012899442, + "grad_norm": 0.41888606951507035, + "learning_rate": 0.0004986892658469666, + "loss": 3.212538003921509, + "step": 2763, + "token_acc": 0.27658080325125506 + }, + { + "epoch": 1.6203459396071533, + "grad_norm": 0.39855172916085413, + "learning_rate": 0.0004986867867459006, + "loss": 3.2517294883728027, + "step": 2764, + "token_acc": 0.2713157017161327 + }, + { + "epoch": 1.6209322779243625, + "grad_norm": 0.41027245762118103, + "learning_rate": 0.0004986843053087573, + "loss": 3.1951522827148438, + "step": 2765, + "token_acc": 0.27912448728309114 + }, + { + "epoch": 1.6215186162415713, + "grad_norm": 0.4367539404422904, + "learning_rate": 0.00049868182153556, + "loss": 3.228646755218506, + "step": 2766, + "token_acc": 0.27499007051904223 + }, + { + "epoch": 1.6221049545587805, + "grad_norm": 0.37799934714026406, + "learning_rate": 0.0004986793354263321, + "loss": 3.238917350769043, + "step": 2767, + "token_acc": 0.2760871295534743 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.3280972892430306, + "learning_rate": 0.000498676846981097, + "loss": 3.210571765899658, + "step": 2768, + "token_acc": 0.2788220950092525 + }, + { + "epoch": 1.6232776311931985, + "grad_norm": 0.40600094055703184, + "learning_rate": 0.0004986743561998781, + "loss": 3.27011775970459, + "step": 2769, + "token_acc": 0.27040760195165886 + }, + { + "epoch": 1.6238639695104076, + "grad_norm": 0.41423823374294383, + "learning_rate": 0.0004986718630826986, + "loss": 3.23716402053833, + "step": 2770, + "token_acc": 0.274798722353706 + }, + { + "epoch": 1.6244503078276167, + "grad_norm": 0.3864477557920523, + "learning_rate": 0.0004986693676295821, + "loss": 3.2373876571655273, + "step": 2771, + "token_acc": 0.27323684067870113 + }, + { + "epoch": 1.6250366461448256, + "grad_norm": 0.3541719735878081, + "learning_rate": 0.000498666869840552, + "loss": 3.240128755569458, + "step": 2772, + "token_acc": 0.27542786247654444 + }, + { + "epoch": 1.6256229844620345, + "grad_norm": 0.3871081603780341, + "learning_rate": 0.0004986643697156317, + "loss": 3.269397497177124, + "step": 2773, + "token_acc": 0.26959298371580914 + }, + { + "epoch": 1.6262093227792436, + "grad_norm": 0.36603598396570886, + "learning_rate": 0.0004986618672548446, + "loss": 3.3137803077697754, + "step": 2774, + "token_acc": 0.26363959837659817 + }, + { + "epoch": 1.6267956610964527, + "grad_norm": 0.36939061875417595, + "learning_rate": 0.0004986593624582145, + "loss": 3.262991428375244, + "step": 2775, + "token_acc": 0.27099446291297113 + }, + { + "epoch": 1.6273819994136618, + "grad_norm": 0.32479646239203375, + "learning_rate": 0.0004986568553257646, + "loss": 3.2764062881469727, + "step": 2776, + "token_acc": 0.2709072985998944 + }, + { + "epoch": 1.6279683377308707, + "grad_norm": 0.32947646142705234, + "learning_rate": 0.0004986543458575188, + "loss": 3.2155604362487793, + "step": 2777, + "token_acc": 0.27678465421921217 + }, + { + "epoch": 1.6285546760480798, + "grad_norm": 0.4162345838071443, + "learning_rate": 0.0004986518340535004, + "loss": 3.2170333862304688, + "step": 2778, + "token_acc": 0.2774165684506852 + }, + { + "epoch": 1.6291410143652887, + "grad_norm": 0.41759363110660364, + "learning_rate": 0.0004986493199137331, + "loss": 3.2335448265075684, + "step": 2779, + "token_acc": 0.2744798825195583 + }, + { + "epoch": 1.6297273526824978, + "grad_norm": 0.397037175693642, + "learning_rate": 0.0004986468034382406, + "loss": 3.2837014198303223, + "step": 2780, + "token_acc": 0.2685570906166738 + }, + { + "epoch": 1.630313690999707, + "grad_norm": 0.355904633245635, + "learning_rate": 0.0004986442846270462, + "loss": 3.2292261123657227, + "step": 2781, + "token_acc": 0.2747922522693035 + }, + { + "epoch": 1.630900029316916, + "grad_norm": 0.3392622183786451, + "learning_rate": 0.000498641763480174, + "loss": 3.243370294570923, + "step": 2782, + "token_acc": 0.27472683999388803 + }, + { + "epoch": 1.631486367634125, + "grad_norm": 0.34813422302345937, + "learning_rate": 0.0004986392399976473, + "loss": 3.199794292449951, + "step": 2783, + "token_acc": 0.2797840826805345 + }, + { + "epoch": 1.6320727059513338, + "grad_norm": 0.35138959495079725, + "learning_rate": 0.0004986367141794902, + "loss": 3.2339978218078613, + "step": 2784, + "token_acc": 0.27494765303401164 + }, + { + "epoch": 1.632659044268543, + "grad_norm": 0.3719351952658503, + "learning_rate": 0.000498634186025726, + "loss": 3.2448782920837402, + "step": 2785, + "token_acc": 0.27485918010954674 + }, + { + "epoch": 1.633245382585752, + "grad_norm": 0.40982051502724537, + "learning_rate": 0.0004986316555363788, + "loss": 3.20831298828125, + "step": 2786, + "token_acc": 0.2756497096163377 + }, + { + "epoch": 1.6338317209029611, + "grad_norm": 0.4679634136903241, + "learning_rate": 0.0004986291227114722, + "loss": 3.227557897567749, + "step": 2787, + "token_acc": 0.276445668895063 + }, + { + "epoch": 1.63441805922017, + "grad_norm": 0.4286460626684321, + "learning_rate": 0.00049862658755103, + "loss": 3.2810144424438477, + "step": 2788, + "token_acc": 0.2693951901613107 + }, + { + "epoch": 1.635004397537379, + "grad_norm": 0.34445129120444495, + "learning_rate": 0.0004986240500550761, + "loss": 3.2818071842193604, + "step": 2789, + "token_acc": 0.269373519689753 + }, + { + "epoch": 1.635590735854588, + "grad_norm": 0.3267231925692841, + "learning_rate": 0.0004986215102236343, + "loss": 3.261897087097168, + "step": 2790, + "token_acc": 0.27020795172933953 + }, + { + "epoch": 1.6361770741717971, + "grad_norm": 0.3392819320475654, + "learning_rate": 0.0004986189680567282, + "loss": 3.2720112800598145, + "step": 2791, + "token_acc": 0.2692128260127343 + }, + { + "epoch": 1.6367634124890063, + "grad_norm": 0.3615260712749753, + "learning_rate": 0.0004986164235543822, + "loss": 3.2113118171691895, + "step": 2792, + "token_acc": 0.27501873744852096 + }, + { + "epoch": 1.6373497508062151, + "grad_norm": 0.3677819809166616, + "learning_rate": 0.0004986138767166197, + "loss": 3.328761577606201, + "step": 2793, + "token_acc": 0.26214651052439786 + }, + { + "epoch": 1.6379360891234243, + "grad_norm": 0.4351661442243823, + "learning_rate": 0.000498611327543465, + "loss": 3.2705330848693848, + "step": 2794, + "token_acc": 0.2708117461631617 + }, + { + "epoch": 1.6385224274406331, + "grad_norm": 0.39804657522399217, + "learning_rate": 0.0004986087760349418, + "loss": 3.2207818031311035, + "step": 2795, + "token_acc": 0.2756710054281475 + }, + { + "epoch": 1.6391087657578423, + "grad_norm": 0.4218043965329263, + "learning_rate": 0.0004986062221910742, + "loss": 3.2590789794921875, + "step": 2796, + "token_acc": 0.27254624435274205 + }, + { + "epoch": 1.6396951040750514, + "grad_norm": 0.4540068057156381, + "learning_rate": 0.0004986036660118861, + "loss": 3.297433853149414, + "step": 2797, + "token_acc": 0.266402219283918 + }, + { + "epoch": 1.6402814423922605, + "grad_norm": 0.4469006842733171, + "learning_rate": 0.0004986011074974016, + "loss": 3.2957658767700195, + "step": 2798, + "token_acc": 0.26661584984184655 + }, + { + "epoch": 1.6408677807094694, + "grad_norm": 0.41208797298991495, + "learning_rate": 0.0004985985466476446, + "loss": 3.2869277000427246, + "step": 2799, + "token_acc": 0.26692779521131194 + }, + { + "epoch": 1.6414541190266783, + "grad_norm": 0.3633945274867081, + "learning_rate": 0.0004985959834626393, + "loss": 3.3038723468780518, + "step": 2800, + "token_acc": 0.2656008985786308 + }, + { + "epoch": 1.6420404573438874, + "grad_norm": 0.3462139546365612, + "learning_rate": 0.0004985934179424097, + "loss": 3.236398696899414, + "step": 2801, + "token_acc": 0.273619067934341 + }, + { + "epoch": 1.6426267956610965, + "grad_norm": 0.3487087292807739, + "learning_rate": 0.00049859085008698, + "loss": 3.1826324462890625, + "step": 2802, + "token_acc": 0.2807146708697997 + }, + { + "epoch": 1.6432131339783056, + "grad_norm": 0.36891589999050367, + "learning_rate": 0.0004985882798963742, + "loss": 3.2357091903686523, + "step": 2803, + "token_acc": 0.2752817395002323 + }, + { + "epoch": 1.6437994722955145, + "grad_norm": 0.3732750470174466, + "learning_rate": 0.0004985857073706165, + "loss": 3.257347583770752, + "step": 2804, + "token_acc": 0.27214495466133143 + }, + { + "epoch": 1.6443858106127234, + "grad_norm": 0.3239709473222964, + "learning_rate": 0.0004985831325097311, + "loss": 3.2477762699127197, + "step": 2805, + "token_acc": 0.27246390782320606 + }, + { + "epoch": 1.6449721489299325, + "grad_norm": 0.35725519351593177, + "learning_rate": 0.000498580555313742, + "loss": 3.2600955963134766, + "step": 2806, + "token_acc": 0.27240646532294416 + }, + { + "epoch": 1.6455584872471416, + "grad_norm": 0.3540916620430091, + "learning_rate": 0.0004985779757826737, + "loss": 3.236013650894165, + "step": 2807, + "token_acc": 0.27411103375364226 + }, + { + "epoch": 1.6461448255643507, + "grad_norm": 0.37591701849603665, + "learning_rate": 0.0004985753939165501, + "loss": 3.245718240737915, + "step": 2808, + "token_acc": 0.27278551349937896 + }, + { + "epoch": 1.6467311638815598, + "grad_norm": 0.39541450209991336, + "learning_rate": 0.0004985728097153958, + "loss": 3.2262916564941406, + "step": 2809, + "token_acc": 0.2763185232124492 + }, + { + "epoch": 1.6473175021987687, + "grad_norm": 0.40034914685238143, + "learning_rate": 0.0004985702231792348, + "loss": 3.2422690391540527, + "step": 2810, + "token_acc": 0.27297902778743655 + }, + { + "epoch": 1.6479038405159776, + "grad_norm": 0.36400651519381383, + "learning_rate": 0.0004985676343080916, + "loss": 3.1679704189300537, + "step": 2811, + "token_acc": 0.2842027605263511 + }, + { + "epoch": 1.6484901788331867, + "grad_norm": 0.2991063437106015, + "learning_rate": 0.0004985650431019904, + "loss": 3.206399440765381, + "step": 2812, + "token_acc": 0.27931457435191465 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.3490048272867048, + "learning_rate": 0.0004985624495609555, + "loss": 3.2703442573547363, + "step": 2813, + "token_acc": 0.2727640631323351 + }, + { + "epoch": 1.649662855467605, + "grad_norm": 0.32726425220451977, + "learning_rate": 0.0004985598536850114, + "loss": 3.2183618545532227, + "step": 2814, + "token_acc": 0.27437523027862126 + }, + { + "epoch": 1.6502491937848138, + "grad_norm": 0.3625428919090589, + "learning_rate": 0.0004985572554741824, + "loss": 3.267106771469116, + "step": 2815, + "token_acc": 0.27121543310952506 + }, + { + "epoch": 1.6508355321020227, + "grad_norm": 0.3722076074199333, + "learning_rate": 0.0004985546549284929, + "loss": 3.2349417209625244, + "step": 2816, + "token_acc": 0.27441717213625916 + }, + { + "epoch": 1.6514218704192318, + "grad_norm": 0.4025791854353653, + "learning_rate": 0.0004985520520479674, + "loss": 3.1869289875030518, + "step": 2817, + "token_acc": 0.28000790019083577 + }, + { + "epoch": 1.652008208736441, + "grad_norm": 0.4031437414739885, + "learning_rate": 0.0004985494468326302, + "loss": 3.2438113689422607, + "step": 2818, + "token_acc": 0.27342399566820086 + }, + { + "epoch": 1.65259454705365, + "grad_norm": 0.3459326240470866, + "learning_rate": 0.0004985468392825059, + "loss": 3.1798017024993896, + "step": 2819, + "token_acc": 0.28222709959958675 + }, + { + "epoch": 1.653180885370859, + "grad_norm": 0.3680654138185959, + "learning_rate": 0.000498544229397619, + "loss": 3.2882046699523926, + "step": 2820, + "token_acc": 0.26705050590576185 + }, + { + "epoch": 1.653767223688068, + "grad_norm": 0.42710970747055693, + "learning_rate": 0.0004985416171779941, + "loss": 3.250947952270508, + "step": 2821, + "token_acc": 0.27014820080492796 + }, + { + "epoch": 1.654353562005277, + "grad_norm": 0.36030137173458404, + "learning_rate": 0.0004985390026236554, + "loss": 3.1986308097839355, + "step": 2822, + "token_acc": 0.27964537168792286 + }, + { + "epoch": 1.654939900322486, + "grad_norm": 0.3300176141682994, + "learning_rate": 0.0004985363857346278, + "loss": 3.215203046798706, + "step": 2823, + "token_acc": 0.27901802559024896 + }, + { + "epoch": 1.6555262386396952, + "grad_norm": 0.38077522645917034, + "learning_rate": 0.0004985337665109358, + "loss": 3.2363369464874268, + "step": 2824, + "token_acc": 0.27359952461573533 + }, + { + "epoch": 1.6561125769569043, + "grad_norm": 0.3954765443524495, + "learning_rate": 0.0004985311449526038, + "loss": 3.253981828689575, + "step": 2825, + "token_acc": 0.27064094382709863 + }, + { + "epoch": 1.6566989152741132, + "grad_norm": 0.36542662594144765, + "learning_rate": 0.0004985285210596567, + "loss": 3.22945499420166, + "step": 2826, + "token_acc": 0.27326493819794817 + }, + { + "epoch": 1.657285253591322, + "grad_norm": 0.37026509008563846, + "learning_rate": 0.0004985258948321191, + "loss": 3.240661859512329, + "step": 2827, + "token_acc": 0.27355614204986856 + }, + { + "epoch": 1.6578715919085312, + "grad_norm": 0.4419391598562821, + "learning_rate": 0.0004985232662700155, + "loss": 3.2287344932556152, + "step": 2828, + "token_acc": 0.27558495220315404 + }, + { + "epoch": 1.6584579302257403, + "grad_norm": 0.48779050943975094, + "learning_rate": 0.0004985206353733708, + "loss": 3.234668493270874, + "step": 2829, + "token_acc": 0.27590292314394704 + }, + { + "epoch": 1.6590442685429494, + "grad_norm": 0.4094397266582616, + "learning_rate": 0.0004985180021422096, + "loss": 3.264362096786499, + "step": 2830, + "token_acc": 0.2688967548144556 + }, + { + "epoch": 1.6596306068601583, + "grad_norm": 0.30673402009883666, + "learning_rate": 0.0004985153665765566, + "loss": 3.1701488494873047, + "step": 2831, + "token_acc": 0.28299677380810134 + }, + { + "epoch": 1.6602169451773672, + "grad_norm": 0.38757061478866, + "learning_rate": 0.0004985127286764366, + "loss": 3.2231578826904297, + "step": 2832, + "token_acc": 0.27622282088275474 + }, + { + "epoch": 1.6608032834945763, + "grad_norm": 0.33669139817449567, + "learning_rate": 0.0004985100884418745, + "loss": 3.231654167175293, + "step": 2833, + "token_acc": 0.2753767747767405 + }, + { + "epoch": 1.6613896218117854, + "grad_norm": 0.3440032753977391, + "learning_rate": 0.0004985074458728948, + "loss": 3.224477767944336, + "step": 2834, + "token_acc": 0.2746972328475345 + }, + { + "epoch": 1.6619759601289945, + "grad_norm": 0.3616120900262278, + "learning_rate": 0.0004985048009695227, + "loss": 3.253448486328125, + "step": 2835, + "token_acc": 0.2716456210077358 + }, + { + "epoch": 1.6625622984462036, + "grad_norm": 0.4099868481129341, + "learning_rate": 0.0004985021537317828, + "loss": 3.2238569259643555, + "step": 2836, + "token_acc": 0.2756548516019131 + }, + { + "epoch": 1.6631486367634125, + "grad_norm": 0.3803184341745219, + "learning_rate": 0.0004984995041596999, + "loss": 3.222700357437134, + "step": 2837, + "token_acc": 0.27487946610154124 + }, + { + "epoch": 1.6637349750806214, + "grad_norm": 0.3184454224279368, + "learning_rate": 0.0004984968522532991, + "loss": 3.2895102500915527, + "step": 2838, + "token_acc": 0.2685154473400007 + }, + { + "epoch": 1.6643213133978305, + "grad_norm": 0.3601900947363005, + "learning_rate": 0.0004984941980126053, + "loss": 3.2527172565460205, + "step": 2839, + "token_acc": 0.2745359943668937 + }, + { + "epoch": 1.6649076517150396, + "grad_norm": 0.45627374355477374, + "learning_rate": 0.0004984915414376433, + "loss": 3.236271381378174, + "step": 2840, + "token_acc": 0.27445176567486873 + }, + { + "epoch": 1.6654939900322487, + "grad_norm": 0.40581273245134936, + "learning_rate": 0.0004984888825284382, + "loss": 3.1966490745544434, + "step": 2841, + "token_acc": 0.2786169680233678 + }, + { + "epoch": 1.6660803283494576, + "grad_norm": 0.4110783404223848, + "learning_rate": 0.0004984862212850148, + "loss": 3.243350028991699, + "step": 2842, + "token_acc": 0.27296050339432204 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5153383702538445, + "learning_rate": 0.0004984835577073981, + "loss": 3.256732940673828, + "step": 2843, + "token_acc": 0.2729319812124157 + }, + { + "epoch": 1.6672530049838756, + "grad_norm": 0.44880704978525005, + "learning_rate": 0.0004984808917956134, + "loss": 3.260125160217285, + "step": 2844, + "token_acc": 0.271819140508182 + }, + { + "epoch": 1.6678393433010847, + "grad_norm": 0.3197319716077354, + "learning_rate": 0.0004984782235496855, + "loss": 3.2926900386810303, + "step": 2845, + "token_acc": 0.26584011908777144 + }, + { + "epoch": 1.6684256816182939, + "grad_norm": 0.3943195370218462, + "learning_rate": 0.0004984755529696394, + "loss": 3.2412266731262207, + "step": 2846, + "token_acc": 0.27476188352687814 + }, + { + "epoch": 1.6690120199355027, + "grad_norm": 0.3860838342042475, + "learning_rate": 0.0004984728800555005, + "loss": 3.2655344009399414, + "step": 2847, + "token_acc": 0.27119640914182447 + }, + { + "epoch": 1.6695983582527119, + "grad_norm": 0.34465659528537185, + "learning_rate": 0.0004984702048072935, + "loss": 3.213186264038086, + "step": 2848, + "token_acc": 0.2785841205256216 + }, + { + "epoch": 1.6701846965699207, + "grad_norm": 0.35769725979815153, + "learning_rate": 0.000498467527225044, + "loss": 3.217963218688965, + "step": 2849, + "token_acc": 0.27643066235131625 + }, + { + "epoch": 1.6707710348871299, + "grad_norm": 0.36363495994466405, + "learning_rate": 0.0004984648473087767, + "loss": 3.209224224090576, + "step": 2850, + "token_acc": 0.2760898827728124 + }, + { + "epoch": 1.671357373204339, + "grad_norm": 0.31830103344088206, + "learning_rate": 0.0004984621650585171, + "loss": 3.242711067199707, + "step": 2851, + "token_acc": 0.2721768417792575 + }, + { + "epoch": 1.671943711521548, + "grad_norm": 0.3471702983054354, + "learning_rate": 0.0004984594804742902, + "loss": 3.3064746856689453, + "step": 2852, + "token_acc": 0.2639758738372062 + }, + { + "epoch": 1.672530049838757, + "grad_norm": 0.34798627988646263, + "learning_rate": 0.0004984567935561212, + "loss": 3.2588958740234375, + "step": 2853, + "token_acc": 0.27207120831281906 + }, + { + "epoch": 1.6731163881559659, + "grad_norm": 0.3607793185234463, + "learning_rate": 0.0004984541043040355, + "loss": 3.2446255683898926, + "step": 2854, + "token_acc": 0.2737564517630509 + }, + { + "epoch": 1.673702726473175, + "grad_norm": 0.32926655624091755, + "learning_rate": 0.0004984514127180583, + "loss": 3.2151713371276855, + "step": 2855, + "token_acc": 0.27725342696058114 + }, + { + "epoch": 1.674289064790384, + "grad_norm": 0.37611181097852886, + "learning_rate": 0.0004984487187982148, + "loss": 3.2627339363098145, + "step": 2856, + "token_acc": 0.2722053333118302 + }, + { + "epoch": 1.6748754031075932, + "grad_norm": 0.40896488647581897, + "learning_rate": 0.0004984460225445304, + "loss": 3.279529333114624, + "step": 2857, + "token_acc": 0.26890280522682836 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.39281409288986535, + "learning_rate": 0.0004984433239570305, + "loss": 3.2456796169281006, + "step": 2858, + "token_acc": 0.27267336915456303 + }, + { + "epoch": 1.676048079742011, + "grad_norm": 0.34413726305781994, + "learning_rate": 0.0004984406230357402, + "loss": 3.2264256477355957, + "step": 2859, + "token_acc": 0.27582265936361117 + }, + { + "epoch": 1.67663441805922, + "grad_norm": 0.3467504554488161, + "learning_rate": 0.0004984379197806851, + "loss": 3.2215394973754883, + "step": 2860, + "token_acc": 0.2748563608605428 + }, + { + "epoch": 1.6772207563764292, + "grad_norm": 0.33984180174820466, + "learning_rate": 0.0004984352141918906, + "loss": 3.214142084121704, + "step": 2861, + "token_acc": 0.2768340173021261 + }, + { + "epoch": 1.6778070946936383, + "grad_norm": 0.3615594616040768, + "learning_rate": 0.0004984325062693819, + "loss": 3.2550578117370605, + "step": 2862, + "token_acc": 0.27170446199161835 + }, + { + "epoch": 1.6783934330108474, + "grad_norm": 0.35220268672586635, + "learning_rate": 0.0004984297960131846, + "loss": 3.2225451469421387, + "step": 2863, + "token_acc": 0.27553557369576925 + }, + { + "epoch": 1.6789797713280563, + "grad_norm": 0.33769575037759203, + "learning_rate": 0.0004984270834233242, + "loss": 3.267636299133301, + "step": 2864, + "token_acc": 0.2687341807786704 + }, + { + "epoch": 1.6795661096452652, + "grad_norm": 0.35845810691304725, + "learning_rate": 0.0004984243684998261, + "loss": 3.2455272674560547, + "step": 2865, + "token_acc": 0.2753273940566806 + }, + { + "epoch": 1.6801524479624743, + "grad_norm": 0.38971212381741577, + "learning_rate": 0.0004984216512427158, + "loss": 3.2231504917144775, + "step": 2866, + "token_acc": 0.27462370305421596 + }, + { + "epoch": 1.6807387862796834, + "grad_norm": 0.39681164654289813, + "learning_rate": 0.0004984189316520188, + "loss": 3.2895255088806152, + "step": 2867, + "token_acc": 0.2676954160060288 + }, + { + "epoch": 1.6813251245968925, + "grad_norm": 0.374844709777505, + "learning_rate": 0.0004984162097277607, + "loss": 3.2543487548828125, + "step": 2868, + "token_acc": 0.27301719430143845 + }, + { + "epoch": 1.6819114629141014, + "grad_norm": 0.29416803244766027, + "learning_rate": 0.0004984134854699671, + "loss": 3.1950864791870117, + "step": 2869, + "token_acc": 0.2782429499955767 + }, + { + "epoch": 1.6824978012313103, + "grad_norm": 0.3436114704202493, + "learning_rate": 0.0004984107588786635, + "loss": 3.2870800495147705, + "step": 2870, + "token_acc": 0.2666457753926907 + }, + { + "epoch": 1.6830841395485194, + "grad_norm": 0.4192258914010484, + "learning_rate": 0.0004984080299538755, + "loss": 3.2338109016418457, + "step": 2871, + "token_acc": 0.27302191221024447 + }, + { + "epoch": 1.6836704778657285, + "grad_norm": 0.4003790402565155, + "learning_rate": 0.0004984052986956289, + "loss": 3.2391676902770996, + "step": 2872, + "token_acc": 0.2728091141692902 + }, + { + "epoch": 1.6842568161829377, + "grad_norm": 0.33582620360864984, + "learning_rate": 0.0004984025651039492, + "loss": 3.2198894023895264, + "step": 2873, + "token_acc": 0.2753551616138871 + }, + { + "epoch": 1.6848431545001465, + "grad_norm": 0.33429753447902155, + "learning_rate": 0.0004983998291788621, + "loss": 3.2407405376434326, + "step": 2874, + "token_acc": 0.2734838889221222 + }, + { + "epoch": 1.6854294928173557, + "grad_norm": 0.35421971099328325, + "learning_rate": 0.0004983970909203934, + "loss": 3.2357354164123535, + "step": 2875, + "token_acc": 0.2748795792433671 + }, + { + "epoch": 1.6860158311345645, + "grad_norm": 0.3741384256079026, + "learning_rate": 0.0004983943503285688, + "loss": 3.2567801475524902, + "step": 2876, + "token_acc": 0.2721441445223035 + }, + { + "epoch": 1.6866021694517737, + "grad_norm": 0.345836872200717, + "learning_rate": 0.0004983916074034139, + "loss": 3.2966365814208984, + "step": 2877, + "token_acc": 0.2660323042888673 + }, + { + "epoch": 1.6871885077689828, + "grad_norm": 0.39063554246123605, + "learning_rate": 0.0004983888621449545, + "loss": 3.21757173538208, + "step": 2878, + "token_acc": 0.27558786163104726 + }, + { + "epoch": 1.6877748460861919, + "grad_norm": 0.3869949530362038, + "learning_rate": 0.0004983861145532166, + "loss": 3.2078969478607178, + "step": 2879, + "token_acc": 0.2780600263619055 + }, + { + "epoch": 1.6883611844034008, + "grad_norm": 0.3434902100940193, + "learning_rate": 0.0004983833646282258, + "loss": 3.2380006313323975, + "step": 2880, + "token_acc": 0.27356789356629063 + }, + { + "epoch": 1.6889475227206097, + "grad_norm": 0.34636998172161854, + "learning_rate": 0.0004983806123700079, + "loss": 3.211334466934204, + "step": 2881, + "token_acc": 0.2796932242270688 + }, + { + "epoch": 1.6895338610378188, + "grad_norm": 0.3428730656032943, + "learning_rate": 0.0004983778577785889, + "loss": 3.261359691619873, + "step": 2882, + "token_acc": 0.269963419803545 + }, + { + "epoch": 1.6901201993550279, + "grad_norm": 0.36185207007541964, + "learning_rate": 0.0004983751008539947, + "loss": 3.2325234413146973, + "step": 2883, + "token_acc": 0.27488725337473763 + }, + { + "epoch": 1.690706537672237, + "grad_norm": 0.3608162432825475, + "learning_rate": 0.000498372341596251, + "loss": 3.224308490753174, + "step": 2884, + "token_acc": 0.27561507073902775 + }, + { + "epoch": 1.6912928759894459, + "grad_norm": 0.35638662663854925, + "learning_rate": 0.0004983695800053839, + "loss": 3.2613353729248047, + "step": 2885, + "token_acc": 0.2692588136643698 + }, + { + "epoch": 1.6918792143066548, + "grad_norm": 0.34155490699521035, + "learning_rate": 0.0004983668160814192, + "loss": 3.2296810150146484, + "step": 2886, + "token_acc": 0.2742266999571903 + }, + { + "epoch": 1.692465552623864, + "grad_norm": 0.30582027308419557, + "learning_rate": 0.0004983640498243831, + "loss": 3.2086734771728516, + "step": 2887, + "token_acc": 0.2779119010037504 + }, + { + "epoch": 1.693051890941073, + "grad_norm": 0.32889832737058994, + "learning_rate": 0.0004983612812343013, + "loss": 3.1901867389678955, + "step": 2888, + "token_acc": 0.28109710358217244 + }, + { + "epoch": 1.6936382292582821, + "grad_norm": 0.33600176577089796, + "learning_rate": 0.0004983585103111999, + "loss": 3.1642379760742188, + "step": 2889, + "token_acc": 0.28245248825745367 + }, + { + "epoch": 1.6942245675754912, + "grad_norm": 0.32020247114454153, + "learning_rate": 0.000498355737055105, + "loss": 3.222043991088867, + "step": 2890, + "token_acc": 0.27524997846105503 + }, + { + "epoch": 1.6948109058927001, + "grad_norm": 0.32750785473876326, + "learning_rate": 0.0004983529614660427, + "loss": 3.2486190795898438, + "step": 2891, + "token_acc": 0.2723835636022016 + }, + { + "epoch": 1.695397244209909, + "grad_norm": 0.34933460689597867, + "learning_rate": 0.000498350183544039, + "loss": 3.2640299797058105, + "step": 2892, + "token_acc": 0.26989428365454904 + }, + { + "epoch": 1.6959835825271181, + "grad_norm": 0.3347309150446478, + "learning_rate": 0.0004983474032891199, + "loss": 3.252535581588745, + "step": 2893, + "token_acc": 0.2716911890707674 + }, + { + "epoch": 1.6965699208443272, + "grad_norm": 0.34390645727999336, + "learning_rate": 0.0004983446207013115, + "loss": 3.2394938468933105, + "step": 2894, + "token_acc": 0.2718806748513868 + }, + { + "epoch": 1.6971562591615363, + "grad_norm": 0.34901035942175995, + "learning_rate": 0.0004983418357806401, + "loss": 3.2158308029174805, + "step": 2895, + "token_acc": 0.2752899284739736 + }, + { + "epoch": 1.6977425974787452, + "grad_norm": 0.3581554556331172, + "learning_rate": 0.000498339048527132, + "loss": 3.2208313941955566, + "step": 2896, + "token_acc": 0.2750399093671147 + }, + { + "epoch": 1.6983289357959541, + "grad_norm": 0.34400339725625534, + "learning_rate": 0.000498336258940813, + "loss": 3.24052095413208, + "step": 2897, + "token_acc": 0.27316235772398834 + }, + { + "epoch": 1.6989152741131632, + "grad_norm": 0.36663961983743665, + "learning_rate": 0.0004983334670217095, + "loss": 3.206697940826416, + "step": 2898, + "token_acc": 0.2770434728223196 + }, + { + "epoch": 1.6995016124303723, + "grad_norm": 0.3349017606432719, + "learning_rate": 0.0004983306727698477, + "loss": 3.2583377361297607, + "step": 2899, + "token_acc": 0.27023326572008116 + }, + { + "epoch": 1.7000879507475815, + "grad_norm": 0.32712479401379124, + "learning_rate": 0.0004983278761852539, + "loss": 3.256592273712158, + "step": 2900, + "token_acc": 0.2711067633303405 + }, + { + "epoch": 1.7006742890647903, + "grad_norm": 0.3252553592693444, + "learning_rate": 0.0004983250772679543, + "loss": 3.2417705059051514, + "step": 2901, + "token_acc": 0.2739805809960307 + }, + { + "epoch": 1.7012606273819995, + "grad_norm": 0.3632553249171078, + "learning_rate": 0.0004983222760179752, + "loss": 3.2737178802490234, + "step": 2902, + "token_acc": 0.26884821130056835 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.36038204512469485, + "learning_rate": 0.0004983194724353429, + "loss": 3.238574743270874, + "step": 2903, + "token_acc": 0.2736429770565193 + }, + { + "epoch": 1.7024333040164175, + "grad_norm": 0.39232407339359915, + "learning_rate": 0.0004983166665200839, + "loss": 3.2138049602508545, + "step": 2904, + "token_acc": 0.2777297283087317 + }, + { + "epoch": 1.7030196423336266, + "grad_norm": 0.4113127586839885, + "learning_rate": 0.0004983138582722244, + "loss": 3.244720220565796, + "step": 2905, + "token_acc": 0.2715484521717429 + }, + { + "epoch": 1.7036059806508357, + "grad_norm": 0.4365462595276291, + "learning_rate": 0.0004983110476917907, + "loss": 3.21602725982666, + "step": 2906, + "token_acc": 0.2763317254484328 + }, + { + "epoch": 1.7041923189680446, + "grad_norm": 0.4333985172809448, + "learning_rate": 0.0004983082347788094, + "loss": 3.2125048637390137, + "step": 2907, + "token_acc": 0.27890444074534293 + }, + { + "epoch": 1.7047786572852535, + "grad_norm": 0.40934218359409935, + "learning_rate": 0.0004983054195333069, + "loss": 3.1973676681518555, + "step": 2908, + "token_acc": 0.2788486970873581 + }, + { + "epoch": 1.7053649956024626, + "grad_norm": 0.40428112562326424, + "learning_rate": 0.0004983026019553094, + "loss": 3.2668399810791016, + "step": 2909, + "token_acc": 0.2690622508338653 + }, + { + "epoch": 1.7059513339196717, + "grad_norm": 0.4082594051392893, + "learning_rate": 0.0004982997820448437, + "loss": 3.2588162422180176, + "step": 2910, + "token_acc": 0.2698203913761608 + }, + { + "epoch": 1.7065376722368808, + "grad_norm": 0.4212207923262204, + "learning_rate": 0.000498296959801936, + "loss": 3.293275833129883, + "step": 2911, + "token_acc": 0.26636096986153984 + }, + { + "epoch": 1.7071240105540897, + "grad_norm": 0.4365212188736934, + "learning_rate": 0.000498294135226613, + "loss": 3.215772867202759, + "step": 2912, + "token_acc": 0.27624439479481155 + }, + { + "epoch": 1.7077103488712986, + "grad_norm": 0.47002232476399813, + "learning_rate": 0.0004982913083189012, + "loss": 3.2346603870391846, + "step": 2913, + "token_acc": 0.2745080654694737 + }, + { + "epoch": 1.7082966871885077, + "grad_norm": 0.31572224687400335, + "learning_rate": 0.0004982884790788272, + "loss": 3.2538657188415527, + "step": 2914, + "token_acc": 0.27191922587965833 + }, + { + "epoch": 1.7088830255057168, + "grad_norm": 0.3306971530735801, + "learning_rate": 0.0004982856475064175, + "loss": 3.21561598777771, + "step": 2915, + "token_acc": 0.2763440403318328 + }, + { + "epoch": 1.709469363822926, + "grad_norm": 0.3174128371929221, + "learning_rate": 0.0004982828136016986, + "loss": 3.207119941711426, + "step": 2916, + "token_acc": 0.27671113243503315 + }, + { + "epoch": 1.7100557021401348, + "grad_norm": 0.3066164469585881, + "learning_rate": 0.0004982799773646973, + "loss": 3.215477705001831, + "step": 2917, + "token_acc": 0.27665513150855575 + }, + { + "epoch": 1.710642040457344, + "grad_norm": 0.3430531821664205, + "learning_rate": 0.0004982771387954402, + "loss": 3.231034278869629, + "step": 2918, + "token_acc": 0.2755734179162055 + }, + { + "epoch": 1.7112283787745528, + "grad_norm": 0.3402987833114422, + "learning_rate": 0.0004982742978939538, + "loss": 3.2693495750427246, + "step": 2919, + "token_acc": 0.26985224112613027 + }, + { + "epoch": 1.711814717091762, + "grad_norm": 0.3591023537472934, + "learning_rate": 0.0004982714546602652, + "loss": 3.2044625282287598, + "step": 2920, + "token_acc": 0.2784588620548506 + }, + { + "epoch": 1.712401055408971, + "grad_norm": 0.36471490933823236, + "learning_rate": 0.0004982686090944006, + "loss": 3.245622158050537, + "step": 2921, + "token_acc": 0.2715842654949556 + }, + { + "epoch": 1.7129873937261801, + "grad_norm": 0.3080611201451875, + "learning_rate": 0.000498265761196387, + "loss": 3.2527220249176025, + "step": 2922, + "token_acc": 0.27139268046310444 + }, + { + "epoch": 1.713573732043389, + "grad_norm": 0.30586315159749394, + "learning_rate": 0.0004982629109662512, + "loss": 3.2204384803771973, + "step": 2923, + "token_acc": 0.2754876591049403 + }, + { + "epoch": 1.714160070360598, + "grad_norm": 0.3648844362231272, + "learning_rate": 0.0004982600584040197, + "loss": 3.247915744781494, + "step": 2924, + "token_acc": 0.2734142984110468 + }, + { + "epoch": 1.714746408677807, + "grad_norm": 0.36170905249156393, + "learning_rate": 0.0004982572035097197, + "loss": 3.240469455718994, + "step": 2925, + "token_acc": 0.276552030747295 + }, + { + "epoch": 1.7153327469950161, + "grad_norm": 0.297762001161725, + "learning_rate": 0.0004982543462833777, + "loss": 3.193209648132324, + "step": 2926, + "token_acc": 0.2809385873822667 + }, + { + "epoch": 1.7159190853122253, + "grad_norm": 0.30717979976635335, + "learning_rate": 0.0004982514867250206, + "loss": 3.195533275604248, + "step": 2927, + "token_acc": 0.27891429301315623 + }, + { + "epoch": 1.7165054236294341, + "grad_norm": 0.31617972081210305, + "learning_rate": 0.0004982486248346755, + "loss": 3.278057098388672, + "step": 2928, + "token_acc": 0.26764444491188955 + }, + { + "epoch": 1.7170917619466433, + "grad_norm": 0.3616189517132931, + "learning_rate": 0.0004982457606123689, + "loss": 3.2243564128875732, + "step": 2929, + "token_acc": 0.2732860823090627 + }, + { + "epoch": 1.7176781002638521, + "grad_norm": 0.33887948444764937, + "learning_rate": 0.000498242894058128, + "loss": 3.2172412872314453, + "step": 2930, + "token_acc": 0.277730658318812 + }, + { + "epoch": 1.7182644385810613, + "grad_norm": 0.3084703335401283, + "learning_rate": 0.0004982400251719796, + "loss": 3.211422920227051, + "step": 2931, + "token_acc": 0.2781537131683846 + }, + { + "epoch": 1.7188507768982704, + "grad_norm": 0.3502064730269053, + "learning_rate": 0.0004982371539539506, + "loss": 3.256725788116455, + "step": 2932, + "token_acc": 0.2708511389632389 + }, + { + "epoch": 1.7194371152154795, + "grad_norm": 0.3503760175185083, + "learning_rate": 0.0004982342804040681, + "loss": 3.260897159576416, + "step": 2933, + "token_acc": 0.27022671050236136 + }, + { + "epoch": 1.7200234535326884, + "grad_norm": 0.2953753550243247, + "learning_rate": 0.000498231404522359, + "loss": 3.278717517852783, + "step": 2934, + "token_acc": 0.26924270599551137 + }, + { + "epoch": 1.7206097918498973, + "grad_norm": 0.3074354124223109, + "learning_rate": 0.0004982285263088504, + "loss": 3.2559940814971924, + "step": 2935, + "token_acc": 0.27261939233489835 + }, + { + "epoch": 1.7211961301671064, + "grad_norm": 0.29029054434071005, + "learning_rate": 0.0004982256457635693, + "loss": 3.1864242553710938, + "step": 2936, + "token_acc": 0.2801218977187265 + }, + { + "epoch": 1.7217824684843155, + "grad_norm": 0.31093496511605156, + "learning_rate": 0.0004982227628865427, + "loss": 3.1821141242980957, + "step": 2937, + "token_acc": 0.2805667303566554 + }, + { + "epoch": 1.7223688068015246, + "grad_norm": 0.35310803622015535, + "learning_rate": 0.0004982198776777978, + "loss": 3.2258875370025635, + "step": 2938, + "token_acc": 0.2781626981789755 + }, + { + "epoch": 1.7229551451187335, + "grad_norm": 0.396046557807798, + "learning_rate": 0.0004982169901373615, + "loss": 3.237213611602783, + "step": 2939, + "token_acc": 0.27376153710996043 + }, + { + "epoch": 1.7235414834359424, + "grad_norm": 0.45557161486600006, + "learning_rate": 0.0004982141002652611, + "loss": 3.2402782440185547, + "step": 2940, + "token_acc": 0.2728206814352061 + }, + { + "epoch": 1.7241278217531515, + "grad_norm": 0.43608211406194136, + "learning_rate": 0.0004982112080615238, + "loss": 3.2449405193328857, + "step": 2941, + "token_acc": 0.2716437274809101 + }, + { + "epoch": 1.7247141600703606, + "grad_norm": 0.40080783312829255, + "learning_rate": 0.0004982083135261765, + "loss": 3.2063965797424316, + "step": 2942, + "token_acc": 0.27758305941375455 + }, + { + "epoch": 1.7253004983875697, + "grad_norm": 0.374390420989693, + "learning_rate": 0.0004982054166592466, + "loss": 3.284343719482422, + "step": 2943, + "token_acc": 0.2683870367546922 + }, + { + "epoch": 1.7258868367047786, + "grad_norm": 0.39281511774799016, + "learning_rate": 0.0004982025174607614, + "loss": 3.2126054763793945, + "step": 2944, + "token_acc": 0.27706377895746587 + }, + { + "epoch": 1.7264731750219877, + "grad_norm": 0.37342900560415854, + "learning_rate": 0.0004981996159307479, + "loss": 3.201507091522217, + "step": 2945, + "token_acc": 0.27888671336022813 + }, + { + "epoch": 1.7270595133391966, + "grad_norm": 0.36786987732230925, + "learning_rate": 0.0004981967120692335, + "loss": 3.1971688270568848, + "step": 2946, + "token_acc": 0.2782797977162704 + }, + { + "epoch": 1.7276458516564057, + "grad_norm": 0.35235560316059217, + "learning_rate": 0.0004981938058762454, + "loss": 3.272752046585083, + "step": 2947, + "token_acc": 0.27078533756658185 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.38353194333142954, + "learning_rate": 0.000498190897351811, + "loss": 3.263674259185791, + "step": 2948, + "token_acc": 0.27082498008237826 + }, + { + "epoch": 1.728818528290824, + "grad_norm": 0.37371185207299473, + "learning_rate": 0.0004981879864959575, + "loss": 3.252302646636963, + "step": 2949, + "token_acc": 0.27062768789678476 + }, + { + "epoch": 1.7294048666080328, + "grad_norm": 0.3575118930210213, + "learning_rate": 0.0004981850733087123, + "loss": 3.224151134490967, + "step": 2950, + "token_acc": 0.27643006497388256 + }, + { + "epoch": 1.7299912049252417, + "grad_norm": 0.4125809343804859, + "learning_rate": 0.0004981821577901029, + "loss": 3.2398011684417725, + "step": 2951, + "token_acc": 0.27276146131805157 + }, + { + "epoch": 1.7305775432424508, + "grad_norm": 0.3574589938657644, + "learning_rate": 0.0004981792399401565, + "loss": 3.2361512184143066, + "step": 2952, + "token_acc": 0.2736039976340966 + }, + { + "epoch": 1.73116388155966, + "grad_norm": 0.387037460078306, + "learning_rate": 0.0004981763197589005, + "loss": 3.2521414756774902, + "step": 2953, + "token_acc": 0.2719322226369663 + }, + { + "epoch": 1.731750219876869, + "grad_norm": 0.4017064388643273, + "learning_rate": 0.0004981733972463624, + "loss": 3.2117247581481934, + "step": 2954, + "token_acc": 0.27719028239444254 + }, + { + "epoch": 1.732336558194078, + "grad_norm": 0.4020798108864234, + "learning_rate": 0.0004981704724025697, + "loss": 3.240279197692871, + "step": 2955, + "token_acc": 0.273643557138758 + }, + { + "epoch": 1.732922896511287, + "grad_norm": 0.3243453294327697, + "learning_rate": 0.0004981675452275497, + "loss": 3.2274320125579834, + "step": 2956, + "token_acc": 0.2757550009965279 + }, + { + "epoch": 1.733509234828496, + "grad_norm": 0.3814305758346807, + "learning_rate": 0.0004981646157213302, + "loss": 3.219722270965576, + "step": 2957, + "token_acc": 0.276751677137685 + }, + { + "epoch": 1.734095573145705, + "grad_norm": 0.37190985656625297, + "learning_rate": 0.0004981616838839384, + "loss": 3.2010815143585205, + "step": 2958, + "token_acc": 0.276917454184723 + }, + { + "epoch": 1.7346819114629142, + "grad_norm": 0.40406374133025846, + "learning_rate": 0.0004981587497154021, + "loss": 3.27656888961792, + "step": 2959, + "token_acc": 0.2693105400443379 + }, + { + "epoch": 1.7352682497801233, + "grad_norm": 0.39288688110412295, + "learning_rate": 0.0004981558132157487, + "loss": 3.265738010406494, + "step": 2960, + "token_acc": 0.2691838116513033 + }, + { + "epoch": 1.7358545880973322, + "grad_norm": 0.33378428899074625, + "learning_rate": 0.0004981528743850058, + "loss": 3.243781805038452, + "step": 2961, + "token_acc": 0.27264371766531537 + }, + { + "epoch": 1.736440926414541, + "grad_norm": 0.31061086929213516, + "learning_rate": 0.0004981499332232011, + "loss": 3.236665725708008, + "step": 2962, + "token_acc": 0.27372398743945925 + }, + { + "epoch": 1.7370272647317502, + "grad_norm": 0.3939558479365216, + "learning_rate": 0.0004981469897303621, + "loss": 3.209806442260742, + "step": 2963, + "token_acc": 0.27713399578282877 + }, + { + "epoch": 1.7376136030489593, + "grad_norm": 0.3813587229703237, + "learning_rate": 0.0004981440439065165, + "loss": 3.2112414836883545, + "step": 2964, + "token_acc": 0.2780221216954737 + }, + { + "epoch": 1.7381999413661684, + "grad_norm": 0.4030733387875492, + "learning_rate": 0.0004981410957516921, + "loss": 3.304614543914795, + "step": 2965, + "token_acc": 0.2643179317492161 + }, + { + "epoch": 1.7387862796833773, + "grad_norm": 0.4816950232488938, + "learning_rate": 0.0004981381452659163, + "loss": 3.291616678237915, + "step": 2966, + "token_acc": 0.2653495496010455 + }, + { + "epoch": 1.7393726180005862, + "grad_norm": 0.41934844139271665, + "learning_rate": 0.0004981351924492171, + "loss": 3.292178153991699, + "step": 2967, + "token_acc": 0.26680586422438224 + }, + { + "epoch": 1.7399589563177953, + "grad_norm": 0.38681918418696587, + "learning_rate": 0.0004981322373016221, + "loss": 3.202153205871582, + "step": 2968, + "token_acc": 0.27886285302054664 + }, + { + "epoch": 1.7405452946350044, + "grad_norm": 0.422892783665272, + "learning_rate": 0.0004981292798231592, + "loss": 3.2289347648620605, + "step": 2969, + "token_acc": 0.27432715331463364 + }, + { + "epoch": 1.7411316329522135, + "grad_norm": 0.3980869388031106, + "learning_rate": 0.000498126320013856, + "loss": 3.238144874572754, + "step": 2970, + "token_acc": 0.2726922395925552 + }, + { + "epoch": 1.7417179712694224, + "grad_norm": 0.3626910506076907, + "learning_rate": 0.0004981233578737404, + "loss": 3.2238550186157227, + "step": 2971, + "token_acc": 0.27277440188178975 + }, + { + "epoch": 1.7423043095866315, + "grad_norm": 0.3875155033807427, + "learning_rate": 0.0004981203934028402, + "loss": 3.222712278366089, + "step": 2972, + "token_acc": 0.27390961846493184 + }, + { + "epoch": 1.7428906479038404, + "grad_norm": 0.3623234780438531, + "learning_rate": 0.0004981174266011832, + "loss": 3.2737631797790527, + "step": 2973, + "token_acc": 0.26902172447818656 + }, + { + "epoch": 1.7434769862210495, + "grad_norm": 0.3612056195844169, + "learning_rate": 0.0004981144574687973, + "loss": 3.1621432304382324, + "step": 2974, + "token_acc": 0.28211334396376647 + }, + { + "epoch": 1.7440633245382586, + "grad_norm": 0.3463241596483429, + "learning_rate": 0.0004981114860057105, + "loss": 3.1605350971221924, + "step": 2975, + "token_acc": 0.2859252504002777 + }, + { + "epoch": 1.7446496628554677, + "grad_norm": 0.3170790506923805, + "learning_rate": 0.0004981085122119505, + "loss": 3.2323050498962402, + "step": 2976, + "token_acc": 0.27584826644184657 + }, + { + "epoch": 1.7452360011726766, + "grad_norm": 0.3320462012680574, + "learning_rate": 0.0004981055360875455, + "loss": 3.220576763153076, + "step": 2977, + "token_acc": 0.27720228954348736 + }, + { + "epoch": 1.7458223394898855, + "grad_norm": 0.2688471085552832, + "learning_rate": 0.0004981025576325232, + "loss": 3.2152085304260254, + "step": 2978, + "token_acc": 0.27587546946946434 + }, + { + "epoch": 1.7464086778070946, + "grad_norm": 0.31908683841767665, + "learning_rate": 0.0004980995768469117, + "loss": 3.237415313720703, + "step": 2979, + "token_acc": 0.274964384913529 + }, + { + "epoch": 1.7469950161243037, + "grad_norm": 0.3256698652534432, + "learning_rate": 0.0004980965937307391, + "loss": 3.2539477348327637, + "step": 2980, + "token_acc": 0.27067472111391583 + }, + { + "epoch": 1.7475813544415129, + "grad_norm": 0.29343056246799526, + "learning_rate": 0.0004980936082840333, + "loss": 3.243265390396118, + "step": 2981, + "token_acc": 0.27319400426922785 + }, + { + "epoch": 1.7481676927587217, + "grad_norm": 0.3086384803980287, + "learning_rate": 0.0004980906205068223, + "loss": 3.183326244354248, + "step": 2982, + "token_acc": 0.2786131631103447 + }, + { + "epoch": 1.7487540310759309, + "grad_norm": 0.280483968244348, + "learning_rate": 0.0004980876303991343, + "loss": 3.205498218536377, + "step": 2983, + "token_acc": 0.27788064416294034 + }, + { + "epoch": 1.7493403693931397, + "grad_norm": 0.313437214406856, + "learning_rate": 0.0004980846379609972, + "loss": 3.239558219909668, + "step": 2984, + "token_acc": 0.2710759877775039 + }, + { + "epoch": 1.7499267077103489, + "grad_norm": 0.2894197397514832, + "learning_rate": 0.0004980816431924392, + "loss": 3.2059102058410645, + "step": 2985, + "token_acc": 0.2779870493742606 + }, + { + "epoch": 1.750513046027558, + "grad_norm": 0.3634947237804513, + "learning_rate": 0.0004980786460934886, + "loss": 3.2123279571533203, + "step": 2986, + "token_acc": 0.2768251674401801 + }, + { + "epoch": 1.751099384344767, + "grad_norm": 0.41487629488616956, + "learning_rate": 0.0004980756466641733, + "loss": 3.17323637008667, + "step": 2987, + "token_acc": 0.2805083987858191 + }, + { + "epoch": 1.751685722661976, + "grad_norm": 0.37602651674794163, + "learning_rate": 0.0004980726449045217, + "loss": 3.246419668197632, + "step": 2988, + "token_acc": 0.27217933467983363 + }, + { + "epoch": 1.7522720609791849, + "grad_norm": 0.3649613881184053, + "learning_rate": 0.0004980696408145619, + "loss": 3.2143547534942627, + "step": 2989, + "token_acc": 0.27642830410262365 + }, + { + "epoch": 1.752858399296394, + "grad_norm": 0.40451866823687505, + "learning_rate": 0.0004980666343943219, + "loss": 3.160172939300537, + "step": 2990, + "token_acc": 0.28651241209950623 + }, + { + "epoch": 1.753444737613603, + "grad_norm": 0.3543439709519719, + "learning_rate": 0.0004980636256438303, + "loss": 3.200169324874878, + "step": 2991, + "token_acc": 0.27783205782423115 + }, + { + "epoch": 1.7540310759308122, + "grad_norm": 0.34117552345337554, + "learning_rate": 0.0004980606145631152, + "loss": 3.196709632873535, + "step": 2992, + "token_acc": 0.27759981695458186 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.3493708947884874, + "learning_rate": 0.0004980576011522048, + "loss": 3.2362403869628906, + "step": 2993, + "token_acc": 0.2749046391684164 + }, + { + "epoch": 1.75520375256523, + "grad_norm": 0.3373444048842763, + "learning_rate": 0.0004980545854111276, + "loss": 3.245910167694092, + "step": 2994, + "token_acc": 0.27103813615159583 + }, + { + "epoch": 1.755790090882439, + "grad_norm": 0.3148879311900756, + "learning_rate": 0.0004980515673399117, + "loss": 3.1721384525299072, + "step": 2995, + "token_acc": 0.28068246088104065 + }, + { + "epoch": 1.7563764291996482, + "grad_norm": 0.3455036237755556, + "learning_rate": 0.0004980485469385857, + "loss": 3.218928575515747, + "step": 2996, + "token_acc": 0.27588237101051577 + }, + { + "epoch": 1.7569627675168573, + "grad_norm": 0.33901707011842386, + "learning_rate": 0.0004980455242071779, + "loss": 3.2165982723236084, + "step": 2997, + "token_acc": 0.2759785650705049 + }, + { + "epoch": 1.7575491058340662, + "grad_norm": 0.3530099508895064, + "learning_rate": 0.0004980424991457165, + "loss": 3.284792423248291, + "step": 2998, + "token_acc": 0.2666016133229248 + }, + { + "epoch": 1.7581354441512753, + "grad_norm": 0.3011842054682417, + "learning_rate": 0.0004980394717542301, + "loss": 3.198941707611084, + "step": 2999, + "token_acc": 0.2764731844491242 + }, + { + "epoch": 1.7587217824684842, + "grad_norm": 0.32958241487595, + "learning_rate": 0.0004980364420327472, + "loss": 3.2457873821258545, + "step": 3000, + "token_acc": 0.2729881117425108 + }, + { + "epoch": 1.7593081207856933, + "grad_norm": 0.3342327695110001, + "learning_rate": 0.0004980334099812961, + "loss": 3.2389214038848877, + "step": 3001, + "token_acc": 0.2731218617613377 + }, + { + "epoch": 1.7598944591029024, + "grad_norm": 0.4149505862224289, + "learning_rate": 0.0004980303755999053, + "loss": 3.22603440284729, + "step": 3002, + "token_acc": 0.2764770088497469 + }, + { + "epoch": 1.7604807974201115, + "grad_norm": 0.5174176115005527, + "learning_rate": 0.0004980273388886034, + "loss": 3.2259535789489746, + "step": 3003, + "token_acc": 0.27584888683155934 + }, + { + "epoch": 1.7610671357373204, + "grad_norm": 0.5027779996337693, + "learning_rate": 0.0004980242998474188, + "loss": 3.2213692665100098, + "step": 3004, + "token_acc": 0.2749432845990985 + }, + { + "epoch": 1.7616534740545293, + "grad_norm": 0.3558314802372436, + "learning_rate": 0.0004980212584763802, + "loss": 3.221322536468506, + "step": 3005, + "token_acc": 0.27349832889563763 + }, + { + "epoch": 1.7622398123717384, + "grad_norm": 0.2938893391343505, + "learning_rate": 0.0004980182147755161, + "loss": 3.193880558013916, + "step": 3006, + "token_acc": 0.2796691907772814 + }, + { + "epoch": 1.7628261506889475, + "grad_norm": 0.37599480835141597, + "learning_rate": 0.000498015168744855, + "loss": 3.234499931335449, + "step": 3007, + "token_acc": 0.2731092993617072 + }, + { + "epoch": 1.7634124890061567, + "grad_norm": 0.32374380043964024, + "learning_rate": 0.0004980121203844257, + "loss": 3.1899890899658203, + "step": 3008, + "token_acc": 0.279889653576397 + }, + { + "epoch": 1.7639988273233655, + "grad_norm": 0.2894269404528567, + "learning_rate": 0.0004980090696942567, + "loss": 3.2286553382873535, + "step": 3009, + "token_acc": 0.2744495114006515 + }, + { + "epoch": 1.7645851656405747, + "grad_norm": 0.27614261824945624, + "learning_rate": 0.0004980060166743766, + "loss": 3.2384986877441406, + "step": 3010, + "token_acc": 0.27345347194266184 + }, + { + "epoch": 1.7651715039577835, + "grad_norm": 0.36059398725152203, + "learning_rate": 0.0004980029613248141, + "loss": 3.2385268211364746, + "step": 3011, + "token_acc": 0.27411050991965896 + }, + { + "epoch": 1.7657578422749927, + "grad_norm": 0.3075915475900472, + "learning_rate": 0.0004979999036455982, + "loss": 3.2044291496276855, + "step": 3012, + "token_acc": 0.2773535847599434 + }, + { + "epoch": 1.7663441805922018, + "grad_norm": 0.2954681841014092, + "learning_rate": 0.0004979968436367572, + "loss": 3.2330384254455566, + "step": 3013, + "token_acc": 0.2748736885571433 + }, + { + "epoch": 1.7669305189094109, + "grad_norm": 0.2984382566031914, + "learning_rate": 0.0004979937812983202, + "loss": 3.2375760078430176, + "step": 3014, + "token_acc": 0.27324742537508495 + }, + { + "epoch": 1.7675168572266198, + "grad_norm": 0.31673112672210957, + "learning_rate": 0.0004979907166303157, + "loss": 3.211059331893921, + "step": 3015, + "token_acc": 0.2781554879241415 + }, + { + "epoch": 1.7681031955438287, + "grad_norm": 0.41317290345659796, + "learning_rate": 0.0004979876496327726, + "loss": 3.2291154861450195, + "step": 3016, + "token_acc": 0.2761533581074957 + }, + { + "epoch": 1.7686895338610378, + "grad_norm": 0.39161828254612224, + "learning_rate": 0.0004979845803057196, + "loss": 3.2371697425842285, + "step": 3017, + "token_acc": 0.27153889764463024 + }, + { + "epoch": 1.7692758721782469, + "grad_norm": 0.32476241901676794, + "learning_rate": 0.0004979815086491858, + "loss": 3.21490478515625, + "step": 3018, + "token_acc": 0.27660157132791835 + }, + { + "epoch": 1.769862210495456, + "grad_norm": 0.34729100067838015, + "learning_rate": 0.0004979784346631998, + "loss": 3.2134461402893066, + "step": 3019, + "token_acc": 0.2780134174637004 + }, + { + "epoch": 1.770448548812665, + "grad_norm": 0.39552937848305697, + "learning_rate": 0.0004979753583477905, + "loss": 3.262941837310791, + "step": 3020, + "token_acc": 0.26955587835331934 + }, + { + "epoch": 1.7710348871298738, + "grad_norm": 0.47807493147682856, + "learning_rate": 0.000497972279702987, + "loss": 3.257229804992676, + "step": 3021, + "token_acc": 0.2712900262533068 + }, + { + "epoch": 1.771621225447083, + "grad_norm": 0.41934966795898143, + "learning_rate": 0.0004979691987288179, + "loss": 3.2018563747406006, + "step": 3022, + "token_acc": 0.2767373146928282 + }, + { + "epoch": 1.772207563764292, + "grad_norm": 0.3828129179671491, + "learning_rate": 0.0004979661154253125, + "loss": 3.2349252700805664, + "step": 3023, + "token_acc": 0.2732313073422437 + }, + { + "epoch": 1.7727939020815011, + "grad_norm": 0.38065321810916636, + "learning_rate": 0.0004979630297924996, + "loss": 3.2119650840759277, + "step": 3024, + "token_acc": 0.2760106165781952 + }, + { + "epoch": 1.77338024039871, + "grad_norm": 0.3645631535819651, + "learning_rate": 0.000497959941830408, + "loss": 3.207237958908081, + "step": 3025, + "token_acc": 0.2778072800396645 + }, + { + "epoch": 1.7739665787159191, + "grad_norm": 0.35671996851641835, + "learning_rate": 0.000497956851539067, + "loss": 3.261927843093872, + "step": 3026, + "token_acc": 0.2695882606904834 + }, + { + "epoch": 1.774552917033128, + "grad_norm": 0.3504982534444864, + "learning_rate": 0.0004979537589185055, + "loss": 3.21873140335083, + "step": 3027, + "token_acc": 0.27596293852068343 + }, + { + "epoch": 1.7751392553503371, + "grad_norm": 0.37743710258961743, + "learning_rate": 0.0004979506639687524, + "loss": 3.236753463745117, + "step": 3028, + "token_acc": 0.27391937578539466 + }, + { + "epoch": 1.7757255936675462, + "grad_norm": 0.3644401203938981, + "learning_rate": 0.0004979475666898371, + "loss": 3.169583320617676, + "step": 3029, + "token_acc": 0.2842915085699333 + }, + { + "epoch": 1.7763119319847553, + "grad_norm": 0.347407136563413, + "learning_rate": 0.0004979444670817886, + "loss": 3.225637435913086, + "step": 3030, + "token_acc": 0.27409282119569967 + }, + { + "epoch": 1.7768982703019642, + "grad_norm": 0.38292462153665086, + "learning_rate": 0.0004979413651446357, + "loss": 3.220942497253418, + "step": 3031, + "token_acc": 0.2744512871971779 + }, + { + "epoch": 1.7774846086191731, + "grad_norm": 0.34667861122472304, + "learning_rate": 0.0004979382608784079, + "loss": 3.2380077838897705, + "step": 3032, + "token_acc": 0.27279422405060677 + }, + { + "epoch": 1.7780709469363822, + "grad_norm": 0.38666134115973627, + "learning_rate": 0.0004979351542831343, + "loss": 3.1864466667175293, + "step": 3033, + "token_acc": 0.2786576478558085 + }, + { + "epoch": 1.7786572852535913, + "grad_norm": 0.42196850420933596, + "learning_rate": 0.0004979320453588439, + "loss": 3.2147109508514404, + "step": 3034, + "token_acc": 0.27672610711062673 + }, + { + "epoch": 1.7792436235708005, + "grad_norm": 0.4313685385812127, + "learning_rate": 0.0004979289341055661, + "loss": 3.253695011138916, + "step": 3035, + "token_acc": 0.2722565425652201 + }, + { + "epoch": 1.7798299618880093, + "grad_norm": 0.3222914467112649, + "learning_rate": 0.00049792582052333, + "loss": 3.211650848388672, + "step": 3036, + "token_acc": 0.27660722683504096 + }, + { + "epoch": 1.7804163002052185, + "grad_norm": 0.34281298595103366, + "learning_rate": 0.000497922704612165, + "loss": 3.200486421585083, + "step": 3037, + "token_acc": 0.27955332169890557 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.4052020324178503, + "learning_rate": 0.0004979195863721002, + "loss": 3.238013505935669, + "step": 3038, + "token_acc": 0.2738013424344672 + }, + { + "epoch": 1.7815889768396365, + "grad_norm": 0.35748763093628994, + "learning_rate": 0.000497916465803165, + "loss": 3.2003164291381836, + "step": 3039, + "token_acc": 0.2768867172283616 + }, + { + "epoch": 1.7821753151568456, + "grad_norm": 0.3014288961182022, + "learning_rate": 0.0004979133429053885, + "loss": 3.2241785526275635, + "step": 3040, + "token_acc": 0.27481671377993505 + }, + { + "epoch": 1.7827616534740547, + "grad_norm": 0.3653977046753094, + "learning_rate": 0.0004979102176788005, + "loss": 3.227954864501953, + "step": 3041, + "token_acc": 0.2754926751156627 + }, + { + "epoch": 1.7833479917912636, + "grad_norm": 0.35387790311194056, + "learning_rate": 0.0004979070901234299, + "loss": 3.2072231769561768, + "step": 3042, + "token_acc": 0.27862468550612235 + }, + { + "epoch": 1.7839343301084725, + "grad_norm": 0.3134796758440765, + "learning_rate": 0.0004979039602393063, + "loss": 3.264094591140747, + "step": 3043, + "token_acc": 0.27295852928601966 + }, + { + "epoch": 1.7845206684256816, + "grad_norm": 0.3143294474850163, + "learning_rate": 0.0004979008280264589, + "loss": 3.2129220962524414, + "step": 3044, + "token_acc": 0.2758149070902107 + }, + { + "epoch": 1.7851070067428907, + "grad_norm": 0.35473493844366777, + "learning_rate": 0.0004978976934849176, + "loss": 3.2005584239959717, + "step": 3045, + "token_acc": 0.2785615246340467 + }, + { + "epoch": 1.7856933450600998, + "grad_norm": 0.4002865901052259, + "learning_rate": 0.0004978945566147112, + "loss": 3.233079433441162, + "step": 3046, + "token_acc": 0.27495956751226297 + }, + { + "epoch": 1.7862796833773087, + "grad_norm": 0.30202097569773967, + "learning_rate": 0.0004978914174158696, + "loss": 3.2649312019348145, + "step": 3047, + "token_acc": 0.27042915309661697 + }, + { + "epoch": 1.7868660216945176, + "grad_norm": 0.38121081176551214, + "learning_rate": 0.0004978882758884222, + "loss": 3.256307601928711, + "step": 3048, + "token_acc": 0.2716579923283834 + }, + { + "epoch": 1.7874523600117267, + "grad_norm": 0.4238585904359199, + "learning_rate": 0.0004978851320323984, + "loss": 3.185225009918213, + "step": 3049, + "token_acc": 0.28061716981084023 + }, + { + "epoch": 1.7880386983289358, + "grad_norm": 0.3923459511441045, + "learning_rate": 0.0004978819858478279, + "loss": 3.2211811542510986, + "step": 3050, + "token_acc": 0.2754796617163871 + }, + { + "epoch": 1.788625036646145, + "grad_norm": 0.3425226437079169, + "learning_rate": 0.0004978788373347401, + "loss": 3.2133331298828125, + "step": 3051, + "token_acc": 0.277002285614862 + }, + { + "epoch": 1.7892113749633538, + "grad_norm": 0.3931444647559363, + "learning_rate": 0.0004978756864931647, + "loss": 3.2010340690612793, + "step": 3052, + "token_acc": 0.2777529413047508 + }, + { + "epoch": 1.789797713280563, + "grad_norm": 0.3919612233516763, + "learning_rate": 0.0004978725333231312, + "loss": 3.1898512840270996, + "step": 3053, + "token_acc": 0.28037795831861534 + }, + { + "epoch": 1.7903840515977718, + "grad_norm": 0.29107398335687057, + "learning_rate": 0.0004978693778246692, + "loss": 3.254596710205078, + "step": 3054, + "token_acc": 0.26990290753697266 + }, + { + "epoch": 1.790970389914981, + "grad_norm": 0.37187333483605906, + "learning_rate": 0.0004978662199978086, + "loss": 3.1810340881347656, + "step": 3055, + "token_acc": 0.28070395173389684 + }, + { + "epoch": 1.79155672823219, + "grad_norm": 0.35712438029223637, + "learning_rate": 0.0004978630598425787, + "loss": 3.241107225418091, + "step": 3056, + "token_acc": 0.27391042928287107 + }, + { + "epoch": 1.7921430665493991, + "grad_norm": 0.332295773026044, + "learning_rate": 0.0004978598973590094, + "loss": 3.180257797241211, + "step": 3057, + "token_acc": 0.28189725880132616 + }, + { + "epoch": 1.792729404866608, + "grad_norm": 0.3312439495567886, + "learning_rate": 0.0004978567325471303, + "loss": 3.1745986938476562, + "step": 3058, + "token_acc": 0.28338573932590283 + }, + { + "epoch": 1.793315743183817, + "grad_norm": 0.3739503612058253, + "learning_rate": 0.0004978535654069713, + "loss": 3.2107272148132324, + "step": 3059, + "token_acc": 0.27865870035625423 + }, + { + "epoch": 1.793902081501026, + "grad_norm": 0.36391710654284615, + "learning_rate": 0.000497850395938562, + "loss": 3.268104076385498, + "step": 3060, + "token_acc": 0.26943851676041575 + }, + { + "epoch": 1.7944884198182351, + "grad_norm": 0.3584543997716403, + "learning_rate": 0.0004978472241419322, + "loss": 3.2370619773864746, + "step": 3061, + "token_acc": 0.273876679695154 + }, + { + "epoch": 1.7950747581354443, + "grad_norm": 0.36340880898925376, + "learning_rate": 0.0004978440500171117, + "loss": 3.2221646308898926, + "step": 3062, + "token_acc": 0.2748442656311163 + }, + { + "epoch": 1.7956610964526531, + "grad_norm": 0.3873339414519027, + "learning_rate": 0.0004978408735641304, + "loss": 3.2930383682250977, + "step": 3063, + "token_acc": 0.263939074022676 + }, + { + "epoch": 1.7962474347698623, + "grad_norm": 0.41482617303419167, + "learning_rate": 0.0004978376947830179, + "loss": 3.2683162689208984, + "step": 3064, + "token_acc": 0.271600435604504 + }, + { + "epoch": 1.7968337730870712, + "grad_norm": 0.34954121883263045, + "learning_rate": 0.0004978345136738043, + "loss": 3.1729214191436768, + "step": 3065, + "token_acc": 0.2827161301293707 + }, + { + "epoch": 1.7974201114042803, + "grad_norm": 0.3103495483398344, + "learning_rate": 0.0004978313302365195, + "loss": 3.2378652095794678, + "step": 3066, + "token_acc": 0.2730559608617286 + }, + { + "epoch": 1.7980064497214894, + "grad_norm": 0.3955304867874588, + "learning_rate": 0.0004978281444711932, + "loss": 3.225979804992676, + "step": 3067, + "token_acc": 0.27478845509383376 + }, + { + "epoch": 1.7985927880386985, + "grad_norm": 0.38769879984135847, + "learning_rate": 0.0004978249563778555, + "loss": 3.263969898223877, + "step": 3068, + "token_acc": 0.27092229488245784 + }, + { + "epoch": 1.7991791263559074, + "grad_norm": 0.3427366015288133, + "learning_rate": 0.0004978217659565362, + "loss": 3.2790961265563965, + "step": 3069, + "token_acc": 0.2684529919147174 + }, + { + "epoch": 1.7997654646731163, + "grad_norm": 0.3509933173153281, + "learning_rate": 0.0004978185732072654, + "loss": 3.202017307281494, + "step": 3070, + "token_acc": 0.2771232904632819 + }, + { + "epoch": 1.8003518029903254, + "grad_norm": 0.36994098563812267, + "learning_rate": 0.000497815378130073, + "loss": 3.286817789077759, + "step": 3071, + "token_acc": 0.26782564353396016 + }, + { + "epoch": 1.8009381413075345, + "grad_norm": 0.3485177362244053, + "learning_rate": 0.0004978121807249892, + "loss": 3.222672700881958, + "step": 3072, + "token_acc": 0.2750010011880765 + }, + { + "epoch": 1.8015244796247436, + "grad_norm": 0.3779147757934785, + "learning_rate": 0.0004978089809920438, + "loss": 3.2124924659729004, + "step": 3073, + "token_acc": 0.2767968756168104 + }, + { + "epoch": 1.8021108179419525, + "grad_norm": 0.3769631519507005, + "learning_rate": 0.0004978057789312669, + "loss": 3.2211883068084717, + "step": 3074, + "token_acc": 0.27653093732759715 + }, + { + "epoch": 1.8026971562591614, + "grad_norm": 0.32871657841052576, + "learning_rate": 0.0004978025745426887, + "loss": 3.204349994659424, + "step": 3075, + "token_acc": 0.2775408474106896 + }, + { + "epoch": 1.8032834945763705, + "grad_norm": 0.3693189279777911, + "learning_rate": 0.0004977993678263392, + "loss": 3.2038631439208984, + "step": 3076, + "token_acc": 0.2768474685782168 + }, + { + "epoch": 1.8038698328935796, + "grad_norm": 0.3616413701092867, + "learning_rate": 0.0004977961587822486, + "loss": 3.261770248413086, + "step": 3077, + "token_acc": 0.26977999392347146 + }, + { + "epoch": 1.8044561712107887, + "grad_norm": 0.37639487686938766, + "learning_rate": 0.0004977929474104469, + "loss": 3.207293748855591, + "step": 3078, + "token_acc": 0.27639445032550386 + }, + { + "epoch": 1.8050425095279976, + "grad_norm": 0.38854222186616394, + "learning_rate": 0.0004977897337109645, + "loss": 3.2256155014038086, + "step": 3079, + "token_acc": 0.2758401944304665 + }, + { + "epoch": 1.8056288478452067, + "grad_norm": 0.3590370406285352, + "learning_rate": 0.0004977865176838313, + "loss": 3.2482213973999023, + "step": 3080, + "token_acc": 0.27238748885571246 + }, + { + "epoch": 1.8062151861624156, + "grad_norm": 0.3213123242580486, + "learning_rate": 0.0004977832993290777, + "loss": 3.232754707336426, + "step": 3081, + "token_acc": 0.27341134579097937 + }, + { + "epoch": 1.8068015244796247, + "grad_norm": 0.33814738594025034, + "learning_rate": 0.0004977800786467341, + "loss": 3.2238521575927734, + "step": 3082, + "token_acc": 0.27429833512005475 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.32725126925089004, + "learning_rate": 0.0004977768556368303, + "loss": 3.2200348377227783, + "step": 3083, + "token_acc": 0.275874671539662 + }, + { + "epoch": 1.807974201114043, + "grad_norm": 0.31065859550059544, + "learning_rate": 0.000497773630299397, + "loss": 3.1588001251220703, + "step": 3084, + "token_acc": 0.2832840854511663 + }, + { + "epoch": 1.8085605394312518, + "grad_norm": 0.29210071321745845, + "learning_rate": 0.0004977704026344642, + "loss": 3.2345504760742188, + "step": 3085, + "token_acc": 0.27349048414023375 + }, + { + "epoch": 1.8091468777484607, + "grad_norm": 0.3601742266796131, + "learning_rate": 0.0004977671726420623, + "loss": 3.2008392810821533, + "step": 3086, + "token_acc": 0.27847882000944835 + }, + { + "epoch": 1.8097332160656698, + "grad_norm": 0.359623803216631, + "learning_rate": 0.0004977639403222217, + "loss": 3.1807808876037598, + "step": 3087, + "token_acc": 0.2804770665482459 + }, + { + "epoch": 1.810319554382879, + "grad_norm": 0.30111071184533417, + "learning_rate": 0.0004977607056749729, + "loss": 3.2510619163513184, + "step": 3088, + "token_acc": 0.27215787192977325 + }, + { + "epoch": 1.810905892700088, + "grad_norm": 0.3122434450489425, + "learning_rate": 0.000497757468700346, + "loss": 3.160186767578125, + "step": 3089, + "token_acc": 0.283549929843818 + }, + { + "epoch": 1.811492231017297, + "grad_norm": 0.33903808851879147, + "learning_rate": 0.0004977542293983716, + "loss": 3.199387788772583, + "step": 3090, + "token_acc": 0.2782477213610506 + }, + { + "epoch": 1.812078569334506, + "grad_norm": 0.3210425491487077, + "learning_rate": 0.00049775098776908, + "loss": 3.266819953918457, + "step": 3091, + "token_acc": 0.26917340043320925 + }, + { + "epoch": 1.812664907651715, + "grad_norm": 0.28165244100059494, + "learning_rate": 0.0004977477438125018, + "loss": 3.2057886123657227, + "step": 3092, + "token_acc": 0.27883147912303047 + }, + { + "epoch": 1.813251245968924, + "grad_norm": 0.28346057851784295, + "learning_rate": 0.0004977444975286674, + "loss": 3.156052589416504, + "step": 3093, + "token_acc": 0.2830984019928746 + }, + { + "epoch": 1.8138375842861332, + "grad_norm": 0.2946439922573313, + "learning_rate": 0.0004977412489176072, + "loss": 3.204800844192505, + "step": 3094, + "token_acc": 0.278671615486449 + }, + { + "epoch": 1.8144239226033423, + "grad_norm": 0.3002826924190573, + "learning_rate": 0.0004977379979793518, + "loss": 3.2153425216674805, + "step": 3095, + "token_acc": 0.2761555647718303 + }, + { + "epoch": 1.8150102609205512, + "grad_norm": 0.28175738293552754, + "learning_rate": 0.0004977347447139318, + "loss": 3.2161903381347656, + "step": 3096, + "token_acc": 0.27703055419427697 + }, + { + "epoch": 1.81559659923776, + "grad_norm": 0.3002849633352072, + "learning_rate": 0.0004977314891213777, + "loss": 3.184248685836792, + "step": 3097, + "token_acc": 0.28247722675375586 + }, + { + "epoch": 1.8161829375549692, + "grad_norm": 0.32384403742729295, + "learning_rate": 0.00049772823120172, + "loss": 3.185739278793335, + "step": 3098, + "token_acc": 0.2805444039329762 + }, + { + "epoch": 1.8167692758721783, + "grad_norm": 0.36971157761720524, + "learning_rate": 0.0004977249709549894, + "loss": 3.251883029937744, + "step": 3099, + "token_acc": 0.2707999269026544 + }, + { + "epoch": 1.8173556141893874, + "grad_norm": 0.4353494644545547, + "learning_rate": 0.0004977217083812167, + "loss": 3.2433829307556152, + "step": 3100, + "token_acc": 0.2732780866571816 + }, + { + "epoch": 1.8179419525065963, + "grad_norm": 0.49033673586899357, + "learning_rate": 0.0004977184434804321, + "loss": 3.263723611831665, + "step": 3101, + "token_acc": 0.2704251105476036 + }, + { + "epoch": 1.8185282908238052, + "grad_norm": 0.454816597428356, + "learning_rate": 0.0004977151762526667, + "loss": 3.226797342300415, + "step": 3102, + "token_acc": 0.2740893792649163 + }, + { + "epoch": 1.8191146291410143, + "grad_norm": 0.39294971706698373, + "learning_rate": 0.000497711906697951, + "loss": 3.2254514694213867, + "step": 3103, + "token_acc": 0.27457616346505237 + }, + { + "epoch": 1.8197009674582234, + "grad_norm": 0.37092518686667825, + "learning_rate": 0.0004977086348163156, + "loss": 3.216805934906006, + "step": 3104, + "token_acc": 0.2755943793147852 + }, + { + "epoch": 1.8202873057754325, + "grad_norm": 0.3251317748581824, + "learning_rate": 0.0004977053606077914, + "loss": 3.179370164871216, + "step": 3105, + "token_acc": 0.2816592108406263 + }, + { + "epoch": 1.8208736440926414, + "grad_norm": 0.3048378568310232, + "learning_rate": 0.0004977020840724093, + "loss": 3.2183425426483154, + "step": 3106, + "token_acc": 0.2763232101813229 + }, + { + "epoch": 1.8214599824098505, + "grad_norm": 0.32228526010036945, + "learning_rate": 0.0004976988052101998, + "loss": 3.226909875869751, + "step": 3107, + "token_acc": 0.27474221079802924 + }, + { + "epoch": 1.8220463207270594, + "grad_norm": 0.36196240028810733, + "learning_rate": 0.0004976955240211938, + "loss": 3.168827772140503, + "step": 3108, + "token_acc": 0.28185230126703725 + }, + { + "epoch": 1.8226326590442685, + "grad_norm": 0.2974113209380026, + "learning_rate": 0.0004976922405054221, + "loss": 3.19718861579895, + "step": 3109, + "token_acc": 0.27764419006333185 + }, + { + "epoch": 1.8232189973614776, + "grad_norm": 0.3524302770314917, + "learning_rate": 0.0004976889546629156, + "loss": 3.2423512935638428, + "step": 3110, + "token_acc": 0.27441172564816285 + }, + { + "epoch": 1.8238053356786867, + "grad_norm": 0.40329508811407877, + "learning_rate": 0.0004976856664937052, + "loss": 3.1913347244262695, + "step": 3111, + "token_acc": 0.2788196788052154 + }, + { + "epoch": 1.8243916739958956, + "grad_norm": 0.4083376644570348, + "learning_rate": 0.0004976823759978216, + "loss": 3.2298972606658936, + "step": 3112, + "token_acc": 0.27421645931270333 + }, + { + "epoch": 1.8249780123131045, + "grad_norm": 0.33399227257877573, + "learning_rate": 0.0004976790831752959, + "loss": 3.223026752471924, + "step": 3113, + "token_acc": 0.2787739365326194 + }, + { + "epoch": 1.8255643506303136, + "grad_norm": 0.3135540725941951, + "learning_rate": 0.0004976757880261589, + "loss": 3.1948225498199463, + "step": 3114, + "token_acc": 0.27814315253120386 + }, + { + "epoch": 1.8261506889475227, + "grad_norm": 0.3157399998244445, + "learning_rate": 0.0004976724905504417, + "loss": 3.209144353866577, + "step": 3115, + "token_acc": 0.2771497244097653 + }, + { + "epoch": 1.8267370272647319, + "grad_norm": 0.37116510360062466, + "learning_rate": 0.0004976691907481751, + "loss": 3.211979389190674, + "step": 3116, + "token_acc": 0.27812151386191797 + }, + { + "epoch": 1.8273233655819408, + "grad_norm": 0.33255809348491844, + "learning_rate": 0.0004976658886193903, + "loss": 3.221564292907715, + "step": 3117, + "token_acc": 0.27540251236204577 + }, + { + "epoch": 1.8279097038991499, + "grad_norm": 0.3577638023527115, + "learning_rate": 0.0004976625841641182, + "loss": 3.183070182800293, + "step": 3118, + "token_acc": 0.2795035128068625 + }, + { + "epoch": 1.8284960422163588, + "grad_norm": 0.32327820005131086, + "learning_rate": 0.0004976592773823899, + "loss": 3.1644632816314697, + "step": 3119, + "token_acc": 0.28111199613491245 + }, + { + "epoch": 1.8290823805335679, + "grad_norm": 0.31164091685798206, + "learning_rate": 0.0004976559682742362, + "loss": 3.2378828525543213, + "step": 3120, + "token_acc": 0.27225829698367626 + }, + { + "epoch": 1.829668718850777, + "grad_norm": 0.3598986517359024, + "learning_rate": 0.0004976526568396886, + "loss": 3.190952777862549, + "step": 3121, + "token_acc": 0.2796077458581615 + }, + { + "epoch": 1.830255057167986, + "grad_norm": 0.3565411530991297, + "learning_rate": 0.0004976493430787778, + "loss": 3.2498016357421875, + "step": 3122, + "token_acc": 0.2700043661766846 + }, + { + "epoch": 1.830841395485195, + "grad_norm": 0.34174331138259756, + "learning_rate": 0.0004976460269915353, + "loss": 3.2153537273406982, + "step": 3123, + "token_acc": 0.27680852064082406 + }, + { + "epoch": 1.8314277338024039, + "grad_norm": 0.3360686714554351, + "learning_rate": 0.0004976427085779921, + "loss": 3.138901710510254, + "step": 3124, + "token_acc": 0.2855392757387171 + }, + { + "epoch": 1.832014072119613, + "grad_norm": 0.3742577982378195, + "learning_rate": 0.0004976393878381793, + "loss": 3.2174904346466064, + "step": 3125, + "token_acc": 0.27462398884525874 + }, + { + "epoch": 1.832600410436822, + "grad_norm": 0.3871094526962964, + "learning_rate": 0.0004976360647721282, + "loss": 3.229483127593994, + "step": 3126, + "token_acc": 0.2748356186667926 + }, + { + "epoch": 1.8331867487540312, + "grad_norm": 0.3828959085048492, + "learning_rate": 0.0004976327393798699, + "loss": 3.2417168617248535, + "step": 3127, + "token_acc": 0.27355023194734757 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.3533370353116019, + "learning_rate": 0.0004976294116614357, + "loss": 3.26116943359375, + "step": 3128, + "token_acc": 0.26840127678014647 + }, + { + "epoch": 1.834359425388449, + "grad_norm": 0.3433605238772967, + "learning_rate": 0.0004976260816168569, + "loss": 3.19572377204895, + "step": 3129, + "token_acc": 0.27880398424143316 + }, + { + "epoch": 1.834945763705658, + "grad_norm": 0.36283551681434834, + "learning_rate": 0.0004976227492461648, + "loss": 3.1815576553344727, + "step": 3130, + "token_acc": 0.278300563063353 + }, + { + "epoch": 1.8355321020228672, + "grad_norm": 0.36029785390642, + "learning_rate": 0.0004976194145493905, + "loss": 3.2210793495178223, + "step": 3131, + "token_acc": 0.2748400469320961 + }, + { + "epoch": 1.8361184403400763, + "grad_norm": 0.329458886445964, + "learning_rate": 0.0004976160775265656, + "loss": 3.2189884185791016, + "step": 3132, + "token_acc": 0.2786865449084976 + }, + { + "epoch": 1.8367047786572852, + "grad_norm": 0.29352646232593516, + "learning_rate": 0.0004976127381777212, + "loss": 3.2372422218322754, + "step": 3133, + "token_acc": 0.2733540275459099 + }, + { + "epoch": 1.8372911169744943, + "grad_norm": 0.394873313097862, + "learning_rate": 0.0004976093965028889, + "loss": 3.2360429763793945, + "step": 3134, + "token_acc": 0.2746922700686788 + }, + { + "epoch": 1.8378774552917032, + "grad_norm": 0.345928833865739, + "learning_rate": 0.0004976060525020999, + "loss": 3.2294840812683105, + "step": 3135, + "token_acc": 0.2746953815746591 + }, + { + "epoch": 1.8384637936089123, + "grad_norm": 0.40877328884882363, + "learning_rate": 0.0004976027061753857, + "loss": 3.214244842529297, + "step": 3136, + "token_acc": 0.27532783852667925 + }, + { + "epoch": 1.8390501319261214, + "grad_norm": 0.39616700982536857, + "learning_rate": 0.0004975993575227777, + "loss": 3.2372498512268066, + "step": 3137, + "token_acc": 0.273489540568023 + }, + { + "epoch": 1.8396364702433305, + "grad_norm": 0.36848350483044756, + "learning_rate": 0.0004975960065443075, + "loss": 3.203279495239258, + "step": 3138, + "token_acc": 0.2778041495546429 + }, + { + "epoch": 1.8402228085605394, + "grad_norm": 0.336260306085189, + "learning_rate": 0.0004975926532400064, + "loss": 3.210517406463623, + "step": 3139, + "token_acc": 0.277127392560726 + }, + { + "epoch": 1.8408091468777483, + "grad_norm": 0.36663515411519815, + "learning_rate": 0.0004975892976099059, + "loss": 3.1943471431732178, + "step": 3140, + "token_acc": 0.28032813416145114 + }, + { + "epoch": 1.8413954851949574, + "grad_norm": 0.35390681764015564, + "learning_rate": 0.0004975859396540377, + "loss": 3.2373294830322266, + "step": 3141, + "token_acc": 0.27316110367576485 + }, + { + "epoch": 1.8419818235121665, + "grad_norm": 0.32382782853981074, + "learning_rate": 0.0004975825793724332, + "loss": 3.2416231632232666, + "step": 3142, + "token_acc": 0.27347788607452 + }, + { + "epoch": 1.8425681618293757, + "grad_norm": 0.3183885213508255, + "learning_rate": 0.0004975792167651238, + "loss": 3.177006244659424, + "step": 3143, + "token_acc": 0.28109130970125074 + }, + { + "epoch": 1.8431545001465846, + "grad_norm": 0.3500263407698211, + "learning_rate": 0.0004975758518321414, + "loss": 3.2232465744018555, + "step": 3144, + "token_acc": 0.27532448967165857 + }, + { + "epoch": 1.8437408384637937, + "grad_norm": 0.33021603173329733, + "learning_rate": 0.0004975724845735175, + "loss": 3.2225494384765625, + "step": 3145, + "token_acc": 0.2751031157309819 + }, + { + "epoch": 1.8443271767810026, + "grad_norm": 0.3278126409064888, + "learning_rate": 0.0004975691149892837, + "loss": 3.2059528827667236, + "step": 3146, + "token_acc": 0.27730124548344015 + }, + { + "epoch": 1.8449135150982117, + "grad_norm": 0.28326440291802535, + "learning_rate": 0.0004975657430794717, + "loss": 3.2390079498291016, + "step": 3147, + "token_acc": 0.2727793448265328 + }, + { + "epoch": 1.8454998534154208, + "grad_norm": 0.357141450133721, + "learning_rate": 0.0004975623688441131, + "loss": 3.197248935699463, + "step": 3148, + "token_acc": 0.27968242796062653 + }, + { + "epoch": 1.8460861917326299, + "grad_norm": 0.33195813811078523, + "learning_rate": 0.0004975589922832398, + "loss": 3.212507963180542, + "step": 3149, + "token_acc": 0.27692869693173255 + }, + { + "epoch": 1.8466725300498388, + "grad_norm": 0.3166989707665105, + "learning_rate": 0.0004975556133968832, + "loss": 3.1847949028015137, + "step": 3150, + "token_acc": 0.2811813914505676 + }, + { + "epoch": 1.8472588683670477, + "grad_norm": 0.2924435637348427, + "learning_rate": 0.0004975522321850752, + "loss": 3.214052438735962, + "step": 3151, + "token_acc": 0.2773872670273416 + }, + { + "epoch": 1.8478452066842568, + "grad_norm": 0.3119761471269352, + "learning_rate": 0.0004975488486478475, + "loss": 3.2143592834472656, + "step": 3152, + "token_acc": 0.27675271861595807 + }, + { + "epoch": 1.848431545001466, + "grad_norm": 0.3519916400630215, + "learning_rate": 0.0004975454627852321, + "loss": 3.23665714263916, + "step": 3153, + "token_acc": 0.27315646328321447 + }, + { + "epoch": 1.849017883318675, + "grad_norm": 0.41923734578491395, + "learning_rate": 0.0004975420745972606, + "loss": 3.219377040863037, + "step": 3154, + "token_acc": 0.27474468570948635 + }, + { + "epoch": 1.849604221635884, + "grad_norm": 0.3769761861283405, + "learning_rate": 0.0004975386840839648, + "loss": 3.2280220985412598, + "step": 3155, + "token_acc": 0.27319603624477634 + }, + { + "epoch": 1.8501905599530928, + "grad_norm": 0.3133650837281849, + "learning_rate": 0.0004975352912453766, + "loss": 3.2121810913085938, + "step": 3156, + "token_acc": 0.2747660690521095 + }, + { + "epoch": 1.850776898270302, + "grad_norm": 0.3138653934763373, + "learning_rate": 0.0004975318960815279, + "loss": 3.2204790115356445, + "step": 3157, + "token_acc": 0.27674664134870053 + }, + { + "epoch": 1.851363236587511, + "grad_norm": 0.3314318357157845, + "learning_rate": 0.0004975284985924508, + "loss": 3.187467098236084, + "step": 3158, + "token_acc": 0.27793669414669947 + }, + { + "epoch": 1.8519495749047201, + "grad_norm": 0.3029753657116342, + "learning_rate": 0.0004975250987781768, + "loss": 3.230639696121216, + "step": 3159, + "token_acc": 0.2733948934060486 + }, + { + "epoch": 1.852535913221929, + "grad_norm": 0.3524581173414202, + "learning_rate": 0.0004975216966387381, + "loss": 3.194685459136963, + "step": 3160, + "token_acc": 0.28047108193543413 + }, + { + "epoch": 1.8531222515391381, + "grad_norm": 0.3390248238269578, + "learning_rate": 0.0004975182921741667, + "loss": 3.239321708679199, + "step": 3161, + "token_acc": 0.27347459941002317 + }, + { + "epoch": 1.853708589856347, + "grad_norm": 0.3091468364062924, + "learning_rate": 0.0004975148853844944, + "loss": 3.18222713470459, + "step": 3162, + "token_acc": 0.2815449366550463 + }, + { + "epoch": 1.8542949281735561, + "grad_norm": 0.29815243038850664, + "learning_rate": 0.0004975114762697531, + "loss": 3.23945689201355, + "step": 3163, + "token_acc": 0.27493044969260483 + }, + { + "epoch": 1.8548812664907652, + "grad_norm": 0.3215585847225152, + "learning_rate": 0.0004975080648299753, + "loss": 3.2652814388275146, + "step": 3164, + "token_acc": 0.2698445759784858 + }, + { + "epoch": 1.8554676048079743, + "grad_norm": 0.34649803599119156, + "learning_rate": 0.0004975046510651926, + "loss": 3.2273406982421875, + "step": 3165, + "token_acc": 0.2739656338930178 + }, + { + "epoch": 1.8560539431251832, + "grad_norm": 0.31690377964329614, + "learning_rate": 0.0004975012349754372, + "loss": 3.184079170227051, + "step": 3166, + "token_acc": 0.2797838662513508 + }, + { + "epoch": 1.8566402814423921, + "grad_norm": 0.3347235605681407, + "learning_rate": 0.0004974978165607412, + "loss": 3.1979713439941406, + "step": 3167, + "token_acc": 0.2777996040057344 + }, + { + "epoch": 1.8572266197596012, + "grad_norm": 0.31187408164306907, + "learning_rate": 0.0004974943958211368, + "loss": 3.190084457397461, + "step": 3168, + "token_acc": 0.2802953650082883 + }, + { + "epoch": 1.8578129580768104, + "grad_norm": 0.30808743707477093, + "learning_rate": 0.000497490972756656, + "loss": 3.1951777935028076, + "step": 3169, + "token_acc": 0.27846077751488946 + }, + { + "epoch": 1.8583992963940195, + "grad_norm": 0.28781709449685455, + "learning_rate": 0.0004974875473673311, + "loss": 3.2095983028411865, + "step": 3170, + "token_acc": 0.2769773038972405 + }, + { + "epoch": 1.8589856347112284, + "grad_norm": 0.2667868718073247, + "learning_rate": 0.0004974841196531941, + "loss": 3.1729869842529297, + "step": 3171, + "token_acc": 0.28227593877273416 + }, + { + "epoch": 1.8595719730284375, + "grad_norm": 0.36169384867439086, + "learning_rate": 0.0004974806896142773, + "loss": 3.21612548828125, + "step": 3172, + "token_acc": 0.27613407537776086 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.3787721236086635, + "learning_rate": 0.0004974772572506129, + "loss": 3.2117819786071777, + "step": 3173, + "token_acc": 0.2760167685996841 + }, + { + "epoch": 1.8607446496628555, + "grad_norm": 0.36750435669555614, + "learning_rate": 0.0004974738225622332, + "loss": 3.2138185501098633, + "step": 3174, + "token_acc": 0.2761070628911411 + }, + { + "epoch": 1.8613309879800646, + "grad_norm": 0.44782320750606014, + "learning_rate": 0.0004974703855491704, + "loss": 3.2162137031555176, + "step": 3175, + "token_acc": 0.27624253149886846 + }, + { + "epoch": 1.8619173262972737, + "grad_norm": 0.3611677020076803, + "learning_rate": 0.0004974669462114567, + "loss": 3.1626176834106445, + "step": 3176, + "token_acc": 0.2838756976887305 + }, + { + "epoch": 1.8625036646144826, + "grad_norm": 0.34042634394152166, + "learning_rate": 0.0004974635045491246, + "loss": 3.243594169616699, + "step": 3177, + "token_acc": 0.27258156248384263 + }, + { + "epoch": 1.8630900029316915, + "grad_norm": 0.2742665316962135, + "learning_rate": 0.0004974600605622063, + "loss": 3.2122983932495117, + "step": 3178, + "token_acc": 0.27575242160903757 + }, + { + "epoch": 1.8636763412489006, + "grad_norm": 0.3877599185233168, + "learning_rate": 0.0004974566142507342, + "loss": 3.249803066253662, + "step": 3179, + "token_acc": 0.2721244068759397 + }, + { + "epoch": 1.8642626795661097, + "grad_norm": 0.40425426730030195, + "learning_rate": 0.0004974531656147406, + "loss": 3.194822311401367, + "step": 3180, + "token_acc": 0.2794471414467894 + }, + { + "epoch": 1.8648490178833188, + "grad_norm": 0.45105800018787545, + "learning_rate": 0.000497449714654258, + "loss": 3.2328460216522217, + "step": 3181, + "token_acc": 0.2743173524600478 + }, + { + "epoch": 1.8654353562005277, + "grad_norm": 0.31503863563968004, + "learning_rate": 0.0004974462613693189, + "loss": 3.232696056365967, + "step": 3182, + "token_acc": 0.274692856059457 + }, + { + "epoch": 1.8660216945177366, + "grad_norm": 0.33764223870327553, + "learning_rate": 0.0004974428057599555, + "loss": 3.212827205657959, + "step": 3183, + "token_acc": 0.2741012803053698 + }, + { + "epoch": 1.8666080328349457, + "grad_norm": 0.3808911403127586, + "learning_rate": 0.0004974393478262004, + "loss": 3.1923816204071045, + "step": 3184, + "token_acc": 0.27918689940539987 + }, + { + "epoch": 1.8671943711521548, + "grad_norm": 0.3344430633864189, + "learning_rate": 0.000497435887568086, + "loss": 3.2020010948181152, + "step": 3185, + "token_acc": 0.27797538781707565 + }, + { + "epoch": 1.867780709469364, + "grad_norm": 0.3690211481526404, + "learning_rate": 0.0004974324249856449, + "loss": 3.244870901107788, + "step": 3186, + "token_acc": 0.27205094433717375 + }, + { + "epoch": 1.8683670477865728, + "grad_norm": 0.37740308352187113, + "learning_rate": 0.0004974289600789096, + "loss": 3.2352359294891357, + "step": 3187, + "token_acc": 0.27430062305693204 + }, + { + "epoch": 1.868953386103782, + "grad_norm": 0.3459160998398935, + "learning_rate": 0.0004974254928479126, + "loss": 3.1817288398742676, + "step": 3188, + "token_acc": 0.2797147966916836 + }, + { + "epoch": 1.8695397244209908, + "grad_norm": 0.3007210912152274, + "learning_rate": 0.0004974220232926865, + "loss": 3.1853928565979004, + "step": 3189, + "token_acc": 0.27906011749844395 + }, + { + "epoch": 1.8701260627382, + "grad_norm": 0.3257078146011132, + "learning_rate": 0.0004974185514132639, + "loss": 3.2011704444885254, + "step": 3190, + "token_acc": 0.2757239764904288 + }, + { + "epoch": 1.870712401055409, + "grad_norm": 0.33274978817006384, + "learning_rate": 0.0004974150772096774, + "loss": 3.2101292610168457, + "step": 3191, + "token_acc": 0.27879314271188055 + }, + { + "epoch": 1.8712987393726181, + "grad_norm": 0.32871039262285545, + "learning_rate": 0.0004974116006819597, + "loss": 3.1871213912963867, + "step": 3192, + "token_acc": 0.2789420494135794 + }, + { + "epoch": 1.871885077689827, + "grad_norm": 0.2959164747183507, + "learning_rate": 0.0004974081218301434, + "loss": 3.2192976474761963, + "step": 3193, + "token_acc": 0.27461728446524036 + }, + { + "epoch": 1.872471416007036, + "grad_norm": 0.2838727741450811, + "learning_rate": 0.0004974046406542612, + "loss": 3.1955108642578125, + "step": 3194, + "token_acc": 0.27964859349599985 + }, + { + "epoch": 1.873057754324245, + "grad_norm": 0.2558016685443632, + "learning_rate": 0.0004974011571543456, + "loss": 3.1855757236480713, + "step": 3195, + "token_acc": 0.2788962641944155 + }, + { + "epoch": 1.8736440926414542, + "grad_norm": 0.27230629958789326, + "learning_rate": 0.0004973976713304297, + "loss": 3.18350887298584, + "step": 3196, + "token_acc": 0.27939244068743596 + }, + { + "epoch": 1.8742304309586633, + "grad_norm": 0.3165362476257985, + "learning_rate": 0.000497394183182546, + "loss": 3.2158000469207764, + "step": 3197, + "token_acc": 0.27685085548262134 + }, + { + "epoch": 1.8748167692758722, + "grad_norm": 0.33673595995617445, + "learning_rate": 0.0004973906927107273, + "loss": 3.213892936706543, + "step": 3198, + "token_acc": 0.2749061278843845 + }, + { + "epoch": 1.875403107593081, + "grad_norm": 0.34970305432716003, + "learning_rate": 0.0004973871999150065, + "loss": 3.1813442707061768, + "step": 3199, + "token_acc": 0.2808978977790095 + }, + { + "epoch": 1.8759894459102902, + "grad_norm": 0.4032671833699118, + "learning_rate": 0.0004973837047954162, + "loss": 3.203171968460083, + "step": 3200, + "token_acc": 0.27973987536124834 + }, + { + "epoch": 1.8765757842274993, + "grad_norm": 0.3873400966019496, + "learning_rate": 0.0004973802073519894, + "loss": 3.1737403869628906, + "step": 3201, + "token_acc": 0.28056242474461723 + }, + { + "epoch": 1.8771621225447084, + "grad_norm": 0.37122743823033044, + "learning_rate": 0.0004973767075847588, + "loss": 3.1454849243164062, + "step": 3202, + "token_acc": 0.2847357167988308 + }, + { + "epoch": 1.8777484608619175, + "grad_norm": 0.3255156860130465, + "learning_rate": 0.0004973732054937575, + "loss": 3.223222494125366, + "step": 3203, + "token_acc": 0.2750273043872483 + }, + { + "epoch": 1.8783347991791264, + "grad_norm": 0.35948473187109026, + "learning_rate": 0.0004973697010790182, + "loss": 3.2170510292053223, + "step": 3204, + "token_acc": 0.277822365923446 + }, + { + "epoch": 1.8789211374963353, + "grad_norm": 0.329513443380268, + "learning_rate": 0.000497366194340574, + "loss": 3.220018148422241, + "step": 3205, + "token_acc": 0.27413369127231924 + }, + { + "epoch": 1.8795074758135444, + "grad_norm": 0.3450147472311175, + "learning_rate": 0.0004973626852784577, + "loss": 3.2421979904174805, + "step": 3206, + "token_acc": 0.2731162222133642 + }, + { + "epoch": 1.8800938141307535, + "grad_norm": 0.31337793915466255, + "learning_rate": 0.0004973591738927022, + "loss": 3.1819725036621094, + "step": 3207, + "token_acc": 0.2789770562973863 + }, + { + "epoch": 1.8806801524479626, + "grad_norm": 0.34821985399251515, + "learning_rate": 0.0004973556601833406, + "loss": 3.2090983390808105, + "step": 3208, + "token_acc": 0.2780263876633472 + }, + { + "epoch": 1.8812664907651715, + "grad_norm": 0.29332944312052356, + "learning_rate": 0.000497352144150406, + "loss": 3.2027201652526855, + "step": 3209, + "token_acc": 0.27690631062721943 + }, + { + "epoch": 1.8818528290823804, + "grad_norm": 0.30646524117442714, + "learning_rate": 0.0004973486257939311, + "loss": 3.225149631500244, + "step": 3210, + "token_acc": 0.2753865841661483 + }, + { + "epoch": 1.8824391673995895, + "grad_norm": 0.32700920072564643, + "learning_rate": 0.0004973451051139494, + "loss": 3.206990957260132, + "step": 3211, + "token_acc": 0.2763532188754615 + }, + { + "epoch": 1.8830255057167986, + "grad_norm": 0.3401542223780962, + "learning_rate": 0.0004973415821104936, + "loss": 3.228464126586914, + "step": 3212, + "token_acc": 0.2735100734284262 + }, + { + "epoch": 1.8836118440340077, + "grad_norm": 0.37492279320205024, + "learning_rate": 0.000497338056783597, + "loss": 3.226731300354004, + "step": 3213, + "token_acc": 0.2744836233716495 + }, + { + "epoch": 1.8841981823512166, + "grad_norm": 0.42391757459530943, + "learning_rate": 0.0004973345291332927, + "loss": 3.204246997833252, + "step": 3214, + "token_acc": 0.27811981942921266 + }, + { + "epoch": 1.8847845206684257, + "grad_norm": 0.36725681907257646, + "learning_rate": 0.0004973309991596137, + "loss": 3.256497383117676, + "step": 3215, + "token_acc": 0.2720441001034257 + }, + { + "epoch": 1.8853708589856346, + "grad_norm": 0.33358954202990587, + "learning_rate": 0.0004973274668625932, + "loss": 3.202805757522583, + "step": 3216, + "token_acc": 0.27606244402833185 + }, + { + "epoch": 1.8859571973028437, + "grad_norm": 0.39563119239048544, + "learning_rate": 0.0004973239322422645, + "loss": 3.2335045337677, + "step": 3217, + "token_acc": 0.27338657546312156 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.34174037428925336, + "learning_rate": 0.0004973203952986608, + "loss": 3.2048444747924805, + "step": 3218, + "token_acc": 0.27848251208899394 + }, + { + "epoch": 1.887129873937262, + "grad_norm": 0.3648846754232908, + "learning_rate": 0.0004973168560318151, + "loss": 3.245723247528076, + "step": 3219, + "token_acc": 0.27322401558691584 + }, + { + "epoch": 1.8877162122544708, + "grad_norm": 0.3434491445171286, + "learning_rate": 0.0004973133144417609, + "loss": 3.218109130859375, + "step": 3220, + "token_acc": 0.2754695998650757 + }, + { + "epoch": 1.8883025505716797, + "grad_norm": 0.378815037406825, + "learning_rate": 0.0004973097705285313, + "loss": 3.229806900024414, + "step": 3221, + "token_acc": 0.2746115091655921 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.35516890329298767, + "learning_rate": 0.0004973062242921598, + "loss": 3.2406787872314453, + "step": 3222, + "token_acc": 0.2732929266052464 + }, + { + "epoch": 1.889475227206098, + "grad_norm": 0.39570064936772026, + "learning_rate": 0.0004973026757326794, + "loss": 3.237842559814453, + "step": 3223, + "token_acc": 0.27185950679286586 + }, + { + "epoch": 1.890061565523307, + "grad_norm": 0.3752993251096688, + "learning_rate": 0.0004972991248501237, + "loss": 3.2135162353515625, + "step": 3224, + "token_acc": 0.2756776768790754 + }, + { + "epoch": 1.890647903840516, + "grad_norm": 0.3156212542143818, + "learning_rate": 0.000497295571644526, + "loss": 3.210658073425293, + "step": 3225, + "token_acc": 0.27758197703775866 + }, + { + "epoch": 1.8912342421577248, + "grad_norm": 0.3732560978121744, + "learning_rate": 0.0004972920161159196, + "loss": 3.2549266815185547, + "step": 3226, + "token_acc": 0.26956761110488053 + }, + { + "epoch": 1.891820580474934, + "grad_norm": 0.36570661407445226, + "learning_rate": 0.0004972884582643379, + "loss": 3.251070976257324, + "step": 3227, + "token_acc": 0.2708862146418071 + }, + { + "epoch": 1.892406918792143, + "grad_norm": 0.34338731470130374, + "learning_rate": 0.0004972848980898144, + "loss": 3.188694477081299, + "step": 3228, + "token_acc": 0.27937241484961445 + }, + { + "epoch": 1.8929932571093522, + "grad_norm": 0.33420559243857445, + "learning_rate": 0.0004972813355923825, + "loss": 3.1797680854797363, + "step": 3229, + "token_acc": 0.28137867357640156 + }, + { + "epoch": 1.8935795954265613, + "grad_norm": 0.2910719112880023, + "learning_rate": 0.0004972777707720756, + "loss": 3.139526844024658, + "step": 3230, + "token_acc": 0.2837067861715749 + }, + { + "epoch": 1.8941659337437702, + "grad_norm": 0.3307947480190128, + "learning_rate": 0.0004972742036289273, + "loss": 3.2095818519592285, + "step": 3231, + "token_acc": 0.27667477021534936 + }, + { + "epoch": 1.894752272060979, + "grad_norm": 0.32942657068370057, + "learning_rate": 0.000497270634162971, + "loss": 3.247345447540283, + "step": 3232, + "token_acc": 0.2726510367961248 + }, + { + "epoch": 1.8953386103781882, + "grad_norm": 0.3419887301613843, + "learning_rate": 0.0004972670623742405, + "loss": 3.243971347808838, + "step": 3233, + "token_acc": 0.27222487690357966 + }, + { + "epoch": 1.8959249486953973, + "grad_norm": 0.3217314069421129, + "learning_rate": 0.0004972634882627689, + "loss": 3.2153892517089844, + "step": 3234, + "token_acc": 0.2768847131248331 + }, + { + "epoch": 1.8965112870126064, + "grad_norm": 0.32432811028489134, + "learning_rate": 0.0004972599118285902, + "loss": 3.1712770462036133, + "step": 3235, + "token_acc": 0.28314001978895237 + }, + { + "epoch": 1.8970976253298153, + "grad_norm": 0.30291471600971015, + "learning_rate": 0.0004972563330717377, + "loss": 3.21274471282959, + "step": 3236, + "token_acc": 0.27548518686390044 + }, + { + "epoch": 1.8976839636470242, + "grad_norm": 0.2946931355686893, + "learning_rate": 0.0004972527519922452, + "loss": 3.2140865325927734, + "step": 3237, + "token_acc": 0.2753327224800286 + }, + { + "epoch": 1.8982703019642333, + "grad_norm": 0.31330103454974073, + "learning_rate": 0.0004972491685901462, + "loss": 3.201774835586548, + "step": 3238, + "token_acc": 0.2768683562792459 + }, + { + "epoch": 1.8988566402814424, + "grad_norm": 0.29552366313157014, + "learning_rate": 0.0004972455828654745, + "loss": 3.167361259460449, + "step": 3239, + "token_acc": 0.2820283043231238 + }, + { + "epoch": 1.8994429785986515, + "grad_norm": 0.33191923339303425, + "learning_rate": 0.0004972419948182637, + "loss": 3.2070162296295166, + "step": 3240, + "token_acc": 0.27776553576381957 + }, + { + "epoch": 1.9000293169158604, + "grad_norm": 0.3490752267672838, + "learning_rate": 0.0004972384044485475, + "loss": 3.1854982376098633, + "step": 3241, + "token_acc": 0.2796276728491098 + }, + { + "epoch": 1.9006156552330695, + "grad_norm": 0.3036467827456306, + "learning_rate": 0.0004972348117563598, + "loss": 3.1936397552490234, + "step": 3242, + "token_acc": 0.27794686407676666 + }, + { + "epoch": 1.9012019935502784, + "grad_norm": 0.35404544231620155, + "learning_rate": 0.000497231216741734, + "loss": 3.2193100452423096, + "step": 3243, + "token_acc": 0.27459732401568093 + }, + { + "epoch": 1.9017883318674875, + "grad_norm": 0.34335495681318157, + "learning_rate": 0.0004972276194047041, + "loss": 3.215498447418213, + "step": 3244, + "token_acc": 0.27312107941768254 + }, + { + "epoch": 1.9023746701846966, + "grad_norm": 0.29721588043420866, + "learning_rate": 0.0004972240197453039, + "loss": 3.2404961585998535, + "step": 3245, + "token_acc": 0.2731248089447533 + }, + { + "epoch": 1.9029610085019057, + "grad_norm": 0.36648993350705167, + "learning_rate": 0.0004972204177635672, + "loss": 3.181131362915039, + "step": 3246, + "token_acc": 0.2821712364766063 + }, + { + "epoch": 1.9035473468191146, + "grad_norm": 0.3764259706821276, + "learning_rate": 0.0004972168134595277, + "loss": 3.1723313331604004, + "step": 3247, + "token_acc": 0.27984802577305284 + }, + { + "epoch": 1.9041336851363235, + "grad_norm": 0.39420930892712136, + "learning_rate": 0.0004972132068332194, + "loss": 3.2050790786743164, + "step": 3248, + "token_acc": 0.2761245563603077 + }, + { + "epoch": 1.9047200234535326, + "grad_norm": 0.3217579056309557, + "learning_rate": 0.0004972095978846763, + "loss": 3.2331573963165283, + "step": 3249, + "token_acc": 0.272179769959927 + }, + { + "epoch": 1.9053063617707418, + "grad_norm": 0.31963980779116213, + "learning_rate": 0.0004972059866139321, + "loss": 3.2373714447021484, + "step": 3250, + "token_acc": 0.2723248508751326 + }, + { + "epoch": 1.9058927000879509, + "grad_norm": 0.332174970218064, + "learning_rate": 0.0004972023730210206, + "loss": 3.166506767272949, + "step": 3251, + "token_acc": 0.28296525791759347 + }, + { + "epoch": 1.9064790384051598, + "grad_norm": 0.33054715221277986, + "learning_rate": 0.000497198757105976, + "loss": 3.251945734024048, + "step": 3252, + "token_acc": 0.2723264506296041 + }, + { + "epoch": 1.9070653767223686, + "grad_norm": 0.3239524684449019, + "learning_rate": 0.0004971951388688323, + "loss": 3.227743625640869, + "step": 3253, + "token_acc": 0.27277543871380916 + }, + { + "epoch": 1.9076517150395778, + "grad_norm": 0.3758479253861497, + "learning_rate": 0.0004971915183096232, + "loss": 3.2090821266174316, + "step": 3254, + "token_acc": 0.27621573462072396 + }, + { + "epoch": 1.9082380533567869, + "grad_norm": 0.37812488498080704, + "learning_rate": 0.000497187895428383, + "loss": 3.1961262226104736, + "step": 3255, + "token_acc": 0.2774504498835966 + }, + { + "epoch": 1.908824391673996, + "grad_norm": 0.3202552281551747, + "learning_rate": 0.0004971842702251456, + "loss": 3.231288433074951, + "step": 3256, + "token_acc": 0.27512210851270774 + }, + { + "epoch": 1.909410729991205, + "grad_norm": 0.3131574933199557, + "learning_rate": 0.000497180642699945, + "loss": 3.2010931968688965, + "step": 3257, + "token_acc": 0.2771954840014046 + }, + { + "epoch": 1.909997068308414, + "grad_norm": 0.31415192537406683, + "learning_rate": 0.0004971770128528154, + "loss": 3.158799171447754, + "step": 3258, + "token_acc": 0.28352145574387433 + }, + { + "epoch": 1.9105834066256229, + "grad_norm": 0.3324151332274796, + "learning_rate": 0.0004971733806837906, + "loss": 3.1905250549316406, + "step": 3259, + "token_acc": 0.27821866867888645 + }, + { + "epoch": 1.911169744942832, + "grad_norm": 0.3606384351245888, + "learning_rate": 0.0004971697461929053, + "loss": 3.1511802673339844, + "step": 3260, + "token_acc": 0.28438557747603205 + }, + { + "epoch": 1.911756083260041, + "grad_norm": 0.40778017979110603, + "learning_rate": 0.0004971661093801932, + "loss": 3.2007057666778564, + "step": 3261, + "token_acc": 0.27763172965219335 + }, + { + "epoch": 1.9123424215772502, + "grad_norm": 0.3793518726382328, + "learning_rate": 0.0004971624702456885, + "loss": 3.2358381748199463, + "step": 3262, + "token_acc": 0.2739803409102742 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.3489883936747871, + "learning_rate": 0.0004971588287894255, + "loss": 3.1804826259613037, + "step": 3263, + "token_acc": 0.2797250123717683 + }, + { + "epoch": 1.913515098211668, + "grad_norm": 0.3549713026391409, + "learning_rate": 0.0004971551850114383, + "loss": 3.1808762550354004, + "step": 3264, + "token_acc": 0.2816051259716073 + }, + { + "epoch": 1.914101436528877, + "grad_norm": 0.29182578457399544, + "learning_rate": 0.0004971515389117613, + "loss": 3.1633729934692383, + "step": 3265, + "token_acc": 0.2836968366179136 + }, + { + "epoch": 1.9146877748460862, + "grad_norm": 0.379642310327676, + "learning_rate": 0.0004971478904904285, + "loss": 3.217869520187378, + "step": 3266, + "token_acc": 0.274332662782335 + }, + { + "epoch": 1.9152741131632953, + "grad_norm": 0.3581704996924199, + "learning_rate": 0.0004971442397474744, + "loss": 3.216370105743408, + "step": 3267, + "token_acc": 0.2767854025498269 + }, + { + "epoch": 1.9158604514805042, + "grad_norm": 0.3671525794093656, + "learning_rate": 0.0004971405866829331, + "loss": 3.2096691131591797, + "step": 3268, + "token_acc": 0.27631683884242597 + }, + { + "epoch": 1.9164467897977133, + "grad_norm": 0.35404654971841504, + "learning_rate": 0.000497136931296839, + "loss": 3.2011871337890625, + "step": 3269, + "token_acc": 0.2787005107453266 + }, + { + "epoch": 1.9170331281149222, + "grad_norm": 0.35297867377126746, + "learning_rate": 0.0004971332735892265, + "loss": 3.1925864219665527, + "step": 3270, + "token_acc": 0.2781865039170182 + }, + { + "epoch": 1.9176194664321313, + "grad_norm": 0.3049843173725241, + "learning_rate": 0.00049712961356013, + "loss": 3.219371795654297, + "step": 3271, + "token_acc": 0.2756746343723805 + }, + { + "epoch": 1.9182058047493404, + "grad_norm": 0.3311590804879516, + "learning_rate": 0.0004971259512095837, + "loss": 3.1935601234436035, + "step": 3272, + "token_acc": 0.279451019425098 + }, + { + "epoch": 1.9187921430665495, + "grad_norm": 0.28150351331393936, + "learning_rate": 0.0004971222865376221, + "loss": 3.171679973602295, + "step": 3273, + "token_acc": 0.2822164027146326 + }, + { + "epoch": 1.9193784813837584, + "grad_norm": 0.3163953727918806, + "learning_rate": 0.0004971186195442797, + "loss": 3.163278102874756, + "step": 3274, + "token_acc": 0.28308570541157824 + }, + { + "epoch": 1.9199648197009673, + "grad_norm": 0.3270020703602421, + "learning_rate": 0.0004971149502295908, + "loss": 3.1697912216186523, + "step": 3275, + "token_acc": 0.2813894409856133 + }, + { + "epoch": 1.9205511580181764, + "grad_norm": 0.3405200705441147, + "learning_rate": 0.00049711127859359, + "loss": 3.2281041145324707, + "step": 3276, + "token_acc": 0.2749717873218061 + }, + { + "epoch": 1.9211374963353856, + "grad_norm": 0.2852595111879431, + "learning_rate": 0.0004971076046363115, + "loss": 3.1619327068328857, + "step": 3277, + "token_acc": 0.28285992794657794 + }, + { + "epoch": 1.9217238346525947, + "grad_norm": 0.34683569218713, + "learning_rate": 0.0004971039283577903, + "loss": 3.205418825149536, + "step": 3278, + "token_acc": 0.27552238655597644 + }, + { + "epoch": 1.9223101729698036, + "grad_norm": 0.31921621619627655, + "learning_rate": 0.0004971002497580606, + "loss": 3.2118849754333496, + "step": 3279, + "token_acc": 0.2751948619452659 + }, + { + "epoch": 1.9228965112870124, + "grad_norm": 0.3367666468311743, + "learning_rate": 0.0004970965688371569, + "loss": 3.211894989013672, + "step": 3280, + "token_acc": 0.2783318812867019 + }, + { + "epoch": 1.9234828496042216, + "grad_norm": 0.3469526790234574, + "learning_rate": 0.000497092885595114, + "loss": 3.200273036956787, + "step": 3281, + "token_acc": 0.2763291883782334 + }, + { + "epoch": 1.9240691879214307, + "grad_norm": 0.4603988924848009, + "learning_rate": 0.0004970892000319664, + "loss": 3.2314515113830566, + "step": 3282, + "token_acc": 0.27475861327021867 + }, + { + "epoch": 1.9246555262386398, + "grad_norm": 0.41801360494864964, + "learning_rate": 0.0004970855121477488, + "loss": 3.1878371238708496, + "step": 3283, + "token_acc": 0.2778261631519413 + }, + { + "epoch": 1.9252418645558487, + "grad_norm": 0.4123696258893851, + "learning_rate": 0.0004970818219424956, + "loss": 3.2267203330993652, + "step": 3284, + "token_acc": 0.27354621105887245 + }, + { + "epoch": 1.9258282028730578, + "grad_norm": 0.36929878589833315, + "learning_rate": 0.0004970781294162418, + "loss": 3.223634719848633, + "step": 3285, + "token_acc": 0.27553608362532367 + }, + { + "epoch": 1.9264145411902667, + "grad_norm": 0.3276860672651484, + "learning_rate": 0.0004970744345690218, + "loss": 3.204263925552368, + "step": 3286, + "token_acc": 0.2779547949146943 + }, + { + "epoch": 1.9270008795074758, + "grad_norm": 0.33064543396397356, + "learning_rate": 0.0004970707374008704, + "loss": 3.1739344596862793, + "step": 3287, + "token_acc": 0.2817995437613699 + }, + { + "epoch": 1.927587217824685, + "grad_norm": 0.3144165707376213, + "learning_rate": 0.0004970670379118224, + "loss": 3.17598295211792, + "step": 3288, + "token_acc": 0.28031299113517066 + }, + { + "epoch": 1.928173556141894, + "grad_norm": 0.36270198804222953, + "learning_rate": 0.0004970633361019125, + "loss": 3.147336959838867, + "step": 3289, + "token_acc": 0.28549360131388524 + }, + { + "epoch": 1.928759894459103, + "grad_norm": 0.3173174332649723, + "learning_rate": 0.0004970596319711756, + "loss": 3.2429709434509277, + "step": 3290, + "token_acc": 0.26938059397884456 + }, + { + "epoch": 1.9293462327763118, + "grad_norm": 0.33399134891494087, + "learning_rate": 0.0004970559255196462, + "loss": 3.226123809814453, + "step": 3291, + "token_acc": 0.27464143085792425 + }, + { + "epoch": 1.929932571093521, + "grad_norm": 0.38467950970576453, + "learning_rate": 0.0004970522167473593, + "loss": 3.222080707550049, + "step": 3292, + "token_acc": 0.27545704595548154 + }, + { + "epoch": 1.93051890941073, + "grad_norm": 0.41005760985873096, + "learning_rate": 0.0004970485056543498, + "loss": 3.2169852256774902, + "step": 3293, + "token_acc": 0.27510152060206133 + }, + { + "epoch": 1.9311052477279391, + "grad_norm": 0.35266939899059435, + "learning_rate": 0.0004970447922406525, + "loss": 3.2392590045928955, + "step": 3294, + "token_acc": 0.2719471094313618 + }, + { + "epoch": 1.931691586045148, + "grad_norm": 0.28299581643499594, + "learning_rate": 0.0004970410765063023, + "loss": 3.214961528778076, + "step": 3295, + "token_acc": 0.27615661402778224 + }, + { + "epoch": 1.9322779243623571, + "grad_norm": 0.29812639815144615, + "learning_rate": 0.000497037358451334, + "loss": 3.214001178741455, + "step": 3296, + "token_acc": 0.27571924847141904 + }, + { + "epoch": 1.932864262679566, + "grad_norm": 0.3382327657027396, + "learning_rate": 0.0004970336380757827, + "loss": 3.2022149562835693, + "step": 3297, + "token_acc": 0.27966875430475246 + }, + { + "epoch": 1.9334506009967751, + "grad_norm": 0.3066979313106334, + "learning_rate": 0.0004970299153796831, + "loss": 3.202234983444214, + "step": 3298, + "token_acc": 0.27683720487080615 + }, + { + "epoch": 1.9340369393139842, + "grad_norm": 0.3166979973689334, + "learning_rate": 0.0004970261903630705, + "loss": 3.183774948120117, + "step": 3299, + "token_acc": 0.2801360276429165 + }, + { + "epoch": 1.9346232776311933, + "grad_norm": 0.3361129313449817, + "learning_rate": 0.0004970224630259796, + "loss": 3.192099094390869, + "step": 3300, + "token_acc": 0.2789783229709293 + }, + { + "epoch": 1.9352096159484022, + "grad_norm": 0.30155742096847576, + "learning_rate": 0.0004970187333684455, + "loss": 3.2279253005981445, + "step": 3301, + "token_acc": 0.27584519160424115 + }, + { + "epoch": 1.9357959542656111, + "grad_norm": 0.28380554924379486, + "learning_rate": 0.0004970150013905033, + "loss": 3.1881918907165527, + "step": 3302, + "token_acc": 0.2798804727728716 + }, + { + "epoch": 1.9363822925828202, + "grad_norm": 0.29065498707486437, + "learning_rate": 0.0004970112670921881, + "loss": 3.2109642028808594, + "step": 3303, + "token_acc": 0.2763192083954205 + }, + { + "epoch": 1.9369686309000294, + "grad_norm": 0.29055430772029495, + "learning_rate": 0.0004970075304735348, + "loss": 3.197960615158081, + "step": 3304, + "token_acc": 0.2802554975915564 + }, + { + "epoch": 1.9375549692172385, + "grad_norm": 0.2897630141462624, + "learning_rate": 0.0004970037915345786, + "loss": 3.2007431983947754, + "step": 3305, + "token_acc": 0.2771131116549829 + }, + { + "epoch": 1.9381413075344474, + "grad_norm": 0.32785179104180306, + "learning_rate": 0.0004970000502753547, + "loss": 3.197744846343994, + "step": 3306, + "token_acc": 0.2782602174340302 + }, + { + "epoch": 1.9387276458516562, + "grad_norm": 0.31800592376082026, + "learning_rate": 0.0004969963066958982, + "loss": 3.198416233062744, + "step": 3307, + "token_acc": 0.27837627386155384 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.28903748322924483, + "learning_rate": 0.0004969925607962441, + "loss": 3.223043918609619, + "step": 3308, + "token_acc": 0.273139188638252 + }, + { + "epoch": 1.9399003224860745, + "grad_norm": 0.3052801998129629, + "learning_rate": 0.0004969888125764277, + "loss": 3.238306999206543, + "step": 3309, + "token_acc": 0.27303367188459016 + }, + { + "epoch": 1.9404866608032836, + "grad_norm": 0.33452790440258146, + "learning_rate": 0.0004969850620364843, + "loss": 3.208237409591675, + "step": 3310, + "token_acc": 0.2768305229020988 + }, + { + "epoch": 1.9410729991204925, + "grad_norm": 0.36812283946652274, + "learning_rate": 0.0004969813091764491, + "loss": 3.1692399978637695, + "step": 3311, + "token_acc": 0.2813889330998992 + }, + { + "epoch": 1.9416593374377016, + "grad_norm": 0.33720104522161176, + "learning_rate": 0.0004969775539963572, + "loss": 3.2040629386901855, + "step": 3312, + "token_acc": 0.2767496828470321 + }, + { + "epoch": 1.9422456757549105, + "grad_norm": 0.31947603938939084, + "learning_rate": 0.0004969737964962441, + "loss": 3.2441258430480957, + "step": 3313, + "token_acc": 0.2727668801923972 + }, + { + "epoch": 1.9428320140721196, + "grad_norm": 0.3509928067453811, + "learning_rate": 0.0004969700366761449, + "loss": 3.2401251792907715, + "step": 3314, + "token_acc": 0.272113694827191 + }, + { + "epoch": 1.9434183523893287, + "grad_norm": 0.34754609259944846, + "learning_rate": 0.000496966274536095, + "loss": 3.2266173362731934, + "step": 3315, + "token_acc": 0.2746948820722852 + }, + { + "epoch": 1.9440046907065378, + "grad_norm": 0.3024852444176208, + "learning_rate": 0.0004969625100761298, + "loss": 3.191174030303955, + "step": 3316, + "token_acc": 0.2775932322374211 + }, + { + "epoch": 1.9445910290237467, + "grad_norm": 0.3244930690870069, + "learning_rate": 0.0004969587432962846, + "loss": 3.216667652130127, + "step": 3317, + "token_acc": 0.2730635711031872 + }, + { + "epoch": 1.9451773673409556, + "grad_norm": 0.30955125621802587, + "learning_rate": 0.0004969549741965948, + "loss": 3.219914197921753, + "step": 3318, + "token_acc": 0.27517256853247457 + }, + { + "epoch": 1.9457637056581647, + "grad_norm": 0.2979258843778903, + "learning_rate": 0.0004969512027770957, + "loss": 3.1957848072052, + "step": 3319, + "token_acc": 0.27913488014859883 + }, + { + "epoch": 1.9463500439753738, + "grad_norm": 0.32516634886121065, + "learning_rate": 0.0004969474290378228, + "loss": 3.207803726196289, + "step": 3320, + "token_acc": 0.27683386875324917 + }, + { + "epoch": 1.946936382292583, + "grad_norm": 0.36500612874311833, + "learning_rate": 0.0004969436529788118, + "loss": 3.145071029663086, + "step": 3321, + "token_acc": 0.28426602376777715 + }, + { + "epoch": 1.9475227206097918, + "grad_norm": 0.32889123234054396, + "learning_rate": 0.0004969398746000977, + "loss": 3.2044808864593506, + "step": 3322, + "token_acc": 0.2760641620620226 + }, + { + "epoch": 1.948109058927001, + "grad_norm": 0.2833699533231078, + "learning_rate": 0.0004969360939017164, + "loss": 3.1625518798828125, + "step": 3323, + "token_acc": 0.2824332458218608 + }, + { + "epoch": 1.9486953972442098, + "grad_norm": 0.34591906653089916, + "learning_rate": 0.0004969323108837031, + "loss": 3.2540786266326904, + "step": 3324, + "token_acc": 0.26991830951791557 + }, + { + "epoch": 1.949281735561419, + "grad_norm": 0.36505975183001654, + "learning_rate": 0.0004969285255460936, + "loss": 3.154824733734131, + "step": 3325, + "token_acc": 0.2851618913228015 + }, + { + "epoch": 1.949868073878628, + "grad_norm": 0.39321431884086505, + "learning_rate": 0.0004969247378889232, + "loss": 3.224813938140869, + "step": 3326, + "token_acc": 0.2738829935206467 + }, + { + "epoch": 1.9504544121958372, + "grad_norm": 0.3846119354312488, + "learning_rate": 0.0004969209479122277, + "loss": 3.1737301349639893, + "step": 3327, + "token_acc": 0.2799449518989482 + }, + { + "epoch": 1.951040750513046, + "grad_norm": 0.3861292516327503, + "learning_rate": 0.0004969171556160428, + "loss": 3.2148985862731934, + "step": 3328, + "token_acc": 0.27699554294975687 + }, + { + "epoch": 1.951627088830255, + "grad_norm": 0.36739579869917127, + "learning_rate": 0.0004969133610004037, + "loss": 3.2122507095336914, + "step": 3329, + "token_acc": 0.27743145424320853 + }, + { + "epoch": 1.952213427147464, + "grad_norm": 0.38452749301211436, + "learning_rate": 0.0004969095640653464, + "loss": 3.222891330718994, + "step": 3330, + "token_acc": 0.27428287128793627 + }, + { + "epoch": 1.9527997654646732, + "grad_norm": 0.37707284398612584, + "learning_rate": 0.0004969057648109064, + "loss": 3.214402675628662, + "step": 3331, + "token_acc": 0.27536929744171235 + }, + { + "epoch": 1.9533861037818823, + "grad_norm": 0.3179865726349459, + "learning_rate": 0.0004969019632371195, + "loss": 3.1555724143981934, + "step": 3332, + "token_acc": 0.28410300461242555 + }, + { + "epoch": 1.9539724420990912, + "grad_norm": 0.38914224092933397, + "learning_rate": 0.0004968981593440213, + "loss": 3.1918838024139404, + "step": 3333, + "token_acc": 0.27796196951466207 + }, + { + "epoch": 1.9545587804163, + "grad_norm": 0.32347663111405445, + "learning_rate": 0.0004968943531316477, + "loss": 3.2015063762664795, + "step": 3334, + "token_acc": 0.27894223645653793 + }, + { + "epoch": 1.9551451187335092, + "grad_norm": 0.34641286638790986, + "learning_rate": 0.0004968905446000344, + "loss": 3.2281012535095215, + "step": 3335, + "token_acc": 0.27331931470612547 + }, + { + "epoch": 1.9557314570507183, + "grad_norm": 0.327647333238988, + "learning_rate": 0.000496886733749217, + "loss": 3.228736639022827, + "step": 3336, + "token_acc": 0.2746170874333022 + }, + { + "epoch": 1.9563177953679274, + "grad_norm": 0.2810965615304208, + "learning_rate": 0.0004968829205792314, + "loss": 3.227494716644287, + "step": 3337, + "token_acc": 0.2731039494735081 + }, + { + "epoch": 1.9569041336851363, + "grad_norm": 0.35703498332410666, + "learning_rate": 0.0004968791050901135, + "loss": 3.213259696960449, + "step": 3338, + "token_acc": 0.276112208738764 + }, + { + "epoch": 1.9574904720023454, + "grad_norm": 0.34176390576775517, + "learning_rate": 0.000496875287281899, + "loss": 3.1860976219177246, + "step": 3339, + "token_acc": 0.27833402976439203 + }, + { + "epoch": 1.9580768103195543, + "grad_norm": 0.29346855712087233, + "learning_rate": 0.000496871467154624, + "loss": 3.2253170013427734, + "step": 3340, + "token_acc": 0.2725922176526416 + }, + { + "epoch": 1.9586631486367634, + "grad_norm": 0.317951599455007, + "learning_rate": 0.0004968676447083242, + "loss": 3.226778507232666, + "step": 3341, + "token_acc": 0.27360041738470536 + }, + { + "epoch": 1.9592494869539725, + "grad_norm": 0.34047863011127416, + "learning_rate": 0.0004968638199430354, + "loss": 3.243438959121704, + "step": 3342, + "token_acc": 0.2726002703920685 + }, + { + "epoch": 1.9598358252711816, + "grad_norm": 0.3634988281518864, + "learning_rate": 0.0004968599928587937, + "loss": 3.217463731765747, + "step": 3343, + "token_acc": 0.2742538514354375 + }, + { + "epoch": 1.9604221635883905, + "grad_norm": 0.4147377498027789, + "learning_rate": 0.000496856163455635, + "loss": 3.2335925102233887, + "step": 3344, + "token_acc": 0.27256705460309516 + }, + { + "epoch": 1.9610085019055994, + "grad_norm": 0.355690117363527, + "learning_rate": 0.0004968523317335954, + "loss": 3.230142831802368, + "step": 3345, + "token_acc": 0.27425719500284107 + }, + { + "epoch": 1.9615948402228085, + "grad_norm": 0.31344275682623896, + "learning_rate": 0.0004968484976927108, + "loss": 3.2032601833343506, + "step": 3346, + "token_acc": 0.2756693037145701 + }, + { + "epoch": 1.9621811785400176, + "grad_norm": 0.32633238479715965, + "learning_rate": 0.0004968446613330171, + "loss": 3.210252285003662, + "step": 3347, + "token_acc": 0.2768882879891206 + }, + { + "epoch": 1.9627675168572267, + "grad_norm": 0.3030159048685932, + "learning_rate": 0.0004968408226545504, + "loss": 3.1647582054138184, + "step": 3348, + "token_acc": 0.2814034808869109 + }, + { + "epoch": 1.9633538551744356, + "grad_norm": 0.3081547936709801, + "learning_rate": 0.0004968369816573468, + "loss": 3.1871068477630615, + "step": 3349, + "token_acc": 0.280452049963417 + }, + { + "epoch": 1.9639401934916447, + "grad_norm": 0.30437978813655314, + "learning_rate": 0.0004968331383414425, + "loss": 3.24863338470459, + "step": 3350, + "token_acc": 0.27110649987256163 + }, + { + "epoch": 1.9645265318088536, + "grad_norm": 0.3169628854740949, + "learning_rate": 0.0004968292927068733, + "loss": 3.1832611560821533, + "step": 3351, + "token_acc": 0.27870598907242156 + }, + { + "epoch": 1.9651128701260627, + "grad_norm": 0.33305862106269346, + "learning_rate": 0.0004968254447536756, + "loss": 3.210705280303955, + "step": 3352, + "token_acc": 0.2771752864161473 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.3200427322994808, + "learning_rate": 0.0004968215944818854, + "loss": 3.206730842590332, + "step": 3353, + "token_acc": 0.277724473872886 + }, + { + "epoch": 1.966285546760481, + "grad_norm": 0.2746980214314008, + "learning_rate": 0.0004968177418915391, + "loss": 3.2191219329833984, + "step": 3354, + "token_acc": 0.27454794939834665 + }, + { + "epoch": 1.9668718850776898, + "grad_norm": 0.2760257099332996, + "learning_rate": 0.0004968138869826725, + "loss": 3.145629405975342, + "step": 3355, + "token_acc": 0.2850058730274016 + }, + { + "epoch": 1.9674582233948987, + "grad_norm": 0.31299435501510564, + "learning_rate": 0.0004968100297553221, + "loss": 3.1890993118286133, + "step": 3356, + "token_acc": 0.2806143716799025 + }, + { + "epoch": 1.9680445617121078, + "grad_norm": 0.3066981450081714, + "learning_rate": 0.000496806170209524, + "loss": 3.2040491104125977, + "step": 3357, + "token_acc": 0.27722351431550246 + }, + { + "epoch": 1.968630900029317, + "grad_norm": 0.32411535703844885, + "learning_rate": 0.0004968023083453146, + "loss": 3.181764602661133, + "step": 3358, + "token_acc": 0.28032071897440414 + }, + { + "epoch": 1.969217238346526, + "grad_norm": 0.29889532064273516, + "learning_rate": 0.00049679844416273, + "loss": 3.1503005027770996, + "step": 3359, + "token_acc": 0.28449555289353445 + }, + { + "epoch": 1.969803576663735, + "grad_norm": 0.28801914020054753, + "learning_rate": 0.0004967945776618066, + "loss": 3.1829357147216797, + "step": 3360, + "token_acc": 0.2795733275155444 + }, + { + "epoch": 1.9703899149809438, + "grad_norm": 0.3222715569418526, + "learning_rate": 0.0004967907088425808, + "loss": 3.2364463806152344, + "step": 3361, + "token_acc": 0.27116048819888705 + }, + { + "epoch": 1.970976253298153, + "grad_norm": 0.29812338114054704, + "learning_rate": 0.0004967868377050887, + "loss": 3.233306884765625, + "step": 3362, + "token_acc": 0.27390893482224854 + }, + { + "epoch": 1.971562591615362, + "grad_norm": 0.2829867325858652, + "learning_rate": 0.0004967829642493669, + "loss": 3.2222745418548584, + "step": 3363, + "token_acc": 0.2749352398182603 + }, + { + "epoch": 1.9721489299325712, + "grad_norm": 0.27640288131647706, + "learning_rate": 0.0004967790884754516, + "loss": 3.2057414054870605, + "step": 3364, + "token_acc": 0.27539596370719144 + }, + { + "epoch": 1.97273526824978, + "grad_norm": 0.28448650152955157, + "learning_rate": 0.0004967752103833793, + "loss": 3.167477607727051, + "step": 3365, + "token_acc": 0.2818621399176955 + }, + { + "epoch": 1.9733216065669892, + "grad_norm": 0.29509026980304615, + "learning_rate": 0.0004967713299731866, + "loss": 3.187300205230713, + "step": 3366, + "token_acc": 0.28179575986565913 + }, + { + "epoch": 1.973907944884198, + "grad_norm": 0.2996013777367461, + "learning_rate": 0.0004967674472449097, + "loss": 3.2192134857177734, + "step": 3367, + "token_acc": 0.27411062034131295 + }, + { + "epoch": 1.9744942832014072, + "grad_norm": 0.3050592507734187, + "learning_rate": 0.0004967635621985851, + "loss": 3.25030255317688, + "step": 3368, + "token_acc": 0.270256284339118 + }, + { + "epoch": 1.9750806215186163, + "grad_norm": 0.3984322109294327, + "learning_rate": 0.0004967596748342493, + "loss": 3.1466012001037598, + "step": 3369, + "token_acc": 0.2875937261713124 + }, + { + "epoch": 1.9756669598358254, + "grad_norm": 0.5117423370039131, + "learning_rate": 0.000496755785151939, + "loss": 3.216395854949951, + "step": 3370, + "token_acc": 0.27599473900971155 + }, + { + "epoch": 1.9762532981530343, + "grad_norm": 0.49348780601224923, + "learning_rate": 0.0004967518931516905, + "loss": 3.2253334522247314, + "step": 3371, + "token_acc": 0.27389060939311655 + }, + { + "epoch": 1.9768396364702432, + "grad_norm": 0.3627416596201303, + "learning_rate": 0.0004967479988335406, + "loss": 3.173816204071045, + "step": 3372, + "token_acc": 0.28169048228306687 + }, + { + "epoch": 1.9774259747874523, + "grad_norm": 0.3239377702970308, + "learning_rate": 0.0004967441021975256, + "loss": 3.2274770736694336, + "step": 3373, + "token_acc": 0.27451598695920715 + }, + { + "epoch": 1.9780123131046614, + "grad_norm": 0.39310271409069536, + "learning_rate": 0.0004967402032436824, + "loss": 3.224817991256714, + "step": 3374, + "token_acc": 0.27354017392773605 + }, + { + "epoch": 1.9785986514218705, + "grad_norm": 0.39903641201492623, + "learning_rate": 0.0004967363019720474, + "loss": 3.2189559936523438, + "step": 3375, + "token_acc": 0.27535255159693334 + }, + { + "epoch": 1.9791849897390794, + "grad_norm": 0.44438675326628685, + "learning_rate": 0.0004967323983826574, + "loss": 3.2311859130859375, + "step": 3376, + "token_acc": 0.2739985560023399 + }, + { + "epoch": 1.9797713280562885, + "grad_norm": 0.3427214435933039, + "learning_rate": 0.0004967284924755488, + "loss": 3.239051580429077, + "step": 3377, + "token_acc": 0.2720193498908619 + }, + { + "epoch": 1.9803576663734974, + "grad_norm": 0.3639660971884832, + "learning_rate": 0.0004967245842507587, + "loss": 3.218960762023926, + "step": 3378, + "token_acc": 0.2745266602602395 + }, + { + "epoch": 1.9809440046907065, + "grad_norm": 0.33176068390089725, + "learning_rate": 0.0004967206737083235, + "loss": 3.216127634048462, + "step": 3379, + "token_acc": 0.2766560103788969 + }, + { + "epoch": 1.9815303430079156, + "grad_norm": 0.2702012507392123, + "learning_rate": 0.00049671676084828, + "loss": 3.1952576637268066, + "step": 3380, + "token_acc": 0.27673677973633537 + }, + { + "epoch": 1.9821166813251248, + "grad_norm": 0.2748472340894447, + "learning_rate": 0.000496712845670665, + "loss": 3.182241678237915, + "step": 3381, + "token_acc": 0.2789931799071969 + }, + { + "epoch": 1.9827030196423336, + "grad_norm": 0.2922055211023222, + "learning_rate": 0.0004967089281755153, + "loss": 3.206305980682373, + "step": 3382, + "token_acc": 0.2754251324390275 + }, + { + "epoch": 1.9832893579595425, + "grad_norm": 0.2968008884116049, + "learning_rate": 0.0004967050083628676, + "loss": 3.1692495346069336, + "step": 3383, + "token_acc": 0.28164042285798774 + }, + { + "epoch": 1.9838756962767516, + "grad_norm": 0.26871514534168844, + "learning_rate": 0.0004967010862327589, + "loss": 3.2181644439697266, + "step": 3384, + "token_acc": 0.2765381032330724 + }, + { + "epoch": 1.9844620345939608, + "grad_norm": 0.3035574130477789, + "learning_rate": 0.0004966971617852259, + "loss": 3.1683802604675293, + "step": 3385, + "token_acc": 0.28147756095565374 + }, + { + "epoch": 1.9850483729111699, + "grad_norm": 0.32604001089114637, + "learning_rate": 0.0004966932350203054, + "loss": 3.2157745361328125, + "step": 3386, + "token_acc": 0.2750828166690298 + }, + { + "epoch": 1.9856347112283788, + "grad_norm": 0.3007621350925398, + "learning_rate": 0.0004966893059380344, + "loss": 3.19637393951416, + "step": 3387, + "token_acc": 0.27860962839474074 + }, + { + "epoch": 1.9862210495455876, + "grad_norm": 0.33411954408794264, + "learning_rate": 0.0004966853745384499, + "loss": 3.179593086242676, + "step": 3388, + "token_acc": 0.2811720551092507 + }, + { + "epoch": 1.9868073878627968, + "grad_norm": 0.3133667040489698, + "learning_rate": 0.0004966814408215887, + "loss": 3.204219341278076, + "step": 3389, + "token_acc": 0.2771576590981772 + }, + { + "epoch": 1.9873937261800059, + "grad_norm": 0.2858855748366626, + "learning_rate": 0.0004966775047874876, + "loss": 3.1873698234558105, + "step": 3390, + "token_acc": 0.2786249184605349 + }, + { + "epoch": 1.987980064497215, + "grad_norm": 0.2798331746046621, + "learning_rate": 0.0004966735664361839, + "loss": 3.19746470451355, + "step": 3391, + "token_acc": 0.27726318128197125 + }, + { + "epoch": 1.9885664028144239, + "grad_norm": 0.3135849821052749, + "learning_rate": 0.0004966696257677144, + "loss": 3.1850180625915527, + "step": 3392, + "token_acc": 0.2780155953036466 + }, + { + "epoch": 1.989152741131633, + "grad_norm": 0.31538329780212543, + "learning_rate": 0.0004966656827821161, + "loss": 3.2509994506835938, + "step": 3393, + "token_acc": 0.2710936985913069 + }, + { + "epoch": 1.9897390794488419, + "grad_norm": 0.2681813474883174, + "learning_rate": 0.0004966617374794262, + "loss": 3.2290990352630615, + "step": 3394, + "token_acc": 0.27330265117587693 + }, + { + "epoch": 1.990325417766051, + "grad_norm": 0.3434879539971122, + "learning_rate": 0.0004966577898596815, + "loss": 3.205097198486328, + "step": 3395, + "token_acc": 0.27613995271383157 + }, + { + "epoch": 1.99091175608326, + "grad_norm": 0.3232015180305107, + "learning_rate": 0.0004966538399229194, + "loss": 3.2253174781799316, + "step": 3396, + "token_acc": 0.2748125183292136 + }, + { + "epoch": 1.9914980944004692, + "grad_norm": 0.28593101575011953, + "learning_rate": 0.0004966498876691768, + "loss": 3.174898386001587, + "step": 3397, + "token_acc": 0.28204193444228026 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.3010890692431842, + "learning_rate": 0.0004966459330984909, + "loss": 3.205470561981201, + "step": 3398, + "token_acc": 0.27923359134990294 + }, + { + "epoch": 1.992670771034887, + "grad_norm": 0.3071122834129306, + "learning_rate": 0.0004966419762108988, + "loss": 3.243103504180908, + "step": 3399, + "token_acc": 0.27229421130921205 + }, + { + "epoch": 1.993257109352096, + "grad_norm": 0.3840391763616149, + "learning_rate": 0.0004966380170064376, + "loss": 3.2434725761413574, + "step": 3400, + "token_acc": 0.2718017321730735 + }, + { + "epoch": 1.9938434476693052, + "grad_norm": 0.3445531193342586, + "learning_rate": 0.0004966340554851447, + "loss": 3.22159481048584, + "step": 3401, + "token_acc": 0.27383123122668707 + }, + { + "epoch": 1.9944297859865143, + "grad_norm": 0.38552247999754213, + "learning_rate": 0.0004966300916470572, + "loss": 3.241112232208252, + "step": 3402, + "token_acc": 0.272339960956894 + }, + { + "epoch": 1.9950161243037232, + "grad_norm": 0.40415804477175776, + "learning_rate": 0.0004966261254922122, + "loss": 3.226780652999878, + "step": 3403, + "token_acc": 0.27282641592135337 + }, + { + "epoch": 1.9956024626209323, + "grad_norm": 0.34206865696249206, + "learning_rate": 0.0004966221570206472, + "loss": 3.2067711353302, + "step": 3404, + "token_acc": 0.27706097328474877 + }, + { + "epoch": 1.9961888009381412, + "grad_norm": 0.32924262041078284, + "learning_rate": 0.0004966181862323993, + "loss": 3.2241430282592773, + "step": 3405, + "token_acc": 0.27307748772517443 + }, + { + "epoch": 1.9967751392553503, + "grad_norm": 0.40896761534670845, + "learning_rate": 0.0004966142131275059, + "loss": 3.2088308334350586, + "step": 3406, + "token_acc": 0.2755536346314505 + }, + { + "epoch": 1.9973614775725594, + "grad_norm": 0.36975675384336365, + "learning_rate": 0.0004966102377060043, + "loss": 3.2081775665283203, + "step": 3407, + "token_acc": 0.2762099053317595 + }, + { + "epoch": 1.9979478158897686, + "grad_norm": 0.30249492311660875, + "learning_rate": 0.0004966062599679318, + "loss": 3.2071213722229004, + "step": 3408, + "token_acc": 0.2771613230798523 + }, + { + "epoch": 1.9985341542069774, + "grad_norm": 0.3196843746497856, + "learning_rate": 0.0004966022799133258, + "loss": 3.1754231452941895, + "step": 3409, + "token_acc": 0.2816014527494837 + }, + { + "epoch": 1.9991204925241863, + "grad_norm": 0.32143911555271226, + "learning_rate": 0.0004965982975422236, + "loss": 3.1902358531951904, + "step": 3410, + "token_acc": 0.27771570086328695 + }, + { + "epoch": 1.9997068308413954, + "grad_norm": 0.3384666116808621, + "learning_rate": 0.0004965943128546627, + "loss": 3.1928365230560303, + "step": 3411, + "token_acc": 0.27727413783455174 + }, + { + "epoch": 2.0, + "grad_norm": 0.3350372638226426, + "learning_rate": 0.0004965903258506806, + "loss": 3.185788154602051, + "step": 3412, + "token_acc": 0.28105435754745106 + }, + { + "epoch": 2.0, + "eval_loss": 3.179800033569336, + "eval_runtime": 6.4459, + "eval_samples_per_second": 39.715, + "eval_steps_per_second": 4.964, + "eval_token_acc": 0.2796646736115392, + "step": 3412 + }, + { + "epoch": 2.000586338317209, + "grad_norm": 0.35875580883392394, + "learning_rate": 0.0004965863365303146, + "loss": 3.1508355140686035, + "step": 3413, + "token_acc": 0.2829138676960799 + }, + { + "epoch": 2.0011726766344182, + "grad_norm": 0.33384211734752284, + "learning_rate": 0.0004965823448936024, + "loss": 3.1868162155151367, + "step": 3414, + "token_acc": 0.2782218622800467 + }, + { + "epoch": 2.001759014951627, + "grad_norm": 0.3597370072349644, + "learning_rate": 0.0004965783509405812, + "loss": 3.2320971488952637, + "step": 3415, + "token_acc": 0.27390920718388684 + }, + { + "epoch": 2.002345353268836, + "grad_norm": 0.40828956745933, + "learning_rate": 0.0004965743546712887, + "loss": 3.1918787956237793, + "step": 3416, + "token_acc": 0.27680324185397465 + }, + { + "epoch": 2.002931691586045, + "grad_norm": 0.3311832707862652, + "learning_rate": 0.0004965703560857624, + "loss": 3.1691036224365234, + "step": 3417, + "token_acc": 0.2807174138858005 + }, + { + "epoch": 2.0035180299032542, + "grad_norm": 0.33338729419960617, + "learning_rate": 0.0004965663551840399, + "loss": 3.133450508117676, + "step": 3418, + "token_acc": 0.28587711708537766 + }, + { + "epoch": 2.0041043682204633, + "grad_norm": 0.30465703089497426, + "learning_rate": 0.0004965623519661587, + "loss": 3.2127926349639893, + "step": 3419, + "token_acc": 0.2745030691448808 + }, + { + "epoch": 2.0046907065376725, + "grad_norm": 0.33295322789137144, + "learning_rate": 0.0004965583464321564, + "loss": 3.1746695041656494, + "step": 3420, + "token_acc": 0.2792896236602932 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.29376194440567804, + "learning_rate": 0.0004965543385820708, + "loss": 3.181506633758545, + "step": 3421, + "token_acc": 0.27898057684743205 + }, + { + "epoch": 2.0058633831720902, + "grad_norm": 0.3072354569450418, + "learning_rate": 0.0004965503284159393, + "loss": 3.1919167041778564, + "step": 3422, + "token_acc": 0.27944150621849007 + }, + { + "epoch": 2.0064497214892993, + "grad_norm": 0.2861043073621896, + "learning_rate": 0.0004965463159337998, + "loss": 3.1077888011932373, + "step": 3423, + "token_acc": 0.28789548329325215 + }, + { + "epoch": 2.0070360598065085, + "grad_norm": 0.31513841047034075, + "learning_rate": 0.0004965423011356898, + "loss": 3.1577324867248535, + "step": 3424, + "token_acc": 0.28225988397077784 + }, + { + "epoch": 2.0076223981237176, + "grad_norm": 0.3103290881226257, + "learning_rate": 0.0004965382840216472, + "loss": 3.149661064147949, + "step": 3425, + "token_acc": 0.28335570009728883 + }, + { + "epoch": 2.0082087364409262, + "grad_norm": 0.3402066694432324, + "learning_rate": 0.0004965342645917096, + "loss": 3.1585540771484375, + "step": 3426, + "token_acc": 0.28156966559777713 + }, + { + "epoch": 2.0087950747581353, + "grad_norm": 0.39288743381694846, + "learning_rate": 0.0004965302428459147, + "loss": 3.1738882064819336, + "step": 3427, + "token_acc": 0.2801654300953563 + }, + { + "epoch": 2.0093814130753445, + "grad_norm": 0.387838202403588, + "learning_rate": 0.0004965262187843005, + "loss": 3.1637301445007324, + "step": 3428, + "token_acc": 0.2810338912178752 + }, + { + "epoch": 2.0099677513925536, + "grad_norm": 0.3240338473238948, + "learning_rate": 0.0004965221924069046, + "loss": 3.1559393405914307, + "step": 3429, + "token_acc": 0.28152189788636883 + }, + { + "epoch": 2.0105540897097627, + "grad_norm": 0.3421680580279285, + "learning_rate": 0.0004965181637137649, + "loss": 3.1266002655029297, + "step": 3430, + "token_acc": 0.2865291897509631 + }, + { + "epoch": 2.0111404280269713, + "grad_norm": 0.3030109667188654, + "learning_rate": 0.0004965141327049193, + "loss": 3.1224703788757324, + "step": 3431, + "token_acc": 0.28618385097047205 + }, + { + "epoch": 2.0117267663441805, + "grad_norm": 0.32424619233641283, + "learning_rate": 0.0004965100993804055, + "loss": 3.1342368125915527, + "step": 3432, + "token_acc": 0.28405660976074126 + }, + { + "epoch": 2.0123131046613896, + "grad_norm": 0.33764924128893886, + "learning_rate": 0.0004965060637402616, + "loss": 3.1585075855255127, + "step": 3433, + "token_acc": 0.2827173788153686 + }, + { + "epoch": 2.0128994429785987, + "grad_norm": 0.274633759943592, + "learning_rate": 0.0004965020257845254, + "loss": 3.138598918914795, + "step": 3434, + "token_acc": 0.2844765804254468 + }, + { + "epoch": 2.013485781295808, + "grad_norm": 0.3110274420573675, + "learning_rate": 0.0004964979855132348, + "loss": 3.1533985137939453, + "step": 3435, + "token_acc": 0.2837424542081928 + }, + { + "epoch": 2.014072119613017, + "grad_norm": 0.3091539342473717, + "learning_rate": 0.0004964939429264277, + "loss": 3.154952049255371, + "step": 3436, + "token_acc": 0.2829913978379935 + }, + { + "epoch": 2.0146584579302256, + "grad_norm": 0.3295448010968533, + "learning_rate": 0.0004964898980241423, + "loss": 3.134345293045044, + "step": 3437, + "token_acc": 0.2838138236916939 + }, + { + "epoch": 2.0152447962474347, + "grad_norm": 0.3442431660662871, + "learning_rate": 0.0004964858508064164, + "loss": 3.143533945083618, + "step": 3438, + "token_acc": 0.28328351404033186 + }, + { + "epoch": 2.015831134564644, + "grad_norm": 0.36977907461327286, + "learning_rate": 0.000496481801273288, + "loss": 3.1626033782958984, + "step": 3439, + "token_acc": 0.2804114847842604 + }, + { + "epoch": 2.016417472881853, + "grad_norm": 0.4070247361963261, + "learning_rate": 0.0004964777494247953, + "loss": 3.172485828399658, + "step": 3440, + "token_acc": 0.28201909553660404 + }, + { + "epoch": 2.017003811199062, + "grad_norm": 0.3562799194813373, + "learning_rate": 0.0004964736952609763, + "loss": 3.16035795211792, + "step": 3441, + "token_acc": 0.28164421269950507 + }, + { + "epoch": 2.0175901495162707, + "grad_norm": 0.3025547194537972, + "learning_rate": 0.000496469638781869, + "loss": 3.180424213409424, + "step": 3442, + "token_acc": 0.28117562716372485 + }, + { + "epoch": 2.01817648783348, + "grad_norm": 0.340688486249103, + "learning_rate": 0.0004964655799875115, + "loss": 3.2261693477630615, + "step": 3443, + "token_acc": 0.271766645765113 + }, + { + "epoch": 2.018762826150689, + "grad_norm": 0.29516423177617485, + "learning_rate": 0.0004964615188779421, + "loss": 3.1849887371063232, + "step": 3444, + "token_acc": 0.2779431541727738 + }, + { + "epoch": 2.019349164467898, + "grad_norm": 0.30390236103735796, + "learning_rate": 0.0004964574554531989, + "loss": 3.1781201362609863, + "step": 3445, + "token_acc": 0.277971381737439 + }, + { + "epoch": 2.019935502785107, + "grad_norm": 0.3420737946630496, + "learning_rate": 0.0004964533897133199, + "loss": 3.1747143268585205, + "step": 3446, + "token_acc": 0.2798935749736723 + }, + { + "epoch": 2.0205218411023163, + "grad_norm": 0.2993293046245871, + "learning_rate": 0.0004964493216583435, + "loss": 3.1464414596557617, + "step": 3447, + "token_acc": 0.2839758334429292 + }, + { + "epoch": 2.021108179419525, + "grad_norm": 0.31849663159498026, + "learning_rate": 0.0004964452512883076, + "loss": 3.1190826892852783, + "step": 3448, + "token_acc": 0.2883136415197747 + }, + { + "epoch": 2.021694517736734, + "grad_norm": 0.28680905052226563, + "learning_rate": 0.0004964411786032509, + "loss": 3.121431827545166, + "step": 3449, + "token_acc": 0.2888548245452585 + }, + { + "epoch": 2.022280856053943, + "grad_norm": 0.27241065565757416, + "learning_rate": 0.0004964371036032113, + "loss": 3.1582865715026855, + "step": 3450, + "token_acc": 0.2805362387205746 + }, + { + "epoch": 2.0228671943711523, + "grad_norm": 0.29258297918491294, + "learning_rate": 0.0004964330262882271, + "loss": 3.1589155197143555, + "step": 3451, + "token_acc": 0.28101455777471807 + }, + { + "epoch": 2.0234535326883614, + "grad_norm": 0.27361369502145405, + "learning_rate": 0.0004964289466583369, + "loss": 3.21328067779541, + "step": 3452, + "token_acc": 0.2759543284246309 + }, + { + "epoch": 2.02403987100557, + "grad_norm": 0.28047057240366896, + "learning_rate": 0.0004964248647135787, + "loss": 3.0913705825805664, + "step": 3453, + "token_acc": 0.29065486800556434 + }, + { + "epoch": 2.024626209322779, + "grad_norm": 0.29635496724541266, + "learning_rate": 0.000496420780453991, + "loss": 3.221021890640259, + "step": 3454, + "token_acc": 0.273634291429907 + }, + { + "epoch": 2.0252125476399883, + "grad_norm": 0.29690948410474016, + "learning_rate": 0.000496416693879612, + "loss": 3.1651062965393066, + "step": 3455, + "token_acc": 0.2801676267580051 + }, + { + "epoch": 2.0257988859571974, + "grad_norm": 0.2679877195880277, + "learning_rate": 0.0004964126049904804, + "loss": 3.1235530376434326, + "step": 3456, + "token_acc": 0.28820668563904345 + }, + { + "epoch": 2.0263852242744065, + "grad_norm": 0.31758540462184087, + "learning_rate": 0.0004964085137866343, + "loss": 3.13019061088562, + "step": 3457, + "token_acc": 0.28610102344007926 + }, + { + "epoch": 2.026971562591615, + "grad_norm": 0.3522954776138269, + "learning_rate": 0.0004964044202681123, + "loss": 3.17744779586792, + "step": 3458, + "token_acc": 0.27839479176759885 + }, + { + "epoch": 2.0275579009088243, + "grad_norm": 0.31616605168380135, + "learning_rate": 0.0004964003244349528, + "loss": 3.0942463874816895, + "step": 3459, + "token_acc": 0.29109055899017733 + }, + { + "epoch": 2.0281442392260334, + "grad_norm": 0.2933340531072689, + "learning_rate": 0.0004963962262871942, + "loss": 3.1837494373321533, + "step": 3460, + "token_acc": 0.27938652828443716 + }, + { + "epoch": 2.0287305775432425, + "grad_norm": 0.3661998329185042, + "learning_rate": 0.0004963921258248752, + "loss": 3.1639838218688965, + "step": 3461, + "token_acc": 0.27997121697890953 + }, + { + "epoch": 2.0293169158604516, + "grad_norm": 0.39801333982721926, + "learning_rate": 0.0004963880230480341, + "loss": 3.151569366455078, + "step": 3462, + "token_acc": 0.28339760417927096 + }, + { + "epoch": 2.0299032541776607, + "grad_norm": 0.2918972213657598, + "learning_rate": 0.0004963839179567095, + "loss": 3.1583714485168457, + "step": 3463, + "token_acc": 0.2836825000264654 + }, + { + "epoch": 2.0304895924948694, + "grad_norm": 0.31358455919066347, + "learning_rate": 0.0004963798105509402, + "loss": 3.1492512226104736, + "step": 3464, + "token_acc": 0.2833523272010275 + }, + { + "epoch": 2.0310759308120785, + "grad_norm": 0.37175887310118133, + "learning_rate": 0.0004963757008307644, + "loss": 3.1634984016418457, + "step": 3465, + "token_acc": 0.28234213781249834 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.2888300833681739, + "learning_rate": 0.0004963715887962209, + "loss": 3.1662116050720215, + "step": 3466, + "token_acc": 0.2824034289370771 + }, + { + "epoch": 2.0322486074464967, + "grad_norm": 0.30137999250448483, + "learning_rate": 0.0004963674744473484, + "loss": 3.1572279930114746, + "step": 3467, + "token_acc": 0.2819186571351892 + }, + { + "epoch": 2.032834945763706, + "grad_norm": 0.3308303974368115, + "learning_rate": 0.0004963633577841854, + "loss": 3.188631296157837, + "step": 3468, + "token_acc": 0.27630911856304774 + }, + { + "epoch": 2.0334212840809145, + "grad_norm": 0.3199950863406161, + "learning_rate": 0.0004963592388067706, + "loss": 3.1774768829345703, + "step": 3469, + "token_acc": 0.2794430258074568 + }, + { + "epoch": 2.0340076223981236, + "grad_norm": 0.2996700842341045, + "learning_rate": 0.0004963551175151429, + "loss": 3.1685638427734375, + "step": 3470, + "token_acc": 0.2798951888028434 + }, + { + "epoch": 2.0345939607153327, + "grad_norm": 0.28280763402728765, + "learning_rate": 0.0004963509939093406, + "loss": 3.1463968753814697, + "step": 3471, + "token_acc": 0.28357994952970866 + }, + { + "epoch": 2.035180299032542, + "grad_norm": 0.29933857702188066, + "learning_rate": 0.0004963468679894027, + "loss": 3.167685031890869, + "step": 3472, + "token_acc": 0.28046617817443326 + }, + { + "epoch": 2.035766637349751, + "grad_norm": 0.3709134440599054, + "learning_rate": 0.0004963427397553682, + "loss": 3.1859445571899414, + "step": 3473, + "token_acc": 0.27930743418190856 + }, + { + "epoch": 2.03635297566696, + "grad_norm": 0.37947904954862194, + "learning_rate": 0.0004963386092072754, + "loss": 3.1485390663146973, + "step": 3474, + "token_acc": 0.28306456408565295 + }, + { + "epoch": 2.0369393139841687, + "grad_norm": 0.32484343676740385, + "learning_rate": 0.0004963344763451633, + "loss": 3.1283059120178223, + "step": 3475, + "token_acc": 0.2866285309465025 + }, + { + "epoch": 2.037525652301378, + "grad_norm": 0.281079589854818, + "learning_rate": 0.0004963303411690708, + "loss": 3.159367084503174, + "step": 3476, + "token_acc": 0.2807503791231257 + }, + { + "epoch": 2.038111990618587, + "grad_norm": 0.3083643422651259, + "learning_rate": 0.0004963262036790366, + "loss": 3.123465061187744, + "step": 3477, + "token_acc": 0.2842507818870573 + }, + { + "epoch": 2.038698328935796, + "grad_norm": 0.3577096881189804, + "learning_rate": 0.0004963220638750998, + "loss": 3.1622657775878906, + "step": 3478, + "token_acc": 0.28208336330658007 + }, + { + "epoch": 2.039284667253005, + "grad_norm": 0.34129021682301103, + "learning_rate": 0.000496317921757299, + "loss": 3.171210289001465, + "step": 3479, + "token_acc": 0.2820871357079776 + }, + { + "epoch": 2.039871005570214, + "grad_norm": 0.30148110609353196, + "learning_rate": 0.0004963137773256732, + "loss": 3.1927154064178467, + "step": 3480, + "token_acc": 0.2766802441334645 + }, + { + "epoch": 2.040457343887423, + "grad_norm": 0.3439958800312842, + "learning_rate": 0.0004963096305802614, + "loss": 3.156517505645752, + "step": 3481, + "token_acc": 0.2843224671124022 + }, + { + "epoch": 2.041043682204632, + "grad_norm": 0.30938180630834594, + "learning_rate": 0.0004963054815211026, + "loss": 3.1485118865966797, + "step": 3482, + "token_acc": 0.28331045500109764 + }, + { + "epoch": 2.041630020521841, + "grad_norm": 0.3275798730834719, + "learning_rate": 0.0004963013301482357, + "loss": 3.1244630813598633, + "step": 3483, + "token_acc": 0.28709181621307733 + }, + { + "epoch": 2.0422163588390503, + "grad_norm": 0.3523921767713871, + "learning_rate": 0.0004962971764616997, + "loss": 3.193903923034668, + "step": 3484, + "token_acc": 0.2772761708416647 + }, + { + "epoch": 2.042802697156259, + "grad_norm": 0.2770416130014744, + "learning_rate": 0.0004962930204615336, + "loss": 3.181400775909424, + "step": 3485, + "token_acc": 0.2797170617877511 + }, + { + "epoch": 2.043389035473468, + "grad_norm": 0.3256875788238273, + "learning_rate": 0.0004962888621477764, + "loss": 3.154294490814209, + "step": 3486, + "token_acc": 0.2802842850562893 + }, + { + "epoch": 2.043975373790677, + "grad_norm": 0.28410716712179396, + "learning_rate": 0.0004962847015204672, + "loss": 3.1605420112609863, + "step": 3487, + "token_acc": 0.28230657215722743 + }, + { + "epoch": 2.0445617121078863, + "grad_norm": 0.31885282666731374, + "learning_rate": 0.0004962805385796453, + "loss": 3.162942409515381, + "step": 3488, + "token_acc": 0.2812634084547331 + }, + { + "epoch": 2.0451480504250954, + "grad_norm": 0.3517513111549293, + "learning_rate": 0.0004962763733253494, + "loss": 3.1280012130737305, + "step": 3489, + "token_acc": 0.2866785631158352 + }, + { + "epoch": 2.0457343887423045, + "grad_norm": 0.3013108459745572, + "learning_rate": 0.0004962722057576189, + "loss": 3.1190743446350098, + "step": 3490, + "token_acc": 0.288037540023171 + }, + { + "epoch": 2.046320727059513, + "grad_norm": 0.34340153046298844, + "learning_rate": 0.0004962680358764929, + "loss": 3.158818006515503, + "step": 3491, + "token_acc": 0.28323040885860307 + }, + { + "epoch": 2.0469070653767223, + "grad_norm": 0.3178031395919165, + "learning_rate": 0.0004962638636820105, + "loss": 3.175481081008911, + "step": 3492, + "token_acc": 0.2801295535569811 + }, + { + "epoch": 2.0474934036939314, + "grad_norm": 0.3494292597401866, + "learning_rate": 0.0004962596891742111, + "loss": 3.1425704956054688, + "step": 3493, + "token_acc": 0.28476312353501554 + }, + { + "epoch": 2.0480797420111405, + "grad_norm": 0.38791443768303097, + "learning_rate": 0.0004962555123531336, + "loss": 3.190159559249878, + "step": 3494, + "token_acc": 0.27901312513794996 + }, + { + "epoch": 2.0486660803283496, + "grad_norm": 0.35494919740835, + "learning_rate": 0.0004962513332188174, + "loss": 3.1541690826416016, + "step": 3495, + "token_acc": 0.2825747021863361 + }, + { + "epoch": 2.0492524186455583, + "grad_norm": 0.3114678285031747, + "learning_rate": 0.0004962471517713018, + "loss": 3.1423988342285156, + "step": 3496, + "token_acc": 0.28280823018807266 + }, + { + "epoch": 2.0498387569627674, + "grad_norm": 0.28853143094853734, + "learning_rate": 0.0004962429680106261, + "loss": 3.181385040283203, + "step": 3497, + "token_acc": 0.2780049947518911 + }, + { + "epoch": 2.0504250952799765, + "grad_norm": 0.3069139379075612, + "learning_rate": 0.0004962387819368294, + "loss": 3.1778111457824707, + "step": 3498, + "token_acc": 0.2801535567416222 + }, + { + "epoch": 2.0510114335971856, + "grad_norm": 0.24579750477275658, + "learning_rate": 0.0004962345935499512, + "loss": 3.1404266357421875, + "step": 3499, + "token_acc": 0.285482450160974 + }, + { + "epoch": 2.0515977719143947, + "grad_norm": 0.29266988651919235, + "learning_rate": 0.0004962304028500309, + "loss": 3.1500110626220703, + "step": 3500, + "token_acc": 0.2822400437965042 + }, + { + "epoch": 2.052184110231604, + "grad_norm": 0.31269373322578076, + "learning_rate": 0.0004962262098371075, + "loss": 3.150394916534424, + "step": 3501, + "token_acc": 0.28254694566209804 + }, + { + "epoch": 2.0527704485488125, + "grad_norm": 0.3038688679674328, + "learning_rate": 0.0004962220145112209, + "loss": 3.1210784912109375, + "step": 3502, + "token_acc": 0.2876300899262403 + }, + { + "epoch": 2.0533567868660216, + "grad_norm": 0.2624599682869427, + "learning_rate": 0.0004962178168724102, + "loss": 3.1782212257385254, + "step": 3503, + "token_acc": 0.27897679958679866 + }, + { + "epoch": 2.0539431251832307, + "grad_norm": 0.3358499889499518, + "learning_rate": 0.0004962136169207148, + "loss": 3.1585726737976074, + "step": 3504, + "token_acc": 0.28222849497541785 + }, + { + "epoch": 2.05452946350044, + "grad_norm": 0.31958405395188955, + "learning_rate": 0.0004962094146561744, + "loss": 3.185286521911621, + "step": 3505, + "token_acc": 0.27590871064338823 + }, + { + "epoch": 2.055115801817649, + "grad_norm": 0.2786987652964806, + "learning_rate": 0.0004962052100788282, + "loss": 3.138186454772949, + "step": 3506, + "token_acc": 0.2839263448034476 + }, + { + "epoch": 2.0557021401348576, + "grad_norm": 0.3549713155772115, + "learning_rate": 0.0004962010031887159, + "loss": 3.1547603607177734, + "step": 3507, + "token_acc": 0.28420984062187865 + }, + { + "epoch": 2.0562884784520667, + "grad_norm": 0.3284620560610857, + "learning_rate": 0.000496196793985877, + "loss": 3.178173542022705, + "step": 3508, + "token_acc": 0.2805207348863294 + }, + { + "epoch": 2.056874816769276, + "grad_norm": 0.3049871511294685, + "learning_rate": 0.0004961925824703508, + "loss": 3.174515724182129, + "step": 3509, + "token_acc": 0.27711550688078096 + }, + { + "epoch": 2.057461155086485, + "grad_norm": 0.3469896875869796, + "learning_rate": 0.0004961883686421772, + "loss": 3.1745896339416504, + "step": 3510, + "token_acc": 0.27968247277122754 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.3471731869961092, + "learning_rate": 0.0004961841525013955, + "loss": 3.133080244064331, + "step": 3511, + "token_acc": 0.28482637167410346 + }, + { + "epoch": 2.0586338317209028, + "grad_norm": 0.37933898487936735, + "learning_rate": 0.0004961799340480454, + "loss": 3.1512584686279297, + "step": 3512, + "token_acc": 0.283558724377107 + }, + { + "epoch": 2.059220170038112, + "grad_norm": 0.3543709774917021, + "learning_rate": 0.0004961757132821667, + "loss": 3.1309523582458496, + "step": 3513, + "token_acc": 0.2842828967691345 + }, + { + "epoch": 2.059806508355321, + "grad_norm": 0.3068409480636232, + "learning_rate": 0.0004961714902037988, + "loss": 3.1454062461853027, + "step": 3514, + "token_acc": 0.2830581239999789 + }, + { + "epoch": 2.06039284667253, + "grad_norm": 0.3046654902798207, + "learning_rate": 0.0004961672648129815, + "loss": 3.1560494899749756, + "step": 3515, + "token_acc": 0.28205461087236444 + }, + { + "epoch": 2.060979184989739, + "grad_norm": 0.3368244147021095, + "learning_rate": 0.0004961630371097544, + "loss": 3.1810343265533447, + "step": 3516, + "token_acc": 0.2774701115973284 + }, + { + "epoch": 2.0615655233069483, + "grad_norm": 0.3701660986100902, + "learning_rate": 0.0004961588070941573, + "loss": 3.149977922439575, + "step": 3517, + "token_acc": 0.28391301432395766 + }, + { + "epoch": 2.062151861624157, + "grad_norm": 0.33764286712926145, + "learning_rate": 0.0004961545747662299, + "loss": 3.1634271144866943, + "step": 3518, + "token_acc": 0.2803246937616614 + }, + { + "epoch": 2.062738199941366, + "grad_norm": 0.29929017768404687, + "learning_rate": 0.000496150340126012, + "loss": 3.1488876342773438, + "step": 3519, + "token_acc": 0.283398571952586 + }, + { + "epoch": 2.063324538258575, + "grad_norm": 0.3860074943447778, + "learning_rate": 0.0004961461031735433, + "loss": 3.162026882171631, + "step": 3520, + "token_acc": 0.2818660044054325 + }, + { + "epoch": 2.0639108765757843, + "grad_norm": 0.40322662307535806, + "learning_rate": 0.0004961418639088637, + "loss": 3.1602020263671875, + "step": 3521, + "token_acc": 0.28260213416670005 + }, + { + "epoch": 2.0644972148929934, + "grad_norm": 0.3063372995425531, + "learning_rate": 0.000496137622332013, + "loss": 3.168941020965576, + "step": 3522, + "token_acc": 0.2815684937792519 + }, + { + "epoch": 2.065083553210202, + "grad_norm": 0.2740548937315513, + "learning_rate": 0.000496133378443031, + "loss": 3.1793575286865234, + "step": 3523, + "token_acc": 0.27917914748038974 + }, + { + "epoch": 2.065669891527411, + "grad_norm": 0.3498037039136486, + "learning_rate": 0.0004961291322419575, + "loss": 3.126283884048462, + "step": 3524, + "token_acc": 0.2856138235885071 + }, + { + "epoch": 2.0662562298446203, + "grad_norm": 0.30430644091772163, + "learning_rate": 0.0004961248837288325, + "loss": 3.2212233543395996, + "step": 3525, + "token_acc": 0.27320839130316177 + }, + { + "epoch": 2.0668425681618294, + "grad_norm": 0.3080976696911605, + "learning_rate": 0.0004961206329036959, + "loss": 3.1706457138061523, + "step": 3526, + "token_acc": 0.2791342081873791 + }, + { + "epoch": 2.0674289064790385, + "grad_norm": 0.31202474498579247, + "learning_rate": 0.0004961163797665876, + "loss": 3.138956069946289, + "step": 3527, + "token_acc": 0.28454223579068366 + }, + { + "epoch": 2.068015244796247, + "grad_norm": 0.33213744345953095, + "learning_rate": 0.0004961121243175476, + "loss": 3.1391143798828125, + "step": 3528, + "token_acc": 0.2841513135470845 + }, + { + "epoch": 2.0686015831134563, + "grad_norm": 0.2985902388928904, + "learning_rate": 0.0004961078665566158, + "loss": 3.17056941986084, + "step": 3529, + "token_acc": 0.28063272948397777 + }, + { + "epoch": 2.0691879214306654, + "grad_norm": 0.2684457845970054, + "learning_rate": 0.0004961036064838321, + "loss": 3.145906448364258, + "step": 3530, + "token_acc": 0.2838580320643757 + }, + { + "epoch": 2.0697742597478745, + "grad_norm": 0.2915488975426558, + "learning_rate": 0.0004960993440992368, + "loss": 3.174548625946045, + "step": 3531, + "token_acc": 0.28032516531357876 + }, + { + "epoch": 2.0703605980650837, + "grad_norm": 0.306815196158011, + "learning_rate": 0.0004960950794028698, + "loss": 3.2025833129882812, + "step": 3532, + "token_acc": 0.27622380369897925 + }, + { + "epoch": 2.0709469363822928, + "grad_norm": 0.3544692014168742, + "learning_rate": 0.0004960908123947711, + "loss": 3.1268396377563477, + "step": 3533, + "token_acc": 0.28515680866278476 + }, + { + "epoch": 2.0715332746995014, + "grad_norm": 0.40542279443083007, + "learning_rate": 0.0004960865430749808, + "loss": 3.176687717437744, + "step": 3534, + "token_acc": 0.28045908739691033 + }, + { + "epoch": 2.0721196130167105, + "grad_norm": 0.3339832266301935, + "learning_rate": 0.000496082271443539, + "loss": 3.155947208404541, + "step": 3535, + "token_acc": 0.2816314567335996 + }, + { + "epoch": 2.0727059513339197, + "grad_norm": 0.2993645985591695, + "learning_rate": 0.000496077997500486, + "loss": 3.1546711921691895, + "step": 3536, + "token_acc": 0.283005465047547 + }, + { + "epoch": 2.0732922896511288, + "grad_norm": 0.3072473793823034, + "learning_rate": 0.0004960737212458617, + "loss": 3.1387758255004883, + "step": 3537, + "token_acc": 0.285692081575668 + }, + { + "epoch": 2.073878627968338, + "grad_norm": 0.2660573505225701, + "learning_rate": 0.0004960694426797064, + "loss": 3.1558258533477783, + "step": 3538, + "token_acc": 0.28303490010776755 + }, + { + "epoch": 2.0744649662855466, + "grad_norm": 0.2920108384327116, + "learning_rate": 0.0004960651618020602, + "loss": 3.1421775817871094, + "step": 3539, + "token_acc": 0.2820939552776611 + }, + { + "epoch": 2.0750513046027557, + "grad_norm": 0.3179261860005554, + "learning_rate": 0.0004960608786129634, + "loss": 3.165583610534668, + "step": 3540, + "token_acc": 0.28070678003619837 + }, + { + "epoch": 2.0756376429199648, + "grad_norm": 0.27350676725042145, + "learning_rate": 0.0004960565931124563, + "loss": 3.153470516204834, + "step": 3541, + "token_acc": 0.2822707543082151 + }, + { + "epoch": 2.076223981237174, + "grad_norm": 0.29532435586417777, + "learning_rate": 0.0004960523053005791, + "loss": 3.154959201812744, + "step": 3542, + "token_acc": 0.2834544182265351 + }, + { + "epoch": 2.076810319554383, + "grad_norm": 0.3031581951789253, + "learning_rate": 0.000496048015177372, + "loss": 3.1637754440307617, + "step": 3543, + "token_acc": 0.2828321482762035 + }, + { + "epoch": 2.077396657871592, + "grad_norm": 0.3064542326024228, + "learning_rate": 0.0004960437227428754, + "loss": 3.1934001445770264, + "step": 3544, + "token_acc": 0.2777947574461752 + }, + { + "epoch": 2.077982996188801, + "grad_norm": 0.35160188302267953, + "learning_rate": 0.0004960394279971295, + "loss": 3.1339306831359863, + "step": 3545, + "token_acc": 0.2846268569497623 + }, + { + "epoch": 2.07856933450601, + "grad_norm": 0.2764336814962135, + "learning_rate": 0.0004960351309401746, + "loss": 3.1865100860595703, + "step": 3546, + "token_acc": 0.2771878366745192 + }, + { + "epoch": 2.079155672823219, + "grad_norm": 0.3510090452699469, + "learning_rate": 0.0004960308315720514, + "loss": 3.1798365116119385, + "step": 3547, + "token_acc": 0.27968411953947736 + }, + { + "epoch": 2.079742011140428, + "grad_norm": 0.3799781455739456, + "learning_rate": 0.0004960265298928, + "loss": 3.182807683944702, + "step": 3548, + "token_acc": 0.27887597351873333 + }, + { + "epoch": 2.0803283494576372, + "grad_norm": 0.36406411439377667, + "learning_rate": 0.0004960222259024608, + "loss": 3.1975419521331787, + "step": 3549, + "token_acc": 0.275740077326023 + }, + { + "epoch": 2.080914687774846, + "grad_norm": 0.3474623785513643, + "learning_rate": 0.0004960179196010743, + "loss": 3.159410238265991, + "step": 3550, + "token_acc": 0.28215285385521244 + }, + { + "epoch": 2.081501026092055, + "grad_norm": 0.32115604879709164, + "learning_rate": 0.0004960136109886811, + "loss": 3.1398887634277344, + "step": 3551, + "token_acc": 0.28241987058372836 + }, + { + "epoch": 2.082087364409264, + "grad_norm": 0.29639531791187373, + "learning_rate": 0.0004960093000653214, + "loss": 3.167410373687744, + "step": 3552, + "token_acc": 0.2817633710178929 + }, + { + "epoch": 2.0826737027264732, + "grad_norm": 0.3083883557202871, + "learning_rate": 0.0004960049868310359, + "loss": 3.1498327255249023, + "step": 3553, + "token_acc": 0.2834377138045366 + }, + { + "epoch": 2.0832600410436823, + "grad_norm": 0.3155841884946807, + "learning_rate": 0.000496000671285865, + "loss": 3.1379475593566895, + "step": 3554, + "token_acc": 0.2839919152669842 + }, + { + "epoch": 2.0838463793608915, + "grad_norm": 0.24933017086674397, + "learning_rate": 0.0004959963534298494, + "loss": 3.1477770805358887, + "step": 3555, + "token_acc": 0.28201478237065425 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.2934278805987186, + "learning_rate": 0.0004959920332630295, + "loss": 3.1574227809906006, + "step": 3556, + "token_acc": 0.28191513598559065 + }, + { + "epoch": 2.0850190559953092, + "grad_norm": 0.29783189289246326, + "learning_rate": 0.0004959877107854458, + "loss": 3.1660821437835693, + "step": 3557, + "token_acc": 0.2796514232842247 + }, + { + "epoch": 2.0856053943125183, + "grad_norm": 0.30523193007804816, + "learning_rate": 0.0004959833859971391, + "loss": 3.1543831825256348, + "step": 3558, + "token_acc": 0.28249522393547716 + }, + { + "epoch": 2.0861917326297275, + "grad_norm": 0.26259297331144965, + "learning_rate": 0.0004959790588981499, + "loss": 3.149477481842041, + "step": 3559, + "token_acc": 0.2828397608475228 + }, + { + "epoch": 2.0867780709469366, + "grad_norm": 0.28941539483323775, + "learning_rate": 0.000495974729488519, + "loss": 3.1689987182617188, + "step": 3560, + "token_acc": 0.2808373798946172 + }, + { + "epoch": 2.0873644092641452, + "grad_norm": 0.2795003281244116, + "learning_rate": 0.000495970397768287, + "loss": 3.1483707427978516, + "step": 3561, + "token_acc": 0.28325412386232596 + }, + { + "epoch": 2.0879507475813543, + "grad_norm": 0.2705318757566965, + "learning_rate": 0.0004959660637374945, + "loss": 3.1755082607269287, + "step": 3562, + "token_acc": 0.27968984141700537 + }, + { + "epoch": 2.0885370858985635, + "grad_norm": 0.25696692362425827, + "learning_rate": 0.0004959617273961822, + "loss": 3.164635181427002, + "step": 3563, + "token_acc": 0.2793328491468137 + }, + { + "epoch": 2.0891234242157726, + "grad_norm": 0.339522279747748, + "learning_rate": 0.0004959573887443911, + "loss": 3.1535964012145996, + "step": 3564, + "token_acc": 0.28317065407670683 + }, + { + "epoch": 2.0897097625329817, + "grad_norm": 0.46136784136540854, + "learning_rate": 0.0004959530477821615, + "loss": 3.2367866039276123, + "step": 3565, + "token_acc": 0.2708910190019234 + }, + { + "epoch": 2.0902961008501904, + "grad_norm": 0.28940370952792466, + "learning_rate": 0.0004959487045095347, + "loss": 3.06662917137146, + "step": 3566, + "token_acc": 0.29549919431384236 + }, + { + "epoch": 2.0908824391673995, + "grad_norm": 0.3596063211600425, + "learning_rate": 0.0004959443589265511, + "loss": 3.178691864013672, + "step": 3567, + "token_acc": 0.28048094429556286 + }, + { + "epoch": 2.0914687774846086, + "grad_norm": 0.3529460929843656, + "learning_rate": 0.0004959400110332517, + "loss": 3.179659128189087, + "step": 3568, + "token_acc": 0.2797587925685803 + }, + { + "epoch": 2.0920551158018177, + "grad_norm": 0.38039943810288573, + "learning_rate": 0.0004959356608296773, + "loss": 3.133793354034424, + "step": 3569, + "token_acc": 0.28552357778366516 + }, + { + "epoch": 2.092641454119027, + "grad_norm": 0.2592183517906237, + "learning_rate": 0.0004959313083158687, + "loss": 3.142136335372925, + "step": 3570, + "token_acc": 0.2843974802087154 + }, + { + "epoch": 2.093227792436236, + "grad_norm": 0.40800673727403725, + "learning_rate": 0.000495926953491867, + "loss": 3.1452178955078125, + "step": 3571, + "token_acc": 0.28454228266677845 + }, + { + "epoch": 2.0938141307534446, + "grad_norm": 0.5210142107830779, + "learning_rate": 0.0004959225963577129, + "loss": 3.1473183631896973, + "step": 3572, + "token_acc": 0.2847555819434699 + }, + { + "epoch": 2.0944004690706537, + "grad_norm": 0.4145348629738687, + "learning_rate": 0.0004959182369134473, + "loss": 3.1342897415161133, + "step": 3573, + "token_acc": 0.2858269808395051 + }, + { + "epoch": 2.094986807387863, + "grad_norm": 0.3583098050781187, + "learning_rate": 0.0004959138751591114, + "loss": 3.1593070030212402, + "step": 3574, + "token_acc": 0.2806715419886219 + }, + { + "epoch": 2.095573145705072, + "grad_norm": 0.4167269287556698, + "learning_rate": 0.000495909511094746, + "loss": 3.132384777069092, + "step": 3575, + "token_acc": 0.2853847525268456 + }, + { + "epoch": 2.096159484022281, + "grad_norm": 0.38581607980149635, + "learning_rate": 0.000495905144720392, + "loss": 3.1706948280334473, + "step": 3576, + "token_acc": 0.2801888092964837 + }, + { + "epoch": 2.0967458223394897, + "grad_norm": 0.3531191538732225, + "learning_rate": 0.0004959007760360905, + "loss": 3.1476900577545166, + "step": 3577, + "token_acc": 0.28522986372554215 + }, + { + "epoch": 2.097332160656699, + "grad_norm": 0.3232527685131376, + "learning_rate": 0.0004958964050418826, + "loss": 3.1262009143829346, + "step": 3578, + "token_acc": 0.2871987782578938 + }, + { + "epoch": 2.097918498973908, + "grad_norm": 0.3258137224259238, + "learning_rate": 0.0004958920317378094, + "loss": 3.2171719074249268, + "step": 3579, + "token_acc": 0.2717514244295644 + }, + { + "epoch": 2.098504837291117, + "grad_norm": 0.3594676970055056, + "learning_rate": 0.0004958876561239118, + "loss": 3.102546453475952, + "step": 3580, + "token_acc": 0.28985525833168746 + }, + { + "epoch": 2.099091175608326, + "grad_norm": 0.3204960573877294, + "learning_rate": 0.0004958832782002312, + "loss": 3.133686065673828, + "step": 3581, + "token_acc": 0.28564496743996237 + }, + { + "epoch": 2.099677513925535, + "grad_norm": 0.34211683648549973, + "learning_rate": 0.0004958788979668084, + "loss": 3.171351909637451, + "step": 3582, + "token_acc": 0.2789041022047329 + }, + { + "epoch": 2.100263852242744, + "grad_norm": 0.3110129179509117, + "learning_rate": 0.0004958745154236846, + "loss": 3.1835148334503174, + "step": 3583, + "token_acc": 0.278656575312802 + }, + { + "epoch": 2.100850190559953, + "grad_norm": 0.31979236952360013, + "learning_rate": 0.0004958701305709011, + "loss": 3.1338422298431396, + "step": 3584, + "token_acc": 0.28550406452699856 + }, + { + "epoch": 2.101436528877162, + "grad_norm": 0.2713446825020389, + "learning_rate": 0.0004958657434084992, + "loss": 3.0960206985473633, + "step": 3585, + "token_acc": 0.289884119710653 + }, + { + "epoch": 2.1020228671943713, + "grad_norm": 0.31386981074298553, + "learning_rate": 0.0004958613539365197, + "loss": 3.1683120727539062, + "step": 3586, + "token_acc": 0.2804749553168407 + }, + { + "epoch": 2.1026092055115804, + "grad_norm": 0.3113756281513712, + "learning_rate": 0.0004958569621550044, + "loss": 3.1868560314178467, + "step": 3587, + "token_acc": 0.27897110503192457 + }, + { + "epoch": 2.103195543828789, + "grad_norm": 0.28771240902428363, + "learning_rate": 0.0004958525680639939, + "loss": 3.1838254928588867, + "step": 3588, + "token_acc": 0.2782339283792303 + }, + { + "epoch": 2.103781882145998, + "grad_norm": 0.33582880463286785, + "learning_rate": 0.00049584817166353, + "loss": 3.1608667373657227, + "step": 3589, + "token_acc": 0.280810848609478 + }, + { + "epoch": 2.1043682204632073, + "grad_norm": 0.3113992649087185, + "learning_rate": 0.0004958437729536537, + "loss": 3.1745858192443848, + "step": 3590, + "token_acc": 0.2797252675357463 + }, + { + "epoch": 2.1049545587804164, + "grad_norm": 0.3189441803697222, + "learning_rate": 0.0004958393719344065, + "loss": 3.194584846496582, + "step": 3591, + "token_acc": 0.2774452516319225 + }, + { + "epoch": 2.1055408970976255, + "grad_norm": 0.3035321778415073, + "learning_rate": 0.0004958349686058297, + "loss": 3.209015130996704, + "step": 3592, + "token_acc": 0.2754370337671483 + }, + { + "epoch": 2.106127235414834, + "grad_norm": 0.31401803937374556, + "learning_rate": 0.0004958305629679646, + "loss": 3.13073992729187, + "step": 3593, + "token_acc": 0.28734924438138326 + }, + { + "epoch": 2.1067135737320433, + "grad_norm": 0.30112332551751264, + "learning_rate": 0.0004958261550208527, + "loss": 3.1646361351013184, + "step": 3594, + "token_acc": 0.2802009533053609 + }, + { + "epoch": 2.1072999120492524, + "grad_norm": 0.30834455995259646, + "learning_rate": 0.0004958217447645352, + "loss": 3.201669216156006, + "step": 3595, + "token_acc": 0.2763970073421518 + }, + { + "epoch": 2.1078862503664615, + "grad_norm": 0.27898882328719726, + "learning_rate": 0.0004958173321990537, + "loss": 3.1747212409973145, + "step": 3596, + "token_acc": 0.2801926070380276 + }, + { + "epoch": 2.1084725886836706, + "grad_norm": 0.2703488550011949, + "learning_rate": 0.0004958129173244497, + "loss": 3.1584572792053223, + "step": 3597, + "token_acc": 0.28158012828973417 + }, + { + "epoch": 2.1090589270008797, + "grad_norm": 0.3623812975691925, + "learning_rate": 0.0004958085001407644, + "loss": 3.1583163738250732, + "step": 3598, + "token_acc": 0.28201280723629985 + }, + { + "epoch": 2.1096452653180884, + "grad_norm": 0.3360966740027526, + "learning_rate": 0.0004958040806480397, + "loss": 3.2056188583374023, + "step": 3599, + "token_acc": 0.27424639920362515 + }, + { + "epoch": 2.1102316036352975, + "grad_norm": 0.30144084590852394, + "learning_rate": 0.0004957996588463167, + "loss": 3.173847198486328, + "step": 3600, + "token_acc": 0.27989988316185643 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.30611084384579584, + "learning_rate": 0.0004957952347356371, + "loss": 3.1444382667541504, + "step": 3601, + "token_acc": 0.28286195722618274 + }, + { + "epoch": 2.1114042802697157, + "grad_norm": 0.31972592019080787, + "learning_rate": 0.0004957908083160426, + "loss": 3.113243818283081, + "step": 3602, + "token_acc": 0.2865525672371638 + }, + { + "epoch": 2.111990618586925, + "grad_norm": 0.3157037691374511, + "learning_rate": 0.0004957863795875747, + "loss": 3.1432249546051025, + "step": 3603, + "token_acc": 0.28469477923094266 + }, + { + "epoch": 2.1125769569041335, + "grad_norm": 0.3171980833511124, + "learning_rate": 0.0004957819485502748, + "loss": 3.095344066619873, + "step": 3604, + "token_acc": 0.291866167322451 + }, + { + "epoch": 2.1131632952213426, + "grad_norm": 0.3839340606490749, + "learning_rate": 0.0004957775152041848, + "loss": 3.1960833072662354, + "step": 3605, + "token_acc": 0.2798559856952234 + }, + { + "epoch": 2.1137496335385517, + "grad_norm": 0.27954622521272043, + "learning_rate": 0.0004957730795493463, + "loss": 3.1282520294189453, + "step": 3606, + "token_acc": 0.2867455249322326 + }, + { + "epoch": 2.114335971855761, + "grad_norm": 0.3793961575328709, + "learning_rate": 0.0004957686415858008, + "loss": 3.144864082336426, + "step": 3607, + "token_acc": 0.2828337634151787 + }, + { + "epoch": 2.11492231017297, + "grad_norm": 0.3101020668560908, + "learning_rate": 0.0004957642013135901, + "loss": 3.1440272331237793, + "step": 3608, + "token_acc": 0.28453413267672983 + }, + { + "epoch": 2.115508648490179, + "grad_norm": 0.27413482513032816, + "learning_rate": 0.0004957597587327559, + "loss": 3.145477771759033, + "step": 3609, + "token_acc": 0.2833999958227161 + }, + { + "epoch": 2.1160949868073877, + "grad_norm": 0.3080450288299567, + "learning_rate": 0.00049575531384334, + "loss": 3.1852211952209473, + "step": 3610, + "token_acc": 0.2797388689876629 + }, + { + "epoch": 2.116681325124597, + "grad_norm": 0.3006427922976153, + "learning_rate": 0.0004957508666453839, + "loss": 3.1854848861694336, + "step": 3611, + "token_acc": 0.27715569236996346 + }, + { + "epoch": 2.117267663441806, + "grad_norm": 0.2826035035745967, + "learning_rate": 0.0004957464171389298, + "loss": 3.1420111656188965, + "step": 3612, + "token_acc": 0.28245700399290496 + }, + { + "epoch": 2.117854001759015, + "grad_norm": 0.2986819921812465, + "learning_rate": 0.0004957419653240191, + "loss": 3.1110424995422363, + "step": 3613, + "token_acc": 0.2881288391005592 + }, + { + "epoch": 2.118440340076224, + "grad_norm": 0.33872078718671045, + "learning_rate": 0.0004957375112006939, + "loss": 3.1691362857818604, + "step": 3614, + "token_acc": 0.2817671102385683 + }, + { + "epoch": 2.119026678393433, + "grad_norm": 0.3128798013303021, + "learning_rate": 0.0004957330547689958, + "loss": 3.1248011589050293, + "step": 3615, + "token_acc": 0.2872706167171687 + }, + { + "epoch": 2.119613016710642, + "grad_norm": 0.2683150275348664, + "learning_rate": 0.0004957285960289668, + "loss": 3.1205973625183105, + "step": 3616, + "token_acc": 0.28713031255301447 + }, + { + "epoch": 2.120199355027851, + "grad_norm": 0.37653159990061064, + "learning_rate": 0.0004957241349806487, + "loss": 3.190948247909546, + "step": 3617, + "token_acc": 0.27640197118083715 + }, + { + "epoch": 2.12078569334506, + "grad_norm": 0.37129927110371946, + "learning_rate": 0.0004957196716240836, + "loss": 3.159011125564575, + "step": 3618, + "token_acc": 0.284125786163522 + }, + { + "epoch": 2.1213720316622693, + "grad_norm": 0.28785420549461566, + "learning_rate": 0.0004957152059593133, + "loss": 3.1618053913116455, + "step": 3619, + "token_acc": 0.280343362025396 + }, + { + "epoch": 2.121958369979478, + "grad_norm": 0.3273046544280014, + "learning_rate": 0.0004957107379863797, + "loss": 3.1540026664733887, + "step": 3620, + "token_acc": 0.28354251697083765 + }, + { + "epoch": 2.122544708296687, + "grad_norm": 0.3766075653031307, + "learning_rate": 0.0004957062677053248, + "loss": 3.1884827613830566, + "step": 3621, + "token_acc": 0.2775796556921183 + }, + { + "epoch": 2.123131046613896, + "grad_norm": 0.3484366317028475, + "learning_rate": 0.0004957017951161906, + "loss": 3.1318821907043457, + "step": 3622, + "token_acc": 0.28481793923603815 + }, + { + "epoch": 2.1237173849311053, + "grad_norm": 0.31774014289134156, + "learning_rate": 0.0004956973202190192, + "loss": 3.158658742904663, + "step": 3623, + "token_acc": 0.28263547583279985 + }, + { + "epoch": 2.1243037232483144, + "grad_norm": 0.3191124932942135, + "learning_rate": 0.0004956928430138525, + "loss": 3.150526523590088, + "step": 3624, + "token_acc": 0.2831809827093194 + }, + { + "epoch": 2.1248900615655235, + "grad_norm": 0.29894313388811217, + "learning_rate": 0.0004956883635007325, + "loss": 3.1347312927246094, + "step": 3625, + "token_acc": 0.2857051150026347 + }, + { + "epoch": 2.125476399882732, + "grad_norm": 0.31653078698668086, + "learning_rate": 0.0004956838816797016, + "loss": 3.1349759101867676, + "step": 3626, + "token_acc": 0.28595034970409655 + }, + { + "epoch": 2.1260627381999413, + "grad_norm": 0.34848202015218516, + "learning_rate": 0.0004956793975508016, + "loss": 3.0926027297973633, + "step": 3627, + "token_acc": 0.2918586707674923 + }, + { + "epoch": 2.1266490765171504, + "grad_norm": 0.31897901469524426, + "learning_rate": 0.0004956749111140747, + "loss": 3.144193649291992, + "step": 3628, + "token_acc": 0.2845022118656995 + }, + { + "epoch": 2.1272354148343595, + "grad_norm": 0.29412849638164157, + "learning_rate": 0.0004956704223695631, + "loss": 3.140152931213379, + "step": 3629, + "token_acc": 0.2841451329932569 + }, + { + "epoch": 2.1278217531515686, + "grad_norm": 0.3090549375247492, + "learning_rate": 0.0004956659313173089, + "loss": 3.1677536964416504, + "step": 3630, + "token_acc": 0.2803928138238052 + }, + { + "epoch": 2.1284080914687773, + "grad_norm": 0.32390791911403166, + "learning_rate": 0.0004956614379573543, + "loss": 3.1762261390686035, + "step": 3631, + "token_acc": 0.27862625313283207 + }, + { + "epoch": 2.1289944297859864, + "grad_norm": 0.29128490360247594, + "learning_rate": 0.0004956569422897416, + "loss": 3.148764133453369, + "step": 3632, + "token_acc": 0.28232832844152705 + }, + { + "epoch": 2.1295807681031955, + "grad_norm": 0.2579340724253643, + "learning_rate": 0.0004956524443145129, + "loss": 3.136120557785034, + "step": 3633, + "token_acc": 0.28262366462457594 + }, + { + "epoch": 2.1301671064204046, + "grad_norm": 0.2541766674452288, + "learning_rate": 0.0004956479440317104, + "loss": 3.13716459274292, + "step": 3634, + "token_acc": 0.285727084914411 + }, + { + "epoch": 2.1307534447376137, + "grad_norm": 0.29750740544822263, + "learning_rate": 0.0004956434414413767, + "loss": 3.155475378036499, + "step": 3635, + "token_acc": 0.28257027226481146 + }, + { + "epoch": 2.1313397830548224, + "grad_norm": 0.406233511747262, + "learning_rate": 0.0004956389365435537, + "loss": 3.163100004196167, + "step": 3636, + "token_acc": 0.2826364904797526 + }, + { + "epoch": 2.1319261213720315, + "grad_norm": 0.35022637861431155, + "learning_rate": 0.000495634429338284, + "loss": 3.1594338417053223, + "step": 3637, + "token_acc": 0.28154511651490505 + }, + { + "epoch": 2.1325124596892406, + "grad_norm": 0.28957473393967376, + "learning_rate": 0.0004956299198256098, + "loss": 3.17781925201416, + "step": 3638, + "token_acc": 0.27970177906930876 + }, + { + "epoch": 2.1330987980064497, + "grad_norm": 0.3284659685381087, + "learning_rate": 0.0004956254080055735, + "loss": 3.1698901653289795, + "step": 3639, + "token_acc": 0.2795845601145296 + }, + { + "epoch": 2.133685136323659, + "grad_norm": 0.3265616404479127, + "learning_rate": 0.0004956208938782174, + "loss": 3.2044849395751953, + "step": 3640, + "token_acc": 0.2752738268444388 + }, + { + "epoch": 2.134271474640868, + "grad_norm": 0.33145488900557457, + "learning_rate": 0.0004956163774435841, + "loss": 3.166576862335205, + "step": 3641, + "token_acc": 0.28192547915008265 + }, + { + "epoch": 2.1348578129580766, + "grad_norm": 0.2888325118426882, + "learning_rate": 0.0004956118587017159, + "loss": 3.167180299758911, + "step": 3642, + "token_acc": 0.28059656610472616 + }, + { + "epoch": 2.1354441512752858, + "grad_norm": 0.3969248172650549, + "learning_rate": 0.0004956073376526551, + "loss": 3.18566632270813, + "step": 3643, + "token_acc": 0.2798255897352189 + }, + { + "epoch": 2.136030489592495, + "grad_norm": 0.3873923896612051, + "learning_rate": 0.0004956028142964444, + "loss": 3.1877474784851074, + "step": 3644, + "token_acc": 0.2789083533482386 + }, + { + "epoch": 2.136616827909704, + "grad_norm": 0.35174316657122257, + "learning_rate": 0.0004955982886331263, + "loss": 3.1825504302978516, + "step": 3645, + "token_acc": 0.27903904027174514 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.30333174330031526, + "learning_rate": 0.0004955937606627432, + "loss": 3.098595142364502, + "step": 3646, + "token_acc": 0.2880032909181763 + }, + { + "epoch": 2.1377895045441218, + "grad_norm": 0.3190452105466988, + "learning_rate": 0.0004955892303853376, + "loss": 3.191960573196411, + "step": 3647, + "token_acc": 0.27879788037741715 + }, + { + "epoch": 2.138375842861331, + "grad_norm": 0.3139275879740007, + "learning_rate": 0.0004955846978009522, + "loss": 3.1175947189331055, + "step": 3648, + "token_acc": 0.2869186872544598 + }, + { + "epoch": 2.13896218117854, + "grad_norm": 0.310624996255693, + "learning_rate": 0.0004955801629096294, + "loss": 3.195763111114502, + "step": 3649, + "token_acc": 0.2779277853298821 + }, + { + "epoch": 2.139548519495749, + "grad_norm": 0.33129846586246103, + "learning_rate": 0.0004955756257114119, + "loss": 3.132993698120117, + "step": 3650, + "token_acc": 0.2857432741357964 + }, + { + "epoch": 2.140134857812958, + "grad_norm": 0.3052242331281519, + "learning_rate": 0.0004955710862063423, + "loss": 3.153242349624634, + "step": 3651, + "token_acc": 0.2842799604593446 + }, + { + "epoch": 2.1407211961301673, + "grad_norm": 0.28661084091543776, + "learning_rate": 0.0004955665443944633, + "loss": 3.159463405609131, + "step": 3652, + "token_acc": 0.28187495725101674 + }, + { + "epoch": 2.141307534447376, + "grad_norm": 0.3010641563739908, + "learning_rate": 0.0004955620002758175, + "loss": 3.2164103984832764, + "step": 3653, + "token_acc": 0.27332953355513434 + }, + { + "epoch": 2.141893872764585, + "grad_norm": 0.30302134501089634, + "learning_rate": 0.0004955574538504477, + "loss": 3.1384472846984863, + "step": 3654, + "token_acc": 0.28503380145668183 + }, + { + "epoch": 2.142480211081794, + "grad_norm": 0.33286901032015953, + "learning_rate": 0.0004955529051183965, + "loss": 3.1714746952056885, + "step": 3655, + "token_acc": 0.2805121617078079 + }, + { + "epoch": 2.1430665493990033, + "grad_norm": 0.32146206065890054, + "learning_rate": 0.0004955483540797065, + "loss": 3.108811378479004, + "step": 3656, + "token_acc": 0.2895592898943996 + }, + { + "epoch": 2.1436528877162124, + "grad_norm": 0.2963711262028485, + "learning_rate": 0.0004955438007344207, + "loss": 3.1212687492370605, + "step": 3657, + "token_acc": 0.28726837675744255 + }, + { + "epoch": 2.144239226033421, + "grad_norm": 0.2882092903446034, + "learning_rate": 0.0004955392450825818, + "loss": 3.17252254486084, + "step": 3658, + "token_acc": 0.2790808404927026 + }, + { + "epoch": 2.14482556435063, + "grad_norm": 0.2964099986458252, + "learning_rate": 0.0004955346871242325, + "loss": 3.1505658626556396, + "step": 3659, + "token_acc": 0.28257402079198274 + }, + { + "epoch": 2.1454119026678393, + "grad_norm": 0.36338370262844644, + "learning_rate": 0.0004955301268594157, + "loss": 3.1737778186798096, + "step": 3660, + "token_acc": 0.2797064514842004 + }, + { + "epoch": 2.1459982409850484, + "grad_norm": 0.38216449775785266, + "learning_rate": 0.000495525564288174, + "loss": 3.1325647830963135, + "step": 3661, + "token_acc": 0.2842759416738011 + }, + { + "epoch": 2.1465845793022575, + "grad_norm": 0.3561807879860291, + "learning_rate": 0.0004955209994105507, + "loss": 3.1667537689208984, + "step": 3662, + "token_acc": 0.2819394716311032 + }, + { + "epoch": 2.1471709176194667, + "grad_norm": 0.2892664423731742, + "learning_rate": 0.0004955164322265885, + "loss": 3.1846001148223877, + "step": 3663, + "token_acc": 0.27947591359941065 + }, + { + "epoch": 2.1477572559366753, + "grad_norm": 0.3561720125443651, + "learning_rate": 0.0004955118627363302, + "loss": 3.1264169216156006, + "step": 3664, + "token_acc": 0.2866172572183701 + }, + { + "epoch": 2.1483435942538844, + "grad_norm": 0.29798665874086766, + "learning_rate": 0.0004955072909398187, + "loss": 3.1765551567077637, + "step": 3665, + "token_acc": 0.28098311347485944 + }, + { + "epoch": 2.1489299325710935, + "grad_norm": 0.2903833020835021, + "learning_rate": 0.0004955027168370972, + "loss": 3.115173101425171, + "step": 3666, + "token_acc": 0.2882780306584336 + }, + { + "epoch": 2.1495162708883027, + "grad_norm": 0.33147753924870615, + "learning_rate": 0.0004954981404282083, + "loss": 3.1446080207824707, + "step": 3667, + "token_acc": 0.2827679966311149 + }, + { + "epoch": 2.1501026092055118, + "grad_norm": 0.2694529552442061, + "learning_rate": 0.0004954935617131952, + "loss": 3.1314547061920166, + "step": 3668, + "token_acc": 0.2857307487182786 + }, + { + "epoch": 2.1506889475227204, + "grad_norm": 0.29964610844254486, + "learning_rate": 0.000495488980692101, + "loss": 3.129884719848633, + "step": 3669, + "token_acc": 0.28650484552706157 + }, + { + "epoch": 2.1512752858399296, + "grad_norm": 0.29408832695989184, + "learning_rate": 0.0004954843973649686, + "loss": 3.211188793182373, + "step": 3670, + "token_acc": 0.27340598610644834 + }, + { + "epoch": 2.1518616241571387, + "grad_norm": 0.31215013273479175, + "learning_rate": 0.0004954798117318411, + "loss": 3.1252026557922363, + "step": 3671, + "token_acc": 0.28516013734245477 + }, + { + "epoch": 2.1524479624743478, + "grad_norm": 0.26842706701612173, + "learning_rate": 0.0004954752237927614, + "loss": 3.158066511154175, + "step": 3672, + "token_acc": 0.2811730129230237 + }, + { + "epoch": 2.153034300791557, + "grad_norm": 0.26417922558178397, + "learning_rate": 0.000495470633547773, + "loss": 3.1407082080841064, + "step": 3673, + "token_acc": 0.2846216008935603 + }, + { + "epoch": 2.1536206391087656, + "grad_norm": 0.3318790351535135, + "learning_rate": 0.0004954660409969186, + "loss": 3.209529399871826, + "step": 3674, + "token_acc": 0.275680908348375 + }, + { + "epoch": 2.1542069774259747, + "grad_norm": 0.29411145006909367, + "learning_rate": 0.0004954614461402416, + "loss": 3.1204981803894043, + "step": 3675, + "token_acc": 0.2886117310614607 + }, + { + "epoch": 2.154793315743184, + "grad_norm": 0.24748749091675695, + "learning_rate": 0.000495456848977785, + "loss": 3.143026351928711, + "step": 3676, + "token_acc": 0.28393547427891763 + }, + { + "epoch": 2.155379654060393, + "grad_norm": 0.33180542942558133, + "learning_rate": 0.0004954522495095921, + "loss": 3.1502833366394043, + "step": 3677, + "token_acc": 0.2821055520661244 + }, + { + "epoch": 2.155965992377602, + "grad_norm": 0.34582967522351293, + "learning_rate": 0.0004954476477357061, + "loss": 3.104684829711914, + "step": 3678, + "token_acc": 0.288200235874456 + }, + { + "epoch": 2.1565523306948107, + "grad_norm": 0.27681622091184327, + "learning_rate": 0.0004954430436561702, + "loss": 3.1860389709472656, + "step": 3679, + "token_acc": 0.2790292928704258 + }, + { + "epoch": 2.15713866901202, + "grad_norm": 0.31460931294414396, + "learning_rate": 0.0004954384372710275, + "loss": 3.165003776550293, + "step": 3680, + "token_acc": 0.2825240522162637 + }, + { + "epoch": 2.157725007329229, + "grad_norm": 0.3229117914365331, + "learning_rate": 0.0004954338285803216, + "loss": 3.139177083969116, + "step": 3681, + "token_acc": 0.2853806433401064 + }, + { + "epoch": 2.158311345646438, + "grad_norm": 0.30192005178031267, + "learning_rate": 0.0004954292175840955, + "loss": 3.162809371948242, + "step": 3682, + "token_acc": 0.28244804723621514 + }, + { + "epoch": 2.158897683963647, + "grad_norm": 0.33006931262480044, + "learning_rate": 0.0004954246042823926, + "loss": 3.2366414070129395, + "step": 3683, + "token_acc": 0.27187196118334656 + }, + { + "epoch": 2.1594840222808562, + "grad_norm": 0.2538175543380687, + "learning_rate": 0.0004954199886752564, + "loss": 3.160055637359619, + "step": 3684, + "token_acc": 0.283541296555204 + }, + { + "epoch": 2.160070360598065, + "grad_norm": 0.3458549398184862, + "learning_rate": 0.00049541537076273, + "loss": 3.1605958938598633, + "step": 3685, + "token_acc": 0.28016138909903093 + }, + { + "epoch": 2.160656698915274, + "grad_norm": 0.2440113929363253, + "learning_rate": 0.000495410750544857, + "loss": 3.1378602981567383, + "step": 3686, + "token_acc": 0.2850267901146717 + }, + { + "epoch": 2.161243037232483, + "grad_norm": 0.2855754573489357, + "learning_rate": 0.0004954061280216805, + "loss": 3.1416077613830566, + "step": 3687, + "token_acc": 0.28433821948104704 + }, + { + "epoch": 2.1618293755496922, + "grad_norm": 0.2979997155246482, + "learning_rate": 0.0004954015031932443, + "loss": 3.1882548332214355, + "step": 3688, + "token_acc": 0.27819883630109626 + }, + { + "epoch": 2.1624157138669013, + "grad_norm": 0.3266638132185164, + "learning_rate": 0.0004953968760595916, + "loss": 3.1561222076416016, + "step": 3689, + "token_acc": 0.2811039802561945 + }, + { + "epoch": 2.16300205218411, + "grad_norm": 0.3545013440893388, + "learning_rate": 0.0004953922466207659, + "loss": 3.111231803894043, + "step": 3690, + "token_acc": 0.2881792887347191 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.3475740902123315, + "learning_rate": 0.0004953876148768106, + "loss": 3.153034210205078, + "step": 3691, + "token_acc": 0.2848911299924376 + }, + { + "epoch": 2.1641747288185282, + "grad_norm": 0.31686603016173087, + "learning_rate": 0.0004953829808277695, + "loss": 3.1761345863342285, + "step": 3692, + "token_acc": 0.27841218343454177 + }, + { + "epoch": 2.1647610671357373, + "grad_norm": 0.2851270577375636, + "learning_rate": 0.0004953783444736859, + "loss": 3.155456304550171, + "step": 3693, + "token_acc": 0.282326879972736 + }, + { + "epoch": 2.1653474054529465, + "grad_norm": 0.2974155595548568, + "learning_rate": 0.0004953737058146034, + "loss": 3.1702611446380615, + "step": 3694, + "token_acc": 0.27938745683355537 + }, + { + "epoch": 2.1659337437701556, + "grad_norm": 0.2904824480784424, + "learning_rate": 0.0004953690648505656, + "loss": 3.174433708190918, + "step": 3695, + "token_acc": 0.2808895453128434 + }, + { + "epoch": 2.1665200820873642, + "grad_norm": 0.313715527111614, + "learning_rate": 0.0004953644215816161, + "loss": 3.150599479675293, + "step": 3696, + "token_acc": 0.28154788191512164 + }, + { + "epoch": 2.1671064204045734, + "grad_norm": 0.2592681018806169, + "learning_rate": 0.0004953597760077984, + "loss": 3.153822422027588, + "step": 3697, + "token_acc": 0.28162820899634006 + }, + { + "epoch": 2.1676927587217825, + "grad_norm": 0.26690029731819465, + "learning_rate": 0.0004953551281291563, + "loss": 3.121373176574707, + "step": 3698, + "token_acc": 0.286613313932339 + }, + { + "epoch": 2.1682790970389916, + "grad_norm": 0.30545037568889716, + "learning_rate": 0.0004953504779457334, + "loss": 3.119475841522217, + "step": 3699, + "token_acc": 0.28839900026732207 + }, + { + "epoch": 2.1688654353562007, + "grad_norm": 0.33656475838299044, + "learning_rate": 0.0004953458254575733, + "loss": 3.186060667037964, + "step": 3700, + "token_acc": 0.27927676229886267 + }, + { + "epoch": 2.1694517736734094, + "grad_norm": 0.2808184222321881, + "learning_rate": 0.0004953411706647198, + "loss": 3.1159610748291016, + "step": 3701, + "token_acc": 0.28839881277006335 + }, + { + "epoch": 2.1700381119906185, + "grad_norm": 0.3174823490833614, + "learning_rate": 0.0004953365135672166, + "loss": 3.106145143508911, + "step": 3702, + "token_acc": 0.2878886060325542 + }, + { + "epoch": 2.1706244503078276, + "grad_norm": 0.3148665330157047, + "learning_rate": 0.0004953318541651075, + "loss": 3.1284549236297607, + "step": 3703, + "token_acc": 0.2859016616618713 + }, + { + "epoch": 2.1712107886250367, + "grad_norm": 0.29801451371004684, + "learning_rate": 0.0004953271924584363, + "loss": 3.1831448078155518, + "step": 3704, + "token_acc": 0.2776782035985811 + }, + { + "epoch": 2.171797126942246, + "grad_norm": 0.3153530175579118, + "learning_rate": 0.0004953225284472465, + "loss": 3.1711905002593994, + "step": 3705, + "token_acc": 0.2803914894066175 + }, + { + "epoch": 2.172383465259455, + "grad_norm": 0.312192547400296, + "learning_rate": 0.0004953178621315823, + "loss": 3.186277151107788, + "step": 3706, + "token_acc": 0.2789642636530263 + }, + { + "epoch": 2.1729698035766636, + "grad_norm": 0.3501295826059464, + "learning_rate": 0.0004953131935114873, + "loss": 3.1341817378997803, + "step": 3707, + "token_acc": 0.28559234684017265 + }, + { + "epoch": 2.1735561418938727, + "grad_norm": 0.3506519200532044, + "learning_rate": 0.0004953085225870053, + "loss": 3.159444808959961, + "step": 3708, + "token_acc": 0.281260631719922 + }, + { + "epoch": 2.174142480211082, + "grad_norm": 0.29817563054121676, + "learning_rate": 0.0004953038493581804, + "loss": 3.1596174240112305, + "step": 3709, + "token_acc": 0.28222528650150547 + }, + { + "epoch": 2.174728818528291, + "grad_norm": 0.3066593369214165, + "learning_rate": 0.0004952991738250564, + "loss": 3.136115789413452, + "step": 3710, + "token_acc": 0.28355320085619296 + }, + { + "epoch": 2.1753151568455, + "grad_norm": 0.3370444403059494, + "learning_rate": 0.0004952944959876772, + "loss": 3.1707992553710938, + "step": 3711, + "token_acc": 0.2805408395175644 + }, + { + "epoch": 2.1759014951627087, + "grad_norm": 0.37391010393470003, + "learning_rate": 0.0004952898158460866, + "loss": 3.204577922821045, + "step": 3712, + "token_acc": 0.2759864613249153 + }, + { + "epoch": 2.176487833479918, + "grad_norm": 0.2657830348902625, + "learning_rate": 0.0004952851334003289, + "loss": 3.1445658206939697, + "step": 3713, + "token_acc": 0.2854460935599124 + }, + { + "epoch": 2.177074171797127, + "grad_norm": 0.25971227931948165, + "learning_rate": 0.0004952804486504478, + "loss": 3.146808624267578, + "step": 3714, + "token_acc": 0.28507834025578815 + }, + { + "epoch": 2.177660510114336, + "grad_norm": 0.2786292892731512, + "learning_rate": 0.0004952757615964875, + "loss": 3.1489758491516113, + "step": 3715, + "token_acc": 0.281100466316506 + }, + { + "epoch": 2.178246848431545, + "grad_norm": 0.2362994299496863, + "learning_rate": 0.0004952710722384918, + "loss": 3.1710357666015625, + "step": 3716, + "token_acc": 0.281224426067106 + }, + { + "epoch": 2.1788331867487543, + "grad_norm": 0.2532047515639838, + "learning_rate": 0.000495266380576505, + "loss": 3.1554884910583496, + "step": 3717, + "token_acc": 0.28152197425099007 + }, + { + "epoch": 2.179419525065963, + "grad_norm": 0.22986122855703853, + "learning_rate": 0.000495261686610571, + "loss": 3.1227917671203613, + "step": 3718, + "token_acc": 0.28741014559915506 + }, + { + "epoch": 2.180005863383172, + "grad_norm": 0.2634098691600834, + "learning_rate": 0.0004952569903407339, + "loss": 3.139397382736206, + "step": 3719, + "token_acc": 0.28651888962409805 + }, + { + "epoch": 2.180592201700381, + "grad_norm": 0.2837643155890122, + "learning_rate": 0.000495252291767038, + "loss": 3.1543872356414795, + "step": 3720, + "token_acc": 0.2821493210897039 + }, + { + "epoch": 2.1811785400175903, + "grad_norm": 0.2546826819111413, + "learning_rate": 0.0004952475908895272, + "loss": 3.147578239440918, + "step": 3721, + "token_acc": 0.2822189312258654 + }, + { + "epoch": 2.1817648783347994, + "grad_norm": 0.2810274487636963, + "learning_rate": 0.0004952428877082458, + "loss": 3.1817545890808105, + "step": 3722, + "token_acc": 0.27998364720156144 + }, + { + "epoch": 2.182351216652008, + "grad_norm": 0.31562468947121036, + "learning_rate": 0.0004952381822232379, + "loss": 3.2187581062316895, + "step": 3723, + "token_acc": 0.2741215223948102 + }, + { + "epoch": 2.182937554969217, + "grad_norm": 0.3426951638733073, + "learning_rate": 0.0004952334744345477, + "loss": 3.170959234237671, + "step": 3724, + "token_acc": 0.28076423425669556 + }, + { + "epoch": 2.1835238932864263, + "grad_norm": 0.3015856908724199, + "learning_rate": 0.0004952287643422195, + "loss": 3.151789426803589, + "step": 3725, + "token_acc": 0.2846478270290839 + }, + { + "epoch": 2.1841102316036354, + "grad_norm": 0.3328134386967711, + "learning_rate": 0.0004952240519462976, + "loss": 3.1855974197387695, + "step": 3726, + "token_acc": 0.27759027219897964 + }, + { + "epoch": 2.1846965699208445, + "grad_norm": 0.33849486866404144, + "learning_rate": 0.0004952193372468261, + "loss": 3.182040214538574, + "step": 3727, + "token_acc": 0.2775441923005208 + }, + { + "epoch": 2.185282908238053, + "grad_norm": 0.326255786965002, + "learning_rate": 0.0004952146202438493, + "loss": 3.1655569076538086, + "step": 3728, + "token_acc": 0.2818119105391145 + }, + { + "epoch": 2.1858692465552623, + "grad_norm": 0.3073557443291841, + "learning_rate": 0.0004952099009374117, + "loss": 3.19307804107666, + "step": 3729, + "token_acc": 0.27523539643349954 + }, + { + "epoch": 2.1864555848724714, + "grad_norm": 0.2845277701116671, + "learning_rate": 0.0004952051793275574, + "loss": 3.1658334732055664, + "step": 3730, + "token_acc": 0.2810134601776468 + }, + { + "epoch": 2.1870419231896805, + "grad_norm": 0.33074208237224484, + "learning_rate": 0.0004952004554143309, + "loss": 3.1735787391662598, + "step": 3731, + "token_acc": 0.28233142719688603 + }, + { + "epoch": 2.1876282615068896, + "grad_norm": 0.3300127189580756, + "learning_rate": 0.0004951957291977766, + "loss": 3.169307231903076, + "step": 3732, + "token_acc": 0.2797914604166187 + }, + { + "epoch": 2.1882145998240983, + "grad_norm": 0.3160419873532794, + "learning_rate": 0.0004951910006779388, + "loss": 3.1577887535095215, + "step": 3733, + "token_acc": 0.2821172662362498 + }, + { + "epoch": 2.1888009381413074, + "grad_norm": 0.3296306369355759, + "learning_rate": 0.000495186269854862, + "loss": 3.1503286361694336, + "step": 3734, + "token_acc": 0.2838889644500445 + }, + { + "epoch": 2.1893872764585165, + "grad_norm": 0.38021542726359653, + "learning_rate": 0.0004951815367285904, + "loss": 3.2131733894348145, + "step": 3735, + "token_acc": 0.2740344610375639 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.3160132516976753, + "learning_rate": 0.0004951768012991688, + "loss": 3.1848201751708984, + "step": 3736, + "token_acc": 0.2801549263486749 + }, + { + "epoch": 2.1905599530929347, + "grad_norm": 0.2912601084496391, + "learning_rate": 0.0004951720635666414, + "loss": 3.167044162750244, + "step": 3737, + "token_acc": 0.27999295882205233 + }, + { + "epoch": 2.191146291410144, + "grad_norm": 0.34282978128086616, + "learning_rate": 0.000495167323531053, + "loss": 3.144331216812134, + "step": 3738, + "token_acc": 0.2844745248278131 + }, + { + "epoch": 2.1917326297273525, + "grad_norm": 0.2827534041211752, + "learning_rate": 0.0004951625811924478, + "loss": 3.2110142707824707, + "step": 3739, + "token_acc": 0.2739920698971894 + }, + { + "epoch": 2.1923189680445616, + "grad_norm": 0.2704330668198203, + "learning_rate": 0.0004951578365508707, + "loss": 3.185276508331299, + "step": 3740, + "token_acc": 0.27909842149537334 + }, + { + "epoch": 2.1929053063617707, + "grad_norm": 0.2670792704998593, + "learning_rate": 0.0004951530896063659, + "loss": 3.176053524017334, + "step": 3741, + "token_acc": 0.28112527781640895 + }, + { + "epoch": 2.19349164467898, + "grad_norm": 0.2547878074988725, + "learning_rate": 0.0004951483403589782, + "loss": 3.1665143966674805, + "step": 3742, + "token_acc": 0.2799359641651683 + }, + { + "epoch": 2.194077982996189, + "grad_norm": 0.3274318425481537, + "learning_rate": 0.0004951435888087523, + "loss": 3.1828558444976807, + "step": 3743, + "token_acc": 0.279443785150405 + }, + { + "epoch": 2.1946643213133976, + "grad_norm": 0.32608385307928234, + "learning_rate": 0.0004951388349557326, + "loss": 3.162421226501465, + "step": 3744, + "token_acc": 0.28068263418161815 + }, + { + "epoch": 2.1952506596306067, + "grad_norm": 0.3512511174436906, + "learning_rate": 0.000495134078799964, + "loss": 3.1768875122070312, + "step": 3745, + "token_acc": 0.2787925761447903 + }, + { + "epoch": 2.195836997947816, + "grad_norm": 0.34682021229657245, + "learning_rate": 0.0004951293203414909, + "loss": 3.2027292251586914, + "step": 3746, + "token_acc": 0.2761715952112964 + }, + { + "epoch": 2.196423336265025, + "grad_norm": 0.3078646210905105, + "learning_rate": 0.0004951245595803583, + "loss": 3.190337657928467, + "step": 3747, + "token_acc": 0.2784136327301795 + }, + { + "epoch": 2.197009674582234, + "grad_norm": 0.3004387191477506, + "learning_rate": 0.0004951197965166106, + "loss": 3.1988492012023926, + "step": 3748, + "token_acc": 0.2755731482599497 + }, + { + "epoch": 2.197596012899443, + "grad_norm": 0.31047297484930964, + "learning_rate": 0.0004951150311502929, + "loss": 3.1783127784729004, + "step": 3749, + "token_acc": 0.2801631234122209 + }, + { + "epoch": 2.198182351216652, + "grad_norm": 0.3562918497362142, + "learning_rate": 0.0004951102634814497, + "loss": 3.1921141147613525, + "step": 3750, + "token_acc": 0.27772101306001346 + }, + { + "epoch": 2.198768689533861, + "grad_norm": 0.30863450537647635, + "learning_rate": 0.0004951054935101258, + "loss": 3.127758502960205, + "step": 3751, + "token_acc": 0.2870434960201367 + }, + { + "epoch": 2.19935502785107, + "grad_norm": 0.29814326466489566, + "learning_rate": 0.0004951007212363661, + "loss": 3.0999693870544434, + "step": 3752, + "token_acc": 0.28932952007669 + }, + { + "epoch": 2.199941366168279, + "grad_norm": 0.3106553178204928, + "learning_rate": 0.0004950959466602155, + "loss": 3.191160202026367, + "step": 3753, + "token_acc": 0.2771539044536077 + }, + { + "epoch": 2.2005277044854883, + "grad_norm": 0.2886387501880631, + "learning_rate": 0.0004950911697817187, + "loss": 3.187046527862549, + "step": 3754, + "token_acc": 0.2784452819411792 + }, + { + "epoch": 2.201114042802697, + "grad_norm": 0.3161131109884263, + "learning_rate": 0.0004950863906009206, + "loss": 3.113492488861084, + "step": 3755, + "token_acc": 0.28867689190514056 + }, + { + "epoch": 2.201700381119906, + "grad_norm": 0.22579771789283223, + "learning_rate": 0.000495081609117866, + "loss": 3.1345467567443848, + "step": 3756, + "token_acc": 0.28268931562906285 + }, + { + "epoch": 2.202286719437115, + "grad_norm": 0.2987120592109529, + "learning_rate": 0.0004950768253326001, + "loss": 3.1591715812683105, + "step": 3757, + "token_acc": 0.2819137971839158 + }, + { + "epoch": 2.2028730577543243, + "grad_norm": 0.2931091589073615, + "learning_rate": 0.0004950720392451676, + "loss": 3.1425862312316895, + "step": 3758, + "token_acc": 0.28395236386970296 + }, + { + "epoch": 2.2034593960715334, + "grad_norm": 0.2873903446101065, + "learning_rate": 0.0004950672508556136, + "loss": 3.1266496181488037, + "step": 3759, + "token_acc": 0.2871510073289814 + }, + { + "epoch": 2.2040457343887425, + "grad_norm": 0.2556194886323477, + "learning_rate": 0.000495062460163983, + "loss": 3.150421619415283, + "step": 3760, + "token_acc": 0.2848755601144523 + }, + { + "epoch": 2.204632072705951, + "grad_norm": 0.2573105231027643, + "learning_rate": 0.0004950576671703208, + "loss": 3.1235156059265137, + "step": 3761, + "token_acc": 0.28695186306287 + }, + { + "epoch": 2.2052184110231603, + "grad_norm": 0.3325154613377441, + "learning_rate": 0.0004950528718746719, + "loss": 3.1386680603027344, + "step": 3762, + "token_acc": 0.28505533811140993 + }, + { + "epoch": 2.2058047493403694, + "grad_norm": 0.35777715201107346, + "learning_rate": 0.0004950480742770817, + "loss": 3.1578664779663086, + "step": 3763, + "token_acc": 0.28491880923691637 + }, + { + "epoch": 2.2063910876575785, + "grad_norm": 0.2747822349005734, + "learning_rate": 0.000495043274377595, + "loss": 3.1813435554504395, + "step": 3764, + "token_acc": 0.2799403441643737 + }, + { + "epoch": 2.2069774259747876, + "grad_norm": 0.3127730937258205, + "learning_rate": 0.0004950384721762568, + "loss": 3.154592990875244, + "step": 3765, + "token_acc": 0.2825758537678767 + }, + { + "epoch": 2.2075637642919963, + "grad_norm": 0.3319689130252632, + "learning_rate": 0.0004950336676731124, + "loss": 3.1862101554870605, + "step": 3766, + "token_acc": 0.27914731515246904 + }, + { + "epoch": 2.2081501026092054, + "grad_norm": 0.29719171155977253, + "learning_rate": 0.000495028860868207, + "loss": 3.1682379245758057, + "step": 3767, + "token_acc": 0.2820816101017578 + }, + { + "epoch": 2.2087364409264145, + "grad_norm": 0.2965916989692894, + "learning_rate": 0.0004950240517615855, + "loss": 3.130030870437622, + "step": 3768, + "token_acc": 0.28576881885756095 + }, + { + "epoch": 2.2093227792436236, + "grad_norm": 0.32011024686620565, + "learning_rate": 0.0004950192403532932, + "loss": 3.1410634517669678, + "step": 3769, + "token_acc": 0.2871836168846393 + }, + { + "epoch": 2.2099091175608327, + "grad_norm": 0.238341190935579, + "learning_rate": 0.0004950144266433755, + "loss": 3.1870779991149902, + "step": 3770, + "token_acc": 0.2777431731458751 + }, + { + "epoch": 2.210495455878042, + "grad_norm": 0.30514750750831315, + "learning_rate": 0.0004950096106318772, + "loss": 3.135143280029297, + "step": 3771, + "token_acc": 0.2846700965743775 + }, + { + "epoch": 2.2110817941952505, + "grad_norm": 0.29170896907474464, + "learning_rate": 0.0004950047923188438, + "loss": 3.1798901557922363, + "step": 3772, + "token_acc": 0.2794307856689527 + }, + { + "epoch": 2.2116681325124596, + "grad_norm": 0.3894135930131722, + "learning_rate": 0.0004949999717043206, + "loss": 3.166572093963623, + "step": 3773, + "token_acc": 0.28148247559951894 + }, + { + "epoch": 2.2122544708296688, + "grad_norm": 0.2508310465549407, + "learning_rate": 0.0004949951487883528, + "loss": 3.1638011932373047, + "step": 3774, + "token_acc": 0.28385914108473587 + }, + { + "epoch": 2.212840809146878, + "grad_norm": 0.32891670263257017, + "learning_rate": 0.0004949903235709857, + "loss": 3.165375232696533, + "step": 3775, + "token_acc": 0.2803039543653214 + }, + { + "epoch": 2.213427147464087, + "grad_norm": 0.3196513226655616, + "learning_rate": 0.0004949854960522646, + "loss": 3.1705684661865234, + "step": 3776, + "token_acc": 0.28022365231667556 + }, + { + "epoch": 2.2140134857812956, + "grad_norm": 0.3013933225382682, + "learning_rate": 0.000494980666232235, + "loss": 3.181222438812256, + "step": 3777, + "token_acc": 0.2772660406845861 + }, + { + "epoch": 2.2145998240985048, + "grad_norm": 0.2786811800951262, + "learning_rate": 0.0004949758341109419, + "loss": 3.144491195678711, + "step": 3778, + "token_acc": 0.28403495924040434 + }, + { + "epoch": 2.215186162415714, + "grad_norm": 0.3080345412474166, + "learning_rate": 0.0004949709996884312, + "loss": 3.1757869720458984, + "step": 3779, + "token_acc": 0.2803350834257927 + }, + { + "epoch": 2.215772500732923, + "grad_norm": 0.3219952986332383, + "learning_rate": 0.0004949661629647479, + "loss": 3.1939444541931152, + "step": 3780, + "token_acc": 0.27802640067998646 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.2575174889105713, + "learning_rate": 0.0004949613239399377, + "loss": 3.1882576942443848, + "step": 3781, + "token_acc": 0.2771468319210826 + }, + { + "epoch": 2.2169451773673408, + "grad_norm": 0.24402519247689416, + "learning_rate": 0.000494956482614046, + "loss": 3.1231582164764404, + "step": 3782, + "token_acc": 0.2846397557753236 + }, + { + "epoch": 2.21753151568455, + "grad_norm": 0.2711246530587684, + "learning_rate": 0.0004949516389871182, + "loss": 3.1705546379089355, + "step": 3783, + "token_acc": 0.2810583030764805 + }, + { + "epoch": 2.218117854001759, + "grad_norm": 0.30673550535045896, + "learning_rate": 0.0004949467930591997, + "loss": 3.1144230365753174, + "step": 3784, + "token_acc": 0.2877619636141344 + }, + { + "epoch": 2.218704192318968, + "grad_norm": 0.3297915595325778, + "learning_rate": 0.0004949419448303362, + "loss": 3.1422719955444336, + "step": 3785, + "token_acc": 0.284815349437298 + }, + { + "epoch": 2.219290530636177, + "grad_norm": 0.28322083433960177, + "learning_rate": 0.0004949370943005732, + "loss": 3.190800666809082, + "step": 3786, + "token_acc": 0.2772277763644335 + }, + { + "epoch": 2.219876868953386, + "grad_norm": 0.2957989959323077, + "learning_rate": 0.0004949322414699563, + "loss": 3.190504550933838, + "step": 3787, + "token_acc": 0.27512100025146274 + }, + { + "epoch": 2.220463207270595, + "grad_norm": 0.3307470194735579, + "learning_rate": 0.0004949273863385311, + "loss": 3.148383140563965, + "step": 3788, + "token_acc": 0.2822902657721752 + }, + { + "epoch": 2.221049545587804, + "grad_norm": 0.3294250518813636, + "learning_rate": 0.0004949225289063431, + "loss": 3.104329824447632, + "step": 3789, + "token_acc": 0.2886879604307604 + }, + { + "epoch": 2.221635883905013, + "grad_norm": 0.32018364012831674, + "learning_rate": 0.0004949176691734378, + "loss": 3.158704996109009, + "step": 3790, + "token_acc": 0.2821065649128231 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.30308458534249355, + "learning_rate": 0.0004949128071398612, + "loss": 3.1528539657592773, + "step": 3791, + "token_acc": 0.2832621111066736 + }, + { + "epoch": 2.2228085605394314, + "grad_norm": 0.30787586574331965, + "learning_rate": 0.0004949079428056588, + "loss": 3.15920090675354, + "step": 3792, + "token_acc": 0.28139433598055713 + }, + { + "epoch": 2.22339489885664, + "grad_norm": 0.31897223916472, + "learning_rate": 0.0004949030761708762, + "loss": 3.1382431983947754, + "step": 3793, + "token_acc": 0.28397484638622306 + }, + { + "epoch": 2.223981237173849, + "grad_norm": 0.33273101685313655, + "learning_rate": 0.0004948982072355594, + "loss": 3.130331039428711, + "step": 3794, + "token_acc": 0.2857558801264168 + }, + { + "epoch": 2.2245675754910583, + "grad_norm": 0.295520714532924, + "learning_rate": 0.0004948933359997538, + "loss": 3.170107126235962, + "step": 3795, + "token_acc": 0.28111115423928373 + }, + { + "epoch": 2.2251539138082674, + "grad_norm": 0.2651887698003276, + "learning_rate": 0.0004948884624635053, + "loss": 3.1857874393463135, + "step": 3796, + "token_acc": 0.27829263553338945 + }, + { + "epoch": 2.2257402521254765, + "grad_norm": 0.30419949767929005, + "learning_rate": 0.0004948835866268597, + "loss": 3.1911942958831787, + "step": 3797, + "token_acc": 0.2767834910358566 + }, + { + "epoch": 2.226326590442685, + "grad_norm": 0.31511868217198036, + "learning_rate": 0.0004948787084898628, + "loss": 3.2010602951049805, + "step": 3798, + "token_acc": 0.27473809215241835 + }, + { + "epoch": 2.2269129287598943, + "grad_norm": 0.3516995385246629, + "learning_rate": 0.0004948738280525604, + "loss": 3.1852400302886963, + "step": 3799, + "token_acc": 0.27958682662467 + }, + { + "epoch": 2.2274992670771034, + "grad_norm": 0.3029946543777094, + "learning_rate": 0.0004948689453149985, + "loss": 3.150453567504883, + "step": 3800, + "token_acc": 0.2820547065622191 + }, + { + "epoch": 2.2280856053943126, + "grad_norm": 0.2661075877364138, + "learning_rate": 0.0004948640602772227, + "loss": 3.1197733879089355, + "step": 3801, + "token_acc": 0.2877233869167804 + }, + { + "epoch": 2.2286719437115217, + "grad_norm": 0.2912803128699928, + "learning_rate": 0.0004948591729392789, + "loss": 3.180408239364624, + "step": 3802, + "token_acc": 0.2805510714276519 + }, + { + "epoch": 2.2292582820287308, + "grad_norm": 0.2763089953111624, + "learning_rate": 0.0004948542833012133, + "loss": 3.1445374488830566, + "step": 3803, + "token_acc": 0.28318792121623737 + }, + { + "epoch": 2.2298446203459394, + "grad_norm": 0.2769176304409203, + "learning_rate": 0.0004948493913630716, + "loss": 3.1564347743988037, + "step": 3804, + "token_acc": 0.2825479696685881 + }, + { + "epoch": 2.2304309586631486, + "grad_norm": 0.29077217594520294, + "learning_rate": 0.0004948444971248998, + "loss": 3.1660101413726807, + "step": 3805, + "token_acc": 0.2820621479427341 + }, + { + "epoch": 2.2310172969803577, + "grad_norm": 0.28650401284829974, + "learning_rate": 0.0004948396005867438, + "loss": 3.139133930206299, + "step": 3806, + "token_acc": 0.2851104624465277 + }, + { + "epoch": 2.231603635297567, + "grad_norm": 0.287620510266524, + "learning_rate": 0.0004948347017486498, + "loss": 3.2195794582366943, + "step": 3807, + "token_acc": 0.27286939207727834 + }, + { + "epoch": 2.232189973614776, + "grad_norm": 0.3120364238806162, + "learning_rate": 0.0004948298006106636, + "loss": 3.152463436126709, + "step": 3808, + "token_acc": 0.2814797593792683 + }, + { + "epoch": 2.2327763119319846, + "grad_norm": 0.2869231333652891, + "learning_rate": 0.0004948248971728314, + "loss": 3.156865119934082, + "step": 3809, + "token_acc": 0.2811637444835455 + }, + { + "epoch": 2.2333626502491937, + "grad_norm": 0.32741760050133945, + "learning_rate": 0.0004948199914351992, + "loss": 3.1683101654052734, + "step": 3810, + "token_acc": 0.2795143543562488 + }, + { + "epoch": 2.233948988566403, + "grad_norm": 0.30557749016069713, + "learning_rate": 0.0004948150833978131, + "loss": 3.139017105102539, + "step": 3811, + "token_acc": 0.28411266508745175 + }, + { + "epoch": 2.234535326883612, + "grad_norm": 0.3635901217009716, + "learning_rate": 0.0004948101730607192, + "loss": 3.132762908935547, + "step": 3812, + "token_acc": 0.28495207236398 + }, + { + "epoch": 2.235121665200821, + "grad_norm": 0.32801564599133337, + "learning_rate": 0.0004948052604239635, + "loss": 3.200976848602295, + "step": 3813, + "token_acc": 0.2769495767163072 + }, + { + "epoch": 2.23570800351803, + "grad_norm": 0.27834001136129105, + "learning_rate": 0.0004948003454875923, + "loss": 3.146116256713867, + "step": 3814, + "token_acc": 0.28222398484731925 + }, + { + "epoch": 2.236294341835239, + "grad_norm": 0.27723891837271386, + "learning_rate": 0.0004947954282516518, + "loss": 3.1998884677886963, + "step": 3815, + "token_acc": 0.2753354153846946 + }, + { + "epoch": 2.236880680152448, + "grad_norm": 0.27049286040890863, + "learning_rate": 0.000494790508716188, + "loss": 3.1218719482421875, + "step": 3816, + "token_acc": 0.2863773455879726 + }, + { + "epoch": 2.237467018469657, + "grad_norm": 0.32656509507300274, + "learning_rate": 0.0004947855868812473, + "loss": 3.1241960525512695, + "step": 3817, + "token_acc": 0.2882601668067499 + }, + { + "epoch": 2.238053356786866, + "grad_norm": 0.2577958142509023, + "learning_rate": 0.0004947806627468758, + "loss": 3.1178030967712402, + "step": 3818, + "token_acc": 0.2889268548727396 + }, + { + "epoch": 2.2386396951040752, + "grad_norm": 0.2609082362846041, + "learning_rate": 0.00049477573631312, + "loss": 3.134734869003296, + "step": 3819, + "token_acc": 0.28481944117107916 + }, + { + "epoch": 2.239226033421284, + "grad_norm": 0.26476610154970054, + "learning_rate": 0.0004947708075800258, + "loss": 3.130898952484131, + "step": 3820, + "token_acc": 0.2836286983606026 + }, + { + "epoch": 2.239812371738493, + "grad_norm": 0.2662261603829151, + "learning_rate": 0.0004947658765476398, + "loss": 3.1527795791625977, + "step": 3821, + "token_acc": 0.28350422784107837 + }, + { + "epoch": 2.240398710055702, + "grad_norm": 0.32881885122498833, + "learning_rate": 0.0004947609432160081, + "loss": 3.168898105621338, + "step": 3822, + "token_acc": 0.2811476494087145 + }, + { + "epoch": 2.2409850483729112, + "grad_norm": 0.30984157312918303, + "learning_rate": 0.0004947560075851773, + "loss": 3.17402720451355, + "step": 3823, + "token_acc": 0.279500130510714 + }, + { + "epoch": 2.2415713866901203, + "grad_norm": 0.3271752951652869, + "learning_rate": 0.0004947510696551936, + "loss": 3.1329240798950195, + "step": 3824, + "token_acc": 0.2869338824514151 + }, + { + "epoch": 2.2421577250073295, + "grad_norm": 0.301190421867787, + "learning_rate": 0.0004947461294261033, + "loss": 3.1421704292297363, + "step": 3825, + "token_acc": 0.2827037430246484 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.30324831224476184, + "learning_rate": 0.000494741186897953, + "loss": 3.184340000152588, + "step": 3826, + "token_acc": 0.2779550392982059 + }, + { + "epoch": 2.2433304016417472, + "grad_norm": 0.3393659459661467, + "learning_rate": 0.000494736242070789, + "loss": 3.1682004928588867, + "step": 3827, + "token_acc": 0.27882895386133727 + }, + { + "epoch": 2.2439167399589564, + "grad_norm": 0.2959570734691476, + "learning_rate": 0.0004947312949446579, + "loss": 3.180204153060913, + "step": 3828, + "token_acc": 0.2791224933826688 + }, + { + "epoch": 2.2445030782761655, + "grad_norm": 0.25907055474663665, + "learning_rate": 0.0004947263455196059, + "loss": 3.1089446544647217, + "step": 3829, + "token_acc": 0.28720609282355347 + }, + { + "epoch": 2.2450894165933746, + "grad_norm": 0.3035475988054403, + "learning_rate": 0.0004947213937956798, + "loss": 3.1567494869232178, + "step": 3830, + "token_acc": 0.2818376734139914 + }, + { + "epoch": 2.2456757549105832, + "grad_norm": 0.32620891546640984, + "learning_rate": 0.0004947164397729259, + "loss": 3.1643857955932617, + "step": 3831, + "token_acc": 0.2801917537693327 + }, + { + "epoch": 2.2462620932277924, + "grad_norm": 0.27437277648272124, + "learning_rate": 0.0004947114834513908, + "loss": 3.1790771484375, + "step": 3832, + "token_acc": 0.2784870937813759 + }, + { + "epoch": 2.2468484315450015, + "grad_norm": 0.26321972510939834, + "learning_rate": 0.0004947065248311211, + "loss": 3.1478734016418457, + "step": 3833, + "token_acc": 0.2808062311270709 + }, + { + "epoch": 2.2474347698622106, + "grad_norm": 0.3321874451269814, + "learning_rate": 0.0004947015639121632, + "loss": 3.148843765258789, + "step": 3834, + "token_acc": 0.28269119657097186 + }, + { + "epoch": 2.2480211081794197, + "grad_norm": 0.3700346831447425, + "learning_rate": 0.0004946966006945641, + "loss": 3.1746716499328613, + "step": 3835, + "token_acc": 0.2799283009747845 + }, + { + "epoch": 2.2486074464966284, + "grad_norm": 0.30603689291046354, + "learning_rate": 0.00049469163517837, + "loss": 3.1255345344543457, + "step": 3836, + "token_acc": 0.2858172304944818 + }, + { + "epoch": 2.2491937848138375, + "grad_norm": 0.2685102091455314, + "learning_rate": 0.0004946866673636277, + "loss": 3.1777853965759277, + "step": 3837, + "token_acc": 0.2796378571059565 + }, + { + "epoch": 2.2497801231310466, + "grad_norm": 0.2869237613209994, + "learning_rate": 0.000494681697250384, + "loss": 3.1614084243774414, + "step": 3838, + "token_acc": 0.28344745452096826 + }, + { + "epoch": 2.2503664614482557, + "grad_norm": 0.32234525350389726, + "learning_rate": 0.0004946767248386854, + "loss": 3.138523817062378, + "step": 3839, + "token_acc": 0.2848541857695429 + }, + { + "epoch": 2.250952799765465, + "grad_norm": 0.32817198927358937, + "learning_rate": 0.0004946717501285786, + "loss": 3.152355432510376, + "step": 3840, + "token_acc": 0.2811997897553041 + }, + { + "epoch": 2.2515391380826735, + "grad_norm": 0.3259947974176206, + "learning_rate": 0.0004946667731201105, + "loss": 3.1380293369293213, + "step": 3841, + "token_acc": 0.2835224906021321 + }, + { + "epoch": 2.2521254763998826, + "grad_norm": 0.3135241874373357, + "learning_rate": 0.0004946617938133278, + "loss": 3.1739583015441895, + "step": 3842, + "token_acc": 0.28144295639137257 + }, + { + "epoch": 2.2527118147170917, + "grad_norm": 0.24817831952163097, + "learning_rate": 0.0004946568122082772, + "loss": 3.167630434036255, + "step": 3843, + "token_acc": 0.2812488916137597 + }, + { + "epoch": 2.253298153034301, + "grad_norm": 0.2731649608743652, + "learning_rate": 0.0004946518283050055, + "loss": 3.135435104370117, + "step": 3844, + "token_acc": 0.2855770869154391 + }, + { + "epoch": 2.25388449135151, + "grad_norm": 0.27231846322880665, + "learning_rate": 0.0004946468421035596, + "loss": 3.149603843688965, + "step": 3845, + "token_acc": 0.2828323462390931 + }, + { + "epoch": 2.254470829668719, + "grad_norm": 0.24587328343055956, + "learning_rate": 0.0004946418536039862, + "loss": 3.1687400341033936, + "step": 3846, + "token_acc": 0.2800664128357154 + }, + { + "epoch": 2.2550571679859277, + "grad_norm": 0.31737504074933487, + "learning_rate": 0.0004946368628063323, + "loss": 3.164569139480591, + "step": 3847, + "token_acc": 0.282083941642086 + }, + { + "epoch": 2.255643506303137, + "grad_norm": 0.3214138382296742, + "learning_rate": 0.0004946318697106447, + "loss": 3.145965576171875, + "step": 3848, + "token_acc": 0.2832863696200631 + }, + { + "epoch": 2.256229844620346, + "grad_norm": 0.2359192855100126, + "learning_rate": 0.0004946268743169702, + "loss": 3.1189632415771484, + "step": 3849, + "token_acc": 0.287117309876641 + }, + { + "epoch": 2.256816182937555, + "grad_norm": 0.2720548980320122, + "learning_rate": 0.0004946218766253561, + "loss": 3.1839418411254883, + "step": 3850, + "token_acc": 0.27657987172446663 + }, + { + "epoch": 2.257402521254764, + "grad_norm": 0.30545373074796633, + "learning_rate": 0.000494616876635849, + "loss": 3.1674296855926514, + "step": 3851, + "token_acc": 0.28053701199932396 + }, + { + "epoch": 2.257988859571973, + "grad_norm": 0.24998909046002024, + "learning_rate": 0.000494611874348496, + "loss": 3.124718189239502, + "step": 3852, + "token_acc": 0.28661570616194193 + }, + { + "epoch": 2.258575197889182, + "grad_norm": 0.26905560843251397, + "learning_rate": 0.000494606869763344, + "loss": 3.172092914581299, + "step": 3853, + "token_acc": 0.2808318259400904 + }, + { + "epoch": 2.259161536206391, + "grad_norm": 0.2560863257823114, + "learning_rate": 0.0004946018628804401, + "loss": 3.1784820556640625, + "step": 3854, + "token_acc": 0.28063623170465113 + }, + { + "epoch": 2.2597478745236, + "grad_norm": 0.35376361229022935, + "learning_rate": 0.0004945968536998312, + "loss": 3.154937267303467, + "step": 3855, + "token_acc": 0.2819474617227426 + }, + { + "epoch": 2.2603342128408093, + "grad_norm": 0.3224194780636899, + "learning_rate": 0.0004945918422215646, + "loss": 3.131925582885742, + "step": 3856, + "token_acc": 0.28493756685426175 + }, + { + "epoch": 2.2609205511580184, + "grad_norm": 0.23735956048188633, + "learning_rate": 0.0004945868284456873, + "loss": 3.1655850410461426, + "step": 3857, + "token_acc": 0.2809684801292273 + }, + { + "epoch": 2.261506889475227, + "grad_norm": 0.2724830779948872, + "learning_rate": 0.0004945818123722461, + "loss": 3.173205852508545, + "step": 3858, + "token_acc": 0.2785379039998791 + }, + { + "epoch": 2.262093227792436, + "grad_norm": 0.2893665611426745, + "learning_rate": 0.0004945767940012885, + "loss": 3.159245491027832, + "step": 3859, + "token_acc": 0.28136490136133696 + }, + { + "epoch": 2.2626795661096453, + "grad_norm": 0.3294490501456755, + "learning_rate": 0.0004945717733328614, + "loss": 3.147303581237793, + "step": 3860, + "token_acc": 0.2836458483646393 + }, + { + "epoch": 2.2632659044268544, + "grad_norm": 0.39727270008228005, + "learning_rate": 0.0004945667503670121, + "loss": 3.1702921390533447, + "step": 3861, + "token_acc": 0.2812443292287751 + }, + { + "epoch": 2.2638522427440635, + "grad_norm": 0.38444282689039877, + "learning_rate": 0.0004945617251037878, + "loss": 3.135335922241211, + "step": 3862, + "token_acc": 0.2856051619631102 + }, + { + "epoch": 2.264438581061272, + "grad_norm": 0.34753773880833516, + "learning_rate": 0.0004945566975432356, + "loss": 3.0984320640563965, + "step": 3863, + "token_acc": 0.2902048117001032 + }, + { + "epoch": 2.2650249193784813, + "grad_norm": 0.37794340169581914, + "learning_rate": 0.0004945516676854028, + "loss": 3.177751064300537, + "step": 3864, + "token_acc": 0.2815546906203256 + }, + { + "epoch": 2.2656112576956904, + "grad_norm": 0.28168794103502537, + "learning_rate": 0.0004945466355303365, + "loss": 3.200343132019043, + "step": 3865, + "token_acc": 0.2759573944753981 + }, + { + "epoch": 2.2661975960128995, + "grad_norm": 0.3234174212103772, + "learning_rate": 0.0004945416010780843, + "loss": 3.1428143978118896, + "step": 3866, + "token_acc": 0.28260335362532524 + }, + { + "epoch": 2.2667839343301086, + "grad_norm": 0.3220769757146058, + "learning_rate": 0.0004945365643286931, + "loss": 3.149648666381836, + "step": 3867, + "token_acc": 0.2816632178437802 + }, + { + "epoch": 2.2673702726473177, + "grad_norm": 0.3302134003482676, + "learning_rate": 0.0004945315252822105, + "loss": 3.1192336082458496, + "step": 3868, + "token_acc": 0.2872133706252684 + }, + { + "epoch": 2.2679566109645264, + "grad_norm": 0.2943345383392368, + "learning_rate": 0.0004945264839386836, + "loss": 3.1715235710144043, + "step": 3869, + "token_acc": 0.2803244281783676 + }, + { + "epoch": 2.2685429492817355, + "grad_norm": 0.2508000130858046, + "learning_rate": 0.00049452144029816, + "loss": 3.143495559692383, + "step": 3870, + "token_acc": 0.2849174902779344 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.26109236874790925, + "learning_rate": 0.000494516394360687, + "loss": 3.190122127532959, + "step": 3871, + "token_acc": 0.27819389596464156 + }, + { + "epoch": 2.2697156259161537, + "grad_norm": 0.33903049639162236, + "learning_rate": 0.0004945113461263119, + "loss": 3.11405086517334, + "step": 3872, + "token_acc": 0.28989891438427484 + }, + { + "epoch": 2.270301964233363, + "grad_norm": 0.3211160059253596, + "learning_rate": 0.0004945062955950821, + "loss": 3.1626482009887695, + "step": 3873, + "token_acc": 0.282796800652315 + }, + { + "epoch": 2.2708883025505715, + "grad_norm": 0.25234664979847354, + "learning_rate": 0.0004945012427670452, + "loss": 3.194135904312134, + "step": 3874, + "token_acc": 0.27816759517645046 + }, + { + "epoch": 2.2714746408677806, + "grad_norm": 0.287554603096279, + "learning_rate": 0.0004944961876422487, + "loss": 3.1574296951293945, + "step": 3875, + "token_acc": 0.2823687780015619 + }, + { + "epoch": 2.2720609791849897, + "grad_norm": 0.29182285292723903, + "learning_rate": 0.0004944911302207398, + "loss": 3.1222705841064453, + "step": 3876, + "token_acc": 0.2878861022958396 + }, + { + "epoch": 2.272647317502199, + "grad_norm": 0.2527807847386134, + "learning_rate": 0.0004944860705025664, + "loss": 3.172006607055664, + "step": 3877, + "token_acc": 0.2799828502384077 + }, + { + "epoch": 2.273233655819408, + "grad_norm": 0.31413541585309224, + "learning_rate": 0.0004944810084877757, + "loss": 3.193477153778076, + "step": 3878, + "token_acc": 0.2772202146510566 + }, + { + "epoch": 2.273819994136617, + "grad_norm": 0.2704083612356819, + "learning_rate": 0.0004944759441764153, + "loss": 3.1231212615966797, + "step": 3879, + "token_acc": 0.2874028166923348 + }, + { + "epoch": 2.2744063324538257, + "grad_norm": 0.31178899912445196, + "learning_rate": 0.0004944708775685329, + "loss": 3.1877129077911377, + "step": 3880, + "token_acc": 0.27901131884424835 + }, + { + "epoch": 2.274992670771035, + "grad_norm": 0.3596351741078318, + "learning_rate": 0.000494465808664176, + "loss": 3.1519899368286133, + "step": 3881, + "token_acc": 0.28307210828965446 + }, + { + "epoch": 2.275579009088244, + "grad_norm": 0.3020333907146802, + "learning_rate": 0.0004944607374633923, + "loss": 3.1304898262023926, + "step": 3882, + "token_acc": 0.284429292808041 + }, + { + "epoch": 2.276165347405453, + "grad_norm": 0.2726401919605312, + "learning_rate": 0.0004944556639662294, + "loss": 3.165764808654785, + "step": 3883, + "token_acc": 0.2802883975409057 + }, + { + "epoch": 2.2767516857226617, + "grad_norm": 0.32285574943583706, + "learning_rate": 0.0004944505881727349, + "loss": 3.1476478576660156, + "step": 3884, + "token_acc": 0.2816021967212252 + }, + { + "epoch": 2.277338024039871, + "grad_norm": 0.2823843503222295, + "learning_rate": 0.0004944455100829565, + "loss": 3.1601884365081787, + "step": 3885, + "token_acc": 0.28280859065821695 + }, + { + "epoch": 2.27792436235708, + "grad_norm": 0.27956131460385075, + "learning_rate": 0.000494440429696942, + "loss": 3.1895575523376465, + "step": 3886, + "token_acc": 0.27752539634186724 + }, + { + "epoch": 2.278510700674289, + "grad_norm": 0.26286987520841, + "learning_rate": 0.000494435347014739, + "loss": 3.1537184715270996, + "step": 3887, + "token_acc": 0.2815025933343604 + }, + { + "epoch": 2.279097038991498, + "grad_norm": 0.29758746109419504, + "learning_rate": 0.0004944302620363953, + "loss": 3.195587158203125, + "step": 3888, + "token_acc": 0.27922072657599106 + }, + { + "epoch": 2.2796833773087073, + "grad_norm": 0.30684174486894483, + "learning_rate": 0.0004944251747619586, + "loss": 3.1939287185668945, + "step": 3889, + "token_acc": 0.27768207532745526 + }, + { + "epoch": 2.280269715625916, + "grad_norm": 0.27791092948529533, + "learning_rate": 0.000494420085191477, + "loss": 3.153632164001465, + "step": 3890, + "token_acc": 0.28336240556026593 + }, + { + "epoch": 2.280856053943125, + "grad_norm": 0.2649049416220319, + "learning_rate": 0.0004944149933249979, + "loss": 3.0951740741729736, + "step": 3891, + "token_acc": 0.2902582883967504 + }, + { + "epoch": 2.281442392260334, + "grad_norm": 0.30366376849256915, + "learning_rate": 0.0004944098991625692, + "loss": 3.2140722274780273, + "step": 3892, + "token_acc": 0.2759592201140853 + }, + { + "epoch": 2.2820287305775433, + "grad_norm": 0.3373327820170129, + "learning_rate": 0.000494404802704239, + "loss": 3.178417682647705, + "step": 3893, + "token_acc": 0.278223548179036 + }, + { + "epoch": 2.2826150688947524, + "grad_norm": 0.3202448688341465, + "learning_rate": 0.0004943997039500549, + "loss": 3.155106782913208, + "step": 3894, + "token_acc": 0.2827261964451188 + }, + { + "epoch": 2.283201407211961, + "grad_norm": 0.2980731572464724, + "learning_rate": 0.0004943946029000648, + "loss": 3.144254446029663, + "step": 3895, + "token_acc": 0.2838209328226442 + }, + { + "epoch": 2.28378774552917, + "grad_norm": 0.2841218228206637, + "learning_rate": 0.000494389499554317, + "loss": 3.1533799171447754, + "step": 3896, + "token_acc": 0.28254233148811464 + }, + { + "epoch": 2.2843740838463793, + "grad_norm": 0.2808923935230558, + "learning_rate": 0.0004943843939128591, + "loss": 3.175994634628296, + "step": 3897, + "token_acc": 0.2805128261394308 + }, + { + "epoch": 2.2849604221635884, + "grad_norm": 0.30017912466260666, + "learning_rate": 0.000494379285975739, + "loss": 3.1532750129699707, + "step": 3898, + "token_acc": 0.28463766165536125 + }, + { + "epoch": 2.2855467604807975, + "grad_norm": 0.3369751939969032, + "learning_rate": 0.0004943741757430049, + "loss": 3.189305305480957, + "step": 3899, + "token_acc": 0.27964515473691337 + }, + { + "epoch": 2.2861330987980066, + "grad_norm": 0.2993892380303333, + "learning_rate": 0.0004943690632147048, + "loss": 3.165214776992798, + "step": 3900, + "token_acc": 0.2805131286300056 + }, + { + "epoch": 2.2867194371152153, + "grad_norm": 0.29396733129894437, + "learning_rate": 0.0004943639483908865, + "loss": 3.17478609085083, + "step": 3901, + "token_acc": 0.2781765473567104 + }, + { + "epoch": 2.2873057754324244, + "grad_norm": 0.2785000078066771, + "learning_rate": 0.0004943588312715983, + "loss": 3.166280746459961, + "step": 3902, + "token_acc": 0.28039920444023164 + }, + { + "epoch": 2.2878921137496335, + "grad_norm": 0.31709409298963454, + "learning_rate": 0.0004943537118568881, + "loss": 3.1413936614990234, + "step": 3903, + "token_acc": 0.2841890440386681 + }, + { + "epoch": 2.2884784520668426, + "grad_norm": 0.27014675030248797, + "learning_rate": 0.0004943485901468041, + "loss": 3.114736557006836, + "step": 3904, + "token_acc": 0.28979685707341424 + }, + { + "epoch": 2.2890647903840518, + "grad_norm": 0.28364520835474033, + "learning_rate": 0.0004943434661413942, + "loss": 3.1973624229431152, + "step": 3905, + "token_acc": 0.2774534822181816 + }, + { + "epoch": 2.2896511287012604, + "grad_norm": 0.2808540960665262, + "learning_rate": 0.0004943383398407069, + "loss": 3.1188249588012695, + "step": 3906, + "token_acc": 0.28633595768237174 + }, + { + "epoch": 2.2902374670184695, + "grad_norm": 0.26979949470625997, + "learning_rate": 0.00049433321124479, + "loss": 3.144449472427368, + "step": 3907, + "token_acc": 0.2825334465642254 + }, + { + "epoch": 2.2908238053356786, + "grad_norm": 0.3317182504829572, + "learning_rate": 0.000494328080353692, + "loss": 3.1412973403930664, + "step": 3908, + "token_acc": 0.2829566111809845 + }, + { + "epoch": 2.2914101436528878, + "grad_norm": 0.3462209536885495, + "learning_rate": 0.0004943229471674607, + "loss": 3.1163129806518555, + "step": 3909, + "token_acc": 0.2878311784654515 + }, + { + "epoch": 2.291996481970097, + "grad_norm": 0.25013331055383425, + "learning_rate": 0.0004943178116861446, + "loss": 3.1724843978881836, + "step": 3910, + "token_acc": 0.27883928084070003 + }, + { + "epoch": 2.292582820287306, + "grad_norm": 0.29640242868774136, + "learning_rate": 0.0004943126739097919, + "loss": 3.156826972961426, + "step": 3911, + "token_acc": 0.2807963805492542 + }, + { + "epoch": 2.2931691586045146, + "grad_norm": 0.32890770083561716, + "learning_rate": 0.0004943075338384509, + "loss": 3.1356372833251953, + "step": 3912, + "token_acc": 0.2840835215965049 + }, + { + "epoch": 2.2937554969217238, + "grad_norm": 0.2751362602346232, + "learning_rate": 0.0004943023914721699, + "loss": 3.143984317779541, + "step": 3913, + "token_acc": 0.2828619868208649 + }, + { + "epoch": 2.294341835238933, + "grad_norm": 0.266215350163477, + "learning_rate": 0.000494297246810997, + "loss": 3.1524040699005127, + "step": 3914, + "token_acc": 0.28338190755240433 + }, + { + "epoch": 2.294928173556142, + "grad_norm": 0.32392491080374525, + "learning_rate": 0.0004942920998549807, + "loss": 3.1548280715942383, + "step": 3915, + "token_acc": 0.28271467636630077 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.3186682447488782, + "learning_rate": 0.0004942869506041693, + "loss": 3.1278862953186035, + "step": 3916, + "token_acc": 0.2848551425288171 + }, + { + "epoch": 2.2961008501905598, + "grad_norm": 0.31465866970918616, + "learning_rate": 0.0004942817990586111, + "loss": 3.176079750061035, + "step": 3917, + "token_acc": 0.2792354460726708 + }, + { + "epoch": 2.296687188507769, + "grad_norm": 0.32176942882249593, + "learning_rate": 0.0004942766452183547, + "loss": 3.1912670135498047, + "step": 3918, + "token_acc": 0.2777089680498699 + }, + { + "epoch": 2.297273526824978, + "grad_norm": 0.3656040823068835, + "learning_rate": 0.0004942714890834483, + "loss": 3.1739344596862793, + "step": 3919, + "token_acc": 0.2780263498361221 + }, + { + "epoch": 2.297859865142187, + "grad_norm": 0.32566068012364663, + "learning_rate": 0.0004942663306539405, + "loss": 3.1790390014648438, + "step": 3920, + "token_acc": 0.279256636987316 + }, + { + "epoch": 2.298446203459396, + "grad_norm": 0.2769985156626455, + "learning_rate": 0.0004942611699298796, + "loss": 3.1757187843322754, + "step": 3921, + "token_acc": 0.2820224541596015 + }, + { + "epoch": 2.2990325417766053, + "grad_norm": 0.3052832209277952, + "learning_rate": 0.0004942560069113142, + "loss": 3.1661975383758545, + "step": 3922, + "token_acc": 0.28091679869824004 + }, + { + "epoch": 2.299618880093814, + "grad_norm": 0.3112915127147975, + "learning_rate": 0.0004942508415982928, + "loss": 3.156914710998535, + "step": 3923, + "token_acc": 0.2826732231805037 + }, + { + "epoch": 2.300205218411023, + "grad_norm": 0.37493133747537133, + "learning_rate": 0.0004942456739908637, + "loss": 3.17193603515625, + "step": 3924, + "token_acc": 0.27917988430677365 + }, + { + "epoch": 2.300791556728232, + "grad_norm": 0.30009062242596574, + "learning_rate": 0.0004942405040890758, + "loss": 3.1218883991241455, + "step": 3925, + "token_acc": 0.2854247557841292 + }, + { + "epoch": 2.3013778950454413, + "grad_norm": 0.2692483083230852, + "learning_rate": 0.0004942353318929774, + "loss": 3.162898540496826, + "step": 3926, + "token_acc": 0.2808329299278385 + }, + { + "epoch": 2.3019642333626504, + "grad_norm": 0.3802494874312786, + "learning_rate": 0.0004942301574026172, + "loss": 3.1569154262542725, + "step": 3927, + "token_acc": 0.27951131054017925 + }, + { + "epoch": 2.302550571679859, + "grad_norm": 0.34674268222322163, + "learning_rate": 0.0004942249806180437, + "loss": 3.115814447402954, + "step": 3928, + "token_acc": 0.28652745881509606 + }, + { + "epoch": 2.303136909997068, + "grad_norm": 0.31747691770564734, + "learning_rate": 0.0004942198015393057, + "loss": 3.1416079998016357, + "step": 3929, + "token_acc": 0.2838086905199505 + }, + { + "epoch": 2.3037232483142773, + "grad_norm": 0.36753539195994017, + "learning_rate": 0.0004942146201664517, + "loss": 3.1530566215515137, + "step": 3930, + "token_acc": 0.2834558539038923 + }, + { + "epoch": 2.3043095866314864, + "grad_norm": 0.26942361972965423, + "learning_rate": 0.0004942094364995304, + "loss": 3.1065523624420166, + "step": 3931, + "token_acc": 0.2883228325539199 + }, + { + "epoch": 2.3048959249486956, + "grad_norm": 0.30884105939737105, + "learning_rate": 0.0004942042505385907, + "loss": 3.1122541427612305, + "step": 3932, + "token_acc": 0.2872049766944912 + }, + { + "epoch": 2.3054822632659047, + "grad_norm": 0.403694243383865, + "learning_rate": 0.000494199062283681, + "loss": 3.159322500228882, + "step": 3933, + "token_acc": 0.2818386229711568 + }, + { + "epoch": 2.3060686015831133, + "grad_norm": 0.3661677765343658, + "learning_rate": 0.0004941938717348502, + "loss": 3.125225782394409, + "step": 3934, + "token_acc": 0.28405573369213244 + }, + { + "epoch": 2.3066549399003224, + "grad_norm": 0.30627501637965, + "learning_rate": 0.0004941886788921469, + "loss": 3.168328046798706, + "step": 3935, + "token_acc": 0.2801591748640684 + }, + { + "epoch": 2.3072412782175316, + "grad_norm": 0.31617263767736264, + "learning_rate": 0.0004941834837556201, + "loss": 3.125821113586426, + "step": 3936, + "token_acc": 0.2859601204590249 + }, + { + "epoch": 2.3078276165347407, + "grad_norm": 0.3706222755913848, + "learning_rate": 0.0004941782863253186, + "loss": 3.193310260772705, + "step": 3937, + "token_acc": 0.27671050192573066 + }, + { + "epoch": 2.3084139548519493, + "grad_norm": 0.38931355205361134, + "learning_rate": 0.000494173086601291, + "loss": 3.165721893310547, + "step": 3938, + "token_acc": 0.27979172780309985 + }, + { + "epoch": 2.3090002931691584, + "grad_norm": 0.2907466859373121, + "learning_rate": 0.0004941678845835864, + "loss": 3.153498649597168, + "step": 3939, + "token_acc": 0.2827911884975597 + }, + { + "epoch": 2.3095866314863676, + "grad_norm": 0.30848660609153267, + "learning_rate": 0.0004941626802722535, + "loss": 3.177926540374756, + "step": 3940, + "token_acc": 0.2789154851344691 + }, + { + "epoch": 2.3101729698035767, + "grad_norm": 0.36349449846897497, + "learning_rate": 0.0004941574736673412, + "loss": 3.1330854892730713, + "step": 3941, + "token_acc": 0.2844436847123806 + }, + { + "epoch": 2.310759308120786, + "grad_norm": 0.26739818078873123, + "learning_rate": 0.0004941522647688984, + "loss": 3.116626739501953, + "step": 3942, + "token_acc": 0.28746450388732864 + }, + { + "epoch": 2.311345646437995, + "grad_norm": 0.2890124532439609, + "learning_rate": 0.0004941470535769742, + "loss": 3.156836986541748, + "step": 3943, + "token_acc": 0.28451289855033635 + }, + { + "epoch": 2.3119319847552036, + "grad_norm": 0.25041120977772835, + "learning_rate": 0.0004941418400916173, + "loss": 3.117131233215332, + "step": 3944, + "token_acc": 0.2859157132059736 + }, + { + "epoch": 2.3125183230724127, + "grad_norm": 0.2965372704381502, + "learning_rate": 0.0004941366243128768, + "loss": 3.1520376205444336, + "step": 3945, + "token_acc": 0.2818441440373137 + }, + { + "epoch": 2.313104661389622, + "grad_norm": 0.24446183801353258, + "learning_rate": 0.0004941314062408018, + "loss": 3.140288829803467, + "step": 3946, + "token_acc": 0.2838446551500195 + }, + { + "epoch": 2.313690999706831, + "grad_norm": 0.2858152316901607, + "learning_rate": 0.0004941261858754411, + "loss": 3.2104921340942383, + "step": 3947, + "token_acc": 0.27508570369684054 + }, + { + "epoch": 2.31427733802404, + "grad_norm": 0.2807372806965721, + "learning_rate": 0.000494120963216844, + "loss": 3.167325973510742, + "step": 3948, + "token_acc": 0.2820974097783593 + }, + { + "epoch": 2.3148636763412487, + "grad_norm": 0.298617957931145, + "learning_rate": 0.0004941157382650593, + "loss": 3.1299381256103516, + "step": 3949, + "token_acc": 0.28588010342879944 + }, + { + "epoch": 2.315450014658458, + "grad_norm": 0.3328875925979855, + "learning_rate": 0.0004941105110201361, + "loss": 3.1952295303344727, + "step": 3950, + "token_acc": 0.27685804860866503 + }, + { + "epoch": 2.316036352975667, + "grad_norm": 0.27993508311367893, + "learning_rate": 0.0004941052814821237, + "loss": 3.179368019104004, + "step": 3951, + "token_acc": 0.27853155271993946 + }, + { + "epoch": 2.316622691292876, + "grad_norm": 0.27208189822011764, + "learning_rate": 0.0004941000496510712, + "loss": 3.1553988456726074, + "step": 3952, + "token_acc": 0.28255558270217446 + }, + { + "epoch": 2.317209029610085, + "grad_norm": 0.2906117613745728, + "learning_rate": 0.0004940948155270276, + "loss": 3.13283109664917, + "step": 3953, + "token_acc": 0.2849087229647118 + }, + { + "epoch": 2.3177953679272942, + "grad_norm": 0.23200456280574802, + "learning_rate": 0.000494089579110042, + "loss": 3.1319456100463867, + "step": 3954, + "token_acc": 0.2853875079407365 + }, + { + "epoch": 2.318381706244503, + "grad_norm": 0.27156625862554395, + "learning_rate": 0.0004940843404001639, + "loss": 3.1421096324920654, + "step": 3955, + "token_acc": 0.2836336567585605 + }, + { + "epoch": 2.318968044561712, + "grad_norm": 0.25999007877423913, + "learning_rate": 0.0004940790993974422, + "loss": 3.1816375255584717, + "step": 3956, + "token_acc": 0.2798555396659961 + }, + { + "epoch": 2.319554382878921, + "grad_norm": 0.2535092620806209, + "learning_rate": 0.0004940738561019264, + "loss": 3.1191036701202393, + "step": 3957, + "token_acc": 0.28780223462517146 + }, + { + "epoch": 2.3201407211961302, + "grad_norm": 0.27356935213757333, + "learning_rate": 0.0004940686105136655, + "loss": 3.1959195137023926, + "step": 3958, + "token_acc": 0.27543486453877924 + }, + { + "epoch": 2.3207270595133394, + "grad_norm": 0.2532781014714082, + "learning_rate": 0.0004940633626327089, + "loss": 3.170225143432617, + "step": 3959, + "token_acc": 0.27967962519776635 + }, + { + "epoch": 2.321313397830548, + "grad_norm": 0.31218292184798657, + "learning_rate": 0.0004940581124591058, + "loss": 3.165313720703125, + "step": 3960, + "token_acc": 0.2818811536889771 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.36911755450408884, + "learning_rate": 0.0004940528599929058, + "loss": 3.171064853668213, + "step": 3961, + "token_acc": 0.2791504559906292 + }, + { + "epoch": 2.3224860744649662, + "grad_norm": 0.29033998962323826, + "learning_rate": 0.0004940476052341579, + "loss": 3.1544559001922607, + "step": 3962, + "token_acc": 0.28208769373426945 + }, + { + "epoch": 2.3230724127821754, + "grad_norm": 0.25904069964386445, + "learning_rate": 0.0004940423481829117, + "loss": 3.1153059005737305, + "step": 3963, + "token_acc": 0.28822642818081173 + }, + { + "epoch": 2.3236587510993845, + "grad_norm": 0.2837859334986874, + "learning_rate": 0.0004940370888392165, + "loss": 3.173543691635132, + "step": 3964, + "token_acc": 0.2793589706769293 + }, + { + "epoch": 2.3242450894165936, + "grad_norm": 0.2984049085507968, + "learning_rate": 0.0004940318272031216, + "loss": 3.1182613372802734, + "step": 3965, + "token_acc": 0.2868576841108217 + }, + { + "epoch": 2.3248314277338022, + "grad_norm": 0.3004806646151251, + "learning_rate": 0.0004940265632746765, + "loss": 3.157388687133789, + "step": 3966, + "token_acc": 0.2816664987191794 + }, + { + "epoch": 2.3254177660510114, + "grad_norm": 0.25186433358815535, + "learning_rate": 0.0004940212970539308, + "loss": 3.1801986694335938, + "step": 3967, + "token_acc": 0.2793455176484238 + }, + { + "epoch": 2.3260041043682205, + "grad_norm": 0.27804333835954365, + "learning_rate": 0.0004940160285409337, + "loss": 3.1617660522460938, + "step": 3968, + "token_acc": 0.28207074173051616 + }, + { + "epoch": 2.3265904426854296, + "grad_norm": 0.27507179174472846, + "learning_rate": 0.000494010757735735, + "loss": 3.1777541637420654, + "step": 3969, + "token_acc": 0.2794753195753165 + }, + { + "epoch": 2.3271767810026387, + "grad_norm": 0.2728657950931429, + "learning_rate": 0.000494005484638384, + "loss": 3.169367790222168, + "step": 3970, + "token_acc": 0.2797783361265116 + }, + { + "epoch": 2.3277631193198474, + "grad_norm": 0.2699673390677939, + "learning_rate": 0.0004940002092489301, + "loss": 3.169825315475464, + "step": 3971, + "token_acc": 0.2795208638904832 + }, + { + "epoch": 2.3283494576370565, + "grad_norm": 0.2562718595891836, + "learning_rate": 0.0004939949315674231, + "loss": 3.1752443313598633, + "step": 3972, + "token_acc": 0.2784165856818918 + }, + { + "epoch": 2.3289357959542656, + "grad_norm": 0.27027362421607853, + "learning_rate": 0.0004939896515939126, + "loss": 3.187270164489746, + "step": 3973, + "token_acc": 0.2773837418771187 + }, + { + "epoch": 2.3295221342714747, + "grad_norm": 0.322310044671525, + "learning_rate": 0.0004939843693284481, + "loss": 3.1919784545898438, + "step": 3974, + "token_acc": 0.2752439112899045 + }, + { + "epoch": 2.330108472588684, + "grad_norm": 0.25917071776406353, + "learning_rate": 0.0004939790847710791, + "loss": 3.1480650901794434, + "step": 3975, + "token_acc": 0.283121095307723 + }, + { + "epoch": 2.330694810905893, + "grad_norm": 0.29256948812926303, + "learning_rate": 0.0004939737979218555, + "loss": 3.13071870803833, + "step": 3976, + "token_acc": 0.2850702219369269 + }, + { + "epoch": 2.3312811492231016, + "grad_norm": 0.355983327424739, + "learning_rate": 0.0004939685087808267, + "loss": 3.1213719844818115, + "step": 3977, + "token_acc": 0.28660329886845715 + }, + { + "epoch": 2.3318674875403107, + "grad_norm": 0.4462612165228996, + "learning_rate": 0.0004939632173480426, + "loss": 3.137389659881592, + "step": 3978, + "token_acc": 0.28391803827962464 + }, + { + "epoch": 2.33245382585752, + "grad_norm": 0.42123050673687495, + "learning_rate": 0.0004939579236235528, + "loss": 3.1404547691345215, + "step": 3979, + "token_acc": 0.2839996874149486 + }, + { + "epoch": 2.333040164174729, + "grad_norm": 0.3020603665689284, + "learning_rate": 0.0004939526276074071, + "loss": 3.1084227561950684, + "step": 3980, + "token_acc": 0.2901078772776163 + }, + { + "epoch": 2.333626502491938, + "grad_norm": 0.36511023866417164, + "learning_rate": 0.000493947329299655, + "loss": 3.192930221557617, + "step": 3981, + "token_acc": 0.276633631688403 + }, + { + "epoch": 2.3342128408091467, + "grad_norm": 0.3110911895364269, + "learning_rate": 0.0004939420287003466, + "loss": 3.169325828552246, + "step": 3982, + "token_acc": 0.2808059304314769 + }, + { + "epoch": 2.334799179126356, + "grad_norm": 0.33003785448185197, + "learning_rate": 0.0004939367258095317, + "loss": 3.116131067276001, + "step": 3983, + "token_acc": 0.28655666558342857 + }, + { + "epoch": 2.335385517443565, + "grad_norm": 0.35248184867215654, + "learning_rate": 0.0004939314206272598, + "loss": 3.1521859169006348, + "step": 3984, + "token_acc": 0.2816936570652686 + }, + { + "epoch": 2.335971855760774, + "grad_norm": 0.30803708914492633, + "learning_rate": 0.0004939261131535809, + "loss": 3.128833770751953, + "step": 3985, + "token_acc": 0.2867014266286512 + }, + { + "epoch": 2.336558194077983, + "grad_norm": 0.3361488742673685, + "learning_rate": 0.0004939208033885449, + "loss": 3.1462182998657227, + "step": 3986, + "token_acc": 0.2848019960640245 + }, + { + "epoch": 2.3371445323951923, + "grad_norm": 0.34731938369147075, + "learning_rate": 0.0004939154913322016, + "loss": 3.1032023429870605, + "step": 3987, + "token_acc": 0.2891486684839589 + }, + { + "epoch": 2.337730870712401, + "grad_norm": 0.3334990850308182, + "learning_rate": 0.000493910176984601, + "loss": 3.1642775535583496, + "step": 3988, + "token_acc": 0.27947172882779986 + }, + { + "epoch": 2.33831720902961, + "grad_norm": 0.27416137275532815, + "learning_rate": 0.000493904860345793, + "loss": 3.148524284362793, + "step": 3989, + "token_acc": 0.2839143250147377 + }, + { + "epoch": 2.338903547346819, + "grad_norm": 0.320259817653581, + "learning_rate": 0.0004938995414158273, + "loss": 3.1541898250579834, + "step": 3990, + "token_acc": 0.28154540126158367 + }, + { + "epoch": 2.3394898856640283, + "grad_norm": 0.2768282240883893, + "learning_rate": 0.0004938942201947543, + "loss": 3.131925106048584, + "step": 3991, + "token_acc": 0.2853591918347077 + }, + { + "epoch": 2.340076223981237, + "grad_norm": 0.273659546836402, + "learning_rate": 0.0004938888966826236, + "loss": 3.1534693241119385, + "step": 3992, + "token_acc": 0.28042292687189346 + }, + { + "epoch": 2.340662562298446, + "grad_norm": 0.3151100472043264, + "learning_rate": 0.0004938835708794855, + "loss": 3.123828887939453, + "step": 3993, + "token_acc": 0.28522968693630496 + }, + { + "epoch": 2.341248900615655, + "grad_norm": 0.33725714209704866, + "learning_rate": 0.0004938782427853898, + "loss": 3.1644792556762695, + "step": 3994, + "token_acc": 0.28067677645940103 + }, + { + "epoch": 2.3418352389328643, + "grad_norm": 0.4047552218966542, + "learning_rate": 0.0004938729124003866, + "loss": 3.1369543075561523, + "step": 3995, + "token_acc": 0.2841544067679257 + }, + { + "epoch": 2.3424215772500734, + "grad_norm": 0.348581109982346, + "learning_rate": 0.000493867579724526, + "loss": 3.163398265838623, + "step": 3996, + "token_acc": 0.2817965044955098 + }, + { + "epoch": 2.3430079155672825, + "grad_norm": 0.2667126137814938, + "learning_rate": 0.0004938622447578582, + "loss": 3.135937213897705, + "step": 3997, + "token_acc": 0.28565611647624534 + }, + { + "epoch": 2.343594253884491, + "grad_norm": 0.267303131580925, + "learning_rate": 0.0004938569075004331, + "loss": 3.1574630737304688, + "step": 3998, + "token_acc": 0.2810470581602161 + }, + { + "epoch": 2.3441805922017003, + "grad_norm": 0.26745200425516374, + "learning_rate": 0.0004938515679523011, + "loss": 3.1264781951904297, + "step": 3999, + "token_acc": 0.28595221915763946 + }, + { + "epoch": 2.3447669305189094, + "grad_norm": 0.2688300192193967, + "learning_rate": 0.0004938462261135121, + "loss": 3.096856117248535, + "step": 4000, + "token_acc": 0.2897349739830139 + }, + { + "epoch": 2.3453532688361185, + "grad_norm": 0.25405227045064427, + "learning_rate": 0.0004938408819841164, + "loss": 3.1228528022766113, + "step": 4001, + "token_acc": 0.2873666981867471 + }, + { + "epoch": 2.3459396071533276, + "grad_norm": 0.2884296113783983, + "learning_rate": 0.0004938355355641643, + "loss": 3.186028003692627, + "step": 4002, + "token_acc": 0.2769003078069607 + }, + { + "epoch": 2.3465259454705363, + "grad_norm": 0.26051591206357727, + "learning_rate": 0.0004938301868537058, + "loss": 3.15671706199646, + "step": 4003, + "token_acc": 0.28157566490966884 + }, + { + "epoch": 2.3471122837877454, + "grad_norm": 0.27665756626266563, + "learning_rate": 0.0004938248358527913, + "loss": 3.1955313682556152, + "step": 4004, + "token_acc": 0.27567125371905565 + }, + { + "epoch": 2.3476986221049545, + "grad_norm": 0.2190041817743469, + "learning_rate": 0.000493819482561471, + "loss": 3.172473430633545, + "step": 4005, + "token_acc": 0.2798969188915761 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.25579136033083066, + "learning_rate": 0.0004938141269797954, + "loss": 3.193089008331299, + "step": 4006, + "token_acc": 0.27685645650708024 + }, + { + "epoch": 2.3488712987393727, + "grad_norm": 0.2988296065470271, + "learning_rate": 0.0004938087691078143, + "loss": 3.1299846172332764, + "step": 4007, + "token_acc": 0.2847666385980102 + }, + { + "epoch": 2.349457637056582, + "grad_norm": 0.34174377554982466, + "learning_rate": 0.0004938034089455786, + "loss": 3.1282472610473633, + "step": 4008, + "token_acc": 0.2869298564187029 + }, + { + "epoch": 2.3500439753737905, + "grad_norm": 0.2954870089577022, + "learning_rate": 0.0004937980464931383, + "loss": 3.1838347911834717, + "step": 4009, + "token_acc": 0.2792873635881474 + }, + { + "epoch": 2.3506303136909996, + "grad_norm": 0.24362040653844436, + "learning_rate": 0.0004937926817505439, + "loss": 3.1690683364868164, + "step": 4010, + "token_acc": 0.2798725596529284 + }, + { + "epoch": 2.3512166520082087, + "grad_norm": 0.3234849344313527, + "learning_rate": 0.0004937873147178457, + "loss": 3.134331464767456, + "step": 4011, + "token_acc": 0.284568084668663 + }, + { + "epoch": 2.351802990325418, + "grad_norm": 0.2837742241760359, + "learning_rate": 0.0004937819453950942, + "loss": 3.1493167877197266, + "step": 4012, + "token_acc": 0.2833623038007524 + }, + { + "epoch": 2.352389328642627, + "grad_norm": 0.3249334552706245, + "learning_rate": 0.00049377657378234, + "loss": 3.1755361557006836, + "step": 4013, + "token_acc": 0.27741696757812 + }, + { + "epoch": 2.3529756669598356, + "grad_norm": 0.35545724682376334, + "learning_rate": 0.0004937711998796332, + "loss": 3.161740303039551, + "step": 4014, + "token_acc": 0.2807676580432353 + }, + { + "epoch": 2.3535620052770447, + "grad_norm": 0.26454969303670217, + "learning_rate": 0.0004937658236870245, + "loss": 3.1504836082458496, + "step": 4015, + "token_acc": 0.2816376466701425 + }, + { + "epoch": 2.354148343594254, + "grad_norm": 0.3928685282247113, + "learning_rate": 0.0004937604452045644, + "loss": 3.123591423034668, + "step": 4016, + "token_acc": 0.28682305957855075 + }, + { + "epoch": 2.354734681911463, + "grad_norm": 0.3254622356512941, + "learning_rate": 0.0004937550644323034, + "loss": 3.158384323120117, + "step": 4017, + "token_acc": 0.2802929075604877 + }, + { + "epoch": 2.355321020228672, + "grad_norm": 0.3174958619786969, + "learning_rate": 0.0004937496813702919, + "loss": 3.1805267333984375, + "step": 4018, + "token_acc": 0.2802285732544795 + }, + { + "epoch": 2.355907358545881, + "grad_norm": 0.2589008997969103, + "learning_rate": 0.0004937442960185807, + "loss": 3.1231000423431396, + "step": 4019, + "token_acc": 0.2867034919166649 + }, + { + "epoch": 2.35649369686309, + "grad_norm": 0.34367314426343626, + "learning_rate": 0.0004937389083772203, + "loss": 3.1560707092285156, + "step": 4020, + "token_acc": 0.2827240324925272 + }, + { + "epoch": 2.357080035180299, + "grad_norm": 0.28506128200065484, + "learning_rate": 0.0004937335184462614, + "loss": 3.177469253540039, + "step": 4021, + "token_acc": 0.27729345614030587 + }, + { + "epoch": 2.357666373497508, + "grad_norm": 0.30081359916660666, + "learning_rate": 0.0004937281262257544, + "loss": 3.165536880493164, + "step": 4022, + "token_acc": 0.27862168087444217 + }, + { + "epoch": 2.358252711814717, + "grad_norm": 0.2868817523103614, + "learning_rate": 0.0004937227317157501, + "loss": 3.130434036254883, + "step": 4023, + "token_acc": 0.2868024833735954 + }, + { + "epoch": 2.3588390501319263, + "grad_norm": 0.2658631021021933, + "learning_rate": 0.0004937173349162991, + "loss": 3.1382298469543457, + "step": 4024, + "token_acc": 0.2841774159441742 + }, + { + "epoch": 2.359425388449135, + "grad_norm": 0.27630209701266484, + "learning_rate": 0.0004937119358274522, + "loss": 3.1533422470092773, + "step": 4025, + "token_acc": 0.284287871376619 + }, + { + "epoch": 2.360011726766344, + "grad_norm": 0.26806193253509747, + "learning_rate": 0.0004937065344492601, + "loss": 3.1603002548217773, + "step": 4026, + "token_acc": 0.2827379050297182 + }, + { + "epoch": 2.360598065083553, + "grad_norm": 0.2878893156666818, + "learning_rate": 0.0004937011307817735, + "loss": 3.1473021507263184, + "step": 4027, + "token_acc": 0.283210009103429 + }, + { + "epoch": 2.3611844034007623, + "grad_norm": 0.27372966378220154, + "learning_rate": 0.0004936957248250431, + "loss": 3.161133289337158, + "step": 4028, + "token_acc": 0.28148498915782993 + }, + { + "epoch": 2.3617707417179714, + "grad_norm": 0.32987362880637805, + "learning_rate": 0.0004936903165791199, + "loss": 3.1719350814819336, + "step": 4029, + "token_acc": 0.278568392072714 + }, + { + "epoch": 2.3623570800351805, + "grad_norm": 0.2805507637830382, + "learning_rate": 0.0004936849060440543, + "loss": 3.1277289390563965, + "step": 4030, + "token_acc": 0.28452087188978 + }, + { + "epoch": 2.362943418352389, + "grad_norm": 0.33163267071368285, + "learning_rate": 0.0004936794932198977, + "loss": 3.1783652305603027, + "step": 4031, + "token_acc": 0.2802775675260815 + }, + { + "epoch": 2.3635297566695983, + "grad_norm": 0.2393012788430242, + "learning_rate": 0.0004936740781067004, + "loss": 3.1389975547790527, + "step": 4032, + "token_acc": 0.2846301481272356 + }, + { + "epoch": 2.3641160949868074, + "grad_norm": 0.2909760459874164, + "learning_rate": 0.0004936686607045135, + "loss": 3.0992393493652344, + "step": 4033, + "token_acc": 0.28946476570402624 + }, + { + "epoch": 2.3647024333040165, + "grad_norm": 0.24001186493798704, + "learning_rate": 0.000493663241013388, + "loss": 3.1664459705352783, + "step": 4034, + "token_acc": 0.2798219274654691 + }, + { + "epoch": 2.3652887716212256, + "grad_norm": 0.31053089378158255, + "learning_rate": 0.0004936578190333745, + "loss": 3.0978147983551025, + "step": 4035, + "token_acc": 0.2919864696382259 + }, + { + "epoch": 2.3658751099384343, + "grad_norm": 0.29250564650884, + "learning_rate": 0.0004936523947645243, + "loss": 3.170487880706787, + "step": 4036, + "token_acc": 0.2793936654184947 + }, + { + "epoch": 2.3664614482556434, + "grad_norm": 0.28161171625902087, + "learning_rate": 0.0004936469682068882, + "loss": 3.1361031532287598, + "step": 4037, + "token_acc": 0.2839699973211894 + }, + { + "epoch": 2.3670477865728525, + "grad_norm": 0.35072093783031416, + "learning_rate": 0.000493641539360517, + "loss": 3.168086051940918, + "step": 4038, + "token_acc": 0.2815140399861308 + }, + { + "epoch": 2.3676341248900616, + "grad_norm": 0.32737886561048474, + "learning_rate": 0.0004936361082254619, + "loss": 3.1163172721862793, + "step": 4039, + "token_acc": 0.28807761313044056 + }, + { + "epoch": 2.3682204632072708, + "grad_norm": 0.3348837610010347, + "learning_rate": 0.0004936306748017739, + "loss": 3.138913154602051, + "step": 4040, + "token_acc": 0.28368995940737324 + }, + { + "epoch": 2.36880680152448, + "grad_norm": 0.2526109604313326, + "learning_rate": 0.000493625239089504, + "loss": 3.139894962310791, + "step": 4041, + "token_acc": 0.28477981700465516 + }, + { + "epoch": 2.3693931398416885, + "grad_norm": 0.3320634406713377, + "learning_rate": 0.0004936198010887032, + "loss": 3.151625633239746, + "step": 4042, + "token_acc": 0.28076933033056356 + }, + { + "epoch": 2.3699794781588976, + "grad_norm": 0.3323202087643768, + "learning_rate": 0.0004936143607994227, + "loss": 3.0928902626037598, + "step": 4043, + "token_acc": 0.29072783656563844 + }, + { + "epoch": 2.3705658164761068, + "grad_norm": 0.26271573821993627, + "learning_rate": 0.0004936089182217136, + "loss": 3.0799219608306885, + "step": 4044, + "token_acc": 0.29257940033821195 + }, + { + "epoch": 2.371152154793316, + "grad_norm": 0.30796021540792395, + "learning_rate": 0.000493603473355627, + "loss": 3.1372227668762207, + "step": 4045, + "token_acc": 0.28536175710594314 + }, + { + "epoch": 2.3717384931105245, + "grad_norm": 0.31112680816289107, + "learning_rate": 0.000493598026201214, + "loss": 3.145521879196167, + "step": 4046, + "token_acc": 0.28273259174593873 + }, + { + "epoch": 2.3723248314277336, + "grad_norm": 0.2547932141582526, + "learning_rate": 0.0004935925767585258, + "loss": 3.1690471172332764, + "step": 4047, + "token_acc": 0.28082155132657455 + }, + { + "epoch": 2.3729111697449428, + "grad_norm": 0.3067688908458565, + "learning_rate": 0.0004935871250276135, + "loss": 3.1275105476379395, + "step": 4048, + "token_acc": 0.28545256486432957 + }, + { + "epoch": 2.373497508062152, + "grad_norm": 0.2805179341133092, + "learning_rate": 0.0004935816710085285, + "loss": 3.1554250717163086, + "step": 4049, + "token_acc": 0.28226631896760124 + }, + { + "epoch": 2.374083846379361, + "grad_norm": 0.28285718271004157, + "learning_rate": 0.0004935762147013218, + "loss": 3.1893157958984375, + "step": 4050, + "token_acc": 0.27810859911182884 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.31795040351553366, + "learning_rate": 0.0004935707561060449, + "loss": 3.1490261554718018, + "step": 4051, + "token_acc": 0.28443693711896506 + }, + { + "epoch": 2.3752565230137788, + "grad_norm": 0.3021004988282593, + "learning_rate": 0.000493565295222749, + "loss": 3.1375069618225098, + "step": 4052, + "token_acc": 0.28617642438555524 + }, + { + "epoch": 2.375842861330988, + "grad_norm": 0.2743867322823992, + "learning_rate": 0.0004935598320514853, + "loss": 3.142432689666748, + "step": 4053, + "token_acc": 0.28379500913219274 + }, + { + "epoch": 2.376429199648197, + "grad_norm": 0.2729379695650785, + "learning_rate": 0.0004935543665923051, + "loss": 3.1676464080810547, + "step": 4054, + "token_acc": 0.28077004507319614 + }, + { + "epoch": 2.377015537965406, + "grad_norm": 0.2964195777850554, + "learning_rate": 0.0004935488988452598, + "loss": 3.1309988498687744, + "step": 4055, + "token_acc": 0.2866052434028869 + }, + { + "epoch": 2.377601876282615, + "grad_norm": 0.28457075331299414, + "learning_rate": 0.000493543428810401, + "loss": 3.1408252716064453, + "step": 4056, + "token_acc": 0.2843132597530786 + }, + { + "epoch": 2.378188214599824, + "grad_norm": 0.34623553441194527, + "learning_rate": 0.0004935379564877797, + "loss": 3.1029162406921387, + "step": 4057, + "token_acc": 0.2884390315265891 + }, + { + "epoch": 2.378774552917033, + "grad_norm": 0.3695914380533764, + "learning_rate": 0.0004935324818774475, + "loss": 3.162315845489502, + "step": 4058, + "token_acc": 0.2815020402944843 + }, + { + "epoch": 2.379360891234242, + "grad_norm": 0.30028005565483523, + "learning_rate": 0.0004935270049794558, + "loss": 3.164185047149658, + "step": 4059, + "token_acc": 0.28234970273698073 + }, + { + "epoch": 2.379947229551451, + "grad_norm": 0.26438452217338815, + "learning_rate": 0.000493521525793856, + "loss": 3.1331026554107666, + "step": 4060, + "token_acc": 0.285255655504796 + }, + { + "epoch": 2.3805335678686603, + "grad_norm": 0.24336475276660716, + "learning_rate": 0.0004935160443206997, + "loss": 3.168433427810669, + "step": 4061, + "token_acc": 0.28047342197006303 + }, + { + "epoch": 2.3811199061858694, + "grad_norm": 0.2589052213552716, + "learning_rate": 0.0004935105605600383, + "loss": 3.140836000442505, + "step": 4062, + "token_acc": 0.2842867829233226 + }, + { + "epoch": 2.381706244503078, + "grad_norm": 0.30157662895422804, + "learning_rate": 0.0004935050745119233, + "loss": 3.113926410675049, + "step": 4063, + "token_acc": 0.2870446716138725 + }, + { + "epoch": 2.382292582820287, + "grad_norm": 0.41415725712738666, + "learning_rate": 0.0004934995861764062, + "loss": 3.096813440322876, + "step": 4064, + "token_acc": 0.2890652004405109 + }, + { + "epoch": 2.3828789211374963, + "grad_norm": 0.3377295879502104, + "learning_rate": 0.0004934940955535386, + "loss": 3.148503065109253, + "step": 4065, + "token_acc": 0.28233725238215446 + }, + { + "epoch": 2.3834652594547054, + "grad_norm": 0.23028799966024388, + "learning_rate": 0.0004934886026433722, + "loss": 3.144071578979492, + "step": 4066, + "token_acc": 0.2828380423814329 + }, + { + "epoch": 2.3840515977719146, + "grad_norm": 0.30702097604114503, + "learning_rate": 0.0004934831074459585, + "loss": 3.1880664825439453, + "step": 4067, + "token_acc": 0.27869606819128034 + }, + { + "epoch": 2.3846379360891232, + "grad_norm": 0.2581701224243676, + "learning_rate": 0.000493477609961349, + "loss": 3.1330957412719727, + "step": 4068, + "token_acc": 0.28431855479866364 + }, + { + "epoch": 2.3852242744063323, + "grad_norm": 0.3121199480347057, + "learning_rate": 0.0004934721101895954, + "loss": 3.127673387527466, + "step": 4069, + "token_acc": 0.28644807537775535 + }, + { + "epoch": 2.3858106127235414, + "grad_norm": 0.27740788582797143, + "learning_rate": 0.0004934666081307496, + "loss": 3.110970973968506, + "step": 4070, + "token_acc": 0.2883976668427774 + }, + { + "epoch": 2.3863969510407506, + "grad_norm": 0.26695029307583873, + "learning_rate": 0.0004934611037848629, + "loss": 3.145266532897949, + "step": 4071, + "token_acc": 0.2827191365813495 + }, + { + "epoch": 2.3869832893579597, + "grad_norm": 0.3355222803854897, + "learning_rate": 0.0004934555971519872, + "loss": 3.1385531425476074, + "step": 4072, + "token_acc": 0.28259410667814183 + }, + { + "epoch": 2.387569627675169, + "grad_norm": 0.32344054787752197, + "learning_rate": 0.0004934500882321743, + "loss": 3.108651876449585, + "step": 4073, + "token_acc": 0.28665177504222006 + }, + { + "epoch": 2.3881559659923774, + "grad_norm": 0.26285109154248626, + "learning_rate": 0.0004934445770254758, + "loss": 3.1573333740234375, + "step": 4074, + "token_acc": 0.2833565374172521 + }, + { + "epoch": 2.3887423043095866, + "grad_norm": 0.27335983051668883, + "learning_rate": 0.0004934390635319436, + "loss": 3.167174816131592, + "step": 4075, + "token_acc": 0.2813889184302882 + }, + { + "epoch": 2.3893286426267957, + "grad_norm": 0.27002179432602424, + "learning_rate": 0.0004934335477516295, + "loss": 3.1499409675598145, + "step": 4076, + "token_acc": 0.2825924980862465 + }, + { + "epoch": 2.389914980944005, + "grad_norm": 0.27710952457774657, + "learning_rate": 0.0004934280296845852, + "loss": 3.16397762298584, + "step": 4077, + "token_acc": 0.28078295437897755 + }, + { + "epoch": 2.390501319261214, + "grad_norm": 0.27246124377460496, + "learning_rate": 0.0004934225093308625, + "loss": 3.145174503326416, + "step": 4078, + "token_acc": 0.28499365706787755 + }, + { + "epoch": 2.3910876575784226, + "grad_norm": 0.26057186861358933, + "learning_rate": 0.0004934169866905135, + "loss": 3.134951591491699, + "step": 4079, + "token_acc": 0.2852747110948988 + }, + { + "epoch": 2.3916739958956317, + "grad_norm": 0.2798257223702878, + "learning_rate": 0.0004934114617635898, + "loss": 3.1059961318969727, + "step": 4080, + "token_acc": 0.28707484761446783 + }, + { + "epoch": 2.392260334212841, + "grad_norm": 0.28362520849016537, + "learning_rate": 0.0004934059345501435, + "loss": 3.142159938812256, + "step": 4081, + "token_acc": 0.28452248550904785 + }, + { + "epoch": 2.39284667253005, + "grad_norm": 0.31383810330243866, + "learning_rate": 0.0004934004050502263, + "loss": 3.138308048248291, + "step": 4082, + "token_acc": 0.2843043759658693 + }, + { + "epoch": 2.393433010847259, + "grad_norm": 0.3155225989837768, + "learning_rate": 0.0004933948732638904, + "loss": 3.1475093364715576, + "step": 4083, + "token_acc": 0.28331787060024777 + }, + { + "epoch": 2.394019349164468, + "grad_norm": 0.26086610124537146, + "learning_rate": 0.0004933893391911876, + "loss": 3.1382765769958496, + "step": 4084, + "token_acc": 0.28297447708774087 + }, + { + "epoch": 2.394605687481677, + "grad_norm": 0.339735441378694, + "learning_rate": 0.00049338380283217, + "loss": 3.1605401039123535, + "step": 4085, + "token_acc": 0.2805394061501943 + }, + { + "epoch": 2.395192025798886, + "grad_norm": 0.37786700657508204, + "learning_rate": 0.0004933782641868894, + "loss": 3.1292455196380615, + "step": 4086, + "token_acc": 0.285063339204911 + }, + { + "epoch": 2.395778364116095, + "grad_norm": 0.28721391005681457, + "learning_rate": 0.0004933727232553981, + "loss": 3.1868762969970703, + "step": 4087, + "token_acc": 0.2777047283437648 + }, + { + "epoch": 2.396364702433304, + "grad_norm": 0.2579902605141577, + "learning_rate": 0.0004933671800377479, + "loss": 3.104323387145996, + "step": 4088, + "token_acc": 0.2893102211349132 + }, + { + "epoch": 2.3969510407505132, + "grad_norm": 0.3299529748874557, + "learning_rate": 0.0004933616345339911, + "loss": 3.1254196166992188, + "step": 4089, + "token_acc": 0.2863896377852757 + }, + { + "epoch": 2.397537379067722, + "grad_norm": 0.24838459133821034, + "learning_rate": 0.0004933560867441797, + "loss": 3.1728739738464355, + "step": 4090, + "token_acc": 0.2791913904038877 + }, + { + "epoch": 2.398123717384931, + "grad_norm": 0.33561991548192066, + "learning_rate": 0.0004933505366683657, + "loss": 3.1828389167785645, + "step": 4091, + "token_acc": 0.2781146397364652 + }, + { + "epoch": 2.39871005570214, + "grad_norm": 0.2598931955585801, + "learning_rate": 0.0004933449843066013, + "loss": 3.151083469390869, + "step": 4092, + "token_acc": 0.28187937067522834 + }, + { + "epoch": 2.3992963940193492, + "grad_norm": 0.3425687238866714, + "learning_rate": 0.0004933394296589387, + "loss": 3.089930772781372, + "step": 4093, + "token_acc": 0.2900241613112128 + }, + { + "epoch": 2.3998827323365584, + "grad_norm": 0.30447962691173036, + "learning_rate": 0.0004933338727254301, + "loss": 3.1673331260681152, + "step": 4094, + "token_acc": 0.27943464938315177 + }, + { + "epoch": 2.4004690706537675, + "grad_norm": 0.32602099609588964, + "learning_rate": 0.0004933283135061277, + "loss": 3.165236473083496, + "step": 4095, + "token_acc": 0.28015353549866967 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.2977355361720388, + "learning_rate": 0.0004933227520010836, + "loss": 3.2125751972198486, + "step": 4096, + "token_acc": 0.274417364379618 + }, + { + "epoch": 2.4016417472881852, + "grad_norm": 0.32008640793279763, + "learning_rate": 0.0004933171882103501, + "loss": 3.150895833969116, + "step": 4097, + "token_acc": 0.2827747372254651 + }, + { + "epoch": 2.4022280856053944, + "grad_norm": 0.3122187543629649, + "learning_rate": 0.0004933116221339796, + "loss": 3.1602330207824707, + "step": 4098, + "token_acc": 0.2812706561353062 + }, + { + "epoch": 2.4028144239226035, + "grad_norm": 0.307871790964921, + "learning_rate": 0.0004933060537720242, + "loss": 3.1681432723999023, + "step": 4099, + "token_acc": 0.28057955507084964 + }, + { + "epoch": 2.403400762239812, + "grad_norm": 0.3600218656442076, + "learning_rate": 0.0004933004831245364, + "loss": 3.167104959487915, + "step": 4100, + "token_acc": 0.281351371931873 + }, + { + "epoch": 2.4039871005570213, + "grad_norm": 0.2857925533151999, + "learning_rate": 0.0004932949101915683, + "loss": 3.1797680854797363, + "step": 4101, + "token_acc": 0.2792059015416545 + }, + { + "epoch": 2.4045734388742304, + "grad_norm": 0.28310141269076017, + "learning_rate": 0.0004932893349731723, + "loss": 3.1275579929351807, + "step": 4102, + "token_acc": 0.28668382569500467 + }, + { + "epoch": 2.4051597771914395, + "grad_norm": 0.3953732165627844, + "learning_rate": 0.000493283757469401, + "loss": 3.146716833114624, + "step": 4103, + "token_acc": 0.2819510362080858 + }, + { + "epoch": 2.4057461155086486, + "grad_norm": 0.34086716300798964, + "learning_rate": 0.0004932781776803065, + "loss": 3.202929973602295, + "step": 4104, + "token_acc": 0.27649836255310195 + }, + { + "epoch": 2.4063324538258577, + "grad_norm": 0.32721596986891277, + "learning_rate": 0.0004932725956059414, + "loss": 3.181713342666626, + "step": 4105, + "token_acc": 0.2793457070439017 + }, + { + "epoch": 2.4069187921430664, + "grad_norm": 0.31054398517466103, + "learning_rate": 0.000493267011246358, + "loss": 3.1325745582580566, + "step": 4106, + "token_acc": 0.286374695863747 + }, + { + "epoch": 2.4075051304602755, + "grad_norm": 0.3606109884193681, + "learning_rate": 0.000493261424601609, + "loss": 3.123775005340576, + "step": 4107, + "token_acc": 0.28622277287498493 + }, + { + "epoch": 2.4080914687774846, + "grad_norm": 0.3179994979862523, + "learning_rate": 0.0004932558356717466, + "loss": 3.1152191162109375, + "step": 4108, + "token_acc": 0.2865733325896355 + }, + { + "epoch": 2.4086778070946937, + "grad_norm": 0.275258802316626, + "learning_rate": 0.0004932502444568235, + "loss": 3.155104160308838, + "step": 4109, + "token_acc": 0.28337545542705356 + }, + { + "epoch": 2.409264145411903, + "grad_norm": 0.3364813915158878, + "learning_rate": 0.000493244650956892, + "loss": 3.1361351013183594, + "step": 4110, + "token_acc": 0.2835962558935723 + }, + { + "epoch": 2.4098504837291115, + "grad_norm": 0.26292514259308347, + "learning_rate": 0.0004932390551720048, + "loss": 3.130554676055908, + "step": 4111, + "token_acc": 0.2851221476510067 + }, + { + "epoch": 2.4104368220463206, + "grad_norm": 0.3066565292712497, + "learning_rate": 0.0004932334571022145, + "loss": 3.152090549468994, + "step": 4112, + "token_acc": 0.2812216580752684 + }, + { + "epoch": 2.4110231603635297, + "grad_norm": 0.25256612744351087, + "learning_rate": 0.0004932278567475737, + "loss": 3.098417043685913, + "step": 4113, + "token_acc": 0.29005599791116893 + }, + { + "epoch": 2.411609498680739, + "grad_norm": 0.2721411239759966, + "learning_rate": 0.0004932222541081348, + "loss": 3.145481586456299, + "step": 4114, + "token_acc": 0.2838695292929558 + }, + { + "epoch": 2.412195836997948, + "grad_norm": 0.22599539963413645, + "learning_rate": 0.0004932166491839507, + "loss": 3.1612601280212402, + "step": 4115, + "token_acc": 0.2824076501242516 + }, + { + "epoch": 2.412782175315157, + "grad_norm": 0.3022587962613591, + "learning_rate": 0.0004932110419750738, + "loss": 3.177727222442627, + "step": 4116, + "token_acc": 0.27852007569971754 + }, + { + "epoch": 2.4133685136323657, + "grad_norm": 0.30763859944274985, + "learning_rate": 0.000493205432481557, + "loss": 3.138605833053589, + "step": 4117, + "token_acc": 0.2863578340757533 + }, + { + "epoch": 2.413954851949575, + "grad_norm": 0.2352536566945773, + "learning_rate": 0.0004931998207034528, + "loss": 3.1431665420532227, + "step": 4118, + "token_acc": 0.2826046662193807 + }, + { + "epoch": 2.414541190266784, + "grad_norm": 0.31237118490587434, + "learning_rate": 0.000493194206640814, + "loss": 3.173316240310669, + "step": 4119, + "token_acc": 0.28012032970978834 + }, + { + "epoch": 2.415127528583993, + "grad_norm": 0.3134306802686362, + "learning_rate": 0.0004931885902936935, + "loss": 3.1762430667877197, + "step": 4120, + "token_acc": 0.27753673434010745 + }, + { + "epoch": 2.415713866901202, + "grad_norm": 0.40413384841349204, + "learning_rate": 0.0004931829716621438, + "loss": 3.1484591960906982, + "step": 4121, + "token_acc": 0.28246398138665446 + }, + { + "epoch": 2.416300205218411, + "grad_norm": 0.3652326662573222, + "learning_rate": 0.0004931773507462176, + "loss": 3.1374175548553467, + "step": 4122, + "token_acc": 0.28323764849576066 + }, + { + "epoch": 2.41688654353562, + "grad_norm": 0.2787192995217331, + "learning_rate": 0.0004931717275459681, + "loss": 3.112290143966675, + "step": 4123, + "token_acc": 0.28707988407282276 + }, + { + "epoch": 2.417472881852829, + "grad_norm": 0.3096256363565667, + "learning_rate": 0.0004931661020614478, + "loss": 3.1806652545928955, + "step": 4124, + "token_acc": 0.27818994062191155 + }, + { + "epoch": 2.418059220170038, + "grad_norm": 0.3263757030294466, + "learning_rate": 0.0004931604742927096, + "loss": 3.175119400024414, + "step": 4125, + "token_acc": 0.2779952528302825 + }, + { + "epoch": 2.4186455584872473, + "grad_norm": 0.32171825048421016, + "learning_rate": 0.0004931548442398065, + "loss": 3.155787706375122, + "step": 4126, + "token_acc": 0.2828748655146307 + }, + { + "epoch": 2.4192318968044564, + "grad_norm": 0.3002516736234532, + "learning_rate": 0.0004931492119027912, + "loss": 3.147311210632324, + "step": 4127, + "token_acc": 0.281658062556939 + }, + { + "epoch": 2.419818235121665, + "grad_norm": 0.2616922706603251, + "learning_rate": 0.0004931435772817168, + "loss": 3.1236133575439453, + "step": 4128, + "token_acc": 0.2862291507347622 + }, + { + "epoch": 2.420404573438874, + "grad_norm": 0.24445385964476138, + "learning_rate": 0.0004931379403766361, + "loss": 3.142514228820801, + "step": 4129, + "token_acc": 0.2830488528844689 + }, + { + "epoch": 2.4209909117560833, + "grad_norm": 0.265518764054282, + "learning_rate": 0.0004931323011876021, + "loss": 3.135782480239868, + "step": 4130, + "token_acc": 0.28349745898105017 + }, + { + "epoch": 2.4215772500732924, + "grad_norm": 0.24828711718407814, + "learning_rate": 0.0004931266597146676, + "loss": 3.1276228427886963, + "step": 4131, + "token_acc": 0.2852538544179976 + }, + { + "epoch": 2.4221635883905015, + "grad_norm": 0.3161760645363754, + "learning_rate": 0.000493121015957886, + "loss": 3.1190271377563477, + "step": 4132, + "token_acc": 0.28724781744087824 + }, + { + "epoch": 2.42274992670771, + "grad_norm": 0.3448451434715971, + "learning_rate": 0.0004931153699173099, + "loss": 3.1582865715026855, + "step": 4133, + "token_acc": 0.2822012403224307 + }, + { + "epoch": 2.4233362650249193, + "grad_norm": 0.2655848173631843, + "learning_rate": 0.0004931097215929925, + "loss": 3.100529193878174, + "step": 4134, + "token_acc": 0.28825113553630316 + }, + { + "epoch": 2.4239226033421284, + "grad_norm": 0.28147055840140633, + "learning_rate": 0.0004931040709849869, + "loss": 3.1324245929718018, + "step": 4135, + "token_acc": 0.2849888342433748 + }, + { + "epoch": 2.4245089416593375, + "grad_norm": 0.3863513852128823, + "learning_rate": 0.0004930984180933462, + "loss": 3.087904453277588, + "step": 4136, + "token_acc": 0.29149913475572503 + }, + { + "epoch": 2.4250952799765466, + "grad_norm": 0.2630892275375379, + "learning_rate": 0.0004930927629181234, + "loss": 3.1913859844207764, + "step": 4137, + "token_acc": 0.2769523282204503 + }, + { + "epoch": 2.4256816182937557, + "grad_norm": 0.3311874495556526, + "learning_rate": 0.0004930871054593716, + "loss": 3.158717155456543, + "step": 4138, + "token_acc": 0.28216183938715783 + }, + { + "epoch": 2.4262679566109644, + "grad_norm": 0.26871327413622054, + "learning_rate": 0.0004930814457171441, + "loss": 3.137910842895508, + "step": 4139, + "token_acc": 0.285189778824897 + }, + { + "epoch": 2.4268542949281735, + "grad_norm": 0.33739768520535274, + "learning_rate": 0.0004930757836914939, + "loss": 3.1281514167785645, + "step": 4140, + "token_acc": 0.28614501940356274 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.26375577887458684, + "learning_rate": 0.0004930701193824744, + "loss": 3.1643097400665283, + "step": 4141, + "token_acc": 0.27949293508582995 + }, + { + "epoch": 2.4280269715625917, + "grad_norm": 0.2865114725356215, + "learning_rate": 0.0004930644527901385, + "loss": 3.124934196472168, + "step": 4142, + "token_acc": 0.2844657683056152 + }, + { + "epoch": 2.4286133098798004, + "grad_norm": 0.2787478182409089, + "learning_rate": 0.0004930587839145396, + "loss": 3.1285805702209473, + "step": 4143, + "token_acc": 0.28579032965622025 + }, + { + "epoch": 2.4291996481970095, + "grad_norm": 0.30236186931630615, + "learning_rate": 0.0004930531127557311, + "loss": 3.1423957347869873, + "step": 4144, + "token_acc": 0.28300767877913263 + }, + { + "epoch": 2.4297859865142186, + "grad_norm": 0.24351281275965278, + "learning_rate": 0.000493047439313766, + "loss": 3.160454034805298, + "step": 4145, + "token_acc": 0.2823586333037814 + }, + { + "epoch": 2.4303723248314277, + "grad_norm": 0.3137301832262234, + "learning_rate": 0.0004930417635886976, + "loss": 3.1892623901367188, + "step": 4146, + "token_acc": 0.2764546567817926 + }, + { + "epoch": 2.430958663148637, + "grad_norm": 0.33155461279884, + "learning_rate": 0.0004930360855805796, + "loss": 3.151230812072754, + "step": 4147, + "token_acc": 0.28088501902305746 + }, + { + "epoch": 2.431545001465846, + "grad_norm": 0.3460235188602479, + "learning_rate": 0.000493030405289465, + "loss": 3.132094383239746, + "step": 4148, + "token_acc": 0.28577278878724677 + }, + { + "epoch": 2.432131339783055, + "grad_norm": 0.28853688060948096, + "learning_rate": 0.0004930247227154072, + "loss": 3.203066349029541, + "step": 4149, + "token_acc": 0.27543740967374924 + }, + { + "epoch": 2.4327176781002637, + "grad_norm": 0.3128670072739596, + "learning_rate": 0.0004930190378584596, + "loss": 3.158848285675049, + "step": 4150, + "token_acc": 0.28278090583011606 + }, + { + "epoch": 2.433304016417473, + "grad_norm": 0.2659541699608326, + "learning_rate": 0.0004930133507186756, + "loss": 3.1860828399658203, + "step": 4151, + "token_acc": 0.2771219844398126 + }, + { + "epoch": 2.433890354734682, + "grad_norm": 0.3610622440174549, + "learning_rate": 0.0004930076612961086, + "loss": 3.1482510566711426, + "step": 4152, + "token_acc": 0.28393887748893004 + }, + { + "epoch": 2.434476693051891, + "grad_norm": 0.25400727208734536, + "learning_rate": 0.0004930019695908121, + "loss": 3.0948405265808105, + "step": 4153, + "token_acc": 0.290667462565655 + }, + { + "epoch": 2.4350630313690997, + "grad_norm": 0.3106252673093002, + "learning_rate": 0.0004929962756028396, + "loss": 3.121525287628174, + "step": 4154, + "token_acc": 0.2853909780095146 + }, + { + "epoch": 2.435649369686309, + "grad_norm": 0.26493612703858604, + "learning_rate": 0.0004929905793322445, + "loss": 3.1614787578582764, + "step": 4155, + "token_acc": 0.28172556775887414 + }, + { + "epoch": 2.436235708003518, + "grad_norm": 0.29910276962565174, + "learning_rate": 0.0004929848807790803, + "loss": 3.1297683715820312, + "step": 4156, + "token_acc": 0.2847041300420812 + }, + { + "epoch": 2.436822046320727, + "grad_norm": 0.2485899773683061, + "learning_rate": 0.0004929791799434006, + "loss": 3.1873116493225098, + "step": 4157, + "token_acc": 0.2757823804534222 + }, + { + "epoch": 2.437408384637936, + "grad_norm": 0.34689503204076666, + "learning_rate": 0.0004929734768252589, + "loss": 3.1448440551757812, + "step": 4158, + "token_acc": 0.284070946303469 + }, + { + "epoch": 2.4379947229551453, + "grad_norm": 0.2647971573709019, + "learning_rate": 0.0004929677714247089, + "loss": 3.1847267150878906, + "step": 4159, + "token_acc": 0.2783791875453027 + }, + { + "epoch": 2.438581061272354, + "grad_norm": 0.275487446603698, + "learning_rate": 0.000492962063741804, + "loss": 3.149015426635742, + "step": 4160, + "token_acc": 0.2815154490511722 + }, + { + "epoch": 2.439167399589563, + "grad_norm": 0.25270398957826845, + "learning_rate": 0.000492956353776598, + "loss": 3.1334729194641113, + "step": 4161, + "token_acc": 0.2856022204558189 + }, + { + "epoch": 2.439753737906772, + "grad_norm": 0.2986367662498128, + "learning_rate": 0.0004929506415291444, + "loss": 3.150852680206299, + "step": 4162, + "token_acc": 0.2820326165792056 + }, + { + "epoch": 2.4403400762239813, + "grad_norm": 0.28561022837885724, + "learning_rate": 0.0004929449269994967, + "loss": 3.1485679149627686, + "step": 4163, + "token_acc": 0.28192553661289177 + }, + { + "epoch": 2.4409264145411904, + "grad_norm": 0.2579067773451289, + "learning_rate": 0.000492939210187709, + "loss": 3.1455910205841064, + "step": 4164, + "token_acc": 0.2817013712544439 + }, + { + "epoch": 2.441512752858399, + "grad_norm": 0.2649746034316926, + "learning_rate": 0.0004929334910938347, + "loss": 3.1592254638671875, + "step": 4165, + "token_acc": 0.28298098277237543 + }, + { + "epoch": 2.442099091175608, + "grad_norm": 0.2969163676399799, + "learning_rate": 0.0004929277697179277, + "loss": 3.135164976119995, + "step": 4166, + "token_acc": 0.2843562947070902 + }, + { + "epoch": 2.4426854294928173, + "grad_norm": 0.3207941388496579, + "learning_rate": 0.0004929220460600417, + "loss": 3.202554225921631, + "step": 4167, + "token_acc": 0.27601026665541656 + }, + { + "epoch": 2.4432717678100264, + "grad_norm": 0.2677869631433697, + "learning_rate": 0.0004929163201202303, + "loss": 3.1436314582824707, + "step": 4168, + "token_acc": 0.2838059332344015 + }, + { + "epoch": 2.4438581061272355, + "grad_norm": 0.242170689723494, + "learning_rate": 0.0004929105918985474, + "loss": 3.1898326873779297, + "step": 4169, + "token_acc": 0.2769615970048327 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.2917618502161134, + "learning_rate": 0.0004929048613950468, + "loss": 3.1665778160095215, + "step": 4170, + "token_acc": 0.2817072114968401 + }, + { + "epoch": 2.4450307827616533, + "grad_norm": 0.36972532152077836, + "learning_rate": 0.0004928991286097825, + "loss": 3.1536307334899902, + "step": 4171, + "token_acc": 0.28326246857639426 + }, + { + "epoch": 2.4456171210788624, + "grad_norm": 0.32769026456770717, + "learning_rate": 0.000492893393542808, + "loss": 3.1668663024902344, + "step": 4172, + "token_acc": 0.2794675738344151 + }, + { + "epoch": 2.4462034593960715, + "grad_norm": 0.21826115470366664, + "learning_rate": 0.0004928876561941776, + "loss": 3.165198564529419, + "step": 4173, + "token_acc": 0.2805083001336672 + }, + { + "epoch": 2.4467897977132806, + "grad_norm": 0.2525647905124037, + "learning_rate": 0.0004928819165639448, + "loss": 3.1652674674987793, + "step": 4174, + "token_acc": 0.2804764754638174 + }, + { + "epoch": 2.4473761360304898, + "grad_norm": 0.24973832293172119, + "learning_rate": 0.0004928761746521637, + "loss": 3.1039247512817383, + "step": 4175, + "token_acc": 0.29084482354150576 + }, + { + "epoch": 2.4479624743476984, + "grad_norm": 0.27811566943399496, + "learning_rate": 0.0004928704304588881, + "loss": 3.1877975463867188, + "step": 4176, + "token_acc": 0.2753458884952452 + }, + { + "epoch": 2.4485488126649075, + "grad_norm": 0.3572517789804198, + "learning_rate": 0.0004928646839841722, + "loss": 3.143075466156006, + "step": 4177, + "token_acc": 0.28347821056893896 + }, + { + "epoch": 2.4491351509821166, + "grad_norm": 0.34927544312848724, + "learning_rate": 0.0004928589352280699, + "loss": 3.150834798812866, + "step": 4178, + "token_acc": 0.2818185731890414 + }, + { + "epoch": 2.4497214892993258, + "grad_norm": 0.26749069012328264, + "learning_rate": 0.0004928531841906352, + "loss": 3.1356890201568604, + "step": 4179, + "token_acc": 0.28432401086011017 + }, + { + "epoch": 2.450307827616535, + "grad_norm": 0.3163747417639619, + "learning_rate": 0.0004928474308719219, + "loss": 3.1242334842681885, + "step": 4180, + "token_acc": 0.2864676402465141 + }, + { + "epoch": 2.450894165933744, + "grad_norm": 0.2295863988942232, + "learning_rate": 0.0004928416752719843, + "loss": 3.1421024799346924, + "step": 4181, + "token_acc": 0.28301806010633074 + }, + { + "epoch": 2.4514805042509527, + "grad_norm": 0.31023578703748433, + "learning_rate": 0.0004928359173908765, + "loss": 3.1689114570617676, + "step": 4182, + "token_acc": 0.2804534725362624 + }, + { + "epoch": 2.4520668425681618, + "grad_norm": 0.32132829357731, + "learning_rate": 0.0004928301572286524, + "loss": 3.1850528717041016, + "step": 4183, + "token_acc": 0.2764554305004034 + }, + { + "epoch": 2.452653180885371, + "grad_norm": 0.24807097322166693, + "learning_rate": 0.0004928243947853662, + "loss": 3.1628308296203613, + "step": 4184, + "token_acc": 0.2826047806815952 + }, + { + "epoch": 2.45323951920258, + "grad_norm": 0.29537704560204525, + "learning_rate": 0.000492818630061072, + "loss": 3.127662181854248, + "step": 4185, + "token_acc": 0.28291572093763284 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.29488447445114074, + "learning_rate": 0.000492812863055824, + "loss": 3.083648681640625, + "step": 4186, + "token_acc": 0.2926736682272555 + }, + { + "epoch": 2.4544121958369978, + "grad_norm": 0.34296162287324217, + "learning_rate": 0.0004928070937696763, + "loss": 3.131272315979004, + "step": 4187, + "token_acc": 0.2848218012067127 + }, + { + "epoch": 2.454998534154207, + "grad_norm": 0.3405302602748392, + "learning_rate": 0.0004928013222026832, + "loss": 3.198709011077881, + "step": 4188, + "token_acc": 0.27616023002025747 + }, + { + "epoch": 2.455584872471416, + "grad_norm": 0.2578411532327479, + "learning_rate": 0.0004927955483548989, + "loss": 3.1064698696136475, + "step": 4189, + "token_acc": 0.2882348467564073 + }, + { + "epoch": 2.456171210788625, + "grad_norm": 0.32971478825560063, + "learning_rate": 0.0004927897722263774, + "loss": 3.1714906692504883, + "step": 4190, + "token_acc": 0.2794472749013635 + }, + { + "epoch": 2.456757549105834, + "grad_norm": 0.26703301123693507, + "learning_rate": 0.0004927839938171734, + "loss": 3.072561740875244, + "step": 4191, + "token_acc": 0.2914858358163838 + }, + { + "epoch": 2.4573438874230433, + "grad_norm": 0.3187975031701169, + "learning_rate": 0.0004927782131273408, + "loss": 3.099703550338745, + "step": 4192, + "token_acc": 0.29018246373310086 + }, + { + "epoch": 2.457930225740252, + "grad_norm": 0.27841817030302446, + "learning_rate": 0.0004927724301569341, + "loss": 3.1477863788604736, + "step": 4193, + "token_acc": 0.28245284791723274 + }, + { + "epoch": 2.458516564057461, + "grad_norm": 0.3107033461527404, + "learning_rate": 0.0004927666449060075, + "loss": 3.137528419494629, + "step": 4194, + "token_acc": 0.2839246382982208 + }, + { + "epoch": 2.45910290237467, + "grad_norm": 0.2937806579561194, + "learning_rate": 0.0004927608573746154, + "loss": 3.1533889770507812, + "step": 4195, + "token_acc": 0.2844599211563732 + }, + { + "epoch": 2.4596892406918793, + "grad_norm": 0.31881748413630634, + "learning_rate": 0.0004927550675628122, + "loss": 3.1699368953704834, + "step": 4196, + "token_acc": 0.279223266715391 + }, + { + "epoch": 2.460275579009088, + "grad_norm": 0.2977123082936681, + "learning_rate": 0.0004927492754706522, + "loss": 3.148170232772827, + "step": 4197, + "token_acc": 0.2826829798599349 + }, + { + "epoch": 2.460861917326297, + "grad_norm": 0.3130183165660392, + "learning_rate": 0.0004927434810981898, + "loss": 3.1294474601745605, + "step": 4198, + "token_acc": 0.28655273925681807 + }, + { + "epoch": 2.4614482556435062, + "grad_norm": 0.3155202396776548, + "learning_rate": 0.0004927376844454797, + "loss": 3.1539344787597656, + "step": 4199, + "token_acc": 0.28121282792763014 + }, + { + "epoch": 2.4620345939607153, + "grad_norm": 0.2801842965345091, + "learning_rate": 0.0004927318855125761, + "loss": 3.0997154712677, + "step": 4200, + "token_acc": 0.28903814432176034 + }, + { + "epoch": 2.4626209322779244, + "grad_norm": 0.30852818632655105, + "learning_rate": 0.0004927260842995335, + "loss": 3.1385796070098877, + "step": 4201, + "token_acc": 0.2850856884382837 + }, + { + "epoch": 2.4632072705951336, + "grad_norm": 0.3174216567748999, + "learning_rate": 0.0004927202808064063, + "loss": 3.1967341899871826, + "step": 4202, + "token_acc": 0.2761190912556066 + }, + { + "epoch": 2.4637936089123427, + "grad_norm": 0.3127201734274958, + "learning_rate": 0.0004927144750332493, + "loss": 3.210827112197876, + "step": 4203, + "token_acc": 0.27585854750651356 + }, + { + "epoch": 2.4643799472295513, + "grad_norm": 0.3100123033233525, + "learning_rate": 0.0004927086669801168, + "loss": 3.135878086090088, + "step": 4204, + "token_acc": 0.28531658371703084 + }, + { + "epoch": 2.4649662855467604, + "grad_norm": 0.29123300944516933, + "learning_rate": 0.0004927028566470633, + "loss": 3.159703254699707, + "step": 4205, + "token_acc": 0.2810755242958033 + }, + { + "epoch": 2.4655526238639696, + "grad_norm": 0.3169398102046388, + "learning_rate": 0.0004926970440341436, + "loss": 3.1234395503997803, + "step": 4206, + "token_acc": 0.2856850238539722 + }, + { + "epoch": 2.4661389621811787, + "grad_norm": 0.32786407189713923, + "learning_rate": 0.0004926912291414123, + "loss": 3.140085220336914, + "step": 4207, + "token_acc": 0.28308418852000505 + }, + { + "epoch": 2.4667253004983873, + "grad_norm": 0.31909777654113397, + "learning_rate": 0.0004926854119689239, + "loss": 3.1785402297973633, + "step": 4208, + "token_acc": 0.2798355511605634 + }, + { + "epoch": 2.4673116388155965, + "grad_norm": 0.2603921936728504, + "learning_rate": 0.000492679592516733, + "loss": 3.1161351203918457, + "step": 4209, + "token_acc": 0.2866432697570301 + }, + { + "epoch": 2.4678979771328056, + "grad_norm": 0.28566648902168706, + "learning_rate": 0.0004926737707848944, + "loss": 3.146257162094116, + "step": 4210, + "token_acc": 0.2831503019631466 + }, + { + "epoch": 2.4684843154500147, + "grad_norm": 0.3311792571287571, + "learning_rate": 0.0004926679467734627, + "loss": 3.1663293838500977, + "step": 4211, + "token_acc": 0.27972298737071877 + }, + { + "epoch": 2.469070653767224, + "grad_norm": 0.2920541350305395, + "learning_rate": 0.0004926621204824926, + "loss": 3.2038497924804688, + "step": 4212, + "token_acc": 0.2743212331914827 + }, + { + "epoch": 2.469656992084433, + "grad_norm": 0.3169651690225659, + "learning_rate": 0.000492656291912039, + "loss": 3.1694750785827637, + "step": 4213, + "token_acc": 0.278409294103823 + }, + { + "epoch": 2.4702433304016416, + "grad_norm": 0.367698108143396, + "learning_rate": 0.0004926504610621564, + "loss": 3.1810524463653564, + "step": 4214, + "token_acc": 0.2770920333765093 + }, + { + "epoch": 2.4708296687188507, + "grad_norm": 0.3222032054448611, + "learning_rate": 0.0004926446279328998, + "loss": 3.1648154258728027, + "step": 4215, + "token_acc": 0.28176942680004563 + }, + { + "epoch": 2.47141600703606, + "grad_norm": 0.2557004144868029, + "learning_rate": 0.0004926387925243237, + "loss": 3.114588499069214, + "step": 4216, + "token_acc": 0.28587690098557533 + }, + { + "epoch": 2.472002345353269, + "grad_norm": 0.2637862781168629, + "learning_rate": 0.0004926329548364833, + "loss": 3.1347289085388184, + "step": 4217, + "token_acc": 0.2854036415979182 + }, + { + "epoch": 2.472588683670478, + "grad_norm": 0.24874164791712872, + "learning_rate": 0.0004926271148694332, + "loss": 3.107349395751953, + "step": 4218, + "token_acc": 0.2886656426120885 + }, + { + "epoch": 2.4731750219876867, + "grad_norm": 0.2732698878771074, + "learning_rate": 0.0004926212726232283, + "loss": 3.163975715637207, + "step": 4219, + "token_acc": 0.27964605978619855 + }, + { + "epoch": 2.473761360304896, + "grad_norm": 0.42054354776419856, + "learning_rate": 0.0004926154280979234, + "loss": 3.1351051330566406, + "step": 4220, + "token_acc": 0.2862769507866716 + }, + { + "epoch": 2.474347698622105, + "grad_norm": 0.45850723675087535, + "learning_rate": 0.0004926095812935736, + "loss": 3.1475491523742676, + "step": 4221, + "token_acc": 0.2842240729890349 + }, + { + "epoch": 2.474934036939314, + "grad_norm": 0.264343481547017, + "learning_rate": 0.0004926037322102336, + "loss": 3.129584312438965, + "step": 4222, + "token_acc": 0.28305626802609596 + }, + { + "epoch": 2.475520375256523, + "grad_norm": 0.3217916991597029, + "learning_rate": 0.0004925978808479585, + "loss": 3.157909631729126, + "step": 4223, + "token_acc": 0.28130903851287087 + }, + { + "epoch": 2.4761067135737322, + "grad_norm": 0.2852363283332628, + "learning_rate": 0.0004925920272068031, + "loss": 3.1636481285095215, + "step": 4224, + "token_acc": 0.28172602343851005 + }, + { + "epoch": 2.476693051890941, + "grad_norm": 0.30036414136795775, + "learning_rate": 0.0004925861712868227, + "loss": 3.1593754291534424, + "step": 4225, + "token_acc": 0.28274572690955946 + }, + { + "epoch": 2.47727939020815, + "grad_norm": 0.31368061905501227, + "learning_rate": 0.0004925803130880719, + "loss": 3.1193454265594482, + "step": 4226, + "token_acc": 0.2875602100261618 + }, + { + "epoch": 2.477865728525359, + "grad_norm": 0.2538877932025326, + "learning_rate": 0.0004925744526106061, + "loss": 3.14658784866333, + "step": 4227, + "token_acc": 0.2823971455541269 + }, + { + "epoch": 2.4784520668425682, + "grad_norm": 0.2971608514508549, + "learning_rate": 0.0004925685898544801, + "loss": 3.143190383911133, + "step": 4228, + "token_acc": 0.28430127837903396 + }, + { + "epoch": 2.4790384051597774, + "grad_norm": 0.3101850656008902, + "learning_rate": 0.0004925627248197491, + "loss": 3.1665735244750977, + "step": 4229, + "token_acc": 0.2808483898786362 + }, + { + "epoch": 2.479624743476986, + "grad_norm": 0.2853338373635848, + "learning_rate": 0.0004925568575064683, + "loss": 3.0559873580932617, + "step": 4230, + "token_acc": 0.29455190216415117 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.3079789439127816, + "learning_rate": 0.0004925509879146925, + "loss": 3.1213254928588867, + "step": 4231, + "token_acc": 0.2853008330442616 + }, + { + "epoch": 2.4807974201114043, + "grad_norm": 0.23365005078398637, + "learning_rate": 0.000492545116044477, + "loss": 3.1593503952026367, + "step": 4232, + "token_acc": 0.2822676743336211 + }, + { + "epoch": 2.4813837584286134, + "grad_norm": 0.3401747512749149, + "learning_rate": 0.0004925392418958771, + "loss": 3.1303277015686035, + "step": 4233, + "token_acc": 0.2849772310789873 + }, + { + "epoch": 2.4819700967458225, + "grad_norm": 0.2976141430883711, + "learning_rate": 0.0004925333654689477, + "loss": 3.1732425689697266, + "step": 4234, + "token_acc": 0.27814858144897914 + }, + { + "epoch": 2.4825564350630316, + "grad_norm": 0.31204318663521496, + "learning_rate": 0.0004925274867637442, + "loss": 3.134174346923828, + "step": 4235, + "token_acc": 0.28627241257580627 + }, + { + "epoch": 2.4831427733802403, + "grad_norm": 0.27507313707966113, + "learning_rate": 0.0004925216057803218, + "loss": 3.1691324710845947, + "step": 4236, + "token_acc": 0.2797478129866184 + }, + { + "epoch": 2.4837291116974494, + "grad_norm": 0.2798317718680493, + "learning_rate": 0.0004925157225187357, + "loss": 3.1303861141204834, + "step": 4237, + "token_acc": 0.2858177146272178 + }, + { + "epoch": 2.4843154500146585, + "grad_norm": 0.32877360599087635, + "learning_rate": 0.0004925098369790412, + "loss": 3.1254799365997314, + "step": 4238, + "token_acc": 0.28797540547293404 + }, + { + "epoch": 2.4849017883318676, + "grad_norm": 0.2515607816995969, + "learning_rate": 0.0004925039491612935, + "loss": 3.148189067840576, + "step": 4239, + "token_acc": 0.28161157755862326 + }, + { + "epoch": 2.4854881266490767, + "grad_norm": 0.30336740299206366, + "learning_rate": 0.0004924980590655481, + "loss": 3.1954843997955322, + "step": 4240, + "token_acc": 0.27653655229382107 + }, + { + "epoch": 2.4860744649662854, + "grad_norm": 0.2463990404723353, + "learning_rate": 0.00049249216669186, + "loss": 3.1458182334899902, + "step": 4241, + "token_acc": 0.2816627113502711 + }, + { + "epoch": 2.4866608032834945, + "grad_norm": 0.29452863088120024, + "learning_rate": 0.0004924862720402849, + "loss": 3.1190481185913086, + "step": 4242, + "token_acc": 0.28853939121323696 + }, + { + "epoch": 2.4872471416007036, + "grad_norm": 0.25048427701672565, + "learning_rate": 0.000492480375110878, + "loss": 3.1333179473876953, + "step": 4243, + "token_acc": 0.28418590313163 + }, + { + "epoch": 2.4878334799179127, + "grad_norm": 0.273340430114594, + "learning_rate": 0.0004924744759036948, + "loss": 3.1438660621643066, + "step": 4244, + "token_acc": 0.2840406654443052 + }, + { + "epoch": 2.488419818235122, + "grad_norm": 0.3049434276679553, + "learning_rate": 0.0004924685744187906, + "loss": 3.152846097946167, + "step": 4245, + "token_acc": 0.2819012336412522 + }, + { + "epoch": 2.489006156552331, + "grad_norm": 0.27403403011737354, + "learning_rate": 0.0004924626706562208, + "loss": 3.125669240951538, + "step": 4246, + "token_acc": 0.2855597251072077 + }, + { + "epoch": 2.4895924948695396, + "grad_norm": 0.28463700493746086, + "learning_rate": 0.000492456764616041, + "loss": 3.159043312072754, + "step": 4247, + "token_acc": 0.28121481304267554 + }, + { + "epoch": 2.4901788331867487, + "grad_norm": 0.2521780456704807, + "learning_rate": 0.0004924508562983066, + "loss": 3.139467239379883, + "step": 4248, + "token_acc": 0.28392874822392344 + }, + { + "epoch": 2.490765171503958, + "grad_norm": 0.3112607097123613, + "learning_rate": 0.0004924449457030731, + "loss": 3.1357169151306152, + "step": 4249, + "token_acc": 0.282311327050909 + }, + { + "epoch": 2.491351509821167, + "grad_norm": 0.2564671993879261, + "learning_rate": 0.0004924390328303961, + "loss": 3.1473641395568848, + "step": 4250, + "token_acc": 0.2823216827780433 + }, + { + "epoch": 2.4919378481383756, + "grad_norm": 0.3009530481884104, + "learning_rate": 0.000492433117680331, + "loss": 3.137119770050049, + "step": 4251, + "token_acc": 0.2849263957323777 + }, + { + "epoch": 2.4925241864555847, + "grad_norm": 0.28251510034457167, + "learning_rate": 0.0004924272002529334, + "loss": 3.1396188735961914, + "step": 4252, + "token_acc": 0.2830706051058537 + }, + { + "epoch": 2.493110524772794, + "grad_norm": 0.3285647140206444, + "learning_rate": 0.0004924212805482589, + "loss": 3.1661481857299805, + "step": 4253, + "token_acc": 0.2789961650760448 + }, + { + "epoch": 2.493696863090003, + "grad_norm": 0.25808545466235516, + "learning_rate": 0.0004924153585663633, + "loss": 3.133260726928711, + "step": 4254, + "token_acc": 0.2866272430450191 + }, + { + "epoch": 2.494283201407212, + "grad_norm": 0.3086101211645659, + "learning_rate": 0.000492409434307302, + "loss": 3.152337074279785, + "step": 4255, + "token_acc": 0.2832935928213243 + }, + { + "epoch": 2.494869539724421, + "grad_norm": 0.26678250462601166, + "learning_rate": 0.0004924035077711308, + "loss": 3.131521701812744, + "step": 4256, + "token_acc": 0.2838843577046315 + }, + { + "epoch": 2.49545587804163, + "grad_norm": 0.30354118955796067, + "learning_rate": 0.000492397578957905, + "loss": 3.139087677001953, + "step": 4257, + "token_acc": 0.2840618438633327 + }, + { + "epoch": 2.496042216358839, + "grad_norm": 0.2420116542544201, + "learning_rate": 0.0004923916478676808, + "loss": 3.1340725421905518, + "step": 4258, + "token_acc": 0.28425254221180346 + }, + { + "epoch": 2.496628554676048, + "grad_norm": 0.2584900077782168, + "learning_rate": 0.0004923857145005137, + "loss": 3.087541341781616, + "step": 4259, + "token_acc": 0.28960529272511587 + }, + { + "epoch": 2.497214892993257, + "grad_norm": 0.2436011326871065, + "learning_rate": 0.0004923797788564595, + "loss": 3.101266384124756, + "step": 4260, + "token_acc": 0.2907542134317991 + }, + { + "epoch": 2.4978012313104663, + "grad_norm": 0.26503644145185634, + "learning_rate": 0.0004923738409355737, + "loss": 3.1852493286132812, + "step": 4261, + "token_acc": 0.27827252134337566 + }, + { + "epoch": 2.498387569627675, + "grad_norm": 0.25518759515027695, + "learning_rate": 0.0004923679007379124, + "loss": 3.112393617630005, + "step": 4262, + "token_acc": 0.28702717523120563 + }, + { + "epoch": 2.498973907944884, + "grad_norm": 0.24577736625728883, + "learning_rate": 0.0004923619582635311, + "loss": 3.143197536468506, + "step": 4263, + "token_acc": 0.28351113967982156 + }, + { + "epoch": 2.499560246262093, + "grad_norm": 0.23761577363047687, + "learning_rate": 0.0004923560135124859, + "loss": 3.1861720085144043, + "step": 4264, + "token_acc": 0.27916490043663805 + }, + { + "epoch": 2.5001465845793023, + "grad_norm": 0.30169261774841344, + "learning_rate": 0.0004923500664848326, + "loss": 3.134916067123413, + "step": 4265, + "token_acc": 0.2849277961528534 + }, + { + "epoch": 2.5007329228965114, + "grad_norm": 0.261200152779632, + "learning_rate": 0.000492344117180627, + "loss": 3.130162239074707, + "step": 4266, + "token_acc": 0.284659194135722 + }, + { + "epoch": 2.5013192612137205, + "grad_norm": 0.2868925459183805, + "learning_rate": 0.0004923381655999249, + "loss": 3.1220574378967285, + "step": 4267, + "token_acc": 0.2874931987264784 + }, + { + "epoch": 2.5019055995309296, + "grad_norm": 0.2297280022925021, + "learning_rate": 0.0004923322117427823, + "loss": 3.1427063941955566, + "step": 4268, + "token_acc": 0.2828639955920246 + }, + { + "epoch": 2.5024919378481383, + "grad_norm": 0.269069665462166, + "learning_rate": 0.0004923262556092551, + "loss": 3.1416192054748535, + "step": 4269, + "token_acc": 0.28258880946287346 + }, + { + "epoch": 2.5030782761653474, + "grad_norm": 0.26701035680832236, + "learning_rate": 0.0004923202971993993, + "loss": 3.179413318634033, + "step": 4270, + "token_acc": 0.2786554023368152 + }, + { + "epoch": 2.5036646144825565, + "grad_norm": 0.2803470675948156, + "learning_rate": 0.0004923143365132708, + "loss": 3.161494731903076, + "step": 4271, + "token_acc": 0.2817550447032257 + }, + { + "epoch": 2.5042509527997656, + "grad_norm": 0.3277185157340391, + "learning_rate": 0.0004923083735509257, + "loss": 3.1134235858917236, + "step": 4272, + "token_acc": 0.2863776215155609 + }, + { + "epoch": 2.5048372911169743, + "grad_norm": 0.375287007803871, + "learning_rate": 0.0004923024083124199, + "loss": 3.1536192893981934, + "step": 4273, + "token_acc": 0.27926387922533624 + }, + { + "epoch": 2.5054236294341834, + "grad_norm": 0.2811102389120282, + "learning_rate": 0.0004922964407978094, + "loss": 3.1751699447631836, + "step": 4274, + "token_acc": 0.28195738660539527 + }, + { + "epoch": 2.5060099677513925, + "grad_norm": 0.30788889426938676, + "learning_rate": 0.0004922904710071505, + "loss": 3.1252379417419434, + "step": 4275, + "token_acc": 0.2863395134054725 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.29832754044621906, + "learning_rate": 0.000492284498940499, + "loss": 3.1817221641540527, + "step": 4276, + "token_acc": 0.2792992159825311 + }, + { + "epoch": 2.5071826443858107, + "grad_norm": 0.29164510630326, + "learning_rate": 0.0004922785245979112, + "loss": 3.1445181369781494, + "step": 4277, + "token_acc": 0.2827581154517364 + }, + { + "epoch": 2.50776898270302, + "grad_norm": 0.31886656251645734, + "learning_rate": 0.000492272547979443, + "loss": 3.1670188903808594, + "step": 4278, + "token_acc": 0.2802357396225322 + }, + { + "epoch": 2.5083553210202285, + "grad_norm": 0.2598135189672792, + "learning_rate": 0.0004922665690851508, + "loss": 3.1491293907165527, + "step": 4279, + "token_acc": 0.2836048409853921 + }, + { + "epoch": 2.5089416593374376, + "grad_norm": 0.2674432332351087, + "learning_rate": 0.0004922605879150906, + "loss": 3.148040771484375, + "step": 4280, + "token_acc": 0.28228450886609996 + }, + { + "epoch": 2.5095279976546467, + "grad_norm": 0.26942678389811003, + "learning_rate": 0.0004922546044693187, + "loss": 3.153386354446411, + "step": 4281, + "token_acc": 0.28247137257813865 + }, + { + "epoch": 2.510114335971856, + "grad_norm": 0.261081268127297, + "learning_rate": 0.0004922486187478912, + "loss": 3.1450023651123047, + "step": 4282, + "token_acc": 0.28330050904551274 + }, + { + "epoch": 2.5107006742890645, + "grad_norm": 0.21410881575339583, + "learning_rate": 0.0004922426307508642, + "loss": 3.139354944229126, + "step": 4283, + "token_acc": 0.2834795960802295 + }, + { + "epoch": 2.5112870126062736, + "grad_norm": 0.29260667410661473, + "learning_rate": 0.0004922366404782943, + "loss": 3.158104658126831, + "step": 4284, + "token_acc": 0.28026898881279244 + }, + { + "epoch": 2.5118733509234827, + "grad_norm": 0.29965306649839285, + "learning_rate": 0.0004922306479302375, + "loss": 3.1704249382019043, + "step": 4285, + "token_acc": 0.28118961272732085 + }, + { + "epoch": 2.512459689240692, + "grad_norm": 0.2567423758958786, + "learning_rate": 0.0004922246531067502, + "loss": 3.154134511947632, + "step": 4286, + "token_acc": 0.2834724052922717 + }, + { + "epoch": 2.513046027557901, + "grad_norm": 0.26475541496539967, + "learning_rate": 0.0004922186560078887, + "loss": 3.1556499004364014, + "step": 4287, + "token_acc": 0.2832360749509433 + }, + { + "epoch": 2.51363236587511, + "grad_norm": 0.27687315686815755, + "learning_rate": 0.0004922126566337093, + "loss": 3.1589155197143555, + "step": 4288, + "token_acc": 0.2811336162465361 + }, + { + "epoch": 2.514218704192319, + "grad_norm": 0.27852264206280625, + "learning_rate": 0.0004922066549842683, + "loss": 3.1473474502563477, + "step": 4289, + "token_acc": 0.2828106137775904 + }, + { + "epoch": 2.514805042509528, + "grad_norm": 0.2671077484038438, + "learning_rate": 0.0004922006510596223, + "loss": 3.1173739433288574, + "step": 4290, + "token_acc": 0.28637902734620985 + }, + { + "epoch": 2.515391380826737, + "grad_norm": 0.2326095306515351, + "learning_rate": 0.0004921946448598275, + "loss": 3.132035255432129, + "step": 4291, + "token_acc": 0.28393412248548117 + }, + { + "epoch": 2.515977719143946, + "grad_norm": 0.22992825667665331, + "learning_rate": 0.0004921886363849404, + "loss": 3.1505091190338135, + "step": 4292, + "token_acc": 0.2833363469635523 + }, + { + "epoch": 2.516564057461155, + "grad_norm": 0.3042372486612273, + "learning_rate": 0.0004921826256350173, + "loss": 3.1512975692749023, + "step": 4293, + "token_acc": 0.2823877258299201 + }, + { + "epoch": 2.517150395778364, + "grad_norm": 0.35882623625851784, + "learning_rate": 0.0004921766126101149, + "loss": 3.1563854217529297, + "step": 4294, + "token_acc": 0.28189638993980964 + }, + { + "epoch": 2.517736734095573, + "grad_norm": 0.2780683516642048, + "learning_rate": 0.0004921705973102894, + "loss": 3.1225974559783936, + "step": 4295, + "token_acc": 0.28589753889826197 + }, + { + "epoch": 2.518323072412782, + "grad_norm": 0.23816444056334407, + "learning_rate": 0.0004921645797355976, + "loss": 3.1350255012512207, + "step": 4296, + "token_acc": 0.2840119983011255 + }, + { + "epoch": 2.518909410729991, + "grad_norm": 0.27219288665958735, + "learning_rate": 0.000492158559886096, + "loss": 3.1376705169677734, + "step": 4297, + "token_acc": 0.28444319661042905 + }, + { + "epoch": 2.5194957490472003, + "grad_norm": 0.23858972281133586, + "learning_rate": 0.0004921525377618408, + "loss": 3.179051637649536, + "step": 4298, + "token_acc": 0.27872398362079154 + }, + { + "epoch": 2.5200820873644094, + "grad_norm": 0.22971999576466592, + "learning_rate": 0.0004921465133628889, + "loss": 3.157120704650879, + "step": 4299, + "token_acc": 0.28111675153510096 + }, + { + "epoch": 2.5206684256816185, + "grad_norm": 0.2586473207886974, + "learning_rate": 0.0004921404866892969, + "loss": 3.1460561752319336, + "step": 4300, + "token_acc": 0.2825990525027059 + }, + { + "epoch": 2.521254763998827, + "grad_norm": 0.2858821864097338, + "learning_rate": 0.0004921344577411212, + "loss": 3.1245875358581543, + "step": 4301, + "token_acc": 0.28530609828937953 + }, + { + "epoch": 2.5218411023160363, + "grad_norm": 0.30615352040388594, + "learning_rate": 0.0004921284265184186, + "loss": 3.133049964904785, + "step": 4302, + "token_acc": 0.2838908506250773 + }, + { + "epoch": 2.5224274406332454, + "grad_norm": 0.3007266941146918, + "learning_rate": 0.0004921223930212458, + "loss": 3.1369903087615967, + "step": 4303, + "token_acc": 0.2836628372962864 + }, + { + "epoch": 2.5230137789504545, + "grad_norm": 0.29510494899516504, + "learning_rate": 0.0004921163572496592, + "loss": 3.1203293800354004, + "step": 4304, + "token_acc": 0.28575063682852553 + }, + { + "epoch": 2.523600117267663, + "grad_norm": 0.307605869683673, + "learning_rate": 0.0004921103192037158, + "loss": 3.133833885192871, + "step": 4305, + "token_acc": 0.2847668324495579 + }, + { + "epoch": 2.5241864555848723, + "grad_norm": 0.33189622168709265, + "learning_rate": 0.0004921042788834721, + "loss": 3.1483256816864014, + "step": 4306, + "token_acc": 0.28336904862938134 + }, + { + "epoch": 2.5247727939020814, + "grad_norm": 0.3084929990974992, + "learning_rate": 0.000492098236288985, + "loss": 3.1353297233581543, + "step": 4307, + "token_acc": 0.2829878897732945 + }, + { + "epoch": 2.5253591322192905, + "grad_norm": 0.257706446408307, + "learning_rate": 0.0004920921914203112, + "loss": 3.1635162830352783, + "step": 4308, + "token_acc": 0.2801346407588288 + }, + { + "epoch": 2.5259454705364996, + "grad_norm": 0.3116828843149226, + "learning_rate": 0.0004920861442775076, + "loss": 3.160191535949707, + "step": 4309, + "token_acc": 0.28130631334437467 + }, + { + "epoch": 2.5265318088537088, + "grad_norm": 0.27230023589998287, + "learning_rate": 0.0004920800948606306, + "loss": 3.151123523712158, + "step": 4310, + "token_acc": 0.28097180614868167 + }, + { + "epoch": 2.527118147170918, + "grad_norm": 0.3355134609466817, + "learning_rate": 0.0004920740431697375, + "loss": 3.1164069175720215, + "step": 4311, + "token_acc": 0.28757651989384275 + }, + { + "epoch": 2.5277044854881265, + "grad_norm": 0.3041376186267956, + "learning_rate": 0.000492067989204885, + "loss": 3.1578617095947266, + "step": 4312, + "token_acc": 0.28154340251315 + }, + { + "epoch": 2.5282908238053357, + "grad_norm": 0.24019866704515144, + "learning_rate": 0.0004920619329661299, + "loss": 3.129883289337158, + "step": 4313, + "token_acc": 0.28478083310645247 + }, + { + "epoch": 2.5288771621225448, + "grad_norm": 0.3131737085272038, + "learning_rate": 0.0004920558744535291, + "loss": 3.2153759002685547, + "step": 4314, + "token_acc": 0.27452384022823034 + }, + { + "epoch": 2.529463500439754, + "grad_norm": 0.26758665022490724, + "learning_rate": 0.0004920498136671396, + "loss": 3.164632797241211, + "step": 4315, + "token_acc": 0.2815751667521806 + }, + { + "epoch": 2.5300498387569625, + "grad_norm": 0.23056215055359155, + "learning_rate": 0.0004920437506070182, + "loss": 3.144793748855591, + "step": 4316, + "token_acc": 0.2825833181332569 + }, + { + "epoch": 2.5306361770741717, + "grad_norm": 0.30941235034930364, + "learning_rate": 0.0004920376852732219, + "loss": 3.199553966522217, + "step": 4317, + "token_acc": 0.27772342451636467 + }, + { + "epoch": 2.5312225153913808, + "grad_norm": 0.2523939704555795, + "learning_rate": 0.0004920316176658077, + "loss": 3.1641182899475098, + "step": 4318, + "token_acc": 0.28056233904922195 + }, + { + "epoch": 2.53180885370859, + "grad_norm": 0.2728309696824672, + "learning_rate": 0.0004920255477848327, + "loss": 3.166947364807129, + "step": 4319, + "token_acc": 0.28012700098007187 + }, + { + "epoch": 2.532395192025799, + "grad_norm": 0.3066075386046518, + "learning_rate": 0.0004920194756303537, + "loss": 3.1164848804473877, + "step": 4320, + "token_acc": 0.28752298541093324 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.24796660321348632, + "learning_rate": 0.0004920134012024279, + "loss": 3.1212098598480225, + "step": 4321, + "token_acc": 0.2860366749728452 + }, + { + "epoch": 2.533567868660217, + "grad_norm": 0.251387110834701, + "learning_rate": 0.0004920073245011123, + "loss": 3.1223530769348145, + "step": 4322, + "token_acc": 0.28597904590265355 + }, + { + "epoch": 2.534154206977426, + "grad_norm": 0.28108959707435394, + "learning_rate": 0.000492001245526464, + "loss": 3.1054224967956543, + "step": 4323, + "token_acc": 0.28950577004567685 + }, + { + "epoch": 2.534740545294635, + "grad_norm": 0.2644107329424148, + "learning_rate": 0.00049199516427854, + "loss": 3.155257225036621, + "step": 4324, + "token_acc": 0.2809893368336828 + }, + { + "epoch": 2.535326883611844, + "grad_norm": 0.30989145078981273, + "learning_rate": 0.0004919890807573977, + "loss": 3.115139961242676, + "step": 4325, + "token_acc": 0.28814040756762394 + }, + { + "epoch": 2.535913221929053, + "grad_norm": 0.2977263044324904, + "learning_rate": 0.000491982994963094, + "loss": 3.1034939289093018, + "step": 4326, + "token_acc": 0.2877057090613572 + }, + { + "epoch": 2.536499560246262, + "grad_norm": 0.25112086160966424, + "learning_rate": 0.0004919769068956861, + "loss": 3.1442034244537354, + "step": 4327, + "token_acc": 0.2845234549998058 + }, + { + "epoch": 2.537085898563471, + "grad_norm": 0.31922932359624206, + "learning_rate": 0.0004919708165552312, + "loss": 3.122307777404785, + "step": 4328, + "token_acc": 0.28650051247453845 + }, + { + "epoch": 2.53767223688068, + "grad_norm": 0.26706980693055804, + "learning_rate": 0.0004919647239417866, + "loss": 3.1440606117248535, + "step": 4329, + "token_acc": 0.28275191048588166 + }, + { + "epoch": 2.538258575197889, + "grad_norm": 0.2865069420660975, + "learning_rate": 0.0004919586290554095, + "loss": 3.1797022819519043, + "step": 4330, + "token_acc": 0.2780855468029681 + }, + { + "epoch": 2.5388449135150983, + "grad_norm": 0.3025230886770115, + "learning_rate": 0.000491952531896157, + "loss": 3.170433521270752, + "step": 4331, + "token_acc": 0.2793197264837018 + }, + { + "epoch": 2.5394312518323074, + "grad_norm": 0.3101695780173901, + "learning_rate": 0.0004919464324640866, + "loss": 3.1348390579223633, + "step": 4332, + "token_acc": 0.28437338149559765 + }, + { + "epoch": 2.540017590149516, + "grad_norm": 0.24404758732667856, + "learning_rate": 0.0004919403307592554, + "loss": 3.09989857673645, + "step": 4333, + "token_acc": 0.28917855599894676 + }, + { + "epoch": 2.5406039284667252, + "grad_norm": 0.28855930517661665, + "learning_rate": 0.000491934226781721, + "loss": 3.152952194213867, + "step": 4334, + "token_acc": 0.28245550232173255 + }, + { + "epoch": 2.5411902667839343, + "grad_norm": 0.29319788305875377, + "learning_rate": 0.0004919281205315405, + "loss": 3.1324281692504883, + "step": 4335, + "token_acc": 0.2852023473576193 + }, + { + "epoch": 2.5417766051011434, + "grad_norm": 0.2703004688269168, + "learning_rate": 0.0004919220120087711, + "loss": 3.1648545265197754, + "step": 4336, + "token_acc": 0.2789870135906686 + }, + { + "epoch": 2.542362943418352, + "grad_norm": 0.2600531071819018, + "learning_rate": 0.0004919159012134706, + "loss": 3.1146669387817383, + "step": 4337, + "token_acc": 0.2864504712170046 + }, + { + "epoch": 2.5429492817355612, + "grad_norm": 0.23898266094751827, + "learning_rate": 0.0004919097881456962, + "loss": 3.1948297023773193, + "step": 4338, + "token_acc": 0.2776180339327699 + }, + { + "epoch": 2.5435356200527703, + "grad_norm": 0.2822356106364129, + "learning_rate": 0.0004919036728055052, + "loss": 3.1372992992401123, + "step": 4339, + "token_acc": 0.28257266814603343 + }, + { + "epoch": 2.5441219583699795, + "grad_norm": 0.3289886647941623, + "learning_rate": 0.0004918975551929552, + "loss": 3.126556396484375, + "step": 4340, + "token_acc": 0.2858095921187254 + }, + { + "epoch": 2.5447082966871886, + "grad_norm": 0.3096503368096305, + "learning_rate": 0.0004918914353081036, + "loss": 3.1287105083465576, + "step": 4341, + "token_acc": 0.2855772104509776 + }, + { + "epoch": 2.5452946350043977, + "grad_norm": 0.2427602573518604, + "learning_rate": 0.000491885313151008, + "loss": 3.1497249603271484, + "step": 4342, + "token_acc": 0.28301823863388553 + }, + { + "epoch": 2.545880973321607, + "grad_norm": 0.2649246447511227, + "learning_rate": 0.0004918791887217258, + "loss": 3.15787410736084, + "step": 4343, + "token_acc": 0.2806290879921017 + }, + { + "epoch": 2.5464673116388155, + "grad_norm": 0.3688002878427219, + "learning_rate": 0.0004918730620203145, + "loss": 3.137702465057373, + "step": 4344, + "token_acc": 0.2836198924895081 + }, + { + "epoch": 2.5470536499560246, + "grad_norm": 0.392547880010125, + "learning_rate": 0.0004918669330468318, + "loss": 3.130627393722534, + "step": 4345, + "token_acc": 0.2848764703899693 + }, + { + "epoch": 2.5476399882732337, + "grad_norm": 0.29446639611721936, + "learning_rate": 0.0004918608018013352, + "loss": 3.1768155097961426, + "step": 4346, + "token_acc": 0.2789353937940636 + }, + { + "epoch": 2.548226326590443, + "grad_norm": 0.291551473048645, + "learning_rate": 0.0004918546682838822, + "loss": 3.210007667541504, + "step": 4347, + "token_acc": 0.27479441616307004 + }, + { + "epoch": 2.5488126649076515, + "grad_norm": 0.27398888875997945, + "learning_rate": 0.0004918485324945305, + "loss": 3.166940689086914, + "step": 4348, + "token_acc": 0.2788000719268848 + }, + { + "epoch": 2.5493990032248606, + "grad_norm": 0.23234282120035715, + "learning_rate": 0.0004918423944333378, + "loss": 3.1179120540618896, + "step": 4349, + "token_acc": 0.2852429490609624 + }, + { + "epoch": 2.5499853415420697, + "grad_norm": 0.32550691217598576, + "learning_rate": 0.0004918362541003616, + "loss": 3.1616291999816895, + "step": 4350, + "token_acc": 0.2825448280125688 + }, + { + "epoch": 2.550571679859279, + "grad_norm": 0.29218977882083025, + "learning_rate": 0.0004918301114956597, + "loss": 3.1603875160217285, + "step": 4351, + "token_acc": 0.27943635187068316 + }, + { + "epoch": 2.551158018176488, + "grad_norm": 0.27048560077512346, + "learning_rate": 0.0004918239666192898, + "loss": 3.149303913116455, + "step": 4352, + "token_acc": 0.2831882286648519 + }, + { + "epoch": 2.551744356493697, + "grad_norm": 0.28268715558857466, + "learning_rate": 0.0004918178194713096, + "loss": 3.125880479812622, + "step": 4353, + "token_acc": 0.28484007427782687 + }, + { + "epoch": 2.552330694810906, + "grad_norm": 0.2531251929638439, + "learning_rate": 0.0004918116700517767, + "loss": 3.127319097518921, + "step": 4354, + "token_acc": 0.2846300730131456 + }, + { + "epoch": 2.552917033128115, + "grad_norm": 0.29445173254114065, + "learning_rate": 0.0004918055183607492, + "loss": 3.1871931552886963, + "step": 4355, + "token_acc": 0.2765448368475908 + }, + { + "epoch": 2.553503371445324, + "grad_norm": 0.295090610181979, + "learning_rate": 0.0004917993643982846, + "loss": 3.1349101066589355, + "step": 4356, + "token_acc": 0.28367573378176664 + }, + { + "epoch": 2.554089709762533, + "grad_norm": 0.3102836198682593, + "learning_rate": 0.0004917932081644408, + "loss": 3.1398916244506836, + "step": 4357, + "token_acc": 0.2838782306862423 + }, + { + "epoch": 2.554676048079742, + "grad_norm": 0.3197076737471242, + "learning_rate": 0.0004917870496592756, + "loss": 3.1072349548339844, + "step": 4358, + "token_acc": 0.28849589371917683 + }, + { + "epoch": 2.555262386396951, + "grad_norm": 0.28351448872883833, + "learning_rate": 0.000491780888882847, + "loss": 3.167807102203369, + "step": 4359, + "token_acc": 0.2801222651390479 + }, + { + "epoch": 2.55584872471416, + "grad_norm": 0.32487168369859104, + "learning_rate": 0.0004917747258352126, + "loss": 3.1269595623016357, + "step": 4360, + "token_acc": 0.2838945916392414 + }, + { + "epoch": 2.556435063031369, + "grad_norm": 0.3970360988267621, + "learning_rate": 0.0004917685605164306, + "loss": 3.1138930320739746, + "step": 4361, + "token_acc": 0.28848098371484465 + }, + { + "epoch": 2.557021401348578, + "grad_norm": 0.3180796567733429, + "learning_rate": 0.0004917623929265587, + "loss": 3.1905202865600586, + "step": 4362, + "token_acc": 0.2761942215929946 + }, + { + "epoch": 2.5576077396657872, + "grad_norm": 0.30116545051003024, + "learning_rate": 0.0004917562230656548, + "loss": 3.1863551139831543, + "step": 4363, + "token_acc": 0.2761850431829338 + }, + { + "epoch": 2.5581940779829964, + "grad_norm": 0.3121216005670908, + "learning_rate": 0.0004917500509337772, + "loss": 3.1712117195129395, + "step": 4364, + "token_acc": 0.27866050093872674 + }, + { + "epoch": 2.5587804163002055, + "grad_norm": 0.26634649353292333, + "learning_rate": 0.0004917438765309834, + "loss": 3.137951612472534, + "step": 4365, + "token_acc": 0.283734290453863 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.24120987487077813, + "learning_rate": 0.0004917376998573316, + "loss": 3.17258882522583, + "step": 4366, + "token_acc": 0.27733272450379975 + }, + { + "epoch": 2.5599530929346233, + "grad_norm": 0.2567590142408795, + "learning_rate": 0.00049173152091288, + "loss": 3.128509998321533, + "step": 4367, + "token_acc": 0.2842827044627466 + }, + { + "epoch": 2.5605394312518324, + "grad_norm": 0.24021257412820615, + "learning_rate": 0.0004917253396976865, + "loss": 3.163233757019043, + "step": 4368, + "token_acc": 0.2794036898748078 + }, + { + "epoch": 2.5611257695690415, + "grad_norm": 0.2520996973701851, + "learning_rate": 0.0004917191562118091, + "loss": 3.1782047748565674, + "step": 4369, + "token_acc": 0.28018436873747493 + }, + { + "epoch": 2.56171210788625, + "grad_norm": 0.24363076301500283, + "learning_rate": 0.0004917129704553059, + "loss": 3.178955554962158, + "step": 4370, + "token_acc": 0.27945157395933606 + }, + { + "epoch": 2.5622984462034593, + "grad_norm": 0.22057607909545476, + "learning_rate": 0.0004917067824282352, + "loss": 3.144303798675537, + "step": 4371, + "token_acc": 0.28172306033924305 + }, + { + "epoch": 2.5628847845206684, + "grad_norm": 0.25323519492898083, + "learning_rate": 0.0004917005921306549, + "loss": 3.144503355026245, + "step": 4372, + "token_acc": 0.2823282095781166 + }, + { + "epoch": 2.5634711228378775, + "grad_norm": 0.2689643226080944, + "learning_rate": 0.0004916943995626232, + "loss": 3.168996810913086, + "step": 4373, + "token_acc": 0.2808877528979768 + }, + { + "epoch": 2.5640574611550866, + "grad_norm": 0.26419153182321536, + "learning_rate": 0.0004916882047241984, + "loss": 3.151325225830078, + "step": 4374, + "token_acc": 0.2825032859206434 + }, + { + "epoch": 2.5646437994722957, + "grad_norm": 0.25188691483540815, + "learning_rate": 0.0004916820076154386, + "loss": 3.123016834259033, + "step": 4375, + "token_acc": 0.285173876694759 + }, + { + "epoch": 2.565230137789505, + "grad_norm": 0.2808546357122756, + "learning_rate": 0.0004916758082364019, + "loss": 3.184295177459717, + "step": 4376, + "token_acc": 0.27758669504944267 + }, + { + "epoch": 2.5658164761067135, + "grad_norm": 0.32797327680642174, + "learning_rate": 0.0004916696065871466, + "loss": 3.1421308517456055, + "step": 4377, + "token_acc": 0.283045190226603 + }, + { + "epoch": 2.5664028144239226, + "grad_norm": 0.2875666664334995, + "learning_rate": 0.0004916634026677311, + "loss": 3.1357574462890625, + "step": 4378, + "token_acc": 0.28307711084779724 + }, + { + "epoch": 2.5669891527411317, + "grad_norm": 0.24565212969135664, + "learning_rate": 0.0004916571964782136, + "loss": 3.1166491508483887, + "step": 4379, + "token_acc": 0.2866507009660146 + }, + { + "epoch": 2.567575491058341, + "grad_norm": 0.28666651684881717, + "learning_rate": 0.0004916509880186524, + "loss": 3.1321258544921875, + "step": 4380, + "token_acc": 0.28443416449217446 + }, + { + "epoch": 2.5681618293755495, + "grad_norm": 0.25411362451029323, + "learning_rate": 0.0004916447772891058, + "loss": 3.180919647216797, + "step": 4381, + "token_acc": 0.27737039412150877 + }, + { + "epoch": 2.5687481676927586, + "grad_norm": 0.2654262243211257, + "learning_rate": 0.000491638564289632, + "loss": 3.1119225025177, + "step": 4382, + "token_acc": 0.28647425706249235 + }, + { + "epoch": 2.5693345060099677, + "grad_norm": 0.2956362442028171, + "learning_rate": 0.0004916323490202895, + "loss": 3.140455722808838, + "step": 4383, + "token_acc": 0.2842919337950291 + }, + { + "epoch": 2.569920844327177, + "grad_norm": 0.2787255526271086, + "learning_rate": 0.0004916261314811368, + "loss": 3.159057378768921, + "step": 4384, + "token_acc": 0.28144780727569363 + }, + { + "epoch": 2.570507182644386, + "grad_norm": 0.29825703158711053, + "learning_rate": 0.0004916199116722322, + "loss": 3.140352249145508, + "step": 4385, + "token_acc": 0.28457807049122164 + }, + { + "epoch": 2.571093520961595, + "grad_norm": 0.3370955933004504, + "learning_rate": 0.000491613689593634, + "loss": 3.1583242416381836, + "step": 4386, + "token_acc": 0.28399039757628664 + }, + { + "epoch": 2.5716798592788037, + "grad_norm": 0.3459928664116189, + "learning_rate": 0.0004916074652454009, + "loss": 3.162193536758423, + "step": 4387, + "token_acc": 0.27937162908571056 + }, + { + "epoch": 2.572266197596013, + "grad_norm": 0.2754947627012599, + "learning_rate": 0.0004916012386275913, + "loss": 3.155233383178711, + "step": 4388, + "token_acc": 0.282756577819121 + }, + { + "epoch": 2.572852535913222, + "grad_norm": 0.30894625579211454, + "learning_rate": 0.0004915950097402633, + "loss": 3.135875701904297, + "step": 4389, + "token_acc": 0.28430345653149786 + }, + { + "epoch": 2.573438874230431, + "grad_norm": 0.2916061934671588, + "learning_rate": 0.000491588778583476, + "loss": 3.167969226837158, + "step": 4390, + "token_acc": 0.2800733775453164 + }, + { + "epoch": 2.5740252125476397, + "grad_norm": 0.2746631274025691, + "learning_rate": 0.0004915825451572877, + "loss": 3.141489267349243, + "step": 4391, + "token_acc": 0.28375773432080054 + }, + { + "epoch": 2.574611550864849, + "grad_norm": 0.35073951974094186, + "learning_rate": 0.0004915763094617566, + "loss": 3.14787220954895, + "step": 4392, + "token_acc": 0.2813030989294992 + }, + { + "epoch": 2.575197889182058, + "grad_norm": 0.23869780654082637, + "learning_rate": 0.0004915700714969419, + "loss": 3.1046342849731445, + "step": 4393, + "token_acc": 0.2888057037309147 + }, + { + "epoch": 2.575784227499267, + "grad_norm": 0.3491423395944009, + "learning_rate": 0.0004915638312629019, + "loss": 3.1581149101257324, + "step": 4394, + "token_acc": 0.2826405829199384 + }, + { + "epoch": 2.576370565816476, + "grad_norm": 0.2584341283336134, + "learning_rate": 0.0004915575887596952, + "loss": 3.1420466899871826, + "step": 4395, + "token_acc": 0.283997259309204 + }, + { + "epoch": 2.5769569041336853, + "grad_norm": 0.3204523717019629, + "learning_rate": 0.0004915513439873804, + "loss": 3.17549467086792, + "step": 4396, + "token_acc": 0.27775441331842565 + }, + { + "epoch": 2.5775432424508944, + "grad_norm": 0.2666313238784185, + "learning_rate": 0.0004915450969460161, + "loss": 3.2244763374328613, + "step": 4397, + "token_acc": 0.27085624133479946 + }, + { + "epoch": 2.578129580768103, + "grad_norm": 0.33805956710405566, + "learning_rate": 0.0004915388476356612, + "loss": 3.1516635417938232, + "step": 4398, + "token_acc": 0.2814144346770011 + }, + { + "epoch": 2.578715919085312, + "grad_norm": 0.284274750803578, + "learning_rate": 0.0004915325960563743, + "loss": 3.1569743156433105, + "step": 4399, + "token_acc": 0.2820393220338983 + }, + { + "epoch": 2.5793022574025213, + "grad_norm": 0.28096357487119394, + "learning_rate": 0.0004915263422082143, + "loss": 3.160341739654541, + "step": 4400, + "token_acc": 0.281220405978507 + }, + { + "epoch": 2.5798885957197304, + "grad_norm": 0.2524062721918786, + "learning_rate": 0.0004915200860912396, + "loss": 3.1140899658203125, + "step": 4401, + "token_acc": 0.2871440185104121 + }, + { + "epoch": 2.580474934036939, + "grad_norm": 0.26410048382613177, + "learning_rate": 0.0004915138277055091, + "loss": 3.1460094451904297, + "step": 4402, + "token_acc": 0.2843677027658139 + }, + { + "epoch": 2.581061272354148, + "grad_norm": 0.26842801518707626, + "learning_rate": 0.0004915075670510817, + "loss": 3.15571665763855, + "step": 4403, + "token_acc": 0.2813728956751519 + }, + { + "epoch": 2.5816476106713573, + "grad_norm": 0.22599120906159506, + "learning_rate": 0.0004915013041280162, + "loss": 3.145167827606201, + "step": 4404, + "token_acc": 0.28293628526245984 + }, + { + "epoch": 2.5822339489885664, + "grad_norm": 0.2680456036466516, + "learning_rate": 0.0004914950389363713, + "loss": 3.0857837200164795, + "step": 4405, + "token_acc": 0.2903688240815301 + }, + { + "epoch": 2.5828202873057755, + "grad_norm": 0.25367921515288877, + "learning_rate": 0.0004914887714762059, + "loss": 3.1026771068573, + "step": 4406, + "token_acc": 0.28792235801581595 + }, + { + "epoch": 2.5834066256229846, + "grad_norm": 0.224020983410606, + "learning_rate": 0.0004914825017475789, + "loss": 3.1019716262817383, + "step": 4407, + "token_acc": 0.2892546379031572 + }, + { + "epoch": 2.5839929639401937, + "grad_norm": 0.35348210079093173, + "learning_rate": 0.0004914762297505493, + "loss": 3.1458561420440674, + "step": 4408, + "token_acc": 0.2848810867578234 + }, + { + "epoch": 2.5845793022574024, + "grad_norm": 0.35172599239333713, + "learning_rate": 0.0004914699554851759, + "loss": 3.116210460662842, + "step": 4409, + "token_acc": 0.28672572216816844 + }, + { + "epoch": 2.5851656405746115, + "grad_norm": 0.27237675230111746, + "learning_rate": 0.0004914636789515177, + "loss": 3.184718608856201, + "step": 4410, + "token_acc": 0.27708282555082936 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.35901935134222934, + "learning_rate": 0.0004914574001496334, + "loss": 3.087034225463867, + "step": 4411, + "token_acc": 0.28986774452243463 + }, + { + "epoch": 2.5863383172090297, + "grad_norm": 0.27319016904507337, + "learning_rate": 0.0004914511190795824, + "loss": 3.1788759231567383, + "step": 4412, + "token_acc": 0.2789790825864739 + }, + { + "epoch": 2.5869246555262384, + "grad_norm": 0.3054525436587608, + "learning_rate": 0.0004914448357414234, + "loss": 3.113358736038208, + "step": 4413, + "token_acc": 0.287545482465501 + }, + { + "epoch": 2.5875109938434475, + "grad_norm": 0.2946710647765237, + "learning_rate": 0.0004914385501352156, + "loss": 3.1616268157958984, + "step": 4414, + "token_acc": 0.28040992323810926 + }, + { + "epoch": 2.5880973321606566, + "grad_norm": 0.24425756015108044, + "learning_rate": 0.0004914322622610178, + "loss": 3.122652053833008, + "step": 4415, + "token_acc": 0.28602584490385 + }, + { + "epoch": 2.5886836704778657, + "grad_norm": 0.27362701657532407, + "learning_rate": 0.0004914259721188894, + "loss": 3.119075298309326, + "step": 4416, + "token_acc": 0.28688217128352167 + }, + { + "epoch": 2.589270008795075, + "grad_norm": 0.3248267226259104, + "learning_rate": 0.0004914196797088892, + "loss": 3.1667394638061523, + "step": 4417, + "token_acc": 0.28041669281455917 + }, + { + "epoch": 2.589856347112284, + "grad_norm": 0.2881265303388993, + "learning_rate": 0.0004914133850310765, + "loss": 3.193389892578125, + "step": 4418, + "token_acc": 0.27687412797141703 + }, + { + "epoch": 2.590442685429493, + "grad_norm": 0.46019423683328586, + "learning_rate": 0.0004914070880855103, + "loss": 3.122049331665039, + "step": 4419, + "token_acc": 0.2854857207950065 + }, + { + "epoch": 2.5910290237467017, + "grad_norm": 0.3019953627677614, + "learning_rate": 0.0004914007888722498, + "loss": 3.1416196823120117, + "step": 4420, + "token_acc": 0.28437696644197974 + }, + { + "epoch": 2.591615362063911, + "grad_norm": 0.30044737737454075, + "learning_rate": 0.0004913944873913543, + "loss": 3.133695602416992, + "step": 4421, + "token_acc": 0.28447698418879974 + }, + { + "epoch": 2.59220170038112, + "grad_norm": 0.29566621773957935, + "learning_rate": 0.0004913881836428827, + "loss": 3.142026901245117, + "step": 4422, + "token_acc": 0.28393396389565123 + }, + { + "epoch": 2.592788038698329, + "grad_norm": 0.26370836990969015, + "learning_rate": 0.0004913818776268946, + "loss": 3.0946197509765625, + "step": 4423, + "token_acc": 0.2903073821192994 + }, + { + "epoch": 2.5933743770155377, + "grad_norm": 0.23893545713360817, + "learning_rate": 0.0004913755693434489, + "loss": 3.1701855659484863, + "step": 4424, + "token_acc": 0.2790072950790258 + }, + { + "epoch": 2.593960715332747, + "grad_norm": 0.2880617578520256, + "learning_rate": 0.0004913692587926049, + "loss": 3.151846408843994, + "step": 4425, + "token_acc": 0.28204308826480184 + }, + { + "epoch": 2.594547053649956, + "grad_norm": 0.3080166873136557, + "learning_rate": 0.0004913629459744221, + "loss": 3.1573777198791504, + "step": 4426, + "token_acc": 0.282134699066657 + }, + { + "epoch": 2.595133391967165, + "grad_norm": 0.4263050394717753, + "learning_rate": 0.0004913566308889596, + "loss": 3.1153945922851562, + "step": 4427, + "token_acc": 0.2855707562885224 + }, + { + "epoch": 2.595719730284374, + "grad_norm": 0.4198320197813092, + "learning_rate": 0.0004913503135362768, + "loss": 3.1211605072021484, + "step": 4428, + "token_acc": 0.2853663867923315 + }, + { + "epoch": 2.5963060686015833, + "grad_norm": 0.3121289616724346, + "learning_rate": 0.000491343993916433, + "loss": 3.1287055015563965, + "step": 4429, + "token_acc": 0.2866549661817965 + }, + { + "epoch": 2.5968924069187924, + "grad_norm": 0.37980093114027763, + "learning_rate": 0.0004913376720294876, + "loss": 3.154170513153076, + "step": 4430, + "token_acc": 0.282718697612616 + }, + { + "epoch": 2.597478745236001, + "grad_norm": 0.32986146997532834, + "learning_rate": 0.0004913313478755, + "loss": 3.160292148590088, + "step": 4431, + "token_acc": 0.2803236878846377 + }, + { + "epoch": 2.59806508355321, + "grad_norm": 0.30078869465495656, + "learning_rate": 0.0004913250214545296, + "loss": 3.174564838409424, + "step": 4432, + "token_acc": 0.27713408403630896 + }, + { + "epoch": 2.5986514218704193, + "grad_norm": 0.2927607369519846, + "learning_rate": 0.0004913186927666359, + "loss": 3.1422476768493652, + "step": 4433, + "token_acc": 0.2825182240600222 + }, + { + "epoch": 2.5992377601876284, + "grad_norm": 0.28119368962595204, + "learning_rate": 0.0004913123618118781, + "loss": 3.1367149353027344, + "step": 4434, + "token_acc": 0.28194494974036977 + }, + { + "epoch": 2.599824098504837, + "grad_norm": 0.34073429017926676, + "learning_rate": 0.0004913060285903159, + "loss": 3.13165020942688, + "step": 4435, + "token_acc": 0.28559732568845736 + }, + { + "epoch": 2.600410436822046, + "grad_norm": 0.28835942904131184, + "learning_rate": 0.0004912996931020087, + "loss": 3.1303820610046387, + "step": 4436, + "token_acc": 0.2839435110373109 + }, + { + "epoch": 2.6009967751392553, + "grad_norm": 0.33422063794693274, + "learning_rate": 0.0004912933553470161, + "loss": 3.1056551933288574, + "step": 4437, + "token_acc": 0.2876120082230906 + }, + { + "epoch": 2.6015831134564644, + "grad_norm": 0.26221060762296367, + "learning_rate": 0.0004912870153253975, + "loss": 3.14431095123291, + "step": 4438, + "token_acc": 0.2829661102996884 + }, + { + "epoch": 2.6021694517736735, + "grad_norm": 0.26298528222746703, + "learning_rate": 0.0004912806730372126, + "loss": 3.122894763946533, + "step": 4439, + "token_acc": 0.2842636195811974 + }, + { + "epoch": 2.6027557900908826, + "grad_norm": 0.2723007755709516, + "learning_rate": 0.0004912743284825209, + "loss": 3.1369009017944336, + "step": 4440, + "token_acc": 0.2831264802393087 + }, + { + "epoch": 2.6033421284080913, + "grad_norm": 0.28391584247640217, + "learning_rate": 0.0004912679816613819, + "loss": 3.1534619331359863, + "step": 4441, + "token_acc": 0.2831015915435194 + }, + { + "epoch": 2.6039284667253004, + "grad_norm": 0.3486956423617713, + "learning_rate": 0.0004912616325738554, + "loss": 3.181530714035034, + "step": 4442, + "token_acc": 0.27839960801727176 + }, + { + "epoch": 2.6045148050425095, + "grad_norm": 0.24043476879509992, + "learning_rate": 0.000491255281220001, + "loss": 3.1551170349121094, + "step": 4443, + "token_acc": 0.28177229649577573 + }, + { + "epoch": 2.6051011433597187, + "grad_norm": 0.2691725427432372, + "learning_rate": 0.0004912489275998783, + "loss": 3.121718168258667, + "step": 4444, + "token_acc": 0.28634862070444894 + }, + { + "epoch": 2.6056874816769273, + "grad_norm": 0.24070049599901105, + "learning_rate": 0.000491242571713547, + "loss": 3.171643018722534, + "step": 4445, + "token_acc": 0.27780545060488676 + }, + { + "epoch": 2.6062738199941364, + "grad_norm": 0.25732795886890575, + "learning_rate": 0.0004912362135610668, + "loss": 3.1481659412384033, + "step": 4446, + "token_acc": 0.28013366836774345 + }, + { + "epoch": 2.6068601583113455, + "grad_norm": 0.2791351599449322, + "learning_rate": 0.0004912298531424974, + "loss": 3.1447200775146484, + "step": 4447, + "token_acc": 0.2829630944951182 + }, + { + "epoch": 2.6074464966285547, + "grad_norm": 0.28419388709359794, + "learning_rate": 0.0004912234904578987, + "loss": 3.1735267639160156, + "step": 4448, + "token_acc": 0.2799769963241822 + }, + { + "epoch": 2.6080328349457638, + "grad_norm": 0.25093391769960843, + "learning_rate": 0.0004912171255073303, + "loss": 3.111140251159668, + "step": 4449, + "token_acc": 0.28830905426765696 + }, + { + "epoch": 2.608619173262973, + "grad_norm": 0.30633934772606325, + "learning_rate": 0.000491210758290852, + "loss": 3.118685007095337, + "step": 4450, + "token_acc": 0.28502878829232786 + }, + { + "epoch": 2.609205511580182, + "grad_norm": 0.29473364943918695, + "learning_rate": 0.0004912043888085238, + "loss": 3.1386444568634033, + "step": 4451, + "token_acc": 0.2846476812888272 + }, + { + "epoch": 2.6097918498973907, + "grad_norm": 0.22865924863579715, + "learning_rate": 0.0004911980170604054, + "loss": 3.1420087814331055, + "step": 4452, + "token_acc": 0.28462879213700426 + }, + { + "epoch": 2.6103781882145998, + "grad_norm": 0.3062297392443326, + "learning_rate": 0.0004911916430465565, + "loss": 3.132032871246338, + "step": 4453, + "token_acc": 0.28367317195966985 + }, + { + "epoch": 2.610964526531809, + "grad_norm": 0.27935605673554037, + "learning_rate": 0.0004911852667670373, + "loss": 3.1432809829711914, + "step": 4454, + "token_acc": 0.28329692368298537 + }, + { + "epoch": 2.611550864849018, + "grad_norm": 0.2606252275923648, + "learning_rate": 0.0004911788882219074, + "loss": 3.1206002235412598, + "step": 4455, + "token_acc": 0.2868433651725518 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.3117186211315846, + "learning_rate": 0.0004911725074112268, + "loss": 3.11965274810791, + "step": 4456, + "token_acc": 0.28574816892548605 + }, + { + "epoch": 2.6127235414834358, + "grad_norm": 0.22470593157282528, + "learning_rate": 0.0004911661243350555, + "loss": 3.1618447303771973, + "step": 4457, + "token_acc": 0.2801963846649009 + }, + { + "epoch": 2.613309879800645, + "grad_norm": 0.2632122378022257, + "learning_rate": 0.0004911597389934535, + "loss": 3.1404547691345215, + "step": 4458, + "token_acc": 0.28278369041660173 + }, + { + "epoch": 2.613896218117854, + "grad_norm": 0.35880362859860043, + "learning_rate": 0.0004911533513864806, + "loss": 3.1570546627044678, + "step": 4459, + "token_acc": 0.2813147297254671 + }, + { + "epoch": 2.614482556435063, + "grad_norm": 0.2977747547382781, + "learning_rate": 0.0004911469615141971, + "loss": 3.1947293281555176, + "step": 4460, + "token_acc": 0.2768346140230316 + }, + { + "epoch": 2.615068894752272, + "grad_norm": 0.24002059815786428, + "learning_rate": 0.0004911405693766627, + "loss": 3.151607036590576, + "step": 4461, + "token_acc": 0.2811408059973317 + }, + { + "epoch": 2.6156552330694813, + "grad_norm": 0.304247785226504, + "learning_rate": 0.0004911341749739376, + "loss": 3.1220548152923584, + "step": 4462, + "token_acc": 0.28436413204036465 + }, + { + "epoch": 2.61624157138669, + "grad_norm": 0.25596117557087283, + "learning_rate": 0.000491127778306082, + "loss": 3.101673126220703, + "step": 4463, + "token_acc": 0.2883029202095894 + }, + { + "epoch": 2.616827909703899, + "grad_norm": 0.3132204177191324, + "learning_rate": 0.0004911213793731557, + "loss": 3.0839123725891113, + "step": 4464, + "token_acc": 0.29173899469247583 + }, + { + "epoch": 2.6174142480211082, + "grad_norm": 0.3007098071653364, + "learning_rate": 0.000491114978175219, + "loss": 3.160425901412964, + "step": 4465, + "token_acc": 0.2801504728925216 + }, + { + "epoch": 2.6180005863383173, + "grad_norm": 0.22513742705747958, + "learning_rate": 0.000491108574712332, + "loss": 3.1495208740234375, + "step": 4466, + "token_acc": 0.28447036498729233 + }, + { + "epoch": 2.618586924655526, + "grad_norm": 0.22017635742880062, + "learning_rate": 0.0004911021689845549, + "loss": 3.159933090209961, + "step": 4467, + "token_acc": 0.2820490794032586 + }, + { + "epoch": 2.619173262972735, + "grad_norm": 0.2315654701100227, + "learning_rate": 0.0004910957609919476, + "loss": 3.1120617389678955, + "step": 4468, + "token_acc": 0.2877209897039034 + }, + { + "epoch": 2.6197596012899442, + "grad_norm": 0.23733525453108914, + "learning_rate": 0.0004910893507345707, + "loss": 3.1168088912963867, + "step": 4469, + "token_acc": 0.28717847815976816 + }, + { + "epoch": 2.6203459396071533, + "grad_norm": 0.2677714802000277, + "learning_rate": 0.0004910829382124842, + "loss": 3.130603551864624, + "step": 4470, + "token_acc": 0.28436955014392873 + }, + { + "epoch": 2.6209322779243625, + "grad_norm": 0.25160877364280415, + "learning_rate": 0.0004910765234257483, + "loss": 3.113950252532959, + "step": 4471, + "token_acc": 0.28772502755439444 + }, + { + "epoch": 2.6215186162415716, + "grad_norm": 0.29844949998234555, + "learning_rate": 0.0004910701063744233, + "loss": 3.1047306060791016, + "step": 4472, + "token_acc": 0.2892114949000129 + }, + { + "epoch": 2.6221049545587807, + "grad_norm": 0.2557769128204677, + "learning_rate": 0.0004910636870585697, + "loss": 3.136532783508301, + "step": 4473, + "token_acc": 0.2848639523389279 + }, + { + "epoch": 2.6226912928759893, + "grad_norm": 0.28125834517679876, + "learning_rate": 0.0004910572654782474, + "loss": 3.191502571105957, + "step": 4474, + "token_acc": 0.2778229019406246 + }, + { + "epoch": 2.6232776311931985, + "grad_norm": 0.3083182493921153, + "learning_rate": 0.0004910508416335168, + "loss": 3.1188063621520996, + "step": 4475, + "token_acc": 0.286374023106807 + }, + { + "epoch": 2.6238639695104076, + "grad_norm": 0.35976571815258557, + "learning_rate": 0.0004910444155244386, + "loss": 3.1428544521331787, + "step": 4476, + "token_acc": 0.2816737451429604 + }, + { + "epoch": 2.6244503078276167, + "grad_norm": 0.29486408886399273, + "learning_rate": 0.0004910379871510728, + "loss": 3.1631898880004883, + "step": 4477, + "token_acc": 0.2803576952874016 + }, + { + "epoch": 2.6250366461448253, + "grad_norm": 0.30970291992441107, + "learning_rate": 0.00049103155651348, + "loss": 3.103642702102661, + "step": 4478, + "token_acc": 0.28936055198861754 + }, + { + "epoch": 2.6256229844620345, + "grad_norm": 0.3844130125718855, + "learning_rate": 0.0004910251236117205, + "loss": 3.139253616333008, + "step": 4479, + "token_acc": 0.2838493683858395 + }, + { + "epoch": 2.6262093227792436, + "grad_norm": 0.2741551505095675, + "learning_rate": 0.0004910186884458548, + "loss": 3.124657154083252, + "step": 4480, + "token_acc": 0.2861188779651844 + }, + { + "epoch": 2.6267956610964527, + "grad_norm": 0.3487253124713831, + "learning_rate": 0.0004910122510159431, + "loss": 3.1688432693481445, + "step": 4481, + "token_acc": 0.28037247745813654 + }, + { + "epoch": 2.627381999413662, + "grad_norm": 0.27541561221725075, + "learning_rate": 0.0004910058113220462, + "loss": 3.126811981201172, + "step": 4482, + "token_acc": 0.2853793730600316 + }, + { + "epoch": 2.627968337730871, + "grad_norm": 0.28073892105092135, + "learning_rate": 0.0004909993693642245, + "loss": 3.119013786315918, + "step": 4483, + "token_acc": 0.28459656530107524 + }, + { + "epoch": 2.62855467604808, + "grad_norm": 0.30762876799147587, + "learning_rate": 0.0004909929251425384, + "loss": 3.1422858238220215, + "step": 4484, + "token_acc": 0.2828757375550569 + }, + { + "epoch": 2.6291410143652887, + "grad_norm": 0.2680298117940531, + "learning_rate": 0.0004909864786570486, + "loss": 3.1751420497894287, + "step": 4485, + "token_acc": 0.27877686780578304 + }, + { + "epoch": 2.629727352682498, + "grad_norm": 0.32606785842895286, + "learning_rate": 0.0004909800299078155, + "loss": 3.134639263153076, + "step": 4486, + "token_acc": 0.2844809220200886 + }, + { + "epoch": 2.630313690999707, + "grad_norm": 0.2682071001707932, + "learning_rate": 0.0004909735788948998, + "loss": 3.130958080291748, + "step": 4487, + "token_acc": 0.28611310926122524 + }, + { + "epoch": 2.630900029316916, + "grad_norm": 0.2816885342904494, + "learning_rate": 0.000490967125618362, + "loss": 3.1387391090393066, + "step": 4488, + "token_acc": 0.28391692417943903 + }, + { + "epoch": 2.6314863676341247, + "grad_norm": 0.2583700103019815, + "learning_rate": 0.0004909606700782628, + "loss": 3.1681928634643555, + "step": 4489, + "token_acc": 0.2795932397745884 + }, + { + "epoch": 2.632072705951334, + "grad_norm": 0.2766910374733494, + "learning_rate": 0.0004909542122746627, + "loss": 3.098881959915161, + "step": 4490, + "token_acc": 0.2877975518015923 + }, + { + "epoch": 2.632659044268543, + "grad_norm": 0.2534875242029691, + "learning_rate": 0.0004909477522076225, + "loss": 3.1765336990356445, + "step": 4491, + "token_acc": 0.2798009304338418 + }, + { + "epoch": 2.633245382585752, + "grad_norm": 0.2725296466325662, + "learning_rate": 0.000490941289877203, + "loss": 3.2209014892578125, + "step": 4492, + "token_acc": 0.2736889231313836 + }, + { + "epoch": 2.633831720902961, + "grad_norm": 0.26538549354139856, + "learning_rate": 0.0004909348252834646, + "loss": 3.117403507232666, + "step": 4493, + "token_acc": 0.2869374032710085 + }, + { + "epoch": 2.6344180592201702, + "grad_norm": 0.2516765555451947, + "learning_rate": 0.0004909283584264683, + "loss": 3.129274606704712, + "step": 4494, + "token_acc": 0.2839846720067092 + }, + { + "epoch": 2.635004397537379, + "grad_norm": 0.24847335417218613, + "learning_rate": 0.0004909218893062745, + "loss": 3.1543307304382324, + "step": 4495, + "token_acc": 0.2817410233234806 + }, + { + "epoch": 2.635590735854588, + "grad_norm": 0.24730646124994307, + "learning_rate": 0.0004909154179229444, + "loss": 3.1373813152313232, + "step": 4496, + "token_acc": 0.28473536689454254 + }, + { + "epoch": 2.636177074171797, + "grad_norm": 0.23717947376181753, + "learning_rate": 0.0004909089442765385, + "loss": 3.1270108222961426, + "step": 4497, + "token_acc": 0.28454830174415463 + }, + { + "epoch": 2.6367634124890063, + "grad_norm": 0.25913042795801317, + "learning_rate": 0.0004909024683671178, + "loss": 3.1273627281188965, + "step": 4498, + "token_acc": 0.28646070500193854 + }, + { + "epoch": 2.637349750806215, + "grad_norm": 0.280644386101006, + "learning_rate": 0.0004908959901947428, + "loss": 3.142911195755005, + "step": 4499, + "token_acc": 0.2829405019747739 + }, + { + "epoch": 2.637936089123424, + "grad_norm": 0.2214587242617182, + "learning_rate": 0.0004908895097594749, + "loss": 3.1260838508605957, + "step": 4500, + "token_acc": 0.28624982668728105 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.25632378980001186, + "learning_rate": 0.0004908830270613744, + "loss": 3.1828646659851074, + "step": 4501, + "token_acc": 0.2787432326878013 + }, + { + "epoch": 2.6391087657578423, + "grad_norm": 0.2555062690669222, + "learning_rate": 0.0004908765421005026, + "loss": 3.1138601303100586, + "step": 4502, + "token_acc": 0.287056427597072 + }, + { + "epoch": 2.6396951040750514, + "grad_norm": 0.2222247197526058, + "learning_rate": 0.0004908700548769202, + "loss": 3.1409080028533936, + "step": 4503, + "token_acc": 0.28470590667896956 + }, + { + "epoch": 2.6402814423922605, + "grad_norm": 0.25779015462960087, + "learning_rate": 0.0004908635653906882, + "loss": 3.103044033050537, + "step": 4504, + "token_acc": 0.2884303317734016 + }, + { + "epoch": 2.6408677807094696, + "grad_norm": 0.30082323503877456, + "learning_rate": 0.0004908570736418676, + "loss": 3.1434521675109863, + "step": 4505, + "token_acc": 0.2826840757915941 + }, + { + "epoch": 2.6414541190266783, + "grad_norm": 0.3376745401628351, + "learning_rate": 0.0004908505796305194, + "loss": 3.1410765647888184, + "step": 4506, + "token_acc": 0.28173394586661005 + }, + { + "epoch": 2.6420404573438874, + "grad_norm": 0.28979422282689443, + "learning_rate": 0.0004908440833567045, + "loss": 3.1467819213867188, + "step": 4507, + "token_acc": 0.28107786514332966 + }, + { + "epoch": 2.6426267956610965, + "grad_norm": 0.24807991038313268, + "learning_rate": 0.000490837584820484, + "loss": 3.1270833015441895, + "step": 4508, + "token_acc": 0.28527554218575274 + }, + { + "epoch": 2.6432131339783056, + "grad_norm": 0.27778995883019647, + "learning_rate": 0.0004908310840219189, + "loss": 3.161132335662842, + "step": 4509, + "token_acc": 0.2799487815613621 + }, + { + "epoch": 2.6437994722955143, + "grad_norm": 0.268342976070708, + "learning_rate": 0.0004908245809610703, + "loss": 3.1399142742156982, + "step": 4510, + "token_acc": 0.28241428560238335 + }, + { + "epoch": 2.6443858106127234, + "grad_norm": 0.25109664777641616, + "learning_rate": 0.0004908180756379993, + "loss": 3.1628198623657227, + "step": 4511, + "token_acc": 0.28025493874007873 + }, + { + "epoch": 2.6449721489299325, + "grad_norm": 0.23209720426909075, + "learning_rate": 0.0004908115680527669, + "loss": 3.1590206623077393, + "step": 4512, + "token_acc": 0.28098680759043393 + }, + { + "epoch": 2.6455584872471416, + "grad_norm": 0.26099288299564416, + "learning_rate": 0.0004908050582054344, + "loss": 3.161895751953125, + "step": 4513, + "token_acc": 0.28008871725026874 + }, + { + "epoch": 2.6461448255643507, + "grad_norm": 0.3045617781248633, + "learning_rate": 0.0004907985460960629, + "loss": 3.1411805152893066, + "step": 4514, + "token_acc": 0.2816012266033438 + }, + { + "epoch": 2.64673116388156, + "grad_norm": 0.29232976335955796, + "learning_rate": 0.0004907920317247134, + "loss": 3.1482903957366943, + "step": 4515, + "token_acc": 0.2811057809490903 + }, + { + "epoch": 2.647317502198769, + "grad_norm": 0.26193951830338924, + "learning_rate": 0.0004907855150914473, + "loss": 3.163036584854126, + "step": 4516, + "token_acc": 0.2807180900592201 + }, + { + "epoch": 2.6479038405159776, + "grad_norm": 0.3172406383327414, + "learning_rate": 0.0004907789961963258, + "loss": 3.1367228031158447, + "step": 4517, + "token_acc": 0.2838660108156319 + }, + { + "epoch": 2.6484901788331867, + "grad_norm": 0.3356648337440435, + "learning_rate": 0.00049077247503941, + "loss": 3.155352830886841, + "step": 4518, + "token_acc": 0.28238370303673127 + }, + { + "epoch": 2.649076517150396, + "grad_norm": 0.28835283834228015, + "learning_rate": 0.0004907659516207614, + "loss": 3.153813600540161, + "step": 4519, + "token_acc": 0.2808702361234362 + }, + { + "epoch": 2.649662855467605, + "grad_norm": 0.249357663473599, + "learning_rate": 0.000490759425940441, + "loss": 3.129187822341919, + "step": 4520, + "token_acc": 0.2851964559688678 + }, + { + "epoch": 2.6502491937848136, + "grad_norm": 0.2513732061273853, + "learning_rate": 0.0004907528979985103, + "loss": 3.1271328926086426, + "step": 4521, + "token_acc": 0.285695754931134 + }, + { + "epoch": 2.6508355321020227, + "grad_norm": 0.26490295100569117, + "learning_rate": 0.0004907463677950305, + "loss": 3.1428842544555664, + "step": 4522, + "token_acc": 0.28249287509623083 + }, + { + "epoch": 2.651421870419232, + "grad_norm": 0.32061621389090744, + "learning_rate": 0.0004907398353300628, + "loss": 3.1661429405212402, + "step": 4523, + "token_acc": 0.2799314402326207 + }, + { + "epoch": 2.652008208736441, + "grad_norm": 0.3562661299822593, + "learning_rate": 0.000490733300603669, + "loss": 3.120821952819824, + "step": 4524, + "token_acc": 0.28763738171948217 + }, + { + "epoch": 2.65259454705365, + "grad_norm": 0.30949062056630505, + "learning_rate": 0.0004907267636159102, + "loss": 3.143404722213745, + "step": 4525, + "token_acc": 0.28132522064723187 + }, + { + "epoch": 2.653180885370859, + "grad_norm": 0.29257220407784473, + "learning_rate": 0.0004907202243668477, + "loss": 3.1127982139587402, + "step": 4526, + "token_acc": 0.28731092167670813 + }, + { + "epoch": 2.6537672236880683, + "grad_norm": 0.4171464378019305, + "learning_rate": 0.0004907136828565432, + "loss": 3.166630268096924, + "step": 4527, + "token_acc": 0.28050423890506293 + }, + { + "epoch": 2.654353562005277, + "grad_norm": 0.3093010026319293, + "learning_rate": 0.000490707139085058, + "loss": 3.1947901248931885, + "step": 4528, + "token_acc": 0.2760164947143863 + }, + { + "epoch": 2.654939900322486, + "grad_norm": 0.30458489838463393, + "learning_rate": 0.0004907005930524536, + "loss": 3.1313133239746094, + "step": 4529, + "token_acc": 0.2837124737230479 + }, + { + "epoch": 2.655526238639695, + "grad_norm": 0.26862086438482335, + "learning_rate": 0.0004906940447587914, + "loss": 3.140035629272461, + "step": 4530, + "token_acc": 0.2858398388381949 + }, + { + "epoch": 2.6561125769569043, + "grad_norm": 0.26053079928759193, + "learning_rate": 0.000490687494204133, + "loss": 3.1293764114379883, + "step": 4531, + "token_acc": 0.2839868643028419 + }, + { + "epoch": 2.656698915274113, + "grad_norm": 0.29246436654778357, + "learning_rate": 0.0004906809413885399, + "loss": 3.1862375736236572, + "step": 4532, + "token_acc": 0.27907232329297355 + }, + { + "epoch": 2.657285253591322, + "grad_norm": 0.22689675109451865, + "learning_rate": 0.0004906743863120737, + "loss": 3.149813175201416, + "step": 4533, + "token_acc": 0.28147393897219286 + }, + { + "epoch": 2.657871591908531, + "grad_norm": 0.25834291866456627, + "learning_rate": 0.0004906678289747959, + "loss": 3.1429402828216553, + "step": 4534, + "token_acc": 0.28302363293191907 + }, + { + "epoch": 2.6584579302257403, + "grad_norm": 0.21242558766448594, + "learning_rate": 0.0004906612693767683, + "loss": 3.176623582839966, + "step": 4535, + "token_acc": 0.2799581496390007 + }, + { + "epoch": 2.6590442685429494, + "grad_norm": 0.23368878507318006, + "learning_rate": 0.0004906547075180523, + "loss": 3.102163314819336, + "step": 4536, + "token_acc": 0.2891551334547957 + }, + { + "epoch": 2.6596306068601585, + "grad_norm": 0.21627783741282997, + "learning_rate": 0.0004906481433987096, + "loss": 3.152431011199951, + "step": 4537, + "token_acc": 0.28096652832866253 + }, + { + "epoch": 2.660216945177367, + "grad_norm": 0.23332895128180198, + "learning_rate": 0.0004906415770188019, + "loss": 3.1362998485565186, + "step": 4538, + "token_acc": 0.28523859138425156 + }, + { + "epoch": 2.6608032834945763, + "grad_norm": 0.27227170624125596, + "learning_rate": 0.0004906350083783907, + "loss": 3.1955220699310303, + "step": 4539, + "token_acc": 0.27691811160892404 + }, + { + "epoch": 2.6613896218117854, + "grad_norm": 0.28780642673917695, + "learning_rate": 0.000490628437477538, + "loss": 3.1884398460388184, + "step": 4540, + "token_acc": 0.2774921809433601 + }, + { + "epoch": 2.6619759601289945, + "grad_norm": 0.2975992585428022, + "learning_rate": 0.0004906218643163054, + "loss": 3.144155263900757, + "step": 4541, + "token_acc": 0.2835971296249173 + }, + { + "epoch": 2.6625622984462036, + "grad_norm": 0.24173241915494628, + "learning_rate": 0.0004906152888947545, + "loss": 3.1317138671875, + "step": 4542, + "token_acc": 0.2846501195131679 + }, + { + "epoch": 2.6631486367634123, + "grad_norm": 0.23947610065546424, + "learning_rate": 0.0004906087112129474, + "loss": 3.094043731689453, + "step": 4543, + "token_acc": 0.28916481600630956 + }, + { + "epoch": 2.6637349750806214, + "grad_norm": 0.24447339786300398, + "learning_rate": 0.0004906021312709455, + "loss": 3.1463050842285156, + "step": 4544, + "token_acc": 0.2831048531906726 + }, + { + "epoch": 2.6643213133978305, + "grad_norm": 0.2389278386984395, + "learning_rate": 0.0004905955490688108, + "loss": 3.164994955062866, + "step": 4545, + "token_acc": 0.2803373551674138 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.25485866213899294, + "learning_rate": 0.0004905889646066052, + "loss": 3.1323490142822266, + "step": 4546, + "token_acc": 0.2866416998714396 + }, + { + "epoch": 2.6654939900322487, + "grad_norm": 0.2814253221596995, + "learning_rate": 0.0004905823778843905, + "loss": 3.167964220046997, + "step": 4547, + "token_acc": 0.27959950331690847 + }, + { + "epoch": 2.666080328349458, + "grad_norm": 0.28802634302210167, + "learning_rate": 0.0004905757889022284, + "loss": 3.1675868034362793, + "step": 4548, + "token_acc": 0.2797365161464191 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2531289993130524, + "learning_rate": 0.0004905691976601811, + "loss": 3.1019842624664307, + "step": 4549, + "token_acc": 0.2889767596246556 + }, + { + "epoch": 2.6672530049838756, + "grad_norm": 0.23377109164904225, + "learning_rate": 0.0004905626041583103, + "loss": 3.1520907878875732, + "step": 4550, + "token_acc": 0.281039814706986 + }, + { + "epoch": 2.6678393433010847, + "grad_norm": 0.3563028375675292, + "learning_rate": 0.0004905560083966781, + "loss": 3.1506714820861816, + "step": 4551, + "token_acc": 0.2815287429598057 + }, + { + "epoch": 2.668425681618294, + "grad_norm": 0.36833642449254844, + "learning_rate": 0.0004905494103753462, + "loss": 3.1473548412323, + "step": 4552, + "token_acc": 0.2816553328705106 + }, + { + "epoch": 2.6690120199355025, + "grad_norm": 0.350467542647802, + "learning_rate": 0.0004905428100943767, + "loss": 3.1355807781219482, + "step": 4553, + "token_acc": 0.2826411200863908 + }, + { + "epoch": 2.6695983582527116, + "grad_norm": 0.26992650613140834, + "learning_rate": 0.0004905362075538317, + "loss": 3.1598753929138184, + "step": 4554, + "token_acc": 0.28076013013901213 + }, + { + "epoch": 2.6701846965699207, + "grad_norm": 0.22727169119500837, + "learning_rate": 0.0004905296027537732, + "loss": 3.1340034008026123, + "step": 4555, + "token_acc": 0.28252492599798623 + }, + { + "epoch": 2.67077103488713, + "grad_norm": 0.29201391013962075, + "learning_rate": 0.0004905229956942632, + "loss": 3.1311614513397217, + "step": 4556, + "token_acc": 0.28352003012250715 + }, + { + "epoch": 2.671357373204339, + "grad_norm": 0.29073266274637083, + "learning_rate": 0.0004905163863753638, + "loss": 3.180220365524292, + "step": 4557, + "token_acc": 0.27701957290765666 + }, + { + "epoch": 2.671943711521548, + "grad_norm": 0.28905171135397717, + "learning_rate": 0.000490509774797137, + "loss": 3.122464179992676, + "step": 4558, + "token_acc": 0.2874867031894316 + }, + { + "epoch": 2.672530049838757, + "grad_norm": 0.26325935928183564, + "learning_rate": 0.0004905031609596449, + "loss": 3.1714792251586914, + "step": 4559, + "token_acc": 0.27866096271025603 + }, + { + "epoch": 2.673116388155966, + "grad_norm": 0.2375904715772175, + "learning_rate": 0.0004904965448629497, + "loss": 3.125209093093872, + "step": 4560, + "token_acc": 0.2841763703917682 + }, + { + "epoch": 2.673702726473175, + "grad_norm": 0.23241632054675374, + "learning_rate": 0.0004904899265071136, + "loss": 3.161773681640625, + "step": 4561, + "token_acc": 0.2812694291293833 + }, + { + "epoch": 2.674289064790384, + "grad_norm": 0.3073570840791119, + "learning_rate": 0.0004904833058921987, + "loss": 3.1765480041503906, + "step": 4562, + "token_acc": 0.2782499826482507 + }, + { + "epoch": 2.674875403107593, + "grad_norm": 0.38672690543600824, + "learning_rate": 0.0004904766830182672, + "loss": 3.187138080596924, + "step": 4563, + "token_acc": 0.2763471387647178 + }, + { + "epoch": 2.675461741424802, + "grad_norm": 0.2825192954051753, + "learning_rate": 0.0004904700578853813, + "loss": 3.1743669509887695, + "step": 4564, + "token_acc": 0.28068595043425376 + }, + { + "epoch": 2.676048079742011, + "grad_norm": 0.2615401041372074, + "learning_rate": 0.0004904634304936031, + "loss": 3.1787073612213135, + "step": 4565, + "token_acc": 0.2784521762974994 + }, + { + "epoch": 2.67663441805922, + "grad_norm": 0.33596458728121636, + "learning_rate": 0.0004904568008429951, + "loss": 3.1512062549591064, + "step": 4566, + "token_acc": 0.28083209509658247 + }, + { + "epoch": 2.677220756376429, + "grad_norm": 0.24547796900119753, + "learning_rate": 0.0004904501689336195, + "loss": 3.1426455974578857, + "step": 4567, + "token_acc": 0.283611214953271 + }, + { + "epoch": 2.6778070946936383, + "grad_norm": 0.2746946277535087, + "learning_rate": 0.0004904435347655386, + "loss": 3.2009925842285156, + "step": 4568, + "token_acc": 0.27325966447467914 + }, + { + "epoch": 2.6783934330108474, + "grad_norm": 0.2846392281299993, + "learning_rate": 0.0004904368983388147, + "loss": 3.1447372436523438, + "step": 4569, + "token_acc": 0.28445944856281213 + }, + { + "epoch": 2.6789797713280565, + "grad_norm": 0.25004107825723876, + "learning_rate": 0.0004904302596535101, + "loss": 3.1360507011413574, + "step": 4570, + "token_acc": 0.2825544050703015 + }, + { + "epoch": 2.679566109645265, + "grad_norm": 0.3093188081340262, + "learning_rate": 0.0004904236187096871, + "loss": 3.1342620849609375, + "step": 4571, + "token_acc": 0.2833793690681398 + }, + { + "epoch": 2.6801524479624743, + "grad_norm": 0.2983771198475082, + "learning_rate": 0.0004904169755074083, + "loss": 3.1560590267181396, + "step": 4572, + "token_acc": 0.2817252867186055 + }, + { + "epoch": 2.6807387862796834, + "grad_norm": 0.2611308246275719, + "learning_rate": 0.000490410330046736, + "loss": 3.109617233276367, + "step": 4573, + "token_acc": 0.2881494254743429 + }, + { + "epoch": 2.6813251245968925, + "grad_norm": 0.27114589782545484, + "learning_rate": 0.0004904036823277326, + "loss": 3.0936973094940186, + "step": 4574, + "token_acc": 0.2890335292028972 + }, + { + "epoch": 2.681911462914101, + "grad_norm": 0.302814370706426, + "learning_rate": 0.0004903970323504604, + "loss": 3.0958058834075928, + "step": 4575, + "token_acc": 0.28960546201494286 + }, + { + "epoch": 2.6824978012313103, + "grad_norm": 0.26994595165391616, + "learning_rate": 0.0004903903801149822, + "loss": 3.1496384143829346, + "step": 4576, + "token_acc": 0.28364542112351604 + }, + { + "epoch": 2.6830841395485194, + "grad_norm": 0.2281576336401177, + "learning_rate": 0.0004903837256213603, + "loss": 3.1455273628234863, + "step": 4577, + "token_acc": 0.2829372112360179 + }, + { + "epoch": 2.6836704778657285, + "grad_norm": 0.2955263179322018, + "learning_rate": 0.000490377068869657, + "loss": 3.111093044281006, + "step": 4578, + "token_acc": 0.2876562908245966 + }, + { + "epoch": 2.6842568161829377, + "grad_norm": 0.2849666885021753, + "learning_rate": 0.0004903704098599352, + "loss": 3.1576688289642334, + "step": 4579, + "token_acc": 0.2804136094211035 + }, + { + "epoch": 2.6848431545001468, + "grad_norm": 0.26541886628902084, + "learning_rate": 0.0004903637485922574, + "loss": 3.169520139694214, + "step": 4580, + "token_acc": 0.27816290705741376 + }, + { + "epoch": 2.685429492817356, + "grad_norm": 0.33017831212644083, + "learning_rate": 0.000490357085066686, + "loss": 3.171154499053955, + "step": 4581, + "token_acc": 0.27867526398631576 + }, + { + "epoch": 2.6860158311345645, + "grad_norm": 0.2925579502614907, + "learning_rate": 0.0004903504192832836, + "loss": 3.2160425186157227, + "step": 4582, + "token_acc": 0.2737966590411443 + }, + { + "epoch": 2.6866021694517737, + "grad_norm": 0.258739454691168, + "learning_rate": 0.000490343751242113, + "loss": 3.1443850994110107, + "step": 4583, + "token_acc": 0.28243598265438546 + }, + { + "epoch": 2.6871885077689828, + "grad_norm": 0.2750951372524947, + "learning_rate": 0.0004903370809432366, + "loss": 3.1276283264160156, + "step": 4584, + "token_acc": 0.2859750997297645 + }, + { + "epoch": 2.687774846086192, + "grad_norm": 0.28674035075631255, + "learning_rate": 0.0004903304083867173, + "loss": 3.131162643432617, + "step": 4585, + "token_acc": 0.2853075519688919 + }, + { + "epoch": 2.6883611844034006, + "grad_norm": 0.324133447793151, + "learning_rate": 0.0004903237335726177, + "loss": 3.173985481262207, + "step": 4586, + "token_acc": 0.2799420759839349 + }, + { + "epoch": 2.6889475227206097, + "grad_norm": 0.3442888658921696, + "learning_rate": 0.0004903170565010003, + "loss": 3.1555871963500977, + "step": 4587, + "token_acc": 0.2800369874630892 + }, + { + "epoch": 2.6895338610378188, + "grad_norm": 0.2159301624043389, + "learning_rate": 0.000490310377171928, + "loss": 3.1447908878326416, + "step": 4588, + "token_acc": 0.2826173820989704 + }, + { + "epoch": 2.690120199355028, + "grad_norm": 0.28127233528871115, + "learning_rate": 0.0004903036955854637, + "loss": 3.107553005218506, + "step": 4589, + "token_acc": 0.28865429811139903 + }, + { + "epoch": 2.690706537672237, + "grad_norm": 0.21470824304125144, + "learning_rate": 0.0004902970117416697, + "loss": 3.15939998626709, + "step": 4590, + "token_acc": 0.2798994241469104 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.29766054153090554, + "learning_rate": 0.0004902903256406093, + "loss": 3.1213431358337402, + "step": 4591, + "token_acc": 0.2871884084984325 + }, + { + "epoch": 2.6918792143066548, + "grad_norm": 0.2256292701692895, + "learning_rate": 0.000490283637282345, + "loss": 3.1266729831695557, + "step": 4592, + "token_acc": 0.28570079093968126 + }, + { + "epoch": 2.692465552623864, + "grad_norm": 0.28789147900489287, + "learning_rate": 0.0004902769466669398, + "loss": 3.156118392944336, + "step": 4593, + "token_acc": 0.28119190152687157 + }, + { + "epoch": 2.693051890941073, + "grad_norm": 0.23820974197935682, + "learning_rate": 0.0004902702537944565, + "loss": 3.078068494796753, + "step": 4594, + "token_acc": 0.29140178943094863 + }, + { + "epoch": 2.693638229258282, + "grad_norm": 0.27718561567821953, + "learning_rate": 0.0004902635586649578, + "loss": 3.116590738296509, + "step": 4595, + "token_acc": 0.2861796729055213 + }, + { + "epoch": 2.6942245675754912, + "grad_norm": 0.2979731067236039, + "learning_rate": 0.0004902568612785067, + "loss": 3.1874887943267822, + "step": 4596, + "token_acc": 0.2770002344522482 + }, + { + "epoch": 2.6948109058927, + "grad_norm": 0.2677133286999762, + "learning_rate": 0.0004902501616351663, + "loss": 3.160142660140991, + "step": 4597, + "token_acc": 0.2797148151731426 + }, + { + "epoch": 2.695397244209909, + "grad_norm": 0.3073354786713019, + "learning_rate": 0.0004902434597349993, + "loss": 3.149329900741577, + "step": 4598, + "token_acc": 0.2825338782533878 + }, + { + "epoch": 2.695983582527118, + "grad_norm": 0.2414383481175744, + "learning_rate": 0.0004902367555780688, + "loss": 3.155968427658081, + "step": 4599, + "token_acc": 0.28039113456688974 + }, + { + "epoch": 2.6965699208443272, + "grad_norm": 0.3153028703425432, + "learning_rate": 0.0004902300491644376, + "loss": 3.101053476333618, + "step": 4600, + "token_acc": 0.2883945882265929 + }, + { + "epoch": 2.6971562591615363, + "grad_norm": 0.23440077949971044, + "learning_rate": 0.0004902233404941688, + "loss": 3.142268657684326, + "step": 4601, + "token_acc": 0.28147943276677057 + }, + { + "epoch": 2.6977425974787455, + "grad_norm": 0.29717901517589745, + "learning_rate": 0.0004902166295673255, + "loss": 3.1640563011169434, + "step": 4602, + "token_acc": 0.2795592460842005 + }, + { + "epoch": 2.698328935795954, + "grad_norm": 0.28572570348458276, + "learning_rate": 0.0004902099163839706, + "loss": 3.1878528594970703, + "step": 4603, + "token_acc": 0.27611366033003676 + }, + { + "epoch": 2.6989152741131632, + "grad_norm": 0.2597955684897106, + "learning_rate": 0.0004902032009441672, + "loss": 3.1218440532684326, + "step": 4604, + "token_acc": 0.2876744446279847 + }, + { + "epoch": 2.6995016124303723, + "grad_norm": 0.2844928918649005, + "learning_rate": 0.0004901964832479785, + "loss": 3.07356595993042, + "step": 4605, + "token_acc": 0.2941308197062806 + }, + { + "epoch": 2.7000879507475815, + "grad_norm": 0.24699663085655166, + "learning_rate": 0.0004901897632954673, + "loss": 3.1109657287597656, + "step": 4606, + "token_acc": 0.2858095687831323 + }, + { + "epoch": 2.70067428906479, + "grad_norm": 0.26351736270608545, + "learning_rate": 0.0004901830410866972, + "loss": 3.118032455444336, + "step": 4607, + "token_acc": 0.2863909863923088 + }, + { + "epoch": 2.7012606273819992, + "grad_norm": 0.23599287588106688, + "learning_rate": 0.0004901763166217309, + "loss": 3.1285157203674316, + "step": 4608, + "token_acc": 0.28603870819373955 + }, + { + "epoch": 2.7018469656992083, + "grad_norm": 0.22522743040455692, + "learning_rate": 0.0004901695899006319, + "loss": 3.149538040161133, + "step": 4609, + "token_acc": 0.28281165967516403 + }, + { + "epoch": 2.7024333040164175, + "grad_norm": 0.22455951050007034, + "learning_rate": 0.0004901628609234631, + "loss": 3.0960569381713867, + "step": 4610, + "token_acc": 0.28850787022165114 + }, + { + "epoch": 2.7030196423336266, + "grad_norm": 0.23532155706212635, + "learning_rate": 0.0004901561296902879, + "loss": 3.1284103393554688, + "step": 4611, + "token_acc": 0.284315905425679 + }, + { + "epoch": 2.7036059806508357, + "grad_norm": 0.25515369741528093, + "learning_rate": 0.0004901493962011694, + "loss": 3.1549153327941895, + "step": 4612, + "token_acc": 0.2805373151299485 + }, + { + "epoch": 2.704192318968045, + "grad_norm": 0.24353652049388982, + "learning_rate": 0.000490142660456171, + "loss": 3.084451675415039, + "step": 4613, + "token_acc": 0.2913050298023633 + }, + { + "epoch": 2.7047786572852535, + "grad_norm": 0.26363219320596737, + "learning_rate": 0.0004901359224553559, + "loss": 3.133878707885742, + "step": 4614, + "token_acc": 0.2825645355233198 + }, + { + "epoch": 2.7053649956024626, + "grad_norm": 0.25453575566896036, + "learning_rate": 0.0004901291821987875, + "loss": 3.1414904594421387, + "step": 4615, + "token_acc": 0.28200538795362584 + }, + { + "epoch": 2.7059513339196717, + "grad_norm": 0.24546837521932802, + "learning_rate": 0.0004901224396865288, + "loss": 3.144693374633789, + "step": 4616, + "token_acc": 0.2831186992548684 + }, + { + "epoch": 2.706537672236881, + "grad_norm": 0.26714083224545776, + "learning_rate": 0.0004901156949186434, + "loss": 3.184706211090088, + "step": 4617, + "token_acc": 0.27719609766703857 + }, + { + "epoch": 2.7071240105540895, + "grad_norm": 0.2884246925258864, + "learning_rate": 0.0004901089478951947, + "loss": 3.179894208908081, + "step": 4618, + "token_acc": 0.2783122233422448 + }, + { + "epoch": 2.7077103488712986, + "grad_norm": 0.3870330747852714, + "learning_rate": 0.0004901021986162459, + "loss": 3.176281452178955, + "step": 4619, + "token_acc": 0.2770651549532268 + }, + { + "epoch": 2.7082966871885077, + "grad_norm": 0.4220802230232499, + "learning_rate": 0.0004900954470818606, + "loss": 3.1220810413360596, + "step": 4620, + "token_acc": 0.28444305160894445 + }, + { + "epoch": 2.708883025505717, + "grad_norm": 0.28847719772337294, + "learning_rate": 0.0004900886932921021, + "loss": 3.123826503753662, + "step": 4621, + "token_acc": 0.2850696876358798 + }, + { + "epoch": 2.709469363822926, + "grad_norm": 0.26718665787498486, + "learning_rate": 0.0004900819372470336, + "loss": 3.14339280128479, + "step": 4622, + "token_acc": 0.28174138540725313 + }, + { + "epoch": 2.710055702140135, + "grad_norm": 0.2683354956939335, + "learning_rate": 0.0004900751789467191, + "loss": 3.125194549560547, + "step": 4623, + "token_acc": 0.2853502367418167 + }, + { + "epoch": 2.710642040457344, + "grad_norm": 0.24074950324163233, + "learning_rate": 0.0004900684183912217, + "loss": 3.138235330581665, + "step": 4624, + "token_acc": 0.28499834360299475 + }, + { + "epoch": 2.711228378774553, + "grad_norm": 0.27774818337033985, + "learning_rate": 0.000490061655580605, + "loss": 3.133533477783203, + "step": 4625, + "token_acc": 0.28299663211485376 + }, + { + "epoch": 2.711814717091762, + "grad_norm": 0.2904127894817428, + "learning_rate": 0.0004900548905149325, + "loss": 3.175708293914795, + "step": 4626, + "token_acc": 0.27870902986177276 + }, + { + "epoch": 2.712401055408971, + "grad_norm": 0.2859313859396692, + "learning_rate": 0.0004900481231942678, + "loss": 3.1801204681396484, + "step": 4627, + "token_acc": 0.2763924753963368 + }, + { + "epoch": 2.71298739372618, + "grad_norm": 0.32398416472042446, + "learning_rate": 0.0004900413536186746, + "loss": 3.144838809967041, + "step": 4628, + "token_acc": 0.2836153848099409 + }, + { + "epoch": 2.713573732043389, + "grad_norm": 0.2796564939939551, + "learning_rate": 0.0004900345817882161, + "loss": 3.1292924880981445, + "step": 4629, + "token_acc": 0.28484225219556814 + }, + { + "epoch": 2.714160070360598, + "grad_norm": 0.30077931824830456, + "learning_rate": 0.0004900278077029563, + "loss": 3.1165008544921875, + "step": 4630, + "token_acc": 0.28470951509606585 + }, + { + "epoch": 2.714746408677807, + "grad_norm": 0.2949413818316535, + "learning_rate": 0.0004900210313629587, + "loss": 3.131425619125366, + "step": 4631, + "token_acc": 0.2845090283657069 + }, + { + "epoch": 2.715332746995016, + "grad_norm": 0.26149296501499775, + "learning_rate": 0.0004900142527682869, + "loss": 3.1609621047973633, + "step": 4632, + "token_acc": 0.28026562142212524 + }, + { + "epoch": 2.7159190853122253, + "grad_norm": 0.2542989178347821, + "learning_rate": 0.0004900074719190045, + "loss": 3.1593732833862305, + "step": 4633, + "token_acc": 0.28187610153353726 + }, + { + "epoch": 2.7165054236294344, + "grad_norm": 0.2808530973569041, + "learning_rate": 0.0004900006888151755, + "loss": 3.152329206466675, + "step": 4634, + "token_acc": 0.28108684375838994 + }, + { + "epoch": 2.7170917619466435, + "grad_norm": 0.2890188446726456, + "learning_rate": 0.0004899939034568633, + "loss": 3.1970624923706055, + "step": 4635, + "token_acc": 0.27567696310138873 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.29683707155963884, + "learning_rate": 0.0004899871158441319, + "loss": 3.1192421913146973, + "step": 4636, + "token_acc": 0.2858852418251896 + }, + { + "epoch": 2.7182644385810613, + "grad_norm": 0.29741058546887494, + "learning_rate": 0.0004899803259770448, + "loss": 3.1668338775634766, + "step": 4637, + "token_acc": 0.27941674165527625 + }, + { + "epoch": 2.7188507768982704, + "grad_norm": 0.29610847521435885, + "learning_rate": 0.000489973533855666, + "loss": 3.113586187362671, + "step": 4638, + "token_acc": 0.28603497930119165 + }, + { + "epoch": 2.7194371152154795, + "grad_norm": 0.3366273867720583, + "learning_rate": 0.0004899667394800592, + "loss": 3.1647963523864746, + "step": 4639, + "token_acc": 0.28175258860240826 + }, + { + "epoch": 2.720023453532688, + "grad_norm": 0.34250426163279474, + "learning_rate": 0.0004899599428502883, + "loss": 3.168461322784424, + "step": 4640, + "token_acc": 0.27693266395642757 + }, + { + "epoch": 2.7206097918498973, + "grad_norm": 0.2750664984680904, + "learning_rate": 0.000489953143966417, + "loss": 3.1305861473083496, + "step": 4641, + "token_acc": 0.2839380693281164 + }, + { + "epoch": 2.7211961301671064, + "grad_norm": 0.2470801226380168, + "learning_rate": 0.0004899463428285093, + "loss": 3.1503121852874756, + "step": 4642, + "token_acc": 0.28222943422051605 + }, + { + "epoch": 2.7217824684843155, + "grad_norm": 0.34691840004262736, + "learning_rate": 0.000489939539436629, + "loss": 3.1506032943725586, + "step": 4643, + "token_acc": 0.28079444004972315 + }, + { + "epoch": 2.7223688068015246, + "grad_norm": 0.3516749888249259, + "learning_rate": 0.0004899327337908402, + "loss": 3.141904354095459, + "step": 4644, + "token_acc": 0.28399655038547617 + }, + { + "epoch": 2.7229551451187337, + "grad_norm": 0.252177892636462, + "learning_rate": 0.0004899259258912065, + "loss": 3.148179531097412, + "step": 4645, + "token_acc": 0.28268363014295006 + }, + { + "epoch": 2.7235414834359424, + "grad_norm": 0.29009563463127364, + "learning_rate": 0.000489919115737792, + "loss": 3.148155689239502, + "step": 4646, + "token_acc": 0.2819458073294867 + }, + { + "epoch": 2.7241278217531515, + "grad_norm": 0.3206674979347932, + "learning_rate": 0.0004899123033306608, + "loss": 3.1484107971191406, + "step": 4647, + "token_acc": 0.28285912452153233 + }, + { + "epoch": 2.7247141600703606, + "grad_norm": 0.23996238680679377, + "learning_rate": 0.0004899054886698768, + "loss": 3.128343105316162, + "step": 4648, + "token_acc": 0.28616290379322473 + }, + { + "epoch": 2.7253004983875697, + "grad_norm": 0.27142357675306694, + "learning_rate": 0.000489898671755504, + "loss": 3.0969607830047607, + "step": 4649, + "token_acc": 0.2903362726950791 + }, + { + "epoch": 2.7258868367047784, + "grad_norm": 0.2900547923061283, + "learning_rate": 0.0004898918525876064, + "loss": 3.091984748840332, + "step": 4650, + "token_acc": 0.29108085914646786 + }, + { + "epoch": 2.7264731750219875, + "grad_norm": 0.22992646377485954, + "learning_rate": 0.0004898850311662482, + "loss": 3.0915329456329346, + "step": 4651, + "token_acc": 0.2891348308320256 + }, + { + "epoch": 2.7270595133391966, + "grad_norm": 0.23601135815877283, + "learning_rate": 0.0004898782074914933, + "loss": 3.1307356357574463, + "step": 4652, + "token_acc": 0.28399524961789835 + }, + { + "epoch": 2.7276458516564057, + "grad_norm": 0.23617799372227247, + "learning_rate": 0.0004898713815634059, + "loss": 3.1133649349212646, + "step": 4653, + "token_acc": 0.2859813932865183 + }, + { + "epoch": 2.728232189973615, + "grad_norm": 0.23585246452634517, + "learning_rate": 0.0004898645533820502, + "loss": 3.166503429412842, + "step": 4654, + "token_acc": 0.28026843984071426 + }, + { + "epoch": 2.728818528290824, + "grad_norm": 0.21685252388318935, + "learning_rate": 0.0004898577229474901, + "loss": 3.170147657394409, + "step": 4655, + "token_acc": 0.27952401306750524 + }, + { + "epoch": 2.729404866608033, + "grad_norm": 0.31649556891255637, + "learning_rate": 0.0004898508902597899, + "loss": 3.083954334259033, + "step": 4656, + "token_acc": 0.2900733946426995 + }, + { + "epoch": 2.7299912049252417, + "grad_norm": 0.3637913677699276, + "learning_rate": 0.0004898440553190139, + "loss": 3.147120952606201, + "step": 4657, + "token_acc": 0.28143762682789064 + }, + { + "epoch": 2.730577543242451, + "grad_norm": 0.36166290170017024, + "learning_rate": 0.0004898372181252261, + "loss": 3.122004747390747, + "step": 4658, + "token_acc": 0.28577327505036637 + }, + { + "epoch": 2.73116388155966, + "grad_norm": 0.23316732792858197, + "learning_rate": 0.0004898303786784909, + "loss": 3.108412504196167, + "step": 4659, + "token_acc": 0.2881640377889195 + }, + { + "epoch": 2.731750219876869, + "grad_norm": 0.29814462268874253, + "learning_rate": 0.0004898235369788724, + "loss": 3.1333141326904297, + "step": 4660, + "token_acc": 0.28487127040673144 + }, + { + "epoch": 2.7323365581940777, + "grad_norm": 0.2509210591832417, + "learning_rate": 0.000489816693026435, + "loss": 3.128150463104248, + "step": 4661, + "token_acc": 0.2850359496323651 + }, + { + "epoch": 2.732922896511287, + "grad_norm": 0.23572388201185432, + "learning_rate": 0.0004898098468212428, + "loss": 3.081077814102173, + "step": 4662, + "token_acc": 0.29224525070649554 + }, + { + "epoch": 2.733509234828496, + "grad_norm": 0.29483905246628833, + "learning_rate": 0.0004898029983633604, + "loss": 3.1461424827575684, + "step": 4663, + "token_acc": 0.2812102110250684 + }, + { + "epoch": 2.734095573145705, + "grad_norm": 0.20071425180809044, + "learning_rate": 0.0004897961476528519, + "loss": 3.1102137565612793, + "step": 4664, + "token_acc": 0.28715215199926686 + }, + { + "epoch": 2.734681911462914, + "grad_norm": 0.29503383800872274, + "learning_rate": 0.0004897892946897817, + "loss": 3.1386547088623047, + "step": 4665, + "token_acc": 0.2839117938936217 + }, + { + "epoch": 2.7352682497801233, + "grad_norm": 0.3034404017055964, + "learning_rate": 0.0004897824394742142, + "loss": 3.144566297531128, + "step": 4666, + "token_acc": 0.2834482413410647 + }, + { + "epoch": 2.7358545880973324, + "grad_norm": 0.21482594888669135, + "learning_rate": 0.0004897755820062139, + "loss": 3.1447761058807373, + "step": 4667, + "token_acc": 0.2832826142228443 + }, + { + "epoch": 2.736440926414541, + "grad_norm": 0.2793304598202907, + "learning_rate": 0.0004897687222858449, + "loss": 3.0738322734832764, + "step": 4668, + "token_acc": 0.29108797202432213 + }, + { + "epoch": 2.73702726473175, + "grad_norm": 0.22536273070786247, + "learning_rate": 0.000489761860313172, + "loss": 3.1731812953948975, + "step": 4669, + "token_acc": 0.27899361272854617 + }, + { + "epoch": 2.7376136030489593, + "grad_norm": 0.2599278124761073, + "learning_rate": 0.0004897549960882594, + "loss": 3.139139175415039, + "step": 4670, + "token_acc": 0.28286789153244957 + }, + { + "epoch": 2.7381999413661684, + "grad_norm": 0.22082128100025292, + "learning_rate": 0.0004897481296111718, + "loss": 3.098876476287842, + "step": 4671, + "token_acc": 0.28846851062845036 + }, + { + "epoch": 2.738786279683377, + "grad_norm": 0.26358662258456256, + "learning_rate": 0.0004897412608819736, + "loss": 3.1468658447265625, + "step": 4672, + "token_acc": 0.2808954277120144 + }, + { + "epoch": 2.739372618000586, + "grad_norm": 0.29519735422107757, + "learning_rate": 0.0004897343899007291, + "loss": 3.1617846488952637, + "step": 4673, + "token_acc": 0.2805192346128357 + }, + { + "epoch": 2.7399589563177953, + "grad_norm": 0.24847913735160262, + "learning_rate": 0.0004897275166675033, + "loss": 3.1215832233428955, + "step": 4674, + "token_acc": 0.28753905136767166 + }, + { + "epoch": 2.7405452946350044, + "grad_norm": 0.3282242401468146, + "learning_rate": 0.0004897206411823604, + "loss": 3.1645286083221436, + "step": 4675, + "token_acc": 0.2789327597523269 + }, + { + "epoch": 2.7411316329522135, + "grad_norm": 0.28710659048661724, + "learning_rate": 0.0004897137634453651, + "loss": 3.145056962966919, + "step": 4676, + "token_acc": 0.2833787614776619 + }, + { + "epoch": 2.7417179712694226, + "grad_norm": 0.2508308232827155, + "learning_rate": 0.0004897068834565821, + "loss": 3.127073287963867, + "step": 4677, + "token_acc": 0.2846760196702343 + }, + { + "epoch": 2.7423043095866317, + "grad_norm": 0.30398951874752256, + "learning_rate": 0.0004897000012160759, + "loss": 3.1692733764648438, + "step": 4678, + "token_acc": 0.2789690947614525 + }, + { + "epoch": 2.7428906479038404, + "grad_norm": 0.2953313476242557, + "learning_rate": 0.0004896931167239111, + "loss": 3.1600899696350098, + "step": 4679, + "token_acc": 0.28037793612554546 + }, + { + "epoch": 2.7434769862210495, + "grad_norm": 0.2468342939084421, + "learning_rate": 0.0004896862299801526, + "loss": 3.1086108684539795, + "step": 4680, + "token_acc": 0.2871841909532683 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.26562090552312306, + "learning_rate": 0.0004896793409848648, + "loss": 3.18031907081604, + "step": 4681, + "token_acc": 0.27814727879001344 + }, + { + "epoch": 2.7446496628554677, + "grad_norm": 0.23660266954358344, + "learning_rate": 0.0004896724497381127, + "loss": 3.1466221809387207, + "step": 4682, + "token_acc": 0.2816064040061401 + }, + { + "epoch": 2.7452360011726764, + "grad_norm": 0.23469188574664585, + "learning_rate": 0.0004896655562399608, + "loss": 3.0906708240509033, + "step": 4683, + "token_acc": 0.2874018080965005 + }, + { + "epoch": 2.7458223394898855, + "grad_norm": 0.23105903296739963, + "learning_rate": 0.000489658660490474, + "loss": 3.108341693878174, + "step": 4684, + "token_acc": 0.28833244089461924 + }, + { + "epoch": 2.7464086778070946, + "grad_norm": 0.23723851449580355, + "learning_rate": 0.000489651762489717, + "loss": 3.1508374214172363, + "step": 4685, + "token_acc": 0.2804935379411232 + }, + { + "epoch": 2.7469950161243037, + "grad_norm": 0.25478314914994793, + "learning_rate": 0.0004896448622377546, + "loss": 3.132269859313965, + "step": 4686, + "token_acc": 0.28439411159820316 + }, + { + "epoch": 2.747581354441513, + "grad_norm": 0.2585053572222541, + "learning_rate": 0.0004896379597346517, + "loss": 3.0948827266693115, + "step": 4687, + "token_acc": 0.2917751699849616 + }, + { + "epoch": 2.748167692758722, + "grad_norm": 0.24682417857298933, + "learning_rate": 0.000489631054980473, + "loss": 3.1234304904937744, + "step": 4688, + "token_acc": 0.28512647756060777 + }, + { + "epoch": 2.748754031075931, + "grad_norm": 0.24527136084390572, + "learning_rate": 0.0004896241479752835, + "loss": 3.154219627380371, + "step": 4689, + "token_acc": 0.28161860729631816 + }, + { + "epoch": 2.7493403693931397, + "grad_norm": 0.23419750139671655, + "learning_rate": 0.0004896172387191481, + "loss": 3.1275951862335205, + "step": 4690, + "token_acc": 0.2865577030638245 + }, + { + "epoch": 2.749926707710349, + "grad_norm": 0.23597819088616398, + "learning_rate": 0.0004896103272121315, + "loss": 3.116631031036377, + "step": 4691, + "token_acc": 0.28638339149166675 + }, + { + "epoch": 2.750513046027558, + "grad_norm": 0.25151531147525114, + "learning_rate": 0.0004896034134542989, + "loss": 3.1337738037109375, + "step": 4692, + "token_acc": 0.2840357459510196 + }, + { + "epoch": 2.751099384344767, + "grad_norm": 0.2735195578784183, + "learning_rate": 0.000489596497445715, + "loss": 3.167222499847412, + "step": 4693, + "token_acc": 0.27862573515284444 + }, + { + "epoch": 2.7516857226619758, + "grad_norm": 0.3585359974120776, + "learning_rate": 0.0004895895791864449, + "loss": 3.1827821731567383, + "step": 4694, + "token_acc": 0.27783848165505814 + }, + { + "epoch": 2.752272060979185, + "grad_norm": 0.5008751756653854, + "learning_rate": 0.0004895826586765535, + "loss": 3.138991594314575, + "step": 4695, + "token_acc": 0.28352407574814187 + }, + { + "epoch": 2.752858399296394, + "grad_norm": 0.3484440637997365, + "learning_rate": 0.000489575735916106, + "loss": 3.103484630584717, + "step": 4696, + "token_acc": 0.28816345524024367 + }, + { + "epoch": 2.753444737613603, + "grad_norm": 0.27816780930070484, + "learning_rate": 0.0004895688109051671, + "loss": 3.1396255493164062, + "step": 4697, + "token_acc": 0.28212498196628527 + }, + { + "epoch": 2.754031075930812, + "grad_norm": 0.3227452600003064, + "learning_rate": 0.0004895618836438022, + "loss": 3.185189962387085, + "step": 4698, + "token_acc": 0.2767657440876856 + }, + { + "epoch": 2.7546174142480213, + "grad_norm": 0.31382178378118714, + "learning_rate": 0.0004895549541320762, + "loss": 3.1338088512420654, + "step": 4699, + "token_acc": 0.2838295169941415 + }, + { + "epoch": 2.75520375256523, + "grad_norm": 0.23049711230321435, + "learning_rate": 0.0004895480223700542, + "loss": 3.142578601837158, + "step": 4700, + "token_acc": 0.2820820480558487 + }, + { + "epoch": 2.755790090882439, + "grad_norm": 0.31489964839209, + "learning_rate": 0.0004895410883578012, + "loss": 3.1371493339538574, + "step": 4701, + "token_acc": 0.2836753013593229 + }, + { + "epoch": 2.756376429199648, + "grad_norm": 0.2686036582480531, + "learning_rate": 0.0004895341520953826, + "loss": 3.1444923877716064, + "step": 4702, + "token_acc": 0.2824188603446453 + }, + { + "epoch": 2.7569627675168573, + "grad_norm": 0.2981379247343524, + "learning_rate": 0.0004895272135828634, + "loss": 3.1441211700439453, + "step": 4703, + "token_acc": 0.2825166085996841 + }, + { + "epoch": 2.757549105834066, + "grad_norm": 0.2601346819378336, + "learning_rate": 0.0004895202728203088, + "loss": 3.164188861846924, + "step": 4704, + "token_acc": 0.27944430027902695 + }, + { + "epoch": 2.758135444151275, + "grad_norm": 0.29451487480838134, + "learning_rate": 0.000489513329807784, + "loss": 3.1282501220703125, + "step": 4705, + "token_acc": 0.2865070585756426 + }, + { + "epoch": 2.758721782468484, + "grad_norm": 0.21241281390617445, + "learning_rate": 0.0004895063845453541, + "loss": 3.1644952297210693, + "step": 4706, + "token_acc": 0.2815866784284742 + }, + { + "epoch": 2.7593081207856933, + "grad_norm": 0.31226325523463877, + "learning_rate": 0.0004894994370330845, + "loss": 3.1245813369750977, + "step": 4707, + "token_acc": 0.28736516274270707 + }, + { + "epoch": 2.7598944591029024, + "grad_norm": 0.23877780682356678, + "learning_rate": 0.0004894924872710406, + "loss": 3.114713191986084, + "step": 4708, + "token_acc": 0.2862852482568282 + }, + { + "epoch": 2.7604807974201115, + "grad_norm": 0.2979531407373359, + "learning_rate": 0.0004894855352592873, + "loss": 3.1398301124572754, + "step": 4709, + "token_acc": 0.28429895651808773 + }, + { + "epoch": 2.7610671357373207, + "grad_norm": 0.27890177073715405, + "learning_rate": 0.0004894785809978902, + "loss": 3.1423983573913574, + "step": 4710, + "token_acc": 0.28212466796917873 + }, + { + "epoch": 2.7616534740545293, + "grad_norm": 0.28840477872329, + "learning_rate": 0.0004894716244869144, + "loss": 3.1969165802001953, + "step": 4711, + "token_acc": 0.2763451022750287 + }, + { + "epoch": 2.7622398123717384, + "grad_norm": 0.30006077725123764, + "learning_rate": 0.0004894646657264256, + "loss": 3.1501541137695312, + "step": 4712, + "token_acc": 0.28159570310037446 + }, + { + "epoch": 2.7628261506889475, + "grad_norm": 0.2574056239584595, + "learning_rate": 0.0004894577047164887, + "loss": 3.1437087059020996, + "step": 4713, + "token_acc": 0.2819671620966257 + }, + { + "epoch": 2.7634124890061567, + "grad_norm": 0.32790668941460493, + "learning_rate": 0.0004894507414571695, + "loss": 3.1586780548095703, + "step": 4714, + "token_acc": 0.2810597499414368 + }, + { + "epoch": 2.7639988273233653, + "grad_norm": 0.3137578036505134, + "learning_rate": 0.0004894437759485332, + "loss": 3.1431007385253906, + "step": 4715, + "token_acc": 0.281324909056098 + }, + { + "epoch": 2.7645851656405744, + "grad_norm": 0.273846201435247, + "learning_rate": 0.0004894368081906454, + "loss": 3.1213245391845703, + "step": 4716, + "token_acc": 0.285061088095464 + }, + { + "epoch": 2.7651715039577835, + "grad_norm": 0.2501106641797382, + "learning_rate": 0.0004894298381835713, + "loss": 3.1490840911865234, + "step": 4717, + "token_acc": 0.2831556850515342 + }, + { + "epoch": 2.7657578422749927, + "grad_norm": 0.22216573064279863, + "learning_rate": 0.0004894228659273765, + "loss": 3.126720905303955, + "step": 4718, + "token_acc": 0.2867807137919293 + }, + { + "epoch": 2.7663441805922018, + "grad_norm": 0.28318810862495697, + "learning_rate": 0.0004894158914221265, + "loss": 3.1314051151275635, + "step": 4719, + "token_acc": 0.28491857545991867 + }, + { + "epoch": 2.766930518909411, + "grad_norm": 0.2560482405355049, + "learning_rate": 0.0004894089146678869, + "loss": 3.1204075813293457, + "step": 4720, + "token_acc": 0.2870677545652889 + }, + { + "epoch": 2.76751685722662, + "grad_norm": 0.22524016270833025, + "learning_rate": 0.0004894019356647231, + "loss": 3.071620225906372, + "step": 4721, + "token_acc": 0.29332311230356456 + }, + { + "epoch": 2.7681031955438287, + "grad_norm": 0.272433271011628, + "learning_rate": 0.0004893949544127008, + "loss": 3.117757797241211, + "step": 4722, + "token_acc": 0.28637140867855754 + }, + { + "epoch": 2.7686895338610378, + "grad_norm": 0.3305912445740114, + "learning_rate": 0.0004893879709118853, + "loss": 3.1081907749176025, + "step": 4723, + "token_acc": 0.28894129328534196 + }, + { + "epoch": 2.769275872178247, + "grad_norm": 0.266289648154714, + "learning_rate": 0.0004893809851623425, + "loss": 3.1260695457458496, + "step": 4724, + "token_acc": 0.2857022663932271 + }, + { + "epoch": 2.769862210495456, + "grad_norm": 0.25903067837290356, + "learning_rate": 0.0004893739971641379, + "loss": 3.134542465209961, + "step": 4725, + "token_acc": 0.2837237881546508 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.31142384520845673, + "learning_rate": 0.0004893670069173371, + "loss": 3.160855531692505, + "step": 4726, + "token_acc": 0.2806040532494046 + }, + { + "epoch": 2.771034887129874, + "grad_norm": 0.24921372353066618, + "learning_rate": 0.0004893600144220059, + "loss": 3.1608972549438477, + "step": 4727, + "token_acc": 0.28055226393912935 + }, + { + "epoch": 2.771621225447083, + "grad_norm": 0.3406261263035801, + "learning_rate": 0.00048935301967821, + "loss": 3.1286606788635254, + "step": 4728, + "token_acc": 0.28479483816761414 + }, + { + "epoch": 2.772207563764292, + "grad_norm": 0.243917088945951, + "learning_rate": 0.0004893460226860149, + "loss": 3.1958253383636475, + "step": 4729, + "token_acc": 0.276421516586247 + }, + { + "epoch": 2.772793902081501, + "grad_norm": 0.33389709848379723, + "learning_rate": 0.0004893390234454864, + "loss": 3.1602582931518555, + "step": 4730, + "token_acc": 0.2792872350414012 + }, + { + "epoch": 2.7733802403987102, + "grad_norm": 0.31752815670651213, + "learning_rate": 0.0004893320219566904, + "loss": 3.095024347305298, + "step": 4731, + "token_acc": 0.2899858897742364 + }, + { + "epoch": 2.7739665787159193, + "grad_norm": 0.2852714205903597, + "learning_rate": 0.0004893250182196924, + "loss": 3.1193761825561523, + "step": 4732, + "token_acc": 0.2867515307852735 + }, + { + "epoch": 2.774552917033128, + "grad_norm": 0.29605496102111417, + "learning_rate": 0.0004893180122345585, + "loss": 3.1618452072143555, + "step": 4733, + "token_acc": 0.27959368970604925 + }, + { + "epoch": 2.775139255350337, + "grad_norm": 0.22579600572916014, + "learning_rate": 0.0004893110040013543, + "loss": 3.1116764545440674, + "step": 4734, + "token_acc": 0.28650340451482514 + }, + { + "epoch": 2.7757255936675462, + "grad_norm": 0.2535650210233001, + "learning_rate": 0.0004893039935201458, + "loss": 3.1361196041107178, + "step": 4735, + "token_acc": 0.2833049839917056 + }, + { + "epoch": 2.7763119319847553, + "grad_norm": 0.2198855230716398, + "learning_rate": 0.0004892969807909986, + "loss": 3.13472318649292, + "step": 4736, + "token_acc": 0.2840570958143328 + }, + { + "epoch": 2.776898270301964, + "grad_norm": 0.3185649597506807, + "learning_rate": 0.0004892899658139788, + "loss": 3.137021064758301, + "step": 4737, + "token_acc": 0.28396280218020264 + }, + { + "epoch": 2.777484608619173, + "grad_norm": 0.2423267319933476, + "learning_rate": 0.0004892829485891522, + "loss": 3.1698858737945557, + "step": 4738, + "token_acc": 0.27865405681930794 + }, + { + "epoch": 2.7780709469363822, + "grad_norm": 0.2486845724927719, + "learning_rate": 0.0004892759291165847, + "loss": 3.0912961959838867, + "step": 4739, + "token_acc": 0.2905722047824101 + }, + { + "epoch": 2.7786572852535913, + "grad_norm": 0.2258235165814888, + "learning_rate": 0.0004892689073963423, + "loss": 3.1047024726867676, + "step": 4740, + "token_acc": 0.2876001199057563 + }, + { + "epoch": 2.7792436235708005, + "grad_norm": 0.21219839238692192, + "learning_rate": 0.0004892618834284909, + "loss": 3.172163486480713, + "step": 4741, + "token_acc": 0.27929837076824626 + }, + { + "epoch": 2.7798299618880096, + "grad_norm": 0.24686201524315263, + "learning_rate": 0.0004892548572130966, + "loss": 3.1514899730682373, + "step": 4742, + "token_acc": 0.2816319848661234 + }, + { + "epoch": 2.7804163002052187, + "grad_norm": 0.22372404989621758, + "learning_rate": 0.0004892478287502252, + "loss": 3.1281960010528564, + "step": 4743, + "token_acc": 0.28440663419879847 + }, + { + "epoch": 2.7810026385224274, + "grad_norm": 0.2470546516877714, + "learning_rate": 0.0004892407980399429, + "loss": 3.128087043762207, + "step": 4744, + "token_acc": 0.283678534526177 + }, + { + "epoch": 2.7815889768396365, + "grad_norm": 0.23988344132263434, + "learning_rate": 0.0004892337650823157, + "loss": 3.113142490386963, + "step": 4745, + "token_acc": 0.2880073158217938 + }, + { + "epoch": 2.7821753151568456, + "grad_norm": 0.2126166486788284, + "learning_rate": 0.0004892267298774096, + "loss": 3.162776470184326, + "step": 4746, + "token_acc": 0.2809240175222968 + }, + { + "epoch": 2.7827616534740547, + "grad_norm": 0.24419246115066431, + "learning_rate": 0.0004892196924252908, + "loss": 3.1414027214050293, + "step": 4747, + "token_acc": 0.28242271641925826 + }, + { + "epoch": 2.7833479917912634, + "grad_norm": 0.29457160394824694, + "learning_rate": 0.0004892126527260253, + "loss": 3.1456260681152344, + "step": 4748, + "token_acc": 0.28155473506156736 + }, + { + "epoch": 2.7839343301084725, + "grad_norm": 0.3212524312582994, + "learning_rate": 0.0004892056107796793, + "loss": 3.147282600402832, + "step": 4749, + "token_acc": 0.280877109987605 + }, + { + "epoch": 2.7845206684256816, + "grad_norm": 0.3658091207040734, + "learning_rate": 0.0004891985665863189, + "loss": 3.112868547439575, + "step": 4750, + "token_acc": 0.28801362425244065 + }, + { + "epoch": 2.7851070067428907, + "grad_norm": 0.23358527190445688, + "learning_rate": 0.0004891915201460103, + "loss": 3.1768479347229004, + "step": 4751, + "token_acc": 0.2800974847646615 + }, + { + "epoch": 2.7856933450601, + "grad_norm": 0.40776014724302473, + "learning_rate": 0.0004891844714588196, + "loss": 3.1268935203552246, + "step": 4752, + "token_acc": 0.285677277041065 + }, + { + "epoch": 2.786279683377309, + "grad_norm": 0.33237398735911294, + "learning_rate": 0.0004891774205248133, + "loss": 3.120698928833008, + "step": 4753, + "token_acc": 0.2839557856669096 + }, + { + "epoch": 2.7868660216945176, + "grad_norm": 0.3027464081842109, + "learning_rate": 0.0004891703673440572, + "loss": 3.119347095489502, + "step": 4754, + "token_acc": 0.28429170467699527 + }, + { + "epoch": 2.7874523600117267, + "grad_norm": 0.3535387774499992, + "learning_rate": 0.0004891633119166179, + "loss": 3.1623423099517822, + "step": 4755, + "token_acc": 0.27947276863374815 + }, + { + "epoch": 2.788038698328936, + "grad_norm": 0.2852322447383937, + "learning_rate": 0.0004891562542425615, + "loss": 3.197279691696167, + "step": 4756, + "token_acc": 0.2746762601012241 + }, + { + "epoch": 2.788625036646145, + "grad_norm": 0.3025865257725178, + "learning_rate": 0.0004891491943219545, + "loss": 3.1487555503845215, + "step": 4757, + "token_acc": 0.2813431230165134 + }, + { + "epoch": 2.7892113749633536, + "grad_norm": 0.26241815870430996, + "learning_rate": 0.0004891421321548629, + "loss": 3.1276917457580566, + "step": 4758, + "token_acc": 0.28311717937883496 + }, + { + "epoch": 2.7897977132805627, + "grad_norm": 0.2570204720041931, + "learning_rate": 0.0004891350677413534, + "loss": 3.1447300910949707, + "step": 4759, + "token_acc": 0.2803467297911538 + }, + { + "epoch": 2.790384051597772, + "grad_norm": 0.2715480239119731, + "learning_rate": 0.000489128001081492, + "loss": 3.118472099304199, + "step": 4760, + "token_acc": 0.286356869080427 + }, + { + "epoch": 2.790970389914981, + "grad_norm": 0.26125443731071396, + "learning_rate": 0.0004891209321753454, + "loss": 3.0976710319519043, + "step": 4761, + "token_acc": 0.28823498585053975 + }, + { + "epoch": 2.79155672823219, + "grad_norm": 0.2540348084330082, + "learning_rate": 0.0004891138610229797, + "loss": 3.1242332458496094, + "step": 4762, + "token_acc": 0.2863199968650809 + }, + { + "epoch": 2.792143066549399, + "grad_norm": 0.2966605158790345, + "learning_rate": 0.0004891067876244616, + "loss": 3.1267249584198, + "step": 4763, + "token_acc": 0.285043304990485 + }, + { + "epoch": 2.7927294048666083, + "grad_norm": 0.26760063200786866, + "learning_rate": 0.0004890997119798574, + "loss": 3.11531138420105, + "step": 4764, + "token_acc": 0.2867565740968297 + }, + { + "epoch": 2.793315743183817, + "grad_norm": 0.2755861163539987, + "learning_rate": 0.0004890926340892337, + "loss": 3.1764094829559326, + "step": 4765, + "token_acc": 0.27950152121059135 + }, + { + "epoch": 2.793902081501026, + "grad_norm": 0.2501730141152054, + "learning_rate": 0.0004890855539526567, + "loss": 3.1275932788848877, + "step": 4766, + "token_acc": 0.2851521061326907 + }, + { + "epoch": 2.794488419818235, + "grad_norm": 0.21243290555023056, + "learning_rate": 0.0004890784715701933, + "loss": 3.1366515159606934, + "step": 4767, + "token_acc": 0.28415683890261617 + }, + { + "epoch": 2.7950747581354443, + "grad_norm": 0.24661673490560215, + "learning_rate": 0.0004890713869419097, + "loss": 3.15203857421875, + "step": 4768, + "token_acc": 0.2808588189566189 + }, + { + "epoch": 2.795661096452653, + "grad_norm": 0.2272899537043174, + "learning_rate": 0.0004890643000678725, + "loss": 3.1364526748657227, + "step": 4769, + "token_acc": 0.28260287774314413 + }, + { + "epoch": 2.796247434769862, + "grad_norm": 0.2599063054148905, + "learning_rate": 0.0004890572109481485, + "loss": 3.1675479412078857, + "step": 4770, + "token_acc": 0.2796374963048929 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.2830031105603673, + "learning_rate": 0.0004890501195828039, + "loss": 3.178873062133789, + "step": 4771, + "token_acc": 0.27720935625054344 + }, + { + "epoch": 2.7974201114042803, + "grad_norm": 0.2432977124445569, + "learning_rate": 0.0004890430259719058, + "loss": 3.15281343460083, + "step": 4772, + "token_acc": 0.2817670969898106 + }, + { + "epoch": 2.7980064497214894, + "grad_norm": 0.2681937681555498, + "learning_rate": 0.0004890359301155205, + "loss": 3.1359658241271973, + "step": 4773, + "token_acc": 0.2841669557664145 + }, + { + "epoch": 2.7985927880386985, + "grad_norm": 0.2744038421293905, + "learning_rate": 0.0004890288320137147, + "loss": 3.155168056488037, + "step": 4774, + "token_acc": 0.2806380692966581 + }, + { + "epoch": 2.7991791263559076, + "grad_norm": 0.2725403939948629, + "learning_rate": 0.0004890217316665552, + "loss": 3.1517701148986816, + "step": 4775, + "token_acc": 0.28120835642390424 + }, + { + "epoch": 2.7997654646731163, + "grad_norm": 0.21723623310345797, + "learning_rate": 0.0004890146290741086, + "loss": 3.130082368850708, + "step": 4776, + "token_acc": 0.2843410521252288 + }, + { + "epoch": 2.8003518029903254, + "grad_norm": 0.20437789964757797, + "learning_rate": 0.0004890075242364415, + "loss": 3.1507952213287354, + "step": 4777, + "token_acc": 0.2810131615357543 + }, + { + "epoch": 2.8009381413075345, + "grad_norm": 0.26381051439657055, + "learning_rate": 0.000489000417153621, + "loss": 3.163231134414673, + "step": 4778, + "token_acc": 0.2783015543832803 + }, + { + "epoch": 2.8015244796247436, + "grad_norm": 0.2637321701196472, + "learning_rate": 0.0004889933078257134, + "loss": 3.1180357933044434, + "step": 4779, + "token_acc": 0.28675394850656216 + }, + { + "epoch": 2.8021108179419523, + "grad_norm": 0.22620770781136632, + "learning_rate": 0.000488986196252786, + "loss": 3.1661171913146973, + "step": 4780, + "token_acc": 0.2811723376387216 + }, + { + "epoch": 2.8026971562591614, + "grad_norm": 0.24031152012616444, + "learning_rate": 0.0004889790824349051, + "loss": 3.1385445594787598, + "step": 4781, + "token_acc": 0.2838351694462471 + }, + { + "epoch": 2.8032834945763705, + "grad_norm": 0.32767669290900836, + "learning_rate": 0.0004889719663721378, + "loss": 3.116870880126953, + "step": 4782, + "token_acc": 0.28447842034541976 + }, + { + "epoch": 2.8038698328935796, + "grad_norm": 0.4466538165430876, + "learning_rate": 0.000488964848064551, + "loss": 3.1482481956481934, + "step": 4783, + "token_acc": 0.28074214732178177 + }, + { + "epoch": 2.8044561712107887, + "grad_norm": 0.3965562260027191, + "learning_rate": 0.0004889577275122113, + "loss": 3.1394331455230713, + "step": 4784, + "token_acc": 0.2827249396814971 + }, + { + "epoch": 2.805042509527998, + "grad_norm": 0.26538141496841916, + "learning_rate": 0.0004889506047151858, + "loss": 3.1387925148010254, + "step": 4785, + "token_acc": 0.28311314216942346 + }, + { + "epoch": 2.805628847845207, + "grad_norm": 0.3218794602907884, + "learning_rate": 0.0004889434796735415, + "loss": 3.11043381690979, + "step": 4786, + "token_acc": 0.2868546126660438 + }, + { + "epoch": 2.8062151861624156, + "grad_norm": 0.2788341858801879, + "learning_rate": 0.000488936352387345, + "loss": 3.107247829437256, + "step": 4787, + "token_acc": 0.28618070798266015 + }, + { + "epoch": 2.8068015244796247, + "grad_norm": 0.2812183411656589, + "learning_rate": 0.0004889292228566635, + "loss": 3.1420722007751465, + "step": 4788, + "token_acc": 0.28230795038740186 + }, + { + "epoch": 2.807387862796834, + "grad_norm": 0.2996716694326207, + "learning_rate": 0.000488922091081564, + "loss": 3.142725944519043, + "step": 4789, + "token_acc": 0.2827711701391617 + }, + { + "epoch": 2.807974201114043, + "grad_norm": 0.2803489956355042, + "learning_rate": 0.0004889149570621133, + "loss": 3.1395883560180664, + "step": 4790, + "token_acc": 0.28265799916539014 + }, + { + "epoch": 2.8085605394312516, + "grad_norm": 0.29353423634237885, + "learning_rate": 0.0004889078207983785, + "loss": 3.131152629852295, + "step": 4791, + "token_acc": 0.28405726715198737 + }, + { + "epoch": 2.8091468777484607, + "grad_norm": 0.24898360870191433, + "learning_rate": 0.0004889006822904268, + "loss": 3.1402578353881836, + "step": 4792, + "token_acc": 0.2809907985973793 + }, + { + "epoch": 2.80973321606567, + "grad_norm": 0.2201221802705435, + "learning_rate": 0.000488893541538325, + "loss": 3.1404025554656982, + "step": 4793, + "token_acc": 0.28434244397640435 + }, + { + "epoch": 2.810319554382879, + "grad_norm": 0.26767496824734044, + "learning_rate": 0.0004888863985421403, + "loss": 3.138634204864502, + "step": 4794, + "token_acc": 0.282695086246518 + }, + { + "epoch": 2.810905892700088, + "grad_norm": 0.21128748436489886, + "learning_rate": 0.0004888792533019398, + "loss": 3.1629817485809326, + "step": 4795, + "token_acc": 0.2790732630688607 + }, + { + "epoch": 2.811492231017297, + "grad_norm": 0.2497509363202678, + "learning_rate": 0.0004888721058177905, + "loss": 3.114626884460449, + "step": 4796, + "token_acc": 0.2859989712918042 + }, + { + "epoch": 2.8120785693345063, + "grad_norm": 0.3002243938230459, + "learning_rate": 0.0004888649560897599, + "loss": 3.0750510692596436, + "step": 4797, + "token_acc": 0.2924205562805785 + }, + { + "epoch": 2.812664907651715, + "grad_norm": 0.26098159775598884, + "learning_rate": 0.0004888578041179147, + "loss": 3.166616916656494, + "step": 4798, + "token_acc": 0.2789171986134225 + }, + { + "epoch": 2.813251245968924, + "grad_norm": 0.221920530249086, + "learning_rate": 0.0004888506499023224, + "loss": 3.1472902297973633, + "step": 4799, + "token_acc": 0.28272709234078996 + }, + { + "epoch": 2.813837584286133, + "grad_norm": 0.2930919568299991, + "learning_rate": 0.00048884349344305, + "loss": 3.136810302734375, + "step": 4800, + "token_acc": 0.28461438503021247 + }, + { + "epoch": 2.8144239226033423, + "grad_norm": 0.29130884409643953, + "learning_rate": 0.0004888363347401649, + "loss": 3.1434431076049805, + "step": 4801, + "token_acc": 0.28241469949846437 + }, + { + "epoch": 2.815010260920551, + "grad_norm": 0.2553519462451598, + "learning_rate": 0.0004888291737937343, + "loss": 3.117910861968994, + "step": 4802, + "token_acc": 0.28457531540716197 + }, + { + "epoch": 2.81559659923776, + "grad_norm": 0.27663672539583145, + "learning_rate": 0.0004888220106038254, + "loss": 3.13838529586792, + "step": 4803, + "token_acc": 0.2851304366039356 + }, + { + "epoch": 2.816182937554969, + "grad_norm": 0.2830561135618201, + "learning_rate": 0.0004888148451705055, + "loss": 3.092414617538452, + "step": 4804, + "token_acc": 0.28795570512387064 + }, + { + "epoch": 2.8167692758721783, + "grad_norm": 0.259522735853877, + "learning_rate": 0.000488807677493842, + "loss": 3.1096653938293457, + "step": 4805, + "token_acc": 0.28804866907520454 + }, + { + "epoch": 2.8173556141893874, + "grad_norm": 0.2950226067418885, + "learning_rate": 0.0004888005075739021, + "loss": 3.1235463619232178, + "step": 4806, + "token_acc": 0.2838370191649467 + }, + { + "epoch": 2.8179419525065965, + "grad_norm": 0.2906369915166881, + "learning_rate": 0.0004887933354107532, + "loss": 3.1253035068511963, + "step": 4807, + "token_acc": 0.2860261006594012 + }, + { + "epoch": 2.818528290823805, + "grad_norm": 0.2727124794917412, + "learning_rate": 0.0004887861610044628, + "loss": 3.177701234817505, + "step": 4808, + "token_acc": 0.27767760451236523 + }, + { + "epoch": 2.8191146291410143, + "grad_norm": 0.2405233197068983, + "learning_rate": 0.0004887789843550981, + "loss": 3.103774070739746, + "step": 4809, + "token_acc": 0.2872932277019881 + }, + { + "epoch": 2.8197009674582234, + "grad_norm": 0.2369185188193357, + "learning_rate": 0.0004887718054627266, + "loss": 3.108470916748047, + "step": 4810, + "token_acc": 0.2888774175966711 + }, + { + "epoch": 2.8202873057754325, + "grad_norm": 0.21363715161438507, + "learning_rate": 0.0004887646243274158, + "loss": 3.1599059104919434, + "step": 4811, + "token_acc": 0.2792772883835767 + }, + { + "epoch": 2.820873644092641, + "grad_norm": 0.21772661749902666, + "learning_rate": 0.000488757440949233, + "loss": 3.1554532051086426, + "step": 4812, + "token_acc": 0.28124307146552363 + }, + { + "epoch": 2.8214599824098503, + "grad_norm": 0.3022079624029741, + "learning_rate": 0.0004887502553282459, + "loss": 3.1413803100585938, + "step": 4813, + "token_acc": 0.28335416300856253 + }, + { + "epoch": 2.8220463207270594, + "grad_norm": 0.36893414015354886, + "learning_rate": 0.0004887430674645218, + "loss": 3.139862537384033, + "step": 4814, + "token_acc": 0.28398390434700305 + }, + { + "epoch": 2.8226326590442685, + "grad_norm": 0.2524035940052976, + "learning_rate": 0.0004887358773581283, + "loss": 3.1565520763397217, + "step": 4815, + "token_acc": 0.28279112691514574 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.2266042514209356, + "learning_rate": 0.0004887286850091329, + "loss": 3.140373945236206, + "step": 4816, + "token_acc": 0.28330737540192924 + }, + { + "epoch": 2.8238053356786867, + "grad_norm": 0.2962740093235684, + "learning_rate": 0.0004887214904176032, + "loss": 3.134408950805664, + "step": 4817, + "token_acc": 0.28433784337843376 + }, + { + "epoch": 2.824391673995896, + "grad_norm": 0.22089683755830478, + "learning_rate": 0.0004887142935836069, + "loss": 3.138504981994629, + "step": 4818, + "token_acc": 0.2825005670160189 + }, + { + "epoch": 2.8249780123131045, + "grad_norm": 0.2637003735275535, + "learning_rate": 0.0004887070945072113, + "loss": 3.1459946632385254, + "step": 4819, + "token_acc": 0.28209072359127296 + }, + { + "epoch": 2.8255643506303136, + "grad_norm": 0.24770239416044743, + "learning_rate": 0.0004886998931884843, + "loss": 3.13864803314209, + "step": 4820, + "token_acc": 0.2836980760255147 + }, + { + "epoch": 2.8261506889475227, + "grad_norm": 0.23454315326146682, + "learning_rate": 0.0004886926896274934, + "loss": 3.113605499267578, + "step": 4821, + "token_acc": 0.28490206600482965 + }, + { + "epoch": 2.826737027264732, + "grad_norm": 0.2651431563404158, + "learning_rate": 0.0004886854838243065, + "loss": 3.0919008255004883, + "step": 4822, + "token_acc": 0.2896715411902811 + }, + { + "epoch": 2.8273233655819405, + "grad_norm": 0.24038376264708838, + "learning_rate": 0.0004886782757789909, + "loss": 3.102668285369873, + "step": 4823, + "token_acc": 0.28718365066017865 + }, + { + "epoch": 2.8279097038991496, + "grad_norm": 0.24144183990414794, + "learning_rate": 0.0004886710654916146, + "loss": 3.1367955207824707, + "step": 4824, + "token_acc": 0.2845948606362035 + }, + { + "epoch": 2.8284960422163588, + "grad_norm": 0.2755659530292548, + "learning_rate": 0.0004886638529622453, + "loss": 3.163353204727173, + "step": 4825, + "token_acc": 0.27903680057562147 + }, + { + "epoch": 2.829082380533568, + "grad_norm": 0.25532211776843117, + "learning_rate": 0.0004886566381909507, + "loss": 3.1461715698242188, + "step": 4826, + "token_acc": 0.28233120901450076 + }, + { + "epoch": 2.829668718850777, + "grad_norm": 0.23827733094897113, + "learning_rate": 0.0004886494211777984, + "loss": 3.1009912490844727, + "step": 4827, + "token_acc": 0.28933638183225835 + }, + { + "epoch": 2.830255057167986, + "grad_norm": 0.2877732760477564, + "learning_rate": 0.0004886422019228565, + "loss": 3.1174845695495605, + "step": 4828, + "token_acc": 0.28412332439678284 + }, + { + "epoch": 2.830841395485195, + "grad_norm": 0.31576323982471777, + "learning_rate": 0.0004886349804261928, + "loss": 3.1727046966552734, + "step": 4829, + "token_acc": 0.27816600798719143 + }, + { + "epoch": 2.831427733802404, + "grad_norm": 0.2745040491637979, + "learning_rate": 0.000488627756687875, + "loss": 3.0889248847961426, + "step": 4830, + "token_acc": 0.29038323758386403 + }, + { + "epoch": 2.832014072119613, + "grad_norm": 0.2527200821652121, + "learning_rate": 0.0004886205307079708, + "loss": 3.1660141944885254, + "step": 4831, + "token_acc": 0.2809941638699069 + }, + { + "epoch": 2.832600410436822, + "grad_norm": 0.24994001644372724, + "learning_rate": 0.0004886133024865483, + "loss": 3.097322702407837, + "step": 4832, + "token_acc": 0.2899228869719839 + }, + { + "epoch": 2.833186748754031, + "grad_norm": 0.2772194207028575, + "learning_rate": 0.0004886060720236755, + "loss": 3.113114595413208, + "step": 4833, + "token_acc": 0.287156201779759 + }, + { + "epoch": 2.83377308707124, + "grad_norm": 0.3463963017644551, + "learning_rate": 0.0004885988393194201, + "loss": 3.1332573890686035, + "step": 4834, + "token_acc": 0.28390824478147597 + }, + { + "epoch": 2.834359425388449, + "grad_norm": 0.3308777573161761, + "learning_rate": 0.00048859160437385, + "loss": 3.1541991233825684, + "step": 4835, + "token_acc": 0.2819822501912031 + }, + { + "epoch": 2.834945763705658, + "grad_norm": 0.27038624227132807, + "learning_rate": 0.0004885843671870335, + "loss": 3.08933687210083, + "step": 4836, + "token_acc": 0.2914752679145297 + }, + { + "epoch": 2.835532102022867, + "grad_norm": 0.28604992831551707, + "learning_rate": 0.0004885771277590383, + "loss": 3.156747817993164, + "step": 4837, + "token_acc": 0.28124876517029457 + }, + { + "epoch": 2.8361184403400763, + "grad_norm": 0.30495858998986786, + "learning_rate": 0.0004885698860899324, + "loss": 3.1565260887145996, + "step": 4838, + "token_acc": 0.2806578504697083 + }, + { + "epoch": 2.8367047786572854, + "grad_norm": 0.2666780861802044, + "learning_rate": 0.000488562642179784, + "loss": 3.1229500770568848, + "step": 4839, + "token_acc": 0.2853679921605034 + }, + { + "epoch": 2.8372911169744945, + "grad_norm": 0.27107225996601786, + "learning_rate": 0.0004885553960286609, + "loss": 3.1419596672058105, + "step": 4840, + "token_acc": 0.2810470033896171 + }, + { + "epoch": 2.837877455291703, + "grad_norm": 0.34543296921476974, + "learning_rate": 0.0004885481476366314, + "loss": 3.133160352706909, + "step": 4841, + "token_acc": 0.28292354944876696 + }, + { + "epoch": 2.8384637936089123, + "grad_norm": 0.25788836693040357, + "learning_rate": 0.0004885408970037636, + "loss": 3.1340599060058594, + "step": 4842, + "token_acc": 0.28384641269323324 + }, + { + "epoch": 2.8390501319261214, + "grad_norm": 0.3177809911161217, + "learning_rate": 0.0004885336441301253, + "loss": 3.1265523433685303, + "step": 4843, + "token_acc": 0.28330160714697467 + }, + { + "epoch": 2.8396364702433305, + "grad_norm": 0.25404191643852275, + "learning_rate": 0.000488526389015785, + "loss": 3.0653738975524902, + "step": 4844, + "token_acc": 0.2939909337612625 + }, + { + "epoch": 2.840222808560539, + "grad_norm": 0.3173571984748071, + "learning_rate": 0.0004885191316608106, + "loss": 3.137275457382202, + "step": 4845, + "token_acc": 0.2822794376726634 + }, + { + "epoch": 2.8408091468777483, + "grad_norm": 0.30475774595429855, + "learning_rate": 0.0004885118720652704, + "loss": 3.1285552978515625, + "step": 4846, + "token_acc": 0.28453220075093605 + }, + { + "epoch": 2.8413954851949574, + "grad_norm": 0.2824073090441585, + "learning_rate": 0.0004885046102292327, + "loss": 3.094118595123291, + "step": 4847, + "token_acc": 0.29059716111378864 + }, + { + "epoch": 2.8419818235121665, + "grad_norm": 0.2827726405057913, + "learning_rate": 0.0004884973461527654, + "loss": 3.171477794647217, + "step": 4848, + "token_acc": 0.278738587223939 + }, + { + "epoch": 2.8425681618293757, + "grad_norm": 0.27057062522085346, + "learning_rate": 0.000488490079835937, + "loss": 3.169370412826538, + "step": 4849, + "token_acc": 0.2782568996184032 + }, + { + "epoch": 2.8431545001465848, + "grad_norm": 0.29739606626571946, + "learning_rate": 0.0004884828112788155, + "loss": 3.1766932010650635, + "step": 4850, + "token_acc": 0.27792062306625415 + }, + { + "epoch": 2.843740838463794, + "grad_norm": 0.27094101805440496, + "learning_rate": 0.0004884755404814695, + "loss": 3.1282691955566406, + "step": 4851, + "token_acc": 0.28499901107862874 + }, + { + "epoch": 2.8443271767810026, + "grad_norm": 0.2721023355385571, + "learning_rate": 0.0004884682674439672, + "loss": 3.154409885406494, + "step": 4852, + "token_acc": 0.28127260874567 + }, + { + "epoch": 2.8449135150982117, + "grad_norm": 0.27513560094676087, + "learning_rate": 0.0004884609921663767, + "loss": 3.104313850402832, + "step": 4853, + "token_acc": 0.28854601863651275 + }, + { + "epoch": 2.8454998534154208, + "grad_norm": 0.23135231606183784, + "learning_rate": 0.0004884537146487666, + "loss": 3.135647773742676, + "step": 4854, + "token_acc": 0.2824167587687539 + }, + { + "epoch": 2.84608619173263, + "grad_norm": 0.28390059504146725, + "learning_rate": 0.0004884464348912052, + "loss": 3.1599316596984863, + "step": 4855, + "token_acc": 0.28203571409797457 + }, + { + "epoch": 2.8466725300498386, + "grad_norm": 0.2602915882825771, + "learning_rate": 0.0004884391528937608, + "loss": 3.12893009185791, + "step": 4856, + "token_acc": 0.28457589959697005 + }, + { + "epoch": 2.8472588683670477, + "grad_norm": 0.29280848821246713, + "learning_rate": 0.0004884318686565019, + "loss": 3.1245017051696777, + "step": 4857, + "token_acc": 0.2845064718536167 + }, + { + "epoch": 2.847845206684257, + "grad_norm": 0.20905816049308512, + "learning_rate": 0.0004884245821794969, + "loss": 3.1666455268859863, + "step": 4858, + "token_acc": 0.2779463493169673 + }, + { + "epoch": 2.848431545001466, + "grad_norm": 0.3349393279738938, + "learning_rate": 0.0004884172934628142, + "loss": 3.1396267414093018, + "step": 4859, + "token_acc": 0.2836357776515317 + }, + { + "epoch": 2.849017883318675, + "grad_norm": 0.25368987618105154, + "learning_rate": 0.0004884100025065223, + "loss": 3.1463000774383545, + "step": 4860, + "token_acc": 0.28435287640641116 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.2991508483751611, + "learning_rate": 0.0004884027093106896, + "loss": 3.1236701011657715, + "step": 4861, + "token_acc": 0.2827968164178123 + }, + { + "epoch": 2.850190559953093, + "grad_norm": 0.29225931230625884, + "learning_rate": 0.0004883954138753849, + "loss": 3.1588516235351562, + "step": 4862, + "token_acc": 0.2795737076556511 + }, + { + "epoch": 2.850776898270302, + "grad_norm": 0.2603450766875072, + "learning_rate": 0.0004883881162006763, + "loss": 3.139554500579834, + "step": 4863, + "token_acc": 0.284038429764378 + }, + { + "epoch": 2.851363236587511, + "grad_norm": 0.27184290452018395, + "learning_rate": 0.0004883808162866328, + "loss": 3.087451934814453, + "step": 4864, + "token_acc": 0.2896082729163154 + }, + { + "epoch": 2.85194957490472, + "grad_norm": 0.2381407118896836, + "learning_rate": 0.0004883735141333227, + "loss": 3.1246142387390137, + "step": 4865, + "token_acc": 0.28369183378519514 + }, + { + "epoch": 2.852535913221929, + "grad_norm": 0.2615941572052395, + "learning_rate": 0.0004883662097408145, + "loss": 3.1363883018493652, + "step": 4866, + "token_acc": 0.28562646170910744 + }, + { + "epoch": 2.853122251539138, + "grad_norm": 0.22284654722782624, + "learning_rate": 0.0004883589031091771, + "loss": 3.078174114227295, + "step": 4867, + "token_acc": 0.29076180115588834 + }, + { + "epoch": 2.853708589856347, + "grad_norm": 0.27795248716023313, + "learning_rate": 0.0004883515942384789, + "loss": 3.164367198944092, + "step": 4868, + "token_acc": 0.2793844975098966 + }, + { + "epoch": 2.854294928173556, + "grad_norm": 0.225008912028625, + "learning_rate": 0.0004883442831287888, + "loss": 3.1500251293182373, + "step": 4869, + "token_acc": 0.28182562685965584 + }, + { + "epoch": 2.8548812664907652, + "grad_norm": 0.23970016918237333, + "learning_rate": 0.0004883369697801751, + "loss": 3.2006447315216064, + "step": 4870, + "token_acc": 0.2736985012226259 + }, + { + "epoch": 2.8554676048079743, + "grad_norm": 0.2327864579120954, + "learning_rate": 0.000488329654192707, + "loss": 3.172611713409424, + "step": 4871, + "token_acc": 0.2788710984199706 + }, + { + "epoch": 2.8560539431251835, + "grad_norm": 0.2664719904586091, + "learning_rate": 0.0004883223363664527, + "loss": 3.1084370613098145, + "step": 4872, + "token_acc": 0.2863302503753478 + }, + { + "epoch": 2.856640281442392, + "grad_norm": 0.24426386549479084, + "learning_rate": 0.0004883150163014814, + "loss": 3.13490629196167, + "step": 4873, + "token_acc": 0.28431476326068317 + }, + { + "epoch": 2.8572266197596012, + "grad_norm": 0.2399171842575517, + "learning_rate": 0.0004883076939978616, + "loss": 3.113438606262207, + "step": 4874, + "token_acc": 0.28580018419243286 + }, + { + "epoch": 2.8578129580768104, + "grad_norm": 0.30837687459020247, + "learning_rate": 0.000488300369455662, + "loss": 3.1295883655548096, + "step": 4875, + "token_acc": 0.2841399843246172 + }, + { + "epoch": 2.8583992963940195, + "grad_norm": 0.24295209598079248, + "learning_rate": 0.0004882930426749517, + "loss": 3.1296138763427734, + "step": 4876, + "token_acc": 0.28432564099956265 + }, + { + "epoch": 2.858985634711228, + "grad_norm": 0.2721329931576096, + "learning_rate": 0.0004882857136557994, + "loss": 3.153696298599243, + "step": 4877, + "token_acc": 0.28128367678382604 + }, + { + "epoch": 2.8595719730284372, + "grad_norm": 0.29408264410763685, + "learning_rate": 0.00048827838239827383, + "loss": 3.162203788757324, + "step": 4878, + "token_acc": 0.2799060719642149 + }, + { + "epoch": 2.8601583113456464, + "grad_norm": 0.24398074797271935, + "learning_rate": 0.0004882710489024439, + "loss": 3.126344919204712, + "step": 4879, + "token_acc": 0.28400916516104757 + }, + { + "epoch": 2.8607446496628555, + "grad_norm": 0.30150785669185504, + "learning_rate": 0.0004882637131683786, + "loss": 3.1272635459899902, + "step": 4880, + "token_acc": 0.28365464140311936 + }, + { + "epoch": 2.8613309879800646, + "grad_norm": 0.28953445745000006, + "learning_rate": 0.00048825637519614673, + "loss": 3.1855874061584473, + "step": 4881, + "token_acc": 0.2761914046033218 + }, + { + "epoch": 2.8619173262972737, + "grad_norm": 0.30295388905733067, + "learning_rate": 0.00048824903498581736, + "loss": 3.138740062713623, + "step": 4882, + "token_acc": 0.28357544858637357 + }, + { + "epoch": 2.862503664614483, + "grad_norm": 0.28710693378579333, + "learning_rate": 0.00048824169253745933, + "loss": 3.1260299682617188, + "step": 4883, + "token_acc": 0.2864039045718571 + }, + { + "epoch": 2.8630900029316915, + "grad_norm": 0.23524056348061323, + "learning_rate": 0.0004882343478511415, + "loss": 3.1516289710998535, + "step": 4884, + "token_acc": 0.2826598219049642 + }, + { + "epoch": 2.8636763412489006, + "grad_norm": 0.2561964033380634, + "learning_rate": 0.00048822700092693316, + "loss": 3.118824005126953, + "step": 4885, + "token_acc": 0.2851107170184072 + }, + { + "epoch": 2.8642626795661097, + "grad_norm": 0.3338957719049952, + "learning_rate": 0.00048821965176490314, + "loss": 3.142913818359375, + "step": 4886, + "token_acc": 0.28275287789019454 + }, + { + "epoch": 2.864849017883319, + "grad_norm": 0.31284440494613225, + "learning_rate": 0.00048821230036512044, + "loss": 3.1092324256896973, + "step": 4887, + "token_acc": 0.2874816034904722 + }, + { + "epoch": 2.8654353562005275, + "grad_norm": 0.21716681335491536, + "learning_rate": 0.0004882049467276541, + "loss": 3.117408275604248, + "step": 4888, + "token_acc": 0.28670610642350214 + }, + { + "epoch": 2.8660216945177366, + "grad_norm": 0.2608137208737408, + "learning_rate": 0.0004881975908525734, + "loss": 3.1200599670410156, + "step": 4889, + "token_acc": 0.2869746508350062 + }, + { + "epoch": 2.8666080328349457, + "grad_norm": 0.2750466535009853, + "learning_rate": 0.00048819023273994727, + "loss": 3.1784353256225586, + "step": 4890, + "token_acc": 0.2776120829267802 + }, + { + "epoch": 2.867194371152155, + "grad_norm": 0.24547870942754935, + "learning_rate": 0.00048818287238984486, + "loss": 3.1103293895721436, + "step": 4891, + "token_acc": 0.287287465707776 + }, + { + "epoch": 2.867780709469364, + "grad_norm": 0.313609054127639, + "learning_rate": 0.00048817550980233536, + "loss": 3.0979578495025635, + "step": 4892, + "token_acc": 0.28923173720982054 + }, + { + "epoch": 2.868367047786573, + "grad_norm": 0.2509927512284993, + "learning_rate": 0.00048816814497748784, + "loss": 3.160936117172241, + "step": 4893, + "token_acc": 0.28152552641602524 + }, + { + "epoch": 2.868953386103782, + "grad_norm": 0.29532063041266643, + "learning_rate": 0.00048816077791537157, + "loss": 3.127889633178711, + "step": 4894, + "token_acc": 0.283574443046707 + }, + { + "epoch": 2.869539724420991, + "grad_norm": 0.34693096371133986, + "learning_rate": 0.0004881534086160557, + "loss": 3.173649787902832, + "step": 4895, + "token_acc": 0.27964687219808265 + }, + { + "epoch": 2.8701260627382, + "grad_norm": 0.2428215841349258, + "learning_rate": 0.00048814603707960947, + "loss": 3.0773868560791016, + "step": 4896, + "token_acc": 0.29024613872987265 + }, + { + "epoch": 2.870712401055409, + "grad_norm": 0.3115948784738602, + "learning_rate": 0.00048813866330610215, + "loss": 3.1171956062316895, + "step": 4897, + "token_acc": 0.286990333713322 + }, + { + "epoch": 2.871298739372618, + "grad_norm": 0.2715649954485524, + "learning_rate": 0.000488131287295603, + "loss": 3.122842788696289, + "step": 4898, + "token_acc": 0.28530024170006174 + }, + { + "epoch": 2.871885077689827, + "grad_norm": 0.28934515336328087, + "learning_rate": 0.00048812390904818116, + "loss": 3.139252185821533, + "step": 4899, + "token_acc": 0.28268319432237543 + }, + { + "epoch": 2.872471416007036, + "grad_norm": 0.2863952705746134, + "learning_rate": 0.0004881165285639062, + "loss": 3.126121997833252, + "step": 4900, + "token_acc": 0.28395641900761354 + }, + { + "epoch": 2.873057754324245, + "grad_norm": 0.27340161269909075, + "learning_rate": 0.00048810914584284726, + "loss": 3.115375518798828, + "step": 4901, + "token_acc": 0.2864196073275249 + }, + { + "epoch": 2.873644092641454, + "grad_norm": 0.268884494905015, + "learning_rate": 0.0004881017608850738, + "loss": 3.163297414779663, + "step": 4902, + "token_acc": 0.27963908352231703 + }, + { + "epoch": 2.8742304309586633, + "grad_norm": 0.22514243882692994, + "learning_rate": 0.00048809437369065514, + "loss": 3.101938247680664, + "step": 4903, + "token_acc": 0.28692334850149065 + }, + { + "epoch": 2.8748167692758724, + "grad_norm": 0.2569857420088353, + "learning_rate": 0.00048808698425966063, + "loss": 3.15704083442688, + "step": 4904, + "token_acc": 0.27955448601238064 + }, + { + "epoch": 2.875403107593081, + "grad_norm": 0.21552991499180688, + "learning_rate": 0.00048807959259215974, + "loss": 3.1478776931762695, + "step": 4905, + "token_acc": 0.28106563152227376 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.2544848470115608, + "learning_rate": 0.00048807219868822195, + "loss": 3.157792568206787, + "step": 4906, + "token_acc": 0.2804688787197517 + }, + { + "epoch": 2.8765757842274993, + "grad_norm": 0.22163938403224986, + "learning_rate": 0.0004880648025479166, + "loss": 3.1324574947357178, + "step": 4907, + "token_acc": 0.2848131722353002 + }, + { + "epoch": 2.8771621225447084, + "grad_norm": 0.2822877488634309, + "learning_rate": 0.00048805740417131325, + "loss": 3.1124801635742188, + "step": 4908, + "token_acc": 0.28740142325192275 + }, + { + "epoch": 2.8777484608619175, + "grad_norm": 0.24468603546085294, + "learning_rate": 0.00048805000355848133, + "loss": 3.1252307891845703, + "step": 4909, + "token_acc": 0.2847210107907394 + }, + { + "epoch": 2.878334799179126, + "grad_norm": 0.28729574012784703, + "learning_rate": 0.00048804260070949045, + "loss": 3.103818893432617, + "step": 4910, + "token_acc": 0.2882280472222081 + }, + { + "epoch": 2.8789211374963353, + "grad_norm": 0.26472334428852384, + "learning_rate": 0.0004880351956244101, + "loss": 3.154768466949463, + "step": 4911, + "token_acc": 0.2803519462818833 + }, + { + "epoch": 2.8795074758135444, + "grad_norm": 0.27072707398898543, + "learning_rate": 0.00048802778830330987, + "loss": 3.1232645511627197, + "step": 4912, + "token_acc": 0.28609724705612744 + }, + { + "epoch": 2.8800938141307535, + "grad_norm": 0.34702788946768703, + "learning_rate": 0.00048802037874625927, + "loss": 3.1395516395568848, + "step": 4913, + "token_acc": 0.2826941719337702 + }, + { + "epoch": 2.8806801524479626, + "grad_norm": 0.24268875024728914, + "learning_rate": 0.00048801296695332797, + "loss": 3.122776985168457, + "step": 4914, + "token_acc": 0.2837166115492241 + }, + { + "epoch": 2.8812664907651717, + "grad_norm": 0.29519130281033223, + "learning_rate": 0.0004880055529245855, + "loss": 3.0927324295043945, + "step": 4915, + "token_acc": 0.2906697390378863 + }, + { + "epoch": 2.8818528290823804, + "grad_norm": 0.25951170370563165, + "learning_rate": 0.00048799813666010165, + "loss": 3.134528875350952, + "step": 4916, + "token_acc": 0.2842230838055473 + }, + { + "epoch": 2.8824391673995895, + "grad_norm": 0.2433983553476355, + "learning_rate": 0.000487990718159946, + "loss": 3.153985023498535, + "step": 4917, + "token_acc": 0.28151600036144425 + }, + { + "epoch": 2.8830255057167986, + "grad_norm": 0.22552680240463832, + "learning_rate": 0.00048798329742418824, + "loss": 3.1615052223205566, + "step": 4918, + "token_acc": 0.28131498835719804 + }, + { + "epoch": 2.8836118440340077, + "grad_norm": 0.23457797177383471, + "learning_rate": 0.00048797587445289814, + "loss": 3.1286332607269287, + "step": 4919, + "token_acc": 0.2851543245482481 + }, + { + "epoch": 2.8841981823512164, + "grad_norm": 0.24432487782522214, + "learning_rate": 0.0004879684492461453, + "loss": 3.1091256141662598, + "step": 4920, + "token_acc": 0.2865839175675928 + }, + { + "epoch": 2.8847845206684255, + "grad_norm": 0.2377113188607445, + "learning_rate": 0.0004879610218039996, + "loss": 3.1517481803894043, + "step": 4921, + "token_acc": 0.2838878407190956 + }, + { + "epoch": 2.8853708589856346, + "grad_norm": 0.24372504589064947, + "learning_rate": 0.00048795359212653076, + "loss": 3.0923757553100586, + "step": 4922, + "token_acc": 0.29036334598717367 + }, + { + "epoch": 2.8859571973028437, + "grad_norm": 0.23310574223944144, + "learning_rate": 0.00048794616021380854, + "loss": 3.1291494369506836, + "step": 4923, + "token_acc": 0.2847905045015342 + }, + { + "epoch": 2.886543535620053, + "grad_norm": 0.2249346197215438, + "learning_rate": 0.0004879387260659027, + "loss": 3.116410255432129, + "step": 4924, + "token_acc": 0.28687137453996797 + }, + { + "epoch": 2.887129873937262, + "grad_norm": 0.2991269378866682, + "learning_rate": 0.0004879312896828833, + "loss": 3.1162405014038086, + "step": 4925, + "token_acc": 0.2849651999377705 + }, + { + "epoch": 2.887716212254471, + "grad_norm": 0.2552031973321904, + "learning_rate": 0.00048792385106481993, + "loss": 3.137662172317505, + "step": 4926, + "token_acc": 0.283719781882669 + }, + { + "epoch": 2.8883025505716797, + "grad_norm": 0.22674399464041056, + "learning_rate": 0.0004879164102117827, + "loss": 3.1396563053131104, + "step": 4927, + "token_acc": 0.28465568139403163 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.2886787981477102, + "learning_rate": 0.00048790896712384136, + "loss": 3.117295265197754, + "step": 4928, + "token_acc": 0.28683486749842524 + }, + { + "epoch": 2.889475227206098, + "grad_norm": 0.27655948163577637, + "learning_rate": 0.00048790152180106585, + "loss": 3.1172399520874023, + "step": 4929, + "token_acc": 0.28440871350726593 + }, + { + "epoch": 2.890061565523307, + "grad_norm": 0.2521089063548159, + "learning_rate": 0.00048789407424352604, + "loss": 3.1061902046203613, + "step": 4930, + "token_acc": 0.2872061168794341 + }, + { + "epoch": 2.8906479038405157, + "grad_norm": 0.3298256128425193, + "learning_rate": 0.00048788662445129204, + "loss": 3.1459975242614746, + "step": 4931, + "token_acc": 0.28299710276505696 + }, + { + "epoch": 2.891234242157725, + "grad_norm": 0.30398850844897296, + "learning_rate": 0.0004878791724244338, + "loss": 3.144563674926758, + "step": 4932, + "token_acc": 0.28111557467382414 + }, + { + "epoch": 2.891820580474934, + "grad_norm": 0.32186584283597364, + "learning_rate": 0.0004878717181630212, + "loss": 3.109895706176758, + "step": 4933, + "token_acc": 0.28757185227958737 + }, + { + "epoch": 2.892406918792143, + "grad_norm": 0.3443025871349975, + "learning_rate": 0.0004878642616671244, + "loss": 3.1630849838256836, + "step": 4934, + "token_acc": 0.2814193635299116 + }, + { + "epoch": 2.892993257109352, + "grad_norm": 0.2301029925206164, + "learning_rate": 0.0004878568029368134, + "loss": 3.132241725921631, + "step": 4935, + "token_acc": 0.2832746393940856 + }, + { + "epoch": 2.8935795954265613, + "grad_norm": 0.2620890795734399, + "learning_rate": 0.00048784934197215827, + "loss": 3.125356435775757, + "step": 4936, + "token_acc": 0.2857983164331912 + }, + { + "epoch": 2.8941659337437704, + "grad_norm": 0.22218595847011413, + "learning_rate": 0.0004878418787732289, + "loss": 3.1670401096343994, + "step": 4937, + "token_acc": 0.2795852374045365 + }, + { + "epoch": 2.894752272060979, + "grad_norm": 0.2584335746268995, + "learning_rate": 0.0004878344133400958, + "loss": 3.1224045753479004, + "step": 4938, + "token_acc": 0.2843456352812928 + }, + { + "epoch": 2.895338610378188, + "grad_norm": 0.251835323765681, + "learning_rate": 0.00048782694567282874, + "loss": 3.1495532989501953, + "step": 4939, + "token_acc": 0.28241576672438634 + }, + { + "epoch": 2.8959249486953973, + "grad_norm": 0.23854720296066906, + "learning_rate": 0.00048781947577149806, + "loss": 3.0989975929260254, + "step": 4940, + "token_acc": 0.2916883357619884 + }, + { + "epoch": 2.8965112870126064, + "grad_norm": 0.2500097892330433, + "learning_rate": 0.00048781200363617384, + "loss": 3.1113104820251465, + "step": 4941, + "token_acc": 0.28744871583930875 + }, + { + "epoch": 2.897097625329815, + "grad_norm": 0.24953324638278887, + "learning_rate": 0.0004878045292669263, + "loss": 3.1418986320495605, + "step": 4942, + "token_acc": 0.28371842669769576 + }, + { + "epoch": 2.897683963647024, + "grad_norm": 0.24820076041688197, + "learning_rate": 0.00048779705266382566, + "loss": 3.081324338912964, + "step": 4943, + "token_acc": 0.29091045830457124 + }, + { + "epoch": 2.8982703019642333, + "grad_norm": 0.2243659998028146, + "learning_rate": 0.00048778957382694215, + "loss": 3.1396548748016357, + "step": 4944, + "token_acc": 0.2848575198792626 + }, + { + "epoch": 2.8988566402814424, + "grad_norm": 0.2207393730603077, + "learning_rate": 0.00048778209275634603, + "loss": 3.143639087677002, + "step": 4945, + "token_acc": 0.2830619183965641 + }, + { + "epoch": 2.8994429785986515, + "grad_norm": 0.30214378413488546, + "learning_rate": 0.00048777460945210755, + "loss": 3.134450912475586, + "step": 4946, + "token_acc": 0.284913070746327 + }, + { + "epoch": 2.9000293169158606, + "grad_norm": 0.3012189256467185, + "learning_rate": 0.000487767123914297, + "loss": 3.1279685497283936, + "step": 4947, + "token_acc": 0.2827419230799829 + }, + { + "epoch": 2.9006156552330697, + "grad_norm": 0.21262570409533066, + "learning_rate": 0.0004877596361429848, + "loss": 3.10392427444458, + "step": 4948, + "token_acc": 0.28851040552972473 + }, + { + "epoch": 2.9012019935502784, + "grad_norm": 0.31257265836380166, + "learning_rate": 0.00048775214613824114, + "loss": 3.184384346008301, + "step": 4949, + "token_acc": 0.27766400211402525 + }, + { + "epoch": 2.9017883318674875, + "grad_norm": 0.34238082151502763, + "learning_rate": 0.00048774465390013643, + "loss": 3.1837613582611084, + "step": 4950, + "token_acc": 0.2784403871052689 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.242457552158686, + "learning_rate": 0.00048773715942874107, + "loss": 3.1589243412017822, + "step": 4951, + "token_acc": 0.2813106662579677 + }, + { + "epoch": 2.9029610085019057, + "grad_norm": 0.3005332064815135, + "learning_rate": 0.0004877296627241254, + "loss": 3.12115216255188, + "step": 4952, + "token_acc": 0.2860999072423892 + }, + { + "epoch": 2.9035473468191144, + "grad_norm": 0.21735373005870992, + "learning_rate": 0.00048772216378636, + "loss": 3.1634509563446045, + "step": 4953, + "token_acc": 0.2804573349470225 + }, + { + "epoch": 2.9041336851363235, + "grad_norm": 0.264352902620328, + "learning_rate": 0.0004877146626155152, + "loss": 3.0979161262512207, + "step": 4954, + "token_acc": 0.2897223608342499 + }, + { + "epoch": 2.9047200234535326, + "grad_norm": 0.29044381708577865, + "learning_rate": 0.0004877071592116614, + "loss": 3.152224063873291, + "step": 4955, + "token_acc": 0.279707410375042 + }, + { + "epoch": 2.9053063617707418, + "grad_norm": 0.332172293547952, + "learning_rate": 0.00048769965357486916, + "loss": 3.1629724502563477, + "step": 4956, + "token_acc": 0.2812857354005088 + }, + { + "epoch": 2.905892700087951, + "grad_norm": 0.28792439526554087, + "learning_rate": 0.00048769214570520904, + "loss": 3.1459832191467285, + "step": 4957, + "token_acc": 0.2817521322522902 + }, + { + "epoch": 2.90647903840516, + "grad_norm": 0.243877731574557, + "learning_rate": 0.0004876846356027514, + "loss": 3.1482694149017334, + "step": 4958, + "token_acc": 0.28087909356422314 + }, + { + "epoch": 2.9070653767223686, + "grad_norm": 0.24062871732834393, + "learning_rate": 0.00048767712326756694, + "loss": 3.1307859420776367, + "step": 4959, + "token_acc": 0.2844771601322176 + }, + { + "epoch": 2.9076517150395778, + "grad_norm": 0.24555808404074167, + "learning_rate": 0.00048766960869972624, + "loss": 3.1613354682922363, + "step": 4960, + "token_acc": 0.2794603496326324 + }, + { + "epoch": 2.908238053356787, + "grad_norm": 0.24587791851756013, + "learning_rate": 0.0004876620918992998, + "loss": 3.1440720558166504, + "step": 4961, + "token_acc": 0.28235912693962884 + }, + { + "epoch": 2.908824391673996, + "grad_norm": 0.23536638324006415, + "learning_rate": 0.00048765457286635826, + "loss": 3.1673424243927, + "step": 4962, + "token_acc": 0.2788081499660811 + }, + { + "epoch": 2.909410729991205, + "grad_norm": 0.23296239241269304, + "learning_rate": 0.0004876470516009722, + "loss": 3.1183199882507324, + "step": 4963, + "token_acc": 0.2860383372388748 + }, + { + "epoch": 2.9099970683084138, + "grad_norm": 0.2325040259701827, + "learning_rate": 0.0004876395281032124, + "loss": 3.125436305999756, + "step": 4964, + "token_acc": 0.2854846778307751 + }, + { + "epoch": 2.910583406625623, + "grad_norm": 0.24309006947276676, + "learning_rate": 0.0004876320023731494, + "loss": 3.1273741722106934, + "step": 4965, + "token_acc": 0.2858567641450445 + }, + { + "epoch": 2.911169744942832, + "grad_norm": 0.21467208252067768, + "learning_rate": 0.000487624474410854, + "loss": 3.1292076110839844, + "step": 4966, + "token_acc": 0.28431792871595657 + }, + { + "epoch": 2.911756083260041, + "grad_norm": 0.22260149417323666, + "learning_rate": 0.0004876169442163968, + "loss": 3.100684642791748, + "step": 4967, + "token_acc": 0.28478081246825704 + }, + { + "epoch": 2.91234242157725, + "grad_norm": 0.25475399156640255, + "learning_rate": 0.00048760941178984865, + "loss": 3.1385393142700195, + "step": 4968, + "token_acc": 0.2827265086395817 + }, + { + "epoch": 2.9129287598944593, + "grad_norm": 0.2689112418370151, + "learning_rate": 0.00048760187713128026, + "loss": 3.155050277709961, + "step": 4969, + "token_acc": 0.2811823812435206 + }, + { + "epoch": 2.913515098211668, + "grad_norm": 0.2706116503433038, + "learning_rate": 0.0004875943402407624, + "loss": 3.0980143547058105, + "step": 4970, + "token_acc": 0.28876156505457606 + }, + { + "epoch": 2.914101436528877, + "grad_norm": 0.2307981305182796, + "learning_rate": 0.00048758680111836585, + "loss": 3.1046605110168457, + "step": 4971, + "token_acc": 0.2884801955015759 + }, + { + "epoch": 2.914687774846086, + "grad_norm": 0.2213007048394922, + "learning_rate": 0.0004875792597641615, + "loss": 3.0680899620056152, + "step": 4972, + "token_acc": 0.29343777062571896 + }, + { + "epoch": 2.9152741131632953, + "grad_norm": 0.2867207568041624, + "learning_rate": 0.0004875717161782201, + "loss": 3.1130900382995605, + "step": 4973, + "token_acc": 0.28708213635390595 + }, + { + "epoch": 2.915860451480504, + "grad_norm": 0.42013674859298566, + "learning_rate": 0.0004875641703606126, + "loss": 3.160256862640381, + "step": 4974, + "token_acc": 0.28140747383206494 + }, + { + "epoch": 2.916446789797713, + "grad_norm": 0.5600867800506169, + "learning_rate": 0.00048755662231140986, + "loss": 3.153989791870117, + "step": 4975, + "token_acc": 0.28072518836482474 + }, + { + "epoch": 2.917033128114922, + "grad_norm": 0.27464065531475684, + "learning_rate": 0.0004875490720306827, + "loss": 3.1440978050231934, + "step": 4976, + "token_acc": 0.2809371053296287 + }, + { + "epoch": 2.9176194664321313, + "grad_norm": 0.42182624221510395, + "learning_rate": 0.00048754151951850214, + "loss": 3.1297121047973633, + "step": 4977, + "token_acc": 0.2849239023612845 + }, + { + "epoch": 2.9182058047493404, + "grad_norm": 0.2842807842374587, + "learning_rate": 0.00048753396477493904, + "loss": 3.0647027492523193, + "step": 4978, + "token_acc": 0.29411764705882354 + }, + { + "epoch": 2.9187921430665495, + "grad_norm": 0.3331885838986031, + "learning_rate": 0.0004875264078000645, + "loss": 3.111910820007324, + "step": 4979, + "token_acc": 0.2858324721152765 + }, + { + "epoch": 2.9193784813837587, + "grad_norm": 0.27789912681534007, + "learning_rate": 0.0004875188485939494, + "loss": 3.104243040084839, + "step": 4980, + "token_acc": 0.2886180704564458 + }, + { + "epoch": 2.9199648197009673, + "grad_norm": 0.3143294974976295, + "learning_rate": 0.0004875112871566648, + "loss": 3.13938045501709, + "step": 4981, + "token_acc": 0.2845464135021097 + }, + { + "epoch": 2.9205511580181764, + "grad_norm": 0.21218678057505008, + "learning_rate": 0.0004875037234882817, + "loss": 3.116074800491333, + "step": 4982, + "token_acc": 0.2850433834843563 + }, + { + "epoch": 2.9211374963353856, + "grad_norm": 0.3236348785750339, + "learning_rate": 0.0004874961575888711, + "loss": 3.094538450241089, + "step": 4983, + "token_acc": 0.289173755851339 + }, + { + "epoch": 2.9217238346525947, + "grad_norm": 0.2620582726122614, + "learning_rate": 0.0004874885894585042, + "loss": 3.1313014030456543, + "step": 4984, + "token_acc": 0.284410233575072 + }, + { + "epoch": 2.9223101729698033, + "grad_norm": 0.28180325216626967, + "learning_rate": 0.000487481019097252, + "loss": 3.134223699569702, + "step": 4985, + "token_acc": 0.28451412923392694 + }, + { + "epoch": 2.9228965112870124, + "grad_norm": 0.24338775858186906, + "learning_rate": 0.0004874734465051857, + "loss": 3.1487369537353516, + "step": 4986, + "token_acc": 0.2829513441434034 + }, + { + "epoch": 2.9234828496042216, + "grad_norm": 0.28237787731808073, + "learning_rate": 0.0004874658716823762, + "loss": 3.165557861328125, + "step": 4987, + "token_acc": 0.2794731053374284 + }, + { + "epoch": 2.9240691879214307, + "grad_norm": 0.27270117052472936, + "learning_rate": 0.00048745829462889503, + "loss": 3.1274406909942627, + "step": 4988, + "token_acc": 0.2839410664696169 + }, + { + "epoch": 2.92465552623864, + "grad_norm": 0.22488968672447976, + "learning_rate": 0.000487450715344813, + "loss": 3.147987127304077, + "step": 4989, + "token_acc": 0.28190344362983744 + }, + { + "epoch": 2.925241864555849, + "grad_norm": 0.2649247462912229, + "learning_rate": 0.00048744313383020153, + "loss": 3.1359360218048096, + "step": 4990, + "token_acc": 0.2811820282336059 + }, + { + "epoch": 2.925828202873058, + "grad_norm": 0.22684200560699008, + "learning_rate": 0.0004874355500851318, + "loss": 3.1208736896514893, + "step": 4991, + "token_acc": 0.2868541313481241 + }, + { + "epoch": 2.9264145411902667, + "grad_norm": 0.24243535116984258, + "learning_rate": 0.000487427964109675, + "loss": 3.1571602821350098, + "step": 4992, + "token_acc": 0.2810554551253304 + }, + { + "epoch": 2.927000879507476, + "grad_norm": 0.23746714915998451, + "learning_rate": 0.0004874203759039024, + "loss": 3.0808215141296387, + "step": 4993, + "token_acc": 0.2909341063008429 + }, + { + "epoch": 2.927587217824685, + "grad_norm": 0.2573632656009003, + "learning_rate": 0.0004874127854678853, + "loss": 3.1138830184936523, + "step": 4994, + "token_acc": 0.285584410765308 + }, + { + "epoch": 2.928173556141894, + "grad_norm": 0.2776031992059007, + "learning_rate": 0.000487405192801695, + "loss": 3.134657382965088, + "step": 4995, + "token_acc": 0.2829649039104763 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.2897421656459499, + "learning_rate": 0.00048739759790540285, + "loss": 3.126394033432007, + "step": 4996, + "token_acc": 0.28627142103371545 + }, + { + "epoch": 2.929346232776312, + "grad_norm": 0.23327237980974946, + "learning_rate": 0.0004873900007790801, + "loss": 3.066833972930908, + "step": 4997, + "token_acc": 0.2927761752947725 + }, + { + "epoch": 2.929932571093521, + "grad_norm": 0.30007429625096604, + "learning_rate": 0.0004873824014227983, + "loss": 3.117905378341675, + "step": 4998, + "token_acc": 0.286602941443063 + }, + { + "epoch": 2.93051890941073, + "grad_norm": 0.28489495058823794, + "learning_rate": 0.00048737479983662857, + "loss": 3.134899377822876, + "step": 4999, + "token_acc": 0.2834375651180679 + }, + { + "epoch": 2.931105247727939, + "grad_norm": 0.21440386072585876, + "learning_rate": 0.0004873671960206426, + "loss": 3.109004497528076, + "step": 5000, + "token_acc": 0.2855063759009425 + }, + { + "epoch": 2.9316915860451482, + "grad_norm": 0.25605963330166026, + "learning_rate": 0.00048735958997491157, + "loss": 3.1286637783050537, + "step": 5001, + "token_acc": 0.28562270418237784 + }, + { + "epoch": 2.9322779243623573, + "grad_norm": 0.25141988071908633, + "learning_rate": 0.00048735198169950713, + "loss": 3.1293187141418457, + "step": 5002, + "token_acc": 0.28528434815723774 + }, + { + "epoch": 2.932864262679566, + "grad_norm": 0.2141529271474328, + "learning_rate": 0.0004873443711945006, + "loss": 3.1483154296875, + "step": 5003, + "token_acc": 0.2803410322841446 + }, + { + "epoch": 2.933450600996775, + "grad_norm": 0.3130246265328783, + "learning_rate": 0.0004873367584599635, + "loss": 3.1273369789123535, + "step": 5004, + "token_acc": 0.283746491804647 + }, + { + "epoch": 2.9340369393139842, + "grad_norm": 0.32513088324049844, + "learning_rate": 0.0004873291434959674, + "loss": 3.1280877590179443, + "step": 5005, + "token_acc": 0.284706452452695 + }, + { + "epoch": 2.9346232776311933, + "grad_norm": 0.20923199331385156, + "learning_rate": 0.00048732152630258385, + "loss": 3.11245059967041, + "step": 5006, + "token_acc": 0.28776597039469237 + }, + { + "epoch": 2.935209615948402, + "grad_norm": 0.24136986709090455, + "learning_rate": 0.0004873139068798843, + "loss": 3.174717903137207, + "step": 5007, + "token_acc": 0.27896171054132496 + }, + { + "epoch": 2.935795954265611, + "grad_norm": 0.2392244794975838, + "learning_rate": 0.0004873062852279404, + "loss": 3.1441166400909424, + "step": 5008, + "token_acc": 0.2831403496836519 + }, + { + "epoch": 2.9363822925828202, + "grad_norm": 0.2682497200847533, + "learning_rate": 0.0004872986613468237, + "loss": 3.1679811477661133, + "step": 5009, + "token_acc": 0.27847445358803624 + }, + { + "epoch": 2.9369686309000294, + "grad_norm": 0.26964364317596096, + "learning_rate": 0.0004872910352366059, + "loss": 3.1294591426849365, + "step": 5010, + "token_acc": 0.28302027034201555 + }, + { + "epoch": 2.9375549692172385, + "grad_norm": 0.22211537283776384, + "learning_rate": 0.0004872834068973585, + "loss": 3.1413230895996094, + "step": 5011, + "token_acc": 0.2842098806914244 + }, + { + "epoch": 2.9381413075344476, + "grad_norm": 0.24044243426936524, + "learning_rate": 0.00048727577632915326, + "loss": 3.126039981842041, + "step": 5012, + "token_acc": 0.28366730916099936 + }, + { + "epoch": 2.9387276458516562, + "grad_norm": 0.21969192877333163, + "learning_rate": 0.00048726814353206184, + "loss": 3.134089469909668, + "step": 5013, + "token_acc": 0.2849448362799129 + }, + { + "epoch": 2.9393139841688654, + "grad_norm": 0.25165462024635504, + "learning_rate": 0.000487260508506156, + "loss": 3.1366353034973145, + "step": 5014, + "token_acc": 0.28379830379998827 + }, + { + "epoch": 2.9399003224860745, + "grad_norm": 0.24104550424361132, + "learning_rate": 0.0004872528712515073, + "loss": 3.1400697231292725, + "step": 5015, + "token_acc": 0.28286487556934453 + }, + { + "epoch": 2.9404866608032836, + "grad_norm": 0.3091771984394633, + "learning_rate": 0.00048724523176818757, + "loss": 3.1702637672424316, + "step": 5016, + "token_acc": 0.27720797285316867 + }, + { + "epoch": 2.9410729991204922, + "grad_norm": 0.4061705666955857, + "learning_rate": 0.00048723759005626867, + "loss": 3.092770576477051, + "step": 5017, + "token_acc": 0.2907736293477542 + }, + { + "epoch": 2.9416593374377014, + "grad_norm": 0.31868675984414274, + "learning_rate": 0.00048722994611582224, + "loss": 3.1166653633117676, + "step": 5018, + "token_acc": 0.2852645800374422 + }, + { + "epoch": 2.9422456757549105, + "grad_norm": 0.26438728876931605, + "learning_rate": 0.00048722229994692016, + "loss": 3.119567394256592, + "step": 5019, + "token_acc": 0.2850664970829762 + }, + { + "epoch": 2.9428320140721196, + "grad_norm": 0.30612925404687125, + "learning_rate": 0.0004872146515496342, + "loss": 3.145846128463745, + "step": 5020, + "token_acc": 0.2825974932231139 + }, + { + "epoch": 2.9434183523893287, + "grad_norm": 0.21888526017309753, + "learning_rate": 0.00048720700092403626, + "loss": 3.135315418243408, + "step": 5021, + "token_acc": 0.2849608141737548 + }, + { + "epoch": 2.944004690706538, + "grad_norm": 0.26941200130587956, + "learning_rate": 0.00048719934807019816, + "loss": 3.1143932342529297, + "step": 5022, + "token_acc": 0.2864189088904438 + }, + { + "epoch": 2.944591029023747, + "grad_norm": 0.25301562560019936, + "learning_rate": 0.00048719169298819183, + "loss": 3.152822732925415, + "step": 5023, + "token_acc": 0.28264420939441876 + }, + { + "epoch": 2.9451773673409556, + "grad_norm": 0.31488721997814373, + "learning_rate": 0.0004871840356780892, + "loss": 3.1335601806640625, + "step": 5024, + "token_acc": 0.28600399733510995 + }, + { + "epoch": 2.9457637056581647, + "grad_norm": 0.24409918810160133, + "learning_rate": 0.00048717637613996214, + "loss": 3.12785005569458, + "step": 5025, + "token_acc": 0.28383287572751675 + }, + { + "epoch": 2.946350043975374, + "grad_norm": 0.2561151222652107, + "learning_rate": 0.0004871687143738826, + "loss": 3.1521596908569336, + "step": 5026, + "token_acc": 0.2818687684081323 + }, + { + "epoch": 2.946936382292583, + "grad_norm": 0.29187288183032073, + "learning_rate": 0.00048716105037992257, + "loss": 3.2151827812194824, + "step": 5027, + "token_acc": 0.2721538987546364 + }, + { + "epoch": 2.9475227206097916, + "grad_norm": 0.287352231972974, + "learning_rate": 0.0004871533841581541, + "loss": 3.1494736671447754, + "step": 5028, + "token_acc": 0.2819452216586166 + }, + { + "epoch": 2.9481090589270007, + "grad_norm": 0.21268769227228548, + "learning_rate": 0.0004871457157086491, + "loss": 3.103982925415039, + "step": 5029, + "token_acc": 0.28768441225360913 + }, + { + "epoch": 2.94869539724421, + "grad_norm": 0.2593412367636553, + "learning_rate": 0.00048713804503147976, + "loss": 3.124460220336914, + "step": 5030, + "token_acc": 0.28478371259005364 + }, + { + "epoch": 2.949281735561419, + "grad_norm": 0.2612123127343416, + "learning_rate": 0.00048713037212671796, + "loss": 3.107083559036255, + "step": 5031, + "token_acc": 0.2866644191221091 + }, + { + "epoch": 2.949868073878628, + "grad_norm": 0.24123033295897464, + "learning_rate": 0.0004871226969944358, + "loss": 3.1284422874450684, + "step": 5032, + "token_acc": 0.2848839372564862 + }, + { + "epoch": 2.950454412195837, + "grad_norm": 0.27460143525491787, + "learning_rate": 0.0004871150196347055, + "loss": 3.1908700466156006, + "step": 5033, + "token_acc": 0.27775067089534033 + }, + { + "epoch": 2.9510407505130463, + "grad_norm": 0.26320050572031906, + "learning_rate": 0.0004871073400475991, + "loss": 3.103938341140747, + "step": 5034, + "token_acc": 0.2871200750966551 + }, + { + "epoch": 2.951627088830255, + "grad_norm": 0.2645706212320269, + "learning_rate": 0.0004870996582331888, + "loss": 3.1424694061279297, + "step": 5035, + "token_acc": 0.2831940824158473 + }, + { + "epoch": 2.952213427147464, + "grad_norm": 0.21439366042139882, + "learning_rate": 0.0004870919741915466, + "loss": 3.127854347229004, + "step": 5036, + "token_acc": 0.2837863641759002 + }, + { + "epoch": 2.952799765464673, + "grad_norm": 0.2401823601680044, + "learning_rate": 0.0004870842879227448, + "loss": 3.144318103790283, + "step": 5037, + "token_acc": 0.2806005078526113 + }, + { + "epoch": 2.9533861037818823, + "grad_norm": 0.22641307004500938, + "learning_rate": 0.00048707659942685567, + "loss": 3.0996389389038086, + "step": 5038, + "token_acc": 0.2884437843696062 + }, + { + "epoch": 2.953972442099091, + "grad_norm": 0.24555454429266688, + "learning_rate": 0.0004870689087039513, + "loss": 3.1291019916534424, + "step": 5039, + "token_acc": 0.2853980622152914 + }, + { + "epoch": 2.9545587804163, + "grad_norm": 0.26631543914857037, + "learning_rate": 0.000487061215754104, + "loss": 3.141916513442993, + "step": 5040, + "token_acc": 0.2839145118568745 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.25769035552611275, + "learning_rate": 0.0004870535205773859, + "loss": 3.1855549812316895, + "step": 5041, + "token_acc": 0.277336928133887 + }, + { + "epoch": 2.9557314570507183, + "grad_norm": 0.27460155688643045, + "learning_rate": 0.0004870458231738696, + "loss": 3.154581069946289, + "step": 5042, + "token_acc": 0.28180919789939646 + }, + { + "epoch": 2.9563177953679274, + "grad_norm": 0.4073859059057163, + "learning_rate": 0.0004870381235436271, + "loss": 3.130425453186035, + "step": 5043, + "token_acc": 0.2838434707229985 + }, + { + "epoch": 2.9569041336851365, + "grad_norm": 0.43431855198354563, + "learning_rate": 0.00048703042168673095, + "loss": 3.1138930320739746, + "step": 5044, + "token_acc": 0.28732473607851616 + }, + { + "epoch": 2.9574904720023456, + "grad_norm": 0.21475586317070977, + "learning_rate": 0.0004870227176032533, + "loss": 3.1383705139160156, + "step": 5045, + "token_acc": 0.2824630620134512 + }, + { + "epoch": 2.9580768103195543, + "grad_norm": 0.3315855311179034, + "learning_rate": 0.00048701501129326665, + "loss": 3.1405115127563477, + "step": 5046, + "token_acc": 0.28419331433558714 + }, + { + "epoch": 2.9586631486367634, + "grad_norm": 0.2330663298856108, + "learning_rate": 0.00048700730275684327, + "loss": 3.1151113510131836, + "step": 5047, + "token_acc": 0.2858880426283629 + }, + { + "epoch": 2.9592494869539725, + "grad_norm": 0.3248144425581349, + "learning_rate": 0.0004869995919940557, + "loss": 3.1694445610046387, + "step": 5048, + "token_acc": 0.2802275153005391 + }, + { + "epoch": 2.9598358252711816, + "grad_norm": 0.22135343760054257, + "learning_rate": 0.0004869918790049764, + "loss": 3.154057025909424, + "step": 5049, + "token_acc": 0.2813104458312992 + }, + { + "epoch": 2.9604221635883903, + "grad_norm": 0.26102946612720423, + "learning_rate": 0.0004869841637896777, + "loss": 3.1102654933929443, + "step": 5050, + "token_acc": 0.2883207592439551 + }, + { + "epoch": 2.9610085019055994, + "grad_norm": 0.2144279742893639, + "learning_rate": 0.00048697644634823205, + "loss": 3.1315462589263916, + "step": 5051, + "token_acc": 0.2844339686174153 + }, + { + "epoch": 2.9615948402228085, + "grad_norm": 0.2702593642371519, + "learning_rate": 0.00048696872668071214, + "loss": 3.147261619567871, + "step": 5052, + "token_acc": 0.28296109264525837 + }, + { + "epoch": 2.9621811785400176, + "grad_norm": 0.2654448283294186, + "learning_rate": 0.00048696100478719023, + "loss": 3.079688549041748, + "step": 5053, + "token_acc": 0.29133618954434815 + }, + { + "epoch": 2.9627675168572267, + "grad_norm": 0.25409080232221837, + "learning_rate": 0.0004869532806677391, + "loss": 3.1200003623962402, + "step": 5054, + "token_acc": 0.2858537435271992 + }, + { + "epoch": 2.963353855174436, + "grad_norm": 0.2931293350787964, + "learning_rate": 0.00048694555432243113, + "loss": 3.0992982387542725, + "step": 5055, + "token_acc": 0.2891093047617785 + }, + { + "epoch": 2.963940193491645, + "grad_norm": 0.2713098398333767, + "learning_rate": 0.00048693782575133895, + "loss": 3.13455867767334, + "step": 5056, + "token_acc": 0.28383693265473514 + }, + { + "epoch": 2.9645265318088536, + "grad_norm": 0.30791694391189034, + "learning_rate": 0.00048693009495453523, + "loss": 3.1085782051086426, + "step": 5057, + "token_acc": 0.2879622447353919 + }, + { + "epoch": 2.9651128701260627, + "grad_norm": 0.2132019125100214, + "learning_rate": 0.0004869223619320925, + "loss": 3.0838470458984375, + "step": 5058, + "token_acc": 0.2916803367638968 + }, + { + "epoch": 2.965699208443272, + "grad_norm": 0.2620606354028236, + "learning_rate": 0.0004869146266840835, + "loss": 3.095184326171875, + "step": 5059, + "token_acc": 0.28787502146274707 + }, + { + "epoch": 2.966285546760481, + "grad_norm": 0.24769001751269004, + "learning_rate": 0.00048690688921058077, + "loss": 3.188934326171875, + "step": 5060, + "token_acc": 0.27673149580610196 + }, + { + "epoch": 2.9668718850776896, + "grad_norm": 0.28063787414840935, + "learning_rate": 0.000486899149511657, + "loss": 3.13330340385437, + "step": 5061, + "token_acc": 0.2837298022763059 + }, + { + "epoch": 2.9674582233948987, + "grad_norm": 0.2200590009047734, + "learning_rate": 0.00048689140758738505, + "loss": 3.1424121856689453, + "step": 5062, + "token_acc": 0.2822681385626847 + }, + { + "epoch": 2.968044561712108, + "grad_norm": 0.26087387528970885, + "learning_rate": 0.0004868836634378375, + "loss": 3.0964503288269043, + "step": 5063, + "token_acc": 0.28938027813741113 + }, + { + "epoch": 2.968630900029317, + "grad_norm": 0.2550698872130819, + "learning_rate": 0.00048687591706308715, + "loss": 3.0813496112823486, + "step": 5064, + "token_acc": 0.29097788352771387 + }, + { + "epoch": 2.969217238346526, + "grad_norm": 0.2762376574762002, + "learning_rate": 0.0004868681684632067, + "loss": 3.1141421794891357, + "step": 5065, + "token_acc": 0.2867829564258136 + }, + { + "epoch": 2.969803576663735, + "grad_norm": 0.2240529240358315, + "learning_rate": 0.00048686041763826906, + "loss": 3.127117395401001, + "step": 5066, + "token_acc": 0.285890549380418 + }, + { + "epoch": 2.970389914980944, + "grad_norm": 0.2396271754345822, + "learning_rate": 0.00048685266458834694, + "loss": 3.1150436401367188, + "step": 5067, + "token_acc": 0.28616807705013847 + }, + { + "epoch": 2.970976253298153, + "grad_norm": 0.2606228808606463, + "learning_rate": 0.0004868449093135132, + "loss": 3.068798303604126, + "step": 5068, + "token_acc": 0.2923513169469049 + }, + { + "epoch": 2.971562591615362, + "grad_norm": 0.26481347756583673, + "learning_rate": 0.0004868371518138407, + "loss": 3.1042301654815674, + "step": 5069, + "token_acc": 0.28946109416886945 + }, + { + "epoch": 2.972148929932571, + "grad_norm": 0.21640718191958788, + "learning_rate": 0.00048682939208940227, + "loss": 3.159252166748047, + "step": 5070, + "token_acc": 0.27981310111914537 + }, + { + "epoch": 2.97273526824978, + "grad_norm": 0.3364816324936961, + "learning_rate": 0.0004868216301402709, + "loss": 3.1341726779937744, + "step": 5071, + "token_acc": 0.2827651031945321 + }, + { + "epoch": 2.973321606566989, + "grad_norm": 0.24019311813317218, + "learning_rate": 0.0004868138659665193, + "loss": 3.1295175552368164, + "step": 5072, + "token_acc": 0.28491618674119024 + }, + { + "epoch": 2.973907944884198, + "grad_norm": 0.32060329280615685, + "learning_rate": 0.00048680609956822064, + "loss": 3.14194917678833, + "step": 5073, + "token_acc": 0.2822914625383761 + }, + { + "epoch": 2.974494283201407, + "grad_norm": 0.26863143263727157, + "learning_rate": 0.0004867983309454478, + "loss": 3.1023459434509277, + "step": 5074, + "token_acc": 0.2881482200426529 + }, + { + "epoch": 2.9750806215186163, + "grad_norm": 0.3159415399831908, + "learning_rate": 0.0004867905600982737, + "loss": 3.1521291732788086, + "step": 5075, + "token_acc": 0.2812723697310284 + }, + { + "epoch": 2.9756669598358254, + "grad_norm": 0.23098873248335225, + "learning_rate": 0.0004867827870267714, + "loss": 3.1205129623413086, + "step": 5076, + "token_acc": 0.28746366585222005 + }, + { + "epoch": 2.9762532981530345, + "grad_norm": 0.2916610383535501, + "learning_rate": 0.0004867750117310138, + "loss": 3.1205546855926514, + "step": 5077, + "token_acc": 0.28396449184376826 + }, + { + "epoch": 2.976839636470243, + "grad_norm": 0.2681245324707136, + "learning_rate": 0.0004867672342110741, + "loss": 3.0903921127319336, + "step": 5078, + "token_acc": 0.29124601679709766 + }, + { + "epoch": 2.9774259747874523, + "grad_norm": 0.2775514769308955, + "learning_rate": 0.0004867594544670252, + "loss": 3.0816352367401123, + "step": 5079, + "token_acc": 0.2923162904927083 + }, + { + "epoch": 2.9780123131046614, + "grad_norm": 0.26866945673466197, + "learning_rate": 0.0004867516724989404, + "loss": 3.1228179931640625, + "step": 5080, + "token_acc": 0.28582954226048574 + }, + { + "epoch": 2.9785986514218705, + "grad_norm": 0.2696039043914441, + "learning_rate": 0.00048674388830689255, + "loss": 3.1396074295043945, + "step": 5081, + "token_acc": 0.28074636601314124 + }, + { + "epoch": 2.979184989739079, + "grad_norm": 0.2853419119533892, + "learning_rate": 0.00048673610189095486, + "loss": 3.131565570831299, + "step": 5082, + "token_acc": 0.2836169449220403 + }, + { + "epoch": 2.9797713280562883, + "grad_norm": 0.26128091138816056, + "learning_rate": 0.0004867283132512006, + "loss": 3.1794867515563965, + "step": 5083, + "token_acc": 0.27843310933016513 + }, + { + "epoch": 2.9803576663734974, + "grad_norm": 0.2644633157548551, + "learning_rate": 0.00048672052238770276, + "loss": 3.1267125606536865, + "step": 5084, + "token_acc": 0.2858645002649133 + }, + { + "epoch": 2.9809440046907065, + "grad_norm": 0.23538525143763747, + "learning_rate": 0.0004867127293005346, + "loss": 3.1287636756896973, + "step": 5085, + "token_acc": 0.28599776560089374 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.21746074606868301, + "learning_rate": 0.00048670493398976934, + "loss": 3.1294968128204346, + "step": 5086, + "token_acc": 0.2837701127276231 + }, + { + "epoch": 2.9821166813251248, + "grad_norm": 0.25141010510472817, + "learning_rate": 0.0004866971364554802, + "loss": 3.16300892829895, + "step": 5087, + "token_acc": 0.27777038889184447 + }, + { + "epoch": 2.982703019642334, + "grad_norm": 0.1987521988846626, + "learning_rate": 0.0004866893366977404, + "loss": 3.1690592765808105, + "step": 5088, + "token_acc": 0.2784115580468164 + }, + { + "epoch": 2.9832893579595425, + "grad_norm": 0.25688432513692644, + "learning_rate": 0.00048668153471662323, + "loss": 3.1373634338378906, + "step": 5089, + "token_acc": 0.2845778214138074 + }, + { + "epoch": 2.9838756962767516, + "grad_norm": 0.23918403315538103, + "learning_rate": 0.00048667373051220197, + "loss": 3.136134147644043, + "step": 5090, + "token_acc": 0.28470975779715774 + }, + { + "epoch": 2.9844620345939608, + "grad_norm": 0.25951237046769476, + "learning_rate": 0.00048666592408455004, + "loss": 3.0959150791168213, + "step": 5091, + "token_acc": 0.28886679818342476 + }, + { + "epoch": 2.98504837291117, + "grad_norm": 0.28256128964825933, + "learning_rate": 0.0004866581154337405, + "loss": 3.1380186080932617, + "step": 5092, + "token_acc": 0.28295188992633113 + }, + { + "epoch": 2.9856347112283785, + "grad_norm": 0.2862454050947879, + "learning_rate": 0.00048665030455984694, + "loss": 3.1073827743530273, + "step": 5093, + "token_acc": 0.28751925291929553 + }, + { + "epoch": 2.9862210495455876, + "grad_norm": 0.32948477080210786, + "learning_rate": 0.00048664249146294263, + "loss": 3.1256392002105713, + "step": 5094, + "token_acc": 0.28464736377472566 + }, + { + "epoch": 2.9868073878627968, + "grad_norm": 0.27460470425521083, + "learning_rate": 0.00048663467614310104, + "loss": 3.0692873001098633, + "step": 5095, + "token_acc": 0.2929485682692282 + }, + { + "epoch": 2.987393726180006, + "grad_norm": 0.28533287860485534, + "learning_rate": 0.00048662685860039547, + "loss": 3.103707790374756, + "step": 5096, + "token_acc": 0.28593239680478494 + }, + { + "epoch": 2.987980064497215, + "grad_norm": 0.3126168187488595, + "learning_rate": 0.00048661903883489947, + "loss": 3.157106399536133, + "step": 5097, + "token_acc": 0.28054050151858134 + }, + { + "epoch": 2.988566402814424, + "grad_norm": 0.23471742300715495, + "learning_rate": 0.00048661121684668646, + "loss": 3.1111836433410645, + "step": 5098, + "token_acc": 0.2865949010261743 + }, + { + "epoch": 2.989152741131633, + "grad_norm": 0.29379702455819884, + "learning_rate": 0.0004866033926358299, + "loss": 3.1283020973205566, + "step": 5099, + "token_acc": 0.2854787687865065 + }, + { + "epoch": 2.989739079448842, + "grad_norm": 0.2651475341495928, + "learning_rate": 0.0004865955662024033, + "loss": 3.0943541526794434, + "step": 5100, + "token_acc": 0.2889979394277619 + }, + { + "epoch": 2.990325417766051, + "grad_norm": 0.2862928524118598, + "learning_rate": 0.00048658773754648013, + "loss": 3.130763530731201, + "step": 5101, + "token_acc": 0.2849230705104492 + }, + { + "epoch": 2.99091175608326, + "grad_norm": 0.24627434560696157, + "learning_rate": 0.000486579906668134, + "loss": 3.1406311988830566, + "step": 5102, + "token_acc": 0.2826849241225138 + }, + { + "epoch": 2.991498094400469, + "grad_norm": 0.255040317350064, + "learning_rate": 0.00048657207356743844, + "loss": 3.1247973442077637, + "step": 5103, + "token_acc": 0.2870278398414728 + }, + { + "epoch": 2.992084432717678, + "grad_norm": 0.24753943025976863, + "learning_rate": 0.00048656423824446705, + "loss": 3.1451849937438965, + "step": 5104, + "token_acc": 0.28084009792552506 + }, + { + "epoch": 2.992670771034887, + "grad_norm": 0.28821650128826953, + "learning_rate": 0.0004865564006992934, + "loss": 3.151597499847412, + "step": 5105, + "token_acc": 0.2823802758932009 + }, + { + "epoch": 2.993257109352096, + "grad_norm": 0.2614114724031269, + "learning_rate": 0.0004865485609319911, + "loss": 3.161609172821045, + "step": 5106, + "token_acc": 0.27931655436091335 + }, + { + "epoch": 2.993843447669305, + "grad_norm": 0.23257223724618717, + "learning_rate": 0.0004865407189426339, + "loss": 3.119357109069824, + "step": 5107, + "token_acc": 0.28619423598977567 + }, + { + "epoch": 2.9944297859865143, + "grad_norm": 0.2860632164956374, + "learning_rate": 0.0004865328747312953, + "loss": 3.130225658416748, + "step": 5108, + "token_acc": 0.2853936817178839 + }, + { + "epoch": 2.9950161243037234, + "grad_norm": 0.22942938460325435, + "learning_rate": 0.0004865250282980491, + "loss": 3.128994941711426, + "step": 5109, + "token_acc": 0.28547814341579253 + }, + { + "epoch": 2.9956024626209325, + "grad_norm": 0.252124695179662, + "learning_rate": 0.0004865171796429689, + "loss": 3.1368064880371094, + "step": 5110, + "token_acc": 0.28463896745324835 + }, + { + "epoch": 2.996188800938141, + "grad_norm": 0.2823795971154206, + "learning_rate": 0.0004865093287661286, + "loss": 3.153078079223633, + "step": 5111, + "token_acc": 0.280427370705167 + }, + { + "epoch": 2.9967751392553503, + "grad_norm": 0.2480082593861109, + "learning_rate": 0.00048650147566760196, + "loss": 3.1281285285949707, + "step": 5112, + "token_acc": 0.28476552122888443 + }, + { + "epoch": 2.9973614775725594, + "grad_norm": 0.3097303307069442, + "learning_rate": 0.0004864936203474625, + "loss": 3.1273326873779297, + "step": 5113, + "token_acc": 0.28605087000306967 + }, + { + "epoch": 2.9979478158897686, + "grad_norm": 0.2413854207360571, + "learning_rate": 0.0004864857628057842, + "loss": 3.089822769165039, + "step": 5114, + "token_acc": 0.2903577966901602 + }, + { + "epoch": 2.998534154206977, + "grad_norm": 0.24080071615322693, + "learning_rate": 0.00048647790304264085, + "loss": 3.1740894317626953, + "step": 5115, + "token_acc": 0.27804753793004633 + }, + { + "epoch": 2.9991204925241863, + "grad_norm": 0.30992952844119925, + "learning_rate": 0.0004864700410581062, + "loss": 3.0656604766845703, + "step": 5116, + "token_acc": 0.2934852579756528 + }, + { + "epoch": 2.9997068308413954, + "grad_norm": 0.277401516394314, + "learning_rate": 0.0004864621768522542, + "loss": 3.1326990127563477, + "step": 5117, + "token_acc": 0.28198034437297753 + }, + { + "epoch": 3.0, + "grad_norm": 0.3087410535085222, + "learning_rate": 0.00048645431042515866, + "loss": 3.124927520751953, + "step": 5118, + "token_acc": 0.2873822204014748 + }, + { + "epoch": 3.0, + "eval_loss": 3.1179864406585693, + "eval_runtime": 6.6091, + "eval_samples_per_second": 38.734, + "eval_steps_per_second": 4.842, + "eval_token_acc": 0.2856722886310509, + "step": 5118 + }, + { + "epoch": 3.000586338317209, + "grad_norm": 0.2882160028749108, + "learning_rate": 0.0004864464417768936, + "loss": 3.117948055267334, + "step": 5119, + "token_acc": 0.28441056772106704 + }, + { + "epoch": 3.0011726766344182, + "grad_norm": 0.31834905557074633, + "learning_rate": 0.0004864385709075327, + "loss": 3.046848773956299, + "step": 5120, + "token_acc": 0.29483086298091427 + }, + { + "epoch": 3.001759014951627, + "grad_norm": 0.2644584368104796, + "learning_rate": 0.0004864306978171501, + "loss": 3.049337387084961, + "step": 5121, + "token_acc": 0.2947937634425712 + }, + { + "epoch": 3.002345353268836, + "grad_norm": 0.26222329268513556, + "learning_rate": 0.00048642282250581966, + "loss": 3.0931153297424316, + "step": 5122, + "token_acc": 0.28815215060453075 + }, + { + "epoch": 3.002931691586045, + "grad_norm": 0.3319241627417302, + "learning_rate": 0.00048641494497361537, + "loss": 3.059563398361206, + "step": 5123, + "token_acc": 0.2925396804762957 + }, + { + "epoch": 3.0035180299032542, + "grad_norm": 0.2714821117033217, + "learning_rate": 0.0004864070652206113, + "loss": 3.0870728492736816, + "step": 5124, + "token_acc": 0.28991021234646847 + }, + { + "epoch": 3.0041043682204633, + "grad_norm": 0.3172501950692396, + "learning_rate": 0.00048639918324688136, + "loss": 3.075063705444336, + "step": 5125, + "token_acc": 0.2919139446548876 + }, + { + "epoch": 3.0046907065376725, + "grad_norm": 0.27970886764511665, + "learning_rate": 0.0004863912990524997, + "loss": 3.0813117027282715, + "step": 5126, + "token_acc": 0.2898245537764277 + }, + { + "epoch": 3.005277044854881, + "grad_norm": 0.29214820361802823, + "learning_rate": 0.0004863834126375403, + "loss": 3.08788800239563, + "step": 5127, + "token_acc": 0.2879080821162637 + }, + { + "epoch": 3.0058633831720902, + "grad_norm": 0.31198781015029076, + "learning_rate": 0.0004863755240020773, + "loss": 3.0655016899108887, + "step": 5128, + "token_acc": 0.29140818165814025 + }, + { + "epoch": 3.0064497214892993, + "grad_norm": 0.27062728481234727, + "learning_rate": 0.0004863676331461847, + "loss": 3.065094232559204, + "step": 5129, + "token_acc": 0.2924741821462717 + }, + { + "epoch": 3.0070360598065085, + "grad_norm": 0.305095067684652, + "learning_rate": 0.00048635974006993677, + "loss": 3.0489299297332764, + "step": 5130, + "token_acc": 0.29337991908789995 + }, + { + "epoch": 3.0076223981237176, + "grad_norm": 0.23105103192595688, + "learning_rate": 0.0004863518447734075, + "loss": 3.062300682067871, + "step": 5131, + "token_acc": 0.293084426522514 + }, + { + "epoch": 3.0082087364409262, + "grad_norm": 0.29156645183418645, + "learning_rate": 0.0004863439472566712, + "loss": 3.086378335952759, + "step": 5132, + "token_acc": 0.28889833116796454 + }, + { + "epoch": 3.0087950747581353, + "grad_norm": 0.2643919033989638, + "learning_rate": 0.000486336047519802, + "loss": 3.059706687927246, + "step": 5133, + "token_acc": 0.2935018408235208 + }, + { + "epoch": 3.0093814130753445, + "grad_norm": 0.21485544492412725, + "learning_rate": 0.0004863281455628741, + "loss": 3.010554313659668, + "step": 5134, + "token_acc": 0.3008202619846144 + }, + { + "epoch": 3.0099677513925536, + "grad_norm": 0.272067651041854, + "learning_rate": 0.0004863202413859617, + "loss": 3.094364643096924, + "step": 5135, + "token_acc": 0.28657185385446843 + }, + { + "epoch": 3.0105540897097627, + "grad_norm": 0.2200032287618293, + "learning_rate": 0.00048631233498913905, + "loss": 3.029228925704956, + "step": 5136, + "token_acc": 0.29739917264702626 + }, + { + "epoch": 3.0111404280269713, + "grad_norm": 0.24699938399702395, + "learning_rate": 0.0004863044263724805, + "loss": 3.0791375637054443, + "step": 5137, + "token_acc": 0.29090153917426137 + }, + { + "epoch": 3.0117267663441805, + "grad_norm": 0.22653257312763628, + "learning_rate": 0.0004862965155360603, + "loss": 3.0522570610046387, + "step": 5138, + "token_acc": 0.29530881296809275 + }, + { + "epoch": 3.0123131046613896, + "grad_norm": 0.26947683095703245, + "learning_rate": 0.00048628860247995273, + "loss": 3.0387346744537354, + "step": 5139, + "token_acc": 0.2943674080794122 + }, + { + "epoch": 3.0128994429785987, + "grad_norm": 0.27191928943213844, + "learning_rate": 0.0004862806872042321, + "loss": 3.0484466552734375, + "step": 5140, + "token_acc": 0.29416462090514967 + }, + { + "epoch": 3.013485781295808, + "grad_norm": 0.24223586743640826, + "learning_rate": 0.0004862727697089728, + "loss": 3.0574183464050293, + "step": 5141, + "token_acc": 0.29256974052230755 + }, + { + "epoch": 3.014072119613017, + "grad_norm": 0.28356402252226, + "learning_rate": 0.0004862648499942493, + "loss": 3.065389633178711, + "step": 5142, + "token_acc": 0.29116122692033247 + }, + { + "epoch": 3.0146584579302256, + "grad_norm": 0.22051485975272037, + "learning_rate": 0.00048625692806013586, + "loss": 3.061725616455078, + "step": 5143, + "token_acc": 0.29221882005439553 + }, + { + "epoch": 3.0152447962474347, + "grad_norm": 0.2836496213321387, + "learning_rate": 0.00048624900390670695, + "loss": 3.062709331512451, + "step": 5144, + "token_acc": 0.2916811046359007 + }, + { + "epoch": 3.015831134564644, + "grad_norm": 0.2313336321943706, + "learning_rate": 0.000486241077534037, + "loss": 3.11256742477417, + "step": 5145, + "token_acc": 0.2832080326958979 + }, + { + "epoch": 3.016417472881853, + "grad_norm": 0.2623036074637004, + "learning_rate": 0.00048623314894220046, + "loss": 3.0813961029052734, + "step": 5146, + "token_acc": 0.2885755088436222 + }, + { + "epoch": 3.017003811199062, + "grad_norm": 0.2366062036052034, + "learning_rate": 0.00048622521813127174, + "loss": 3.0432567596435547, + "step": 5147, + "token_acc": 0.29490412547833694 + }, + { + "epoch": 3.0175901495162707, + "grad_norm": 0.21381505772151607, + "learning_rate": 0.0004862172851013255, + "loss": 3.033277750015259, + "step": 5148, + "token_acc": 0.297480382540461 + }, + { + "epoch": 3.01817648783348, + "grad_norm": 0.24359287788688994, + "learning_rate": 0.00048620934985243617, + "loss": 3.110724687576294, + "step": 5149, + "token_acc": 0.28736121002823195 + }, + { + "epoch": 3.018762826150689, + "grad_norm": 0.20885551861776722, + "learning_rate": 0.0004862014123846783, + "loss": 3.1032533645629883, + "step": 5150, + "token_acc": 0.2878907569642181 + }, + { + "epoch": 3.019349164467898, + "grad_norm": 0.27182600149614233, + "learning_rate": 0.0004861934726981264, + "loss": 3.0839195251464844, + "step": 5151, + "token_acc": 0.289646867736425 + }, + { + "epoch": 3.019935502785107, + "grad_norm": 0.23746196887024953, + "learning_rate": 0.0004861855307928551, + "loss": 3.073615550994873, + "step": 5152, + "token_acc": 0.2894042917843784 + }, + { + "epoch": 3.0205218411023163, + "grad_norm": 0.21783195713987594, + "learning_rate": 0.00048617758666893903, + "loss": 3.025961399078369, + "step": 5153, + "token_acc": 0.29741202210489553 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.22052411329225424, + "learning_rate": 0.0004861696403264528, + "loss": 3.10245418548584, + "step": 5154, + "token_acc": 0.2879073228788583 + }, + { + "epoch": 3.021694517736734, + "grad_norm": 0.260452859086525, + "learning_rate": 0.000486161691765471, + "loss": 3.053095817565918, + "step": 5155, + "token_acc": 0.29357544968100985 + }, + { + "epoch": 3.022280856053943, + "grad_norm": 0.27201297577491657, + "learning_rate": 0.00048615374098606837, + "loss": 3.0812697410583496, + "step": 5156, + "token_acc": 0.29122008573306485 + }, + { + "epoch": 3.0228671943711523, + "grad_norm": 0.23566806425581, + "learning_rate": 0.00048614578798831956, + "loss": 3.092101573944092, + "step": 5157, + "token_acc": 0.2885811918039659 + }, + { + "epoch": 3.0234535326883614, + "grad_norm": 0.20298902788148207, + "learning_rate": 0.0004861378327722993, + "loss": 3.0941519737243652, + "step": 5158, + "token_acc": 0.2876100148695897 + }, + { + "epoch": 3.02403987100557, + "grad_norm": 0.226501295528511, + "learning_rate": 0.0004861298753380822, + "loss": 3.0533154010772705, + "step": 5159, + "token_acc": 0.2940406834489757 + }, + { + "epoch": 3.024626209322779, + "grad_norm": 0.2826830258379027, + "learning_rate": 0.0004861219156857432, + "loss": 3.0858664512634277, + "step": 5160, + "token_acc": 0.28827576512483305 + }, + { + "epoch": 3.0252125476399883, + "grad_norm": 0.2657235648218342, + "learning_rate": 0.0004861139538153569, + "loss": 3.0944931507110596, + "step": 5161, + "token_acc": 0.2868873820242518 + }, + { + "epoch": 3.0257988859571974, + "grad_norm": 0.21443986378105956, + "learning_rate": 0.0004861059897269983, + "loss": 3.0560765266418457, + "step": 5162, + "token_acc": 0.29272467675845526 + }, + { + "epoch": 3.0263852242744065, + "grad_norm": 0.22073308017767562, + "learning_rate": 0.00048609802342074204, + "loss": 3.0360772609710693, + "step": 5163, + "token_acc": 0.2964008951087199 + }, + { + "epoch": 3.026971562591615, + "grad_norm": 0.2530275578061761, + "learning_rate": 0.00048609005489666296, + "loss": 3.103590965270996, + "step": 5164, + "token_acc": 0.28694595907790316 + }, + { + "epoch": 3.0275579009088243, + "grad_norm": 0.22060595795542087, + "learning_rate": 0.000486082084154836, + "loss": 3.0465476512908936, + "step": 5165, + "token_acc": 0.2947382096920878 + }, + { + "epoch": 3.0281442392260334, + "grad_norm": 0.21208614628522823, + "learning_rate": 0.00048607411119533595, + "loss": 3.059967279434204, + "step": 5166, + "token_acc": 0.29185977514461897 + }, + { + "epoch": 3.0287305775432425, + "grad_norm": 0.23787190969563315, + "learning_rate": 0.0004860661360182377, + "loss": 3.055058479309082, + "step": 5167, + "token_acc": 0.29266713335666783 + }, + { + "epoch": 3.0293169158604516, + "grad_norm": 0.3774276247385556, + "learning_rate": 0.00048605815862361624, + "loss": 3.095343589782715, + "step": 5168, + "token_acc": 0.2881098570737051 + }, + { + "epoch": 3.0299032541776607, + "grad_norm": 0.5157414160119536, + "learning_rate": 0.00048605017901154644, + "loss": 3.095182418823242, + "step": 5169, + "token_acc": 0.2881034609823101 + }, + { + "epoch": 3.0304895924948694, + "grad_norm": 0.24417588156403958, + "learning_rate": 0.0004860421971821034, + "loss": 3.039158344268799, + "step": 5170, + "token_acc": 0.2954391283149259 + }, + { + "epoch": 3.0310759308120785, + "grad_norm": 0.33770151795217096, + "learning_rate": 0.0004860342131353619, + "loss": 3.0692014694213867, + "step": 5171, + "token_acc": 0.2903390493763492 + }, + { + "epoch": 3.0316622691292876, + "grad_norm": 0.21440975365850581, + "learning_rate": 0.0004860262268713971, + "loss": 3.048516273498535, + "step": 5172, + "token_acc": 0.29338846250379785 + }, + { + "epoch": 3.0322486074464967, + "grad_norm": 0.27213364220646075, + "learning_rate": 0.0004860182383902838, + "loss": 3.0958354473114014, + "step": 5173, + "token_acc": 0.28788101899572965 + }, + { + "epoch": 3.032834945763706, + "grad_norm": 0.23370546792198157, + "learning_rate": 0.00048601024769209735, + "loss": 3.089506149291992, + "step": 5174, + "token_acc": 0.28906534029537756 + }, + { + "epoch": 3.0334212840809145, + "grad_norm": 0.2523841166717806, + "learning_rate": 0.0004860022547769125, + "loss": 3.0423972606658936, + "step": 5175, + "token_acc": 0.2937526016020793 + }, + { + "epoch": 3.0340076223981236, + "grad_norm": 0.26197981283164257, + "learning_rate": 0.0004859942596448046, + "loss": 3.0522491931915283, + "step": 5176, + "token_acc": 0.2950880639775114 + }, + { + "epoch": 3.0345939607153327, + "grad_norm": 0.21591051042242448, + "learning_rate": 0.00048598626229584866, + "loss": 3.079040050506592, + "step": 5177, + "token_acc": 0.2906592160716008 + }, + { + "epoch": 3.035180299032542, + "grad_norm": 0.2861028648431269, + "learning_rate": 0.0004859782627301197, + "loss": 3.1199498176574707, + "step": 5178, + "token_acc": 0.2834390323635175 + }, + { + "epoch": 3.035766637349751, + "grad_norm": 0.22137764469169652, + "learning_rate": 0.00048597026094769294, + "loss": 3.053640842437744, + "step": 5179, + "token_acc": 0.29326542515518894 + }, + { + "epoch": 3.03635297566696, + "grad_norm": 0.29187842657416035, + "learning_rate": 0.0004859622569486436, + "loss": 3.0686140060424805, + "step": 5180, + "token_acc": 0.29211527891025324 + }, + { + "epoch": 3.0369393139841687, + "grad_norm": 0.22940034649296087, + "learning_rate": 0.00048595425073304677, + "loss": 3.1228342056274414, + "step": 5181, + "token_acc": 0.28545669508133 + }, + { + "epoch": 3.037525652301378, + "grad_norm": 0.2958295550146608, + "learning_rate": 0.00048594624230097774, + "loss": 3.081361770629883, + "step": 5182, + "token_acc": 0.28991781951610335 + }, + { + "epoch": 3.038111990618587, + "grad_norm": 0.2812588592185084, + "learning_rate": 0.00048593823165251173, + "loss": 3.116501808166504, + "step": 5183, + "token_acc": 0.2847373854452421 + }, + { + "epoch": 3.038698328935796, + "grad_norm": 0.3165643224125176, + "learning_rate": 0.0004859302187877239, + "loss": 3.0625205039978027, + "step": 5184, + "token_acc": 0.2930380914722552 + }, + { + "epoch": 3.039284667253005, + "grad_norm": 0.24699632189106552, + "learning_rate": 0.0004859222037066896, + "loss": 3.0394372940063477, + "step": 5185, + "token_acc": 0.2956459074073093 + }, + { + "epoch": 3.039871005570214, + "grad_norm": 0.21267587083791145, + "learning_rate": 0.00048591418640948415, + "loss": 3.062208652496338, + "step": 5186, + "token_acc": 0.2922067610218949 + }, + { + "epoch": 3.040457343887423, + "grad_norm": 0.23055548758078567, + "learning_rate": 0.00048590616689618283, + "loss": 3.111326217651367, + "step": 5187, + "token_acc": 0.28780197086665466 + }, + { + "epoch": 3.041043682204632, + "grad_norm": 0.23274274612381035, + "learning_rate": 0.0004858981451668609, + "loss": 3.1234617233276367, + "step": 5188, + "token_acc": 0.2853598141969318 + }, + { + "epoch": 3.041630020521841, + "grad_norm": 0.23070190152065695, + "learning_rate": 0.0004858901212215938, + "loss": 3.0723695755004883, + "step": 5189, + "token_acc": 0.2929783340957292 + }, + { + "epoch": 3.0422163588390503, + "grad_norm": 0.23384417080529668, + "learning_rate": 0.0004858820950604569, + "loss": 3.0617289543151855, + "step": 5190, + "token_acc": 0.2930191550328103 + }, + { + "epoch": 3.042802697156259, + "grad_norm": 0.23651873002484405, + "learning_rate": 0.0004858740666835255, + "loss": 3.078335762023926, + "step": 5191, + "token_acc": 0.2887696685680794 + }, + { + "epoch": 3.043389035473468, + "grad_norm": 0.27029208020735157, + "learning_rate": 0.00048586603609087513, + "loss": 3.0879404544830322, + "step": 5192, + "token_acc": 0.2889154342547749 + }, + { + "epoch": 3.043975373790677, + "grad_norm": 0.20096834971336394, + "learning_rate": 0.0004858580032825812, + "loss": 3.052816390991211, + "step": 5193, + "token_acc": 0.2940693770980977 + }, + { + "epoch": 3.0445617121078863, + "grad_norm": 0.2984639972852076, + "learning_rate": 0.00048584996825871914, + "loss": 3.0756049156188965, + "step": 5194, + "token_acc": 0.2898000604234288 + }, + { + "epoch": 3.0451480504250954, + "grad_norm": 0.21713853605936775, + "learning_rate": 0.00048584193101936445, + "loss": 3.0888099670410156, + "step": 5195, + "token_acc": 0.2885610343490418 + }, + { + "epoch": 3.0457343887423045, + "grad_norm": 0.27219213045597174, + "learning_rate": 0.0004858338915645926, + "loss": 3.0222482681274414, + "step": 5196, + "token_acc": 0.2998593935156505 + }, + { + "epoch": 3.046320727059513, + "grad_norm": 0.32521878313062497, + "learning_rate": 0.00048582584989447907, + "loss": 3.0886552333831787, + "step": 5197, + "token_acc": 0.28827402683309855 + }, + { + "epoch": 3.0469070653767223, + "grad_norm": 0.24705784865921382, + "learning_rate": 0.00048581780600909957, + "loss": 3.0708184242248535, + "step": 5198, + "token_acc": 0.2902632296518273 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.36636533294246215, + "learning_rate": 0.0004858097599085295, + "loss": 3.062175989151001, + "step": 5199, + "token_acc": 0.29109758104327793 + }, + { + "epoch": 3.0480797420111405, + "grad_norm": 0.2326016431056734, + "learning_rate": 0.0004858017115928445, + "loss": 3.0578935146331787, + "step": 5200, + "token_acc": 0.2926591789013685 + }, + { + "epoch": 3.0486660803283496, + "grad_norm": 0.30422837007606635, + "learning_rate": 0.0004857936610621202, + "loss": 3.0532772541046143, + "step": 5201, + "token_acc": 0.2936129039117907 + }, + { + "epoch": 3.0492524186455583, + "grad_norm": 0.24142606601121344, + "learning_rate": 0.00048578560831643214, + "loss": 3.0546822547912598, + "step": 5202, + "token_acc": 0.29267986495437937 + }, + { + "epoch": 3.0498387569627674, + "grad_norm": 0.29826665916570516, + "learning_rate": 0.00048577755335585604, + "loss": 3.0806407928466797, + "step": 5203, + "token_acc": 0.2905272463627494 + }, + { + "epoch": 3.0504250952799765, + "grad_norm": 0.26946616616855495, + "learning_rate": 0.0004857694961804675, + "loss": 3.0651307106018066, + "step": 5204, + "token_acc": 0.29360836930101253 + }, + { + "epoch": 3.0510114335971856, + "grad_norm": 0.2611873108703457, + "learning_rate": 0.0004857614367903423, + "loss": 3.080674171447754, + "step": 5205, + "token_acc": 0.2897982445542838 + }, + { + "epoch": 3.0515977719143947, + "grad_norm": 0.2839189679525068, + "learning_rate": 0.0004857533751855561, + "loss": 3.095982551574707, + "step": 5206, + "token_acc": 0.2876162698635665 + }, + { + "epoch": 3.052184110231604, + "grad_norm": 0.21530893964637418, + "learning_rate": 0.0004857453113661846, + "loss": 3.101893424987793, + "step": 5207, + "token_acc": 0.286405318897111 + }, + { + "epoch": 3.0527704485488125, + "grad_norm": 0.2689238599398576, + "learning_rate": 0.00048573724533230355, + "loss": 3.0780866146087646, + "step": 5208, + "token_acc": 0.2902573864644148 + }, + { + "epoch": 3.0533567868660216, + "grad_norm": 0.21577878617149543, + "learning_rate": 0.0004857291770839887, + "loss": 3.124995470046997, + "step": 5209, + "token_acc": 0.2857989763025148 + }, + { + "epoch": 3.0539431251832307, + "grad_norm": 0.26754186806587926, + "learning_rate": 0.000485721106621316, + "loss": 3.073413133621216, + "step": 5210, + "token_acc": 0.29073124499398634 + }, + { + "epoch": 3.05452946350044, + "grad_norm": 0.2979932904404805, + "learning_rate": 0.0004857130339443611, + "loss": 3.106566905975342, + "step": 5211, + "token_acc": 0.2863657422371625 + }, + { + "epoch": 3.055115801817649, + "grad_norm": 0.2614817130852734, + "learning_rate": 0.00048570495905319975, + "loss": 3.042097568511963, + "step": 5212, + "token_acc": 0.2936354651457069 + }, + { + "epoch": 3.0557021401348576, + "grad_norm": 0.24640113427084576, + "learning_rate": 0.0004856968819479081, + "loss": 3.061169385910034, + "step": 5213, + "token_acc": 0.29207875945859535 + }, + { + "epoch": 3.0562884784520667, + "grad_norm": 0.23367505889205795, + "learning_rate": 0.0004856888026285617, + "loss": 3.0874786376953125, + "step": 5214, + "token_acc": 0.2894607013043119 + }, + { + "epoch": 3.056874816769276, + "grad_norm": 0.2098683440937314, + "learning_rate": 0.00048568072109523674, + "loss": 3.053040027618408, + "step": 5215, + "token_acc": 0.29335893410827574 + }, + { + "epoch": 3.057461155086485, + "grad_norm": 0.23098426715655707, + "learning_rate": 0.00048567263734800893, + "loss": 3.0737555027008057, + "step": 5216, + "token_acc": 0.290288427011385 + }, + { + "epoch": 3.058047493403694, + "grad_norm": 0.21248676131412744, + "learning_rate": 0.0004856645513869542, + "loss": 3.0830187797546387, + "step": 5217, + "token_acc": 0.2882682661470595 + }, + { + "epoch": 3.0586338317209028, + "grad_norm": 0.22576576993133143, + "learning_rate": 0.00048565646321214865, + "loss": 3.065706491470337, + "step": 5218, + "token_acc": 0.29212273369767666 + }, + { + "epoch": 3.059220170038112, + "grad_norm": 0.22977289001017456, + "learning_rate": 0.00048564837282366813, + "loss": 3.0613574981689453, + "step": 5219, + "token_acc": 0.29184455539063553 + }, + { + "epoch": 3.059806508355321, + "grad_norm": 0.26160483651925204, + "learning_rate": 0.00048564028022158874, + "loss": 3.0220696926116943, + "step": 5220, + "token_acc": 0.29893590429320555 + }, + { + "epoch": 3.06039284667253, + "grad_norm": 0.2170517412378351, + "learning_rate": 0.0004856321854059864, + "loss": 3.104980945587158, + "step": 5221, + "token_acc": 0.2884311842555493 + }, + { + "epoch": 3.060979184989739, + "grad_norm": 0.24205719991771996, + "learning_rate": 0.0004856240883769372, + "loss": 3.063933849334717, + "step": 5222, + "token_acc": 0.29482963707738924 + }, + { + "epoch": 3.0615655233069483, + "grad_norm": 0.2745472350216802, + "learning_rate": 0.0004856159891345172, + "loss": 3.073984146118164, + "step": 5223, + "token_acc": 0.2903301874294556 + }, + { + "epoch": 3.062151861624157, + "grad_norm": 0.2151189529096028, + "learning_rate": 0.0004856078876788025, + "loss": 3.098543643951416, + "step": 5224, + "token_acc": 0.2876095963264318 + }, + { + "epoch": 3.062738199941366, + "grad_norm": 0.25987197538088136, + "learning_rate": 0.0004855997840098692, + "loss": 3.043489933013916, + "step": 5225, + "token_acc": 0.2942860968135432 + }, + { + "epoch": 3.063324538258575, + "grad_norm": 0.20600653983720674, + "learning_rate": 0.00048559167812779335, + "loss": 3.07480525970459, + "step": 5226, + "token_acc": 0.29156108713038187 + }, + { + "epoch": 3.0639108765757843, + "grad_norm": 0.22839056602449656, + "learning_rate": 0.00048558357003265117, + "loss": 3.076939582824707, + "step": 5227, + "token_acc": 0.29157132111326656 + }, + { + "epoch": 3.0644972148929934, + "grad_norm": 0.2736531116092107, + "learning_rate": 0.00048557545972451884, + "loss": 3.0803170204162598, + "step": 5228, + "token_acc": 0.290480162349389 + }, + { + "epoch": 3.065083553210202, + "grad_norm": 0.3629298048101826, + "learning_rate": 0.0004855673472034725, + "loss": 3.0543994903564453, + "step": 5229, + "token_acc": 0.29361668405689906 + }, + { + "epoch": 3.065669891527411, + "grad_norm": 0.31196110360187296, + "learning_rate": 0.00048555923246958833, + "loss": 3.1105122566223145, + "step": 5230, + "token_acc": 0.28595971331966946 + }, + { + "epoch": 3.0662562298446203, + "grad_norm": 0.26968010099753403, + "learning_rate": 0.0004855511155229426, + "loss": 3.1003952026367188, + "step": 5231, + "token_acc": 0.2861464330798362 + }, + { + "epoch": 3.0668425681618294, + "grad_norm": 0.31679044683858004, + "learning_rate": 0.00048554299636361156, + "loss": 3.0746235847473145, + "step": 5232, + "token_acc": 0.29170201179218114 + }, + { + "epoch": 3.0674289064790385, + "grad_norm": 0.2380455912842498, + "learning_rate": 0.00048553487499167143, + "loss": 3.03865385055542, + "step": 5233, + "token_acc": 0.29671215711911963 + }, + { + "epoch": 3.068015244796247, + "grad_norm": 0.29084205010485037, + "learning_rate": 0.0004855267514071985, + "loss": 3.0861971378326416, + "step": 5234, + "token_acc": 0.290347847097026 + }, + { + "epoch": 3.0686015831134563, + "grad_norm": 0.270147042940698, + "learning_rate": 0.0004855186256102692, + "loss": 3.050185441970825, + "step": 5235, + "token_acc": 0.29399004286081515 + }, + { + "epoch": 3.0691879214306654, + "grad_norm": 0.21989156926929357, + "learning_rate": 0.00048551049760095976, + "loss": 3.0831711292266846, + "step": 5236, + "token_acc": 0.2899712454418403 + }, + { + "epoch": 3.0697742597478745, + "grad_norm": 0.24042272942448464, + "learning_rate": 0.0004855023673793466, + "loss": 3.074821710586548, + "step": 5237, + "token_acc": 0.29094616508517573 + }, + { + "epoch": 3.0703605980650837, + "grad_norm": 0.22862377657428393, + "learning_rate": 0.0004854942349455059, + "loss": 3.1099209785461426, + "step": 5238, + "token_acc": 0.28494912587008003 + }, + { + "epoch": 3.0709469363822928, + "grad_norm": 0.23513111823054786, + "learning_rate": 0.00048548610029951433, + "loss": 3.0856919288635254, + "step": 5239, + "token_acc": 0.28967279911775895 + }, + { + "epoch": 3.0715332746995014, + "grad_norm": 0.2491982801687533, + "learning_rate": 0.00048547796344144815, + "loss": 3.0947065353393555, + "step": 5240, + "token_acc": 0.2885461567867547 + }, + { + "epoch": 3.0721196130167105, + "grad_norm": 0.2475177679273553, + "learning_rate": 0.00048546982437138375, + "loss": 3.092134714126587, + "step": 5241, + "token_acc": 0.2902554831685151 + }, + { + "epoch": 3.0727059513339197, + "grad_norm": 0.22291407647683034, + "learning_rate": 0.0004854616830893977, + "loss": 3.0829644203186035, + "step": 5242, + "token_acc": 0.28947767816949416 + }, + { + "epoch": 3.0732922896511288, + "grad_norm": 0.23249588807185387, + "learning_rate": 0.00048545353959556636, + "loss": 3.072159767150879, + "step": 5243, + "token_acc": 0.2922139708857852 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.23048170405392537, + "learning_rate": 0.0004854453938899664, + "loss": 3.1043477058410645, + "step": 5244, + "token_acc": 0.28690024980618484 + }, + { + "epoch": 3.0744649662855466, + "grad_norm": 0.2105993301897914, + "learning_rate": 0.00048543724597267416, + "loss": 3.052957057952881, + "step": 5245, + "token_acc": 0.29426410991141005 + }, + { + "epoch": 3.0750513046027557, + "grad_norm": 0.2305878945979558, + "learning_rate": 0.00048542909584376625, + "loss": 3.0958142280578613, + "step": 5246, + "token_acc": 0.28880178318731603 + }, + { + "epoch": 3.0756376429199648, + "grad_norm": 0.2459117518914884, + "learning_rate": 0.0004854209435033193, + "loss": 3.0506558418273926, + "step": 5247, + "token_acc": 0.29318781053495746 + }, + { + "epoch": 3.076223981237174, + "grad_norm": 0.23206553952241118, + "learning_rate": 0.00048541278895140974, + "loss": 3.075748920440674, + "step": 5248, + "token_acc": 0.289481464575996 + }, + { + "epoch": 3.076810319554383, + "grad_norm": 0.19216039227917162, + "learning_rate": 0.00048540463218811424, + "loss": 3.029987335205078, + "step": 5249, + "token_acc": 0.2964475063732489 + }, + { + "epoch": 3.077396657871592, + "grad_norm": 0.22159583984247275, + "learning_rate": 0.0004853964732135095, + "loss": 3.1173555850982666, + "step": 5250, + "token_acc": 0.28369109198044434 + }, + { + "epoch": 3.077982996188801, + "grad_norm": 0.2565723282741172, + "learning_rate": 0.0004853883120276721, + "loss": 3.091160774230957, + "step": 5251, + "token_acc": 0.2897236199694454 + }, + { + "epoch": 3.07856933450601, + "grad_norm": 0.24045751366869014, + "learning_rate": 0.0004853801486306786, + "loss": 3.0558314323425293, + "step": 5252, + "token_acc": 0.2932082863536012 + }, + { + "epoch": 3.079155672823219, + "grad_norm": 0.2666021705203025, + "learning_rate": 0.0004853719830226059, + "loss": 3.0953288078308105, + "step": 5253, + "token_acc": 0.28738323697364476 + }, + { + "epoch": 3.079742011140428, + "grad_norm": 0.2685631373940192, + "learning_rate": 0.00048536381520353043, + "loss": 3.0666189193725586, + "step": 5254, + "token_acc": 0.29048242505986516 + }, + { + "epoch": 3.0803283494576372, + "grad_norm": 0.24492814729613077, + "learning_rate": 0.00048535564517352927, + "loss": 3.120121955871582, + "step": 5255, + "token_acc": 0.283545353471065 + }, + { + "epoch": 3.080914687774846, + "grad_norm": 0.23527681622175808, + "learning_rate": 0.0004853474729326788, + "loss": 3.072406053543091, + "step": 5256, + "token_acc": 0.28882249034150775 + }, + { + "epoch": 3.081501026092055, + "grad_norm": 0.2908396629971575, + "learning_rate": 0.00048533929848105606, + "loss": 3.065654754638672, + "step": 5257, + "token_acc": 0.29263950577613773 + }, + { + "epoch": 3.082087364409264, + "grad_norm": 0.3105124866618194, + "learning_rate": 0.00048533112181873775, + "loss": 3.0710458755493164, + "step": 5258, + "token_acc": 0.2911501322540553 + }, + { + "epoch": 3.0826737027264732, + "grad_norm": 0.2927566062707977, + "learning_rate": 0.0004853229429458006, + "loss": 3.0859200954437256, + "step": 5259, + "token_acc": 0.29062255632542183 + }, + { + "epoch": 3.0832600410436823, + "grad_norm": 0.26433959481425817, + "learning_rate": 0.0004853147618623215, + "loss": 3.082026720046997, + "step": 5260, + "token_acc": 0.28819525688822334 + }, + { + "epoch": 3.0838463793608915, + "grad_norm": 0.2348281596231774, + "learning_rate": 0.00048530657856837736, + "loss": 3.1209797859191895, + "step": 5261, + "token_acc": 0.28373229531429056 + }, + { + "epoch": 3.0844327176781, + "grad_norm": 0.28384059459679134, + "learning_rate": 0.0004852983930640449, + "loss": 3.087246894836426, + "step": 5262, + "token_acc": 0.28995229115789256 + }, + { + "epoch": 3.0850190559953092, + "grad_norm": 0.2880180579183701, + "learning_rate": 0.00048529020534940115, + "loss": 3.147364616394043, + "step": 5263, + "token_acc": 0.2826135982210288 + }, + { + "epoch": 3.0856053943125183, + "grad_norm": 0.2690478163397538, + "learning_rate": 0.000485282015424523, + "loss": 3.0888078212738037, + "step": 5264, + "token_acc": 0.2916706397060886 + }, + { + "epoch": 3.0861917326297275, + "grad_norm": 0.2961167020785109, + "learning_rate": 0.00048527382328948735, + "loss": 3.0707712173461914, + "step": 5265, + "token_acc": 0.2924201375217404 + }, + { + "epoch": 3.0867780709469366, + "grad_norm": 0.24505964873609412, + "learning_rate": 0.00048526562894437116, + "loss": 3.0708301067352295, + "step": 5266, + "token_acc": 0.2925084559235798 + }, + { + "epoch": 3.0873644092641452, + "grad_norm": 0.2523145534302392, + "learning_rate": 0.0004852574323892514, + "loss": 3.1526927947998047, + "step": 5267, + "token_acc": 0.28015322961489453 + }, + { + "epoch": 3.0879507475813543, + "grad_norm": 0.26446104497246503, + "learning_rate": 0.00048524923362420515, + "loss": 3.105459213256836, + "step": 5268, + "token_acc": 0.2872541031311838 + }, + { + "epoch": 3.0885370858985635, + "grad_norm": 0.2644804666396965, + "learning_rate": 0.0004852410326493093, + "loss": 3.165639877319336, + "step": 5269, + "token_acc": 0.2787009935060166 + }, + { + "epoch": 3.0891234242157726, + "grad_norm": 0.3533793292216388, + "learning_rate": 0.00048523282946464084, + "loss": 3.098417282104492, + "step": 5270, + "token_acc": 0.2868668026199999 + }, + { + "epoch": 3.0897097625329817, + "grad_norm": 0.24459757383992764, + "learning_rate": 0.0004852246240702771, + "loss": 3.0434343814849854, + "step": 5271, + "token_acc": 0.29385356383568 + }, + { + "epoch": 3.0902961008501904, + "grad_norm": 0.25331396911565857, + "learning_rate": 0.0004852164164662949, + "loss": 3.05297589302063, + "step": 5272, + "token_acc": 0.2935096913819826 + }, + { + "epoch": 3.0908824391673995, + "grad_norm": 0.22470611493413228, + "learning_rate": 0.00048520820665277144, + "loss": 3.0502848625183105, + "step": 5273, + "token_acc": 0.29399595946553037 + }, + { + "epoch": 3.0914687774846086, + "grad_norm": 0.27873143917945054, + "learning_rate": 0.0004851999946297838, + "loss": 3.0773239135742188, + "step": 5274, + "token_acc": 0.2898796087283672 + }, + { + "epoch": 3.0920551158018177, + "grad_norm": 0.2553183012027645, + "learning_rate": 0.0004851917803974092, + "loss": 3.0916311740875244, + "step": 5275, + "token_acc": 0.28800445959785836 + }, + { + "epoch": 3.092641454119027, + "grad_norm": 0.22817695916192723, + "learning_rate": 0.0004851835639557247, + "loss": 3.068256139755249, + "step": 5276, + "token_acc": 0.2929437654580307 + }, + { + "epoch": 3.093227792436236, + "grad_norm": 0.2324256666105188, + "learning_rate": 0.00048517534530480755, + "loss": 3.1146092414855957, + "step": 5277, + "token_acc": 0.28521486136267915 + }, + { + "epoch": 3.0938141307534446, + "grad_norm": 0.23712258852232615, + "learning_rate": 0.0004851671244447349, + "loss": 3.0510077476501465, + "step": 5278, + "token_acc": 0.29443173538897355 + }, + { + "epoch": 3.0944004690706537, + "grad_norm": 0.23027849843841336, + "learning_rate": 0.00048515890137558406, + "loss": 3.0696325302124023, + "step": 5279, + "token_acc": 0.2922022506453511 + }, + { + "epoch": 3.094986807387863, + "grad_norm": 0.20010596700351374, + "learning_rate": 0.0004851506760974321, + "loss": 3.1069397926330566, + "step": 5280, + "token_acc": 0.2861826599767459 + }, + { + "epoch": 3.095573145705072, + "grad_norm": 0.22656025040393174, + "learning_rate": 0.00048514244861035664, + "loss": 3.0675530433654785, + "step": 5281, + "token_acc": 0.29302827990994157 + }, + { + "epoch": 3.096159484022281, + "grad_norm": 0.2220925769437115, + "learning_rate": 0.00048513421891443456, + "loss": 3.091341495513916, + "step": 5282, + "token_acc": 0.2889386990471916 + }, + { + "epoch": 3.0967458223394897, + "grad_norm": 0.23174505951330446, + "learning_rate": 0.00048512598700974335, + "loss": 3.0739829540252686, + "step": 5283, + "token_acc": 0.290051042320096 + }, + { + "epoch": 3.097332160656699, + "grad_norm": 0.21713762533140324, + "learning_rate": 0.0004851177528963604, + "loss": 3.07684063911438, + "step": 5284, + "token_acc": 0.29102620811677543 + }, + { + "epoch": 3.097918498973908, + "grad_norm": 0.19880068309360657, + "learning_rate": 0.0004851095165743629, + "loss": 3.110692024230957, + "step": 5285, + "token_acc": 0.2865157537617961 + }, + { + "epoch": 3.098504837291117, + "grad_norm": 0.2632894283567366, + "learning_rate": 0.00048510127804382835, + "loss": 3.094048500061035, + "step": 5286, + "token_acc": 0.2889997847214477 + }, + { + "epoch": 3.099091175608326, + "grad_norm": 0.26756388082820515, + "learning_rate": 0.0004850930373048341, + "loss": 3.0018835067749023, + "step": 5287, + "token_acc": 0.3015662201778397 + }, + { + "epoch": 3.099677513925535, + "grad_norm": 0.22573785091953907, + "learning_rate": 0.00048508479435745757, + "loss": 3.0963966846466064, + "step": 5288, + "token_acc": 0.28909743977530517 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.23687499423800748, + "learning_rate": 0.00048507654920177615, + "loss": 3.099174976348877, + "step": 5289, + "token_acc": 0.2869317106152806 + }, + { + "epoch": 3.100850190559953, + "grad_norm": 0.27328660563388435, + "learning_rate": 0.0004850683018378673, + "loss": 3.0650577545166016, + "step": 5290, + "token_acc": 0.29184046633073063 + }, + { + "epoch": 3.101436528877162, + "grad_norm": 0.26886454781954544, + "learning_rate": 0.0004850600522658086, + "loss": 3.075317621231079, + "step": 5291, + "token_acc": 0.29188111663512406 + }, + { + "epoch": 3.1020228671943713, + "grad_norm": 0.21866429340224428, + "learning_rate": 0.0004850518004856773, + "loss": 3.1019554138183594, + "step": 5292, + "token_acc": 0.2887729384789927 + }, + { + "epoch": 3.1026092055115804, + "grad_norm": 0.2502165669135519, + "learning_rate": 0.0004850435464975512, + "loss": 3.0623326301574707, + "step": 5293, + "token_acc": 0.29168286349571626 + }, + { + "epoch": 3.103195543828789, + "grad_norm": 0.23376412934713223, + "learning_rate": 0.00048503529030150775, + "loss": 3.111783504486084, + "step": 5294, + "token_acc": 0.28539289854188277 + }, + { + "epoch": 3.103781882145998, + "grad_norm": 0.21710513803628576, + "learning_rate": 0.0004850270318976243, + "loss": 3.0831615924835205, + "step": 5295, + "token_acc": 0.2898277330189297 + }, + { + "epoch": 3.1043682204632073, + "grad_norm": 0.20717101484088, + "learning_rate": 0.0004850187712859787, + "loss": 3.055173873901367, + "step": 5296, + "token_acc": 0.29347306781397464 + }, + { + "epoch": 3.1049545587804164, + "grad_norm": 0.2029586861226379, + "learning_rate": 0.0004850105084666484, + "loss": 3.0654706954956055, + "step": 5297, + "token_acc": 0.2934862383007271 + }, + { + "epoch": 3.1055408970976255, + "grad_norm": 0.2004458865589157, + "learning_rate": 0.0004850022434397111, + "loss": 3.065861701965332, + "step": 5298, + "token_acc": 0.29267319606984166 + }, + { + "epoch": 3.106127235414834, + "grad_norm": 0.22544908963570545, + "learning_rate": 0.0004849939762052443, + "loss": 3.1037955284118652, + "step": 5299, + "token_acc": 0.2848294344537394 + }, + { + "epoch": 3.1067135737320433, + "grad_norm": 0.23831678521993327, + "learning_rate": 0.0004849857067633259, + "loss": 3.0608458518981934, + "step": 5300, + "token_acc": 0.2922557701254564 + }, + { + "epoch": 3.1072999120492524, + "grad_norm": 0.23374211858360971, + "learning_rate": 0.0004849774351140333, + "loss": 3.141669511795044, + "step": 5301, + "token_acc": 0.28283381499432375 + }, + { + "epoch": 3.1078862503664615, + "grad_norm": 0.22371598794648104, + "learning_rate": 0.0004849691612574444, + "loss": 3.094804525375366, + "step": 5302, + "token_acc": 0.28672789584716457 + }, + { + "epoch": 3.1084725886836706, + "grad_norm": 0.2404846105501374, + "learning_rate": 0.0004849608851936368, + "loss": 3.0776517391204834, + "step": 5303, + "token_acc": 0.289813220974745 + }, + { + "epoch": 3.1090589270008797, + "grad_norm": 0.3060840663069173, + "learning_rate": 0.00048495260692268835, + "loss": 3.079890251159668, + "step": 5304, + "token_acc": 0.28916527712757284 + }, + { + "epoch": 3.1096452653180884, + "grad_norm": 0.2999643536338987, + "learning_rate": 0.0004849443264446767, + "loss": 3.0992326736450195, + "step": 5305, + "token_acc": 0.2880369744818466 + }, + { + "epoch": 3.1102316036352975, + "grad_norm": 0.22509195588169298, + "learning_rate": 0.0004849360437596797, + "loss": 3.0654430389404297, + "step": 5306, + "token_acc": 0.2930437166641743 + }, + { + "epoch": 3.1108179419525066, + "grad_norm": 0.3221086788026361, + "learning_rate": 0.00048492775886777517, + "loss": 3.084301710128784, + "step": 5307, + "token_acc": 0.2898172591961048 + }, + { + "epoch": 3.1114042802697157, + "grad_norm": 0.32032370696140006, + "learning_rate": 0.00048491947176904093, + "loss": 3.06071400642395, + "step": 5308, + "token_acc": 0.29273710592270147 + }, + { + "epoch": 3.111990618586925, + "grad_norm": 0.2331465000433474, + "learning_rate": 0.0004849111824635548, + "loss": 3.114689350128174, + "step": 5309, + "token_acc": 0.2861944432739522 + }, + { + "epoch": 3.1125769569041335, + "grad_norm": 0.2726687377570615, + "learning_rate": 0.00048490289095139475, + "loss": 3.0953264236450195, + "step": 5310, + "token_acc": 0.2878053843658774 + }, + { + "epoch": 3.1131632952213426, + "grad_norm": 0.21608582124177647, + "learning_rate": 0.00048489459723263844, + "loss": 3.0749454498291016, + "step": 5311, + "token_acc": 0.2920474555204092 + }, + { + "epoch": 3.1137496335385517, + "grad_norm": 0.2582227387398244, + "learning_rate": 0.000484886301307364, + "loss": 3.0242645740509033, + "step": 5312, + "token_acc": 0.29808135913068184 + }, + { + "epoch": 3.114335971855761, + "grad_norm": 0.21601353435900633, + "learning_rate": 0.00048487800317564925, + "loss": 3.077521800994873, + "step": 5313, + "token_acc": 0.28984336674776645 + }, + { + "epoch": 3.11492231017297, + "grad_norm": 0.2593558941101283, + "learning_rate": 0.00048486970283757213, + "loss": 3.058673620223999, + "step": 5314, + "token_acc": 0.29319421716656013 + }, + { + "epoch": 3.115508648490179, + "grad_norm": 0.2017696676825862, + "learning_rate": 0.00048486140029321064, + "loss": 3.076042890548706, + "step": 5315, + "token_acc": 0.2910030581995697 + }, + { + "epoch": 3.1160949868073877, + "grad_norm": 0.257433688741311, + "learning_rate": 0.0004848530955426428, + "loss": 3.0691733360290527, + "step": 5316, + "token_acc": 0.2915739652422131 + }, + { + "epoch": 3.116681325124597, + "grad_norm": 0.2546950586009729, + "learning_rate": 0.0004848447885859466, + "loss": 3.045759439468384, + "step": 5317, + "token_acc": 0.29363566341766745 + }, + { + "epoch": 3.117267663441806, + "grad_norm": 0.20880778957489565, + "learning_rate": 0.0004848364794232, + "loss": 3.087078809738159, + "step": 5318, + "token_acc": 0.2907360652553894 + }, + { + "epoch": 3.117854001759015, + "grad_norm": 0.2547893683665738, + "learning_rate": 0.0004848281680544812, + "loss": 3.058403491973877, + "step": 5319, + "token_acc": 0.29223249764450454 + }, + { + "epoch": 3.118440340076224, + "grad_norm": 0.19858354383454282, + "learning_rate": 0.0004848198544798682, + "loss": 3.01145339012146, + "step": 5320, + "token_acc": 0.3007392970830897 + }, + { + "epoch": 3.119026678393433, + "grad_norm": 0.2794088818526359, + "learning_rate": 0.00048481153869943904, + "loss": 3.074542999267578, + "step": 5321, + "token_acc": 0.2902742136228415 + }, + { + "epoch": 3.119613016710642, + "grad_norm": 0.29449820160115914, + "learning_rate": 0.00048480322071327195, + "loss": 3.047043800354004, + "step": 5322, + "token_acc": 0.2938045572600843 + }, + { + "epoch": 3.120199355027851, + "grad_norm": 0.24142235610320656, + "learning_rate": 0.00048479490052144494, + "loss": 3.064574718475342, + "step": 5323, + "token_acc": 0.29273269628861937 + }, + { + "epoch": 3.12078569334506, + "grad_norm": 0.24364992340981872, + "learning_rate": 0.00048478657812403624, + "loss": 3.032588005065918, + "step": 5324, + "token_acc": 0.29596231878783785 + }, + { + "epoch": 3.1213720316622693, + "grad_norm": 0.20400466666161168, + "learning_rate": 0.000484778253521124, + "loss": 3.031632900238037, + "step": 5325, + "token_acc": 0.2961214068206314 + }, + { + "epoch": 3.121958369979478, + "grad_norm": 0.27296219356350443, + "learning_rate": 0.0004847699267127865, + "loss": 3.0798611640930176, + "step": 5326, + "token_acc": 0.28916984671523466 + }, + { + "epoch": 3.122544708296687, + "grad_norm": 0.27509862934428064, + "learning_rate": 0.0004847615976991019, + "loss": 3.0601420402526855, + "step": 5327, + "token_acc": 0.29229890897282124 + }, + { + "epoch": 3.123131046613896, + "grad_norm": 0.2376644248015055, + "learning_rate": 0.00048475326648014837, + "loss": 3.0424389839172363, + "step": 5328, + "token_acc": 0.2949330659781925 + }, + { + "epoch": 3.1237173849311053, + "grad_norm": 0.20726554994579008, + "learning_rate": 0.0004847449330560043, + "loss": 3.0863256454467773, + "step": 5329, + "token_acc": 0.289624333362048 + }, + { + "epoch": 3.1243037232483144, + "grad_norm": 0.25046257941014555, + "learning_rate": 0.0004847365974267478, + "loss": 3.115145683288574, + "step": 5330, + "token_acc": 0.2848473445288629 + }, + { + "epoch": 3.1248900615655235, + "grad_norm": 0.24475307927658718, + "learning_rate": 0.00048472825959245736, + "loss": 3.0597195625305176, + "step": 5331, + "token_acc": 0.29335720981415064 + }, + { + "epoch": 3.125476399882732, + "grad_norm": 0.22238941553654298, + "learning_rate": 0.00048471991955321124, + "loss": 3.0325560569763184, + "step": 5332, + "token_acc": 0.296774110697316 + }, + { + "epoch": 3.1260627381999413, + "grad_norm": 0.23982642420961964, + "learning_rate": 0.00048471157730908777, + "loss": 3.0690784454345703, + "step": 5333, + "token_acc": 0.29068035526842767 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.2767881152244385, + "learning_rate": 0.00048470323286016524, + "loss": 3.070901870727539, + "step": 5334, + "token_acc": 0.2906237532964784 + }, + { + "epoch": 3.1272354148343595, + "grad_norm": 0.2805077892705656, + "learning_rate": 0.00048469488620652215, + "loss": 3.114424228668213, + "step": 5335, + "token_acc": 0.28730042086979757 + }, + { + "epoch": 3.1278217531515686, + "grad_norm": 0.24718539348355087, + "learning_rate": 0.0004846865373482369, + "loss": 3.070014715194702, + "step": 5336, + "token_acc": 0.2904370926773189 + }, + { + "epoch": 3.1284080914687773, + "grad_norm": 0.2365259547729129, + "learning_rate": 0.0004846781862853877, + "loss": 3.0651416778564453, + "step": 5337, + "token_acc": 0.29182760132939517 + }, + { + "epoch": 3.1289944297859864, + "grad_norm": 0.28748719063069794, + "learning_rate": 0.0004846698330180533, + "loss": 3.0983052253723145, + "step": 5338, + "token_acc": 0.288061741462763 + }, + { + "epoch": 3.1295807681031955, + "grad_norm": 0.28886662796127593, + "learning_rate": 0.00048466147754631206, + "loss": 3.0901296138763428, + "step": 5339, + "token_acc": 0.2880048198918238 + }, + { + "epoch": 3.1301671064204046, + "grad_norm": 0.2811265587975524, + "learning_rate": 0.00048465311987024246, + "loss": 3.0517702102661133, + "step": 5340, + "token_acc": 0.2936519311208782 + }, + { + "epoch": 3.1307534447376137, + "grad_norm": 0.2950524170626383, + "learning_rate": 0.000484644759989923, + "loss": 3.064511299133301, + "step": 5341, + "token_acc": 0.2917712758467698 + }, + { + "epoch": 3.1313397830548224, + "grad_norm": 0.2914655794971766, + "learning_rate": 0.0004846363979054321, + "loss": 3.1119766235351562, + "step": 5342, + "token_acc": 0.2865607367133622 + }, + { + "epoch": 3.1319261213720315, + "grad_norm": 0.2286959528273482, + "learning_rate": 0.0004846280336168485, + "loss": 3.049220561981201, + "step": 5343, + "token_acc": 0.2931493248088425 + }, + { + "epoch": 3.1325124596892406, + "grad_norm": 0.23448041818124438, + "learning_rate": 0.0004846196671242507, + "loss": 3.0791497230529785, + "step": 5344, + "token_acc": 0.2911683270798292 + }, + { + "epoch": 3.1330987980064497, + "grad_norm": 0.2542394106302144, + "learning_rate": 0.00048461129842771724, + "loss": 3.0905866622924805, + "step": 5345, + "token_acc": 0.2885547874702556 + }, + { + "epoch": 3.133685136323659, + "grad_norm": 0.2638202204267411, + "learning_rate": 0.0004846029275273268, + "loss": 3.102177143096924, + "step": 5346, + "token_acc": 0.28609541785163173 + }, + { + "epoch": 3.134271474640868, + "grad_norm": 0.3488620140066714, + "learning_rate": 0.000484594554423158, + "loss": 3.099637031555176, + "step": 5347, + "token_acc": 0.28690434627282546 + }, + { + "epoch": 3.1348578129580766, + "grad_norm": 0.2794835559970808, + "learning_rate": 0.00048458617911528945, + "loss": 3.1367764472961426, + "step": 5348, + "token_acc": 0.28238201920128075 + }, + { + "epoch": 3.1354441512752858, + "grad_norm": 0.2540916046055134, + "learning_rate": 0.00048457780160379986, + "loss": 3.0785274505615234, + "step": 5349, + "token_acc": 0.29003170879802376 + }, + { + "epoch": 3.136030489592495, + "grad_norm": 0.32584807891255, + "learning_rate": 0.00048456942188876797, + "loss": 3.0652501583099365, + "step": 5350, + "token_acc": 0.2923613947788629 + }, + { + "epoch": 3.136616827909704, + "grad_norm": 0.27031670283157927, + "learning_rate": 0.00048456103997027237, + "loss": 3.0286927223205566, + "step": 5351, + "token_acc": 0.2966758739430557 + }, + { + "epoch": 3.137203166226913, + "grad_norm": 0.3007439063100857, + "learning_rate": 0.00048455265584839194, + "loss": 3.056358575820923, + "step": 5352, + "token_acc": 0.2926556047443394 + }, + { + "epoch": 3.1377895045441218, + "grad_norm": 0.2327031701974762, + "learning_rate": 0.0004845442695232053, + "loss": 3.098090410232544, + "step": 5353, + "token_acc": 0.28827324965490675 + }, + { + "epoch": 3.138375842861331, + "grad_norm": 0.3038421216598257, + "learning_rate": 0.0004845358809947914, + "loss": 3.095351457595825, + "step": 5354, + "token_acc": 0.2870464634354313 + }, + { + "epoch": 3.13896218117854, + "grad_norm": 0.22523652142551745, + "learning_rate": 0.00048452749026322884, + "loss": 3.0919747352600098, + "step": 5355, + "token_acc": 0.2876081881151083 + }, + { + "epoch": 3.139548519495749, + "grad_norm": 0.28415555062441206, + "learning_rate": 0.00048451909732859656, + "loss": 3.109529972076416, + "step": 5356, + "token_acc": 0.28773102529960054 + }, + { + "epoch": 3.140134857812958, + "grad_norm": 0.2080174683453784, + "learning_rate": 0.00048451070219097345, + "loss": 3.1018543243408203, + "step": 5357, + "token_acc": 0.28869290294750466 + }, + { + "epoch": 3.1407211961301673, + "grad_norm": 0.29649041923947733, + "learning_rate": 0.00048450230485043823, + "loss": 3.0854685306549072, + "step": 5358, + "token_acc": 0.2884209081132205 + }, + { + "epoch": 3.141307534447376, + "grad_norm": 0.2353750921014541, + "learning_rate": 0.0004844939053070699, + "loss": 3.074301242828369, + "step": 5359, + "token_acc": 0.28948242254335116 + }, + { + "epoch": 3.141893872764585, + "grad_norm": 0.2774620941204048, + "learning_rate": 0.0004844855035609472, + "loss": 3.0552220344543457, + "step": 5360, + "token_acc": 0.2931546678465331 + }, + { + "epoch": 3.142480211081794, + "grad_norm": 0.25784343387254877, + "learning_rate": 0.0004844770996121493, + "loss": 3.0955235958099365, + "step": 5361, + "token_acc": 0.2881070251666591 + }, + { + "epoch": 3.1430665493990033, + "grad_norm": 0.26692897311862945, + "learning_rate": 0.00048446869346075496, + "loss": 3.0753328800201416, + "step": 5362, + "token_acc": 0.2915813659865615 + }, + { + "epoch": 3.1436528877162124, + "grad_norm": 0.2692112962259802, + "learning_rate": 0.0004844602851068433, + "loss": 3.096564531326294, + "step": 5363, + "token_acc": 0.29042160320370647 + }, + { + "epoch": 3.144239226033421, + "grad_norm": 0.24231124846070337, + "learning_rate": 0.000484451874550493, + "loss": 3.0609450340270996, + "step": 5364, + "token_acc": 0.2953631037473378 + }, + { + "epoch": 3.14482556435063, + "grad_norm": 0.2889304038224138, + "learning_rate": 0.0004844434617917834, + "loss": 3.111720085144043, + "step": 5365, + "token_acc": 0.2852307248760017 + }, + { + "epoch": 3.1454119026678393, + "grad_norm": 0.21652773527573413, + "learning_rate": 0.00048443504683079333, + "loss": 3.0766842365264893, + "step": 5366, + "token_acc": 0.29031821743113884 + }, + { + "epoch": 3.1459982409850484, + "grad_norm": 0.30683036986750867, + "learning_rate": 0.000484426629667602, + "loss": 3.088304042816162, + "step": 5367, + "token_acc": 0.28739718772204437 + }, + { + "epoch": 3.1465845793022575, + "grad_norm": 0.20812457454841551, + "learning_rate": 0.0004844182103022883, + "loss": 3.091416597366333, + "step": 5368, + "token_acc": 0.28777015539175593 + }, + { + "epoch": 3.1471709176194667, + "grad_norm": 0.262296668432293, + "learning_rate": 0.00048440978873493136, + "loss": 3.0604071617126465, + "step": 5369, + "token_acc": 0.2923104206756539 + }, + { + "epoch": 3.1477572559366753, + "grad_norm": 0.2379811405894767, + "learning_rate": 0.0004844013649656104, + "loss": 3.0911827087402344, + "step": 5370, + "token_acc": 0.2876997154738455 + }, + { + "epoch": 3.1483435942538844, + "grad_norm": 0.262893051684386, + "learning_rate": 0.0004843929389944044, + "loss": 3.022144079208374, + "step": 5371, + "token_acc": 0.2977118186724231 + }, + { + "epoch": 3.1489299325710935, + "grad_norm": 0.2850739504790262, + "learning_rate": 0.0004843845108213927, + "loss": 3.1021642684936523, + "step": 5372, + "token_acc": 0.2871994023836978 + }, + { + "epoch": 3.1495162708883027, + "grad_norm": 0.23454127255866442, + "learning_rate": 0.0004843760804466543, + "loss": 3.072443723678589, + "step": 5373, + "token_acc": 0.29314002260589145 + }, + { + "epoch": 3.1501026092055118, + "grad_norm": 0.2457032619019582, + "learning_rate": 0.00048436764787026837, + "loss": 3.06713604927063, + "step": 5374, + "token_acc": 0.2919216305541637 + }, + { + "epoch": 3.1506889475227204, + "grad_norm": 0.22819694606477195, + "learning_rate": 0.00048435921309231426, + "loss": 3.124382495880127, + "step": 5375, + "token_acc": 0.28441843189570304 + }, + { + "epoch": 3.1512752858399296, + "grad_norm": 0.24297134445500634, + "learning_rate": 0.0004843507761128712, + "loss": 3.067120313644409, + "step": 5376, + "token_acc": 0.290918088001724 + }, + { + "epoch": 3.1518616241571387, + "grad_norm": 0.25449735185912636, + "learning_rate": 0.00048434233693201833, + "loss": 3.109018087387085, + "step": 5377, + "token_acc": 0.2864034596766928 + }, + { + "epoch": 3.1524479624743478, + "grad_norm": 0.2550315975994766, + "learning_rate": 0.000484333895549835, + "loss": 3.070399761199951, + "step": 5378, + "token_acc": 0.29107993320000825 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.19232181794031733, + "learning_rate": 0.0004843254519664005, + "loss": 3.1145553588867188, + "step": 5379, + "token_acc": 0.28402401190506216 + }, + { + "epoch": 3.1536206391087656, + "grad_norm": 0.24998199395252224, + "learning_rate": 0.0004843170061817941, + "loss": 3.1127700805664062, + "step": 5380, + "token_acc": 0.2851928244869932 + }, + { + "epoch": 3.1542069774259747, + "grad_norm": 0.21220249260568033, + "learning_rate": 0.0004843085581960953, + "loss": 3.0855536460876465, + "step": 5381, + "token_acc": 0.2870381086654602 + }, + { + "epoch": 3.154793315743184, + "grad_norm": 0.2156426542553022, + "learning_rate": 0.0004843001080093832, + "loss": 3.073103427886963, + "step": 5382, + "token_acc": 0.29155722806748074 + }, + { + "epoch": 3.155379654060393, + "grad_norm": 0.2025900706281843, + "learning_rate": 0.0004842916556217373, + "loss": 3.064711332321167, + "step": 5383, + "token_acc": 0.2913845153986457 + }, + { + "epoch": 3.155965992377602, + "grad_norm": 0.2762018758559658, + "learning_rate": 0.0004842832010332371, + "loss": 3.1205625534057617, + "step": 5384, + "token_acc": 0.2847077975274972 + }, + { + "epoch": 3.1565523306948107, + "grad_norm": 0.25236008576031277, + "learning_rate": 0.0004842747442439619, + "loss": 3.085803508758545, + "step": 5385, + "token_acc": 0.2875594523952352 + }, + { + "epoch": 3.15713866901202, + "grad_norm": 0.1925612316388654, + "learning_rate": 0.00048426628525399107, + "loss": 3.127965211868286, + "step": 5386, + "token_acc": 0.2835134336731781 + }, + { + "epoch": 3.157725007329229, + "grad_norm": 0.2678794724195621, + "learning_rate": 0.00048425782406340425, + "loss": 3.0816855430603027, + "step": 5387, + "token_acc": 0.28798827682406675 + }, + { + "epoch": 3.158311345646438, + "grad_norm": 0.23982066458650814, + "learning_rate": 0.00048424936067228085, + "loss": 3.0889694690704346, + "step": 5388, + "token_acc": 0.29158860619246185 + }, + { + "epoch": 3.158897683963647, + "grad_norm": 0.22162238777793375, + "learning_rate": 0.00048424089508070035, + "loss": 3.1221237182617188, + "step": 5389, + "token_acc": 0.2845974605760358 + }, + { + "epoch": 3.1594840222808562, + "grad_norm": 0.2987548212310297, + "learning_rate": 0.0004842324272887423, + "loss": 3.088703155517578, + "step": 5390, + "token_acc": 0.289970086498133 + }, + { + "epoch": 3.160070360598065, + "grad_norm": 0.231410939295816, + "learning_rate": 0.00048422395729648616, + "loss": 3.1149497032165527, + "step": 5391, + "token_acc": 0.2861084443712342 + }, + { + "epoch": 3.160656698915274, + "grad_norm": 0.26257703828829787, + "learning_rate": 0.0004842154851040116, + "loss": 3.1110424995422363, + "step": 5392, + "token_acc": 0.2877152423054293 + }, + { + "epoch": 3.161243037232483, + "grad_norm": 0.3049586605964518, + "learning_rate": 0.00048420701071139825, + "loss": 3.0953636169433594, + "step": 5393, + "token_acc": 0.287868920438385 + }, + { + "epoch": 3.1618293755496922, + "grad_norm": 0.2174744563175756, + "learning_rate": 0.0004841985341187255, + "loss": 3.0917856693267822, + "step": 5394, + "token_acc": 0.2874669114120304 + }, + { + "epoch": 3.1624157138669013, + "grad_norm": 0.23318761866910048, + "learning_rate": 0.00048419005532607316, + "loss": 3.0464816093444824, + "step": 5395, + "token_acc": 0.2949274554702223 + }, + { + "epoch": 3.16300205218411, + "grad_norm": 0.24117122129786508, + "learning_rate": 0.0004841815743335208, + "loss": 3.084419012069702, + "step": 5396, + "token_acc": 0.28896534593289047 + }, + { + "epoch": 3.163588390501319, + "grad_norm": 0.21914907510306686, + "learning_rate": 0.00048417309114114814, + "loss": 3.102555751800537, + "step": 5397, + "token_acc": 0.28638021296770955 + }, + { + "epoch": 3.1641747288185282, + "grad_norm": 0.2352310785952553, + "learning_rate": 0.00048416460574903484, + "loss": 3.090198278427124, + "step": 5398, + "token_acc": 0.2871389858334842 + }, + { + "epoch": 3.1647610671357373, + "grad_norm": 0.20086174451892444, + "learning_rate": 0.0004841561181572607, + "loss": 3.0813889503479004, + "step": 5399, + "token_acc": 0.290448620900146 + }, + { + "epoch": 3.1653474054529465, + "grad_norm": 0.19788378806047002, + "learning_rate": 0.00048414762836590525, + "loss": 3.088487386703491, + "step": 5400, + "token_acc": 0.2894971250573388 + }, + { + "epoch": 3.1659337437701556, + "grad_norm": 0.21623090661415612, + "learning_rate": 0.0004841391363750484, + "loss": 3.1092023849487305, + "step": 5401, + "token_acc": 0.2847902758374134 + }, + { + "epoch": 3.1665200820873642, + "grad_norm": 0.23181314100769357, + "learning_rate": 0.0004841306421847698, + "loss": 3.1072206497192383, + "step": 5402, + "token_acc": 0.28503549829082303 + }, + { + "epoch": 3.1671064204045734, + "grad_norm": 0.2574239391829674, + "learning_rate": 0.00048412214579514936, + "loss": 3.077296257019043, + "step": 5403, + "token_acc": 0.29125323378196943 + }, + { + "epoch": 3.1676927587217825, + "grad_norm": 0.22701215493923937, + "learning_rate": 0.0004841136472062668, + "loss": 3.088351249694824, + "step": 5404, + "token_acc": 0.28836280811757653 + }, + { + "epoch": 3.1682790970389916, + "grad_norm": 0.20295419192559722, + "learning_rate": 0.0004841051464182021, + "loss": 3.0589370727539062, + "step": 5405, + "token_acc": 0.29253242170960814 + }, + { + "epoch": 3.1688654353562007, + "grad_norm": 0.2607549727110417, + "learning_rate": 0.00048409664343103496, + "loss": 3.079810619354248, + "step": 5406, + "token_acc": 0.2921655108914506 + }, + { + "epoch": 3.1694517736734094, + "grad_norm": 0.2556801178922923, + "learning_rate": 0.0004840881382448453, + "loss": 3.0851964950561523, + "step": 5407, + "token_acc": 0.28816055732383383 + }, + { + "epoch": 3.1700381119906185, + "grad_norm": 0.23139362571177743, + "learning_rate": 0.00048407963085971294, + "loss": 3.097228527069092, + "step": 5408, + "token_acc": 0.28813098979552243 + }, + { + "epoch": 3.1706244503078276, + "grad_norm": 0.27629954609462326, + "learning_rate": 0.00048407112127571796, + "loss": 3.0813326835632324, + "step": 5409, + "token_acc": 0.28972388242999836 + }, + { + "epoch": 3.1712107886250367, + "grad_norm": 0.3506953779670314, + "learning_rate": 0.0004840626094929402, + "loss": 3.105868339538574, + "step": 5410, + "token_acc": 0.28555266917454264 + }, + { + "epoch": 3.171797126942246, + "grad_norm": 0.25849498782941216, + "learning_rate": 0.0004840540955114596, + "loss": 3.1221442222595215, + "step": 5411, + "token_acc": 0.28317703586662857 + }, + { + "epoch": 3.172383465259455, + "grad_norm": 0.23552736836129423, + "learning_rate": 0.00048404557933135617, + "loss": 3.0864925384521484, + "step": 5412, + "token_acc": 0.28969401578891896 + }, + { + "epoch": 3.1729698035766636, + "grad_norm": 0.286349093098653, + "learning_rate": 0.00048403706095270993, + "loss": 3.0716917514801025, + "step": 5413, + "token_acc": 0.29013902048638435 + }, + { + "epoch": 3.1735561418938727, + "grad_norm": 0.2913096333447423, + "learning_rate": 0.00048402854037560083, + "loss": 3.0626206398010254, + "step": 5414, + "token_acc": 0.2925302135531545 + }, + { + "epoch": 3.174142480211082, + "grad_norm": 0.2868411487503322, + "learning_rate": 0.0004840200176001091, + "loss": 3.104795455932617, + "step": 5415, + "token_acc": 0.28755177509130025 + }, + { + "epoch": 3.174728818528291, + "grad_norm": 0.22780602563737737, + "learning_rate": 0.00048401149262631443, + "loss": 3.041975975036621, + "step": 5416, + "token_acc": 0.2953545206954757 + }, + { + "epoch": 3.1753151568455, + "grad_norm": 0.30273646416810757, + "learning_rate": 0.0004840029654542972, + "loss": 3.057687997817993, + "step": 5417, + "token_acc": 0.293371367332787 + }, + { + "epoch": 3.1759014951627087, + "grad_norm": 0.2503287543371628, + "learning_rate": 0.0004839944360841375, + "loss": 3.1214709281921387, + "step": 5418, + "token_acc": 0.28591176161997695 + }, + { + "epoch": 3.176487833479918, + "grad_norm": 0.2723316840276955, + "learning_rate": 0.0004839859045159153, + "loss": 3.032430648803711, + "step": 5419, + "token_acc": 0.2966773144182298 + }, + { + "epoch": 3.177074171797127, + "grad_norm": 0.2591239684175086, + "learning_rate": 0.0004839773707497109, + "loss": 3.0948195457458496, + "step": 5420, + "token_acc": 0.2878129631371433 + }, + { + "epoch": 3.177660510114336, + "grad_norm": 0.242404775241652, + "learning_rate": 0.0004839688347856044, + "loss": 3.1006669998168945, + "step": 5421, + "token_acc": 0.2850978411489698 + }, + { + "epoch": 3.178246848431545, + "grad_norm": 0.28534832713108255, + "learning_rate": 0.0004839602966236759, + "loss": 3.067765951156616, + "step": 5422, + "token_acc": 0.2915206294920092 + }, + { + "epoch": 3.1788331867487543, + "grad_norm": 0.2461833356481782, + "learning_rate": 0.00048395175626400567, + "loss": 3.0894522666931152, + "step": 5423, + "token_acc": 0.28878316414429217 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.22623942131667513, + "learning_rate": 0.00048394321370667396, + "loss": 3.080869674682617, + "step": 5424, + "token_acc": 0.28945344188012423 + }, + { + "epoch": 3.180005863383172, + "grad_norm": 0.20643540575504152, + "learning_rate": 0.00048393466895176106, + "loss": 3.1093862056732178, + "step": 5425, + "token_acc": 0.2849288688954065 + }, + { + "epoch": 3.180592201700381, + "grad_norm": 0.25200526921651256, + "learning_rate": 0.0004839261219993472, + "loss": 3.062257766723633, + "step": 5426, + "token_acc": 0.2926565931606196 + }, + { + "epoch": 3.1811785400175903, + "grad_norm": 0.22787933108896155, + "learning_rate": 0.00048391757284951256, + "loss": 3.0733144283294678, + "step": 5427, + "token_acc": 0.2902532743874726 + }, + { + "epoch": 3.1817648783347994, + "grad_norm": 0.20010755065674193, + "learning_rate": 0.0004839090215023375, + "loss": 3.106715202331543, + "step": 5428, + "token_acc": 0.2864649795318354 + }, + { + "epoch": 3.182351216652008, + "grad_norm": 0.2665846649274655, + "learning_rate": 0.00048390046795790246, + "loss": 3.067208766937256, + "step": 5429, + "token_acc": 0.2916715598876872 + }, + { + "epoch": 3.182937554969217, + "grad_norm": 0.22573517009977176, + "learning_rate": 0.00048389191221628766, + "loss": 3.110297203063965, + "step": 5430, + "token_acc": 0.28531988496218286 + }, + { + "epoch": 3.1835238932864263, + "grad_norm": 0.23795589348656535, + "learning_rate": 0.00048388335427757353, + "loss": 3.0575461387634277, + "step": 5431, + "token_acc": 0.29496478422333205 + }, + { + "epoch": 3.1841102316036354, + "grad_norm": 0.23059708429853826, + "learning_rate": 0.0004838747941418404, + "loss": 3.097414970397949, + "step": 5432, + "token_acc": 0.2870912766241318 + }, + { + "epoch": 3.1846965699208445, + "grad_norm": 0.25725779749496724, + "learning_rate": 0.0004838662318091688, + "loss": 3.0693695545196533, + "step": 5433, + "token_acc": 0.2908157320303579 + }, + { + "epoch": 3.185282908238053, + "grad_norm": 0.2273306144777121, + "learning_rate": 0.00048385766727963907, + "loss": 3.095987558364868, + "step": 5434, + "token_acc": 0.2883468723678248 + }, + { + "epoch": 3.1858692465552623, + "grad_norm": 0.21600413921559986, + "learning_rate": 0.00048384910055333173, + "loss": 3.0629045963287354, + "step": 5435, + "token_acc": 0.2915187156387894 + }, + { + "epoch": 3.1864555848724714, + "grad_norm": 0.23535460203106645, + "learning_rate": 0.00048384053163032714, + "loss": 3.0876994132995605, + "step": 5436, + "token_acc": 0.28865827709559916 + }, + { + "epoch": 3.1870419231896805, + "grad_norm": 0.20340423157478893, + "learning_rate": 0.0004838319605107059, + "loss": 3.0654821395874023, + "step": 5437, + "token_acc": 0.2919456945371547 + }, + { + "epoch": 3.1876282615068896, + "grad_norm": 0.26068732820882895, + "learning_rate": 0.0004838233871945485, + "loss": 3.0552265644073486, + "step": 5438, + "token_acc": 0.2934917603348049 + }, + { + "epoch": 3.1882145998240983, + "grad_norm": 0.24787226519879124, + "learning_rate": 0.0004838148116819354, + "loss": 3.0732932090759277, + "step": 5439, + "token_acc": 0.29001191844379176 + }, + { + "epoch": 3.1888009381413074, + "grad_norm": 0.21675574358974603, + "learning_rate": 0.00048380623397294723, + "loss": 3.1133170127868652, + "step": 5440, + "token_acc": 0.2867183601914908 + }, + { + "epoch": 3.1893872764585165, + "grad_norm": 0.1986820327999222, + "learning_rate": 0.00048379765406766456, + "loss": 3.098450183868408, + "step": 5441, + "token_acc": 0.2868654858549923 + }, + { + "epoch": 3.1899736147757256, + "grad_norm": 0.26916121527649106, + "learning_rate": 0.00048378907196616793, + "loss": 3.086653470993042, + "step": 5442, + "token_acc": 0.28983141557398984 + }, + { + "epoch": 3.1905599530929347, + "grad_norm": 0.3488644789597987, + "learning_rate": 0.0004837804876685381, + "loss": 3.110848903656006, + "step": 5443, + "token_acc": 0.28541291643670985 + }, + { + "epoch": 3.191146291410144, + "grad_norm": 0.25415888971842154, + "learning_rate": 0.0004837719011748556, + "loss": 3.1153311729431152, + "step": 5444, + "token_acc": 0.2872074209616215 + }, + { + "epoch": 3.1917326297273525, + "grad_norm": 0.24029906246948998, + "learning_rate": 0.00048376331248520103, + "loss": 3.07814884185791, + "step": 5445, + "token_acc": 0.28974220556869845 + }, + { + "epoch": 3.1923189680445616, + "grad_norm": 0.3444781676707779, + "learning_rate": 0.00048375472159965517, + "loss": 3.107276439666748, + "step": 5446, + "token_acc": 0.2859525055364401 + }, + { + "epoch": 3.1929053063617707, + "grad_norm": 0.2574197223305299, + "learning_rate": 0.00048374612851829866, + "loss": 3.1433699131011963, + "step": 5447, + "token_acc": 0.2814090757087578 + }, + { + "epoch": 3.19349164467898, + "grad_norm": 0.2349115132285172, + "learning_rate": 0.0004837375332412123, + "loss": 3.1310391426086426, + "step": 5448, + "token_acc": 0.2841380946731522 + }, + { + "epoch": 3.194077982996189, + "grad_norm": 0.22250479862476977, + "learning_rate": 0.00048372893576847676, + "loss": 3.065086841583252, + "step": 5449, + "token_acc": 0.29228825241231426 + }, + { + "epoch": 3.1946643213133976, + "grad_norm": 0.21986526737823042, + "learning_rate": 0.00048372033610017285, + "loss": 3.068359851837158, + "step": 5450, + "token_acc": 0.29152397150457776 + }, + { + "epoch": 3.1952506596306067, + "grad_norm": 0.2010440904458156, + "learning_rate": 0.0004837117342363813, + "loss": 3.138734817504883, + "step": 5451, + "token_acc": 0.28323608799374983 + }, + { + "epoch": 3.195836997947816, + "grad_norm": 0.1967668288092555, + "learning_rate": 0.00048370313017718293, + "loss": 3.0715856552124023, + "step": 5452, + "token_acc": 0.2914232569706924 + }, + { + "epoch": 3.196423336265025, + "grad_norm": 0.2220683877189509, + "learning_rate": 0.0004836945239226586, + "loss": 3.029512405395508, + "step": 5453, + "token_acc": 0.2957632385515545 + }, + { + "epoch": 3.197009674582234, + "grad_norm": 0.22230900602186066, + "learning_rate": 0.00048368591547288904, + "loss": 3.04811692237854, + "step": 5454, + "token_acc": 0.2942054333819323 + }, + { + "epoch": 3.197596012899443, + "grad_norm": 0.23886735602667203, + "learning_rate": 0.00048367730482795526, + "loss": 3.0767033100128174, + "step": 5455, + "token_acc": 0.2910709311117891 + }, + { + "epoch": 3.198182351216652, + "grad_norm": 0.22109707624850036, + "learning_rate": 0.00048366869198793807, + "loss": 3.068223476409912, + "step": 5456, + "token_acc": 0.2934374149517449 + }, + { + "epoch": 3.198768689533861, + "grad_norm": 0.2263894580239652, + "learning_rate": 0.0004836600769529184, + "loss": 3.06014347076416, + "step": 5457, + "token_acc": 0.29183797192226335 + }, + { + "epoch": 3.19935502785107, + "grad_norm": 0.2243096443099745, + "learning_rate": 0.00048365145972297717, + "loss": 3.0624053478240967, + "step": 5458, + "token_acc": 0.2908863618682784 + }, + { + "epoch": 3.199941366168279, + "grad_norm": 0.18750881468522201, + "learning_rate": 0.0004836428402981954, + "loss": 3.012996196746826, + "step": 5459, + "token_acc": 0.3013225334659937 + }, + { + "epoch": 3.2005277044854883, + "grad_norm": 0.273552323644599, + "learning_rate": 0.0004836342186786539, + "loss": 3.115236759185791, + "step": 5460, + "token_acc": 0.2854183246141603 + }, + { + "epoch": 3.201114042802697, + "grad_norm": 0.38368304604424686, + "learning_rate": 0.0004836255948644337, + "loss": 3.0715832710266113, + "step": 5461, + "token_acc": 0.29018726202388534 + }, + { + "epoch": 3.201700381119906, + "grad_norm": 0.3036228903578986, + "learning_rate": 0.0004836169688556159, + "loss": 3.0731489658355713, + "step": 5462, + "token_acc": 0.2897559117801857 + }, + { + "epoch": 3.202286719437115, + "grad_norm": 0.23620765641881641, + "learning_rate": 0.0004836083406522815, + "loss": 3.1141715049743652, + "step": 5463, + "token_acc": 0.28582330207025575 + }, + { + "epoch": 3.2028730577543243, + "grad_norm": 0.3048368845601399, + "learning_rate": 0.0004835997102545115, + "loss": 3.043590545654297, + "step": 5464, + "token_acc": 0.29727681965559855 + }, + { + "epoch": 3.2034593960715334, + "grad_norm": 0.22769527062763317, + "learning_rate": 0.000483591077662387, + "loss": 3.0908002853393555, + "step": 5465, + "token_acc": 0.2890580712300256 + }, + { + "epoch": 3.2040457343887425, + "grad_norm": 0.2632695950206581, + "learning_rate": 0.0004835824428759891, + "loss": 3.1086792945861816, + "step": 5466, + "token_acc": 0.2859434135539465 + }, + { + "epoch": 3.204632072705951, + "grad_norm": 0.2310896011015095, + "learning_rate": 0.00048357380589539897, + "loss": 3.1089518070220947, + "step": 5467, + "token_acc": 0.2862588332387683 + }, + { + "epoch": 3.2052184110231603, + "grad_norm": 0.2630610456656749, + "learning_rate": 0.0004835651667206976, + "loss": 3.103029251098633, + "step": 5468, + "token_acc": 0.2864836428323499 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 0.23435979078593147, + "learning_rate": 0.00048355652535196625, + "loss": 3.0748791694641113, + "step": 5469, + "token_acc": 0.28868769591663035 + }, + { + "epoch": 3.2063910876575785, + "grad_norm": 0.3784516082466502, + "learning_rate": 0.00048354788178928604, + "loss": 3.1102170944213867, + "step": 5470, + "token_acc": 0.2860081144290229 + }, + { + "epoch": 3.2069774259747876, + "grad_norm": 0.22306993861077215, + "learning_rate": 0.0004835392360327382, + "loss": 3.1052050590515137, + "step": 5471, + "token_acc": 0.2875476759072006 + }, + { + "epoch": 3.2075637642919963, + "grad_norm": 0.2693641644113196, + "learning_rate": 0.000483530588082404, + "loss": 3.078205108642578, + "step": 5472, + "token_acc": 0.2908847396617234 + }, + { + "epoch": 3.2081501026092054, + "grad_norm": 0.21399341728918342, + "learning_rate": 0.0004835219379383645, + "loss": 3.083387851715088, + "step": 5473, + "token_acc": 0.2884688494367745 + }, + { + "epoch": 3.2087364409264145, + "grad_norm": 0.2729535591672113, + "learning_rate": 0.00048351328560070116, + "loss": 3.073638439178467, + "step": 5474, + "token_acc": 0.28968790797737787 + }, + { + "epoch": 3.2093227792436236, + "grad_norm": 0.22863452677591722, + "learning_rate": 0.00048350463106949516, + "loss": 3.1276912689208984, + "step": 5475, + "token_acc": 0.28609252756045517 + }, + { + "epoch": 3.2099091175608327, + "grad_norm": 0.22223867307764247, + "learning_rate": 0.0004834959743448277, + "loss": 3.113156318664551, + "step": 5476, + "token_acc": 0.2850723363998699 + }, + { + "epoch": 3.210495455878042, + "grad_norm": 0.2583761047262608, + "learning_rate": 0.0004834873154267803, + "loss": 3.0841078758239746, + "step": 5477, + "token_acc": 0.28956811228604595 + }, + { + "epoch": 3.2110817941952505, + "grad_norm": 0.2312522671351118, + "learning_rate": 0.00048347865431543416, + "loss": 3.1102442741394043, + "step": 5478, + "token_acc": 0.2879688941473254 + }, + { + "epoch": 3.2116681325124596, + "grad_norm": 0.24471345013846232, + "learning_rate": 0.00048346999101087074, + "loss": 3.107431411743164, + "step": 5479, + "token_acc": 0.2860838447646474 + }, + { + "epoch": 3.2122544708296688, + "grad_norm": 0.22558153654015145, + "learning_rate": 0.0004834613255131713, + "loss": 3.136211633682251, + "step": 5480, + "token_acc": 0.2802895361141678 + }, + { + "epoch": 3.212840809146878, + "grad_norm": 0.23939409213433005, + "learning_rate": 0.0004834526578224173, + "loss": 3.027445077896118, + "step": 5481, + "token_acc": 0.2982794953330769 + }, + { + "epoch": 3.213427147464087, + "grad_norm": 0.21857384711031744, + "learning_rate": 0.0004834439879386902, + "loss": 3.0736289024353027, + "step": 5482, + "token_acc": 0.29021089077746304 + }, + { + "epoch": 3.2140134857812956, + "grad_norm": 0.25467179521175926, + "learning_rate": 0.00048343531586207136, + "loss": 3.046515941619873, + "step": 5483, + "token_acc": 0.29532265211657865 + }, + { + "epoch": 3.2145998240985048, + "grad_norm": 0.26024118648733696, + "learning_rate": 0.00048342664159264226, + "loss": 3.06687068939209, + "step": 5484, + "token_acc": 0.2907771385959305 + }, + { + "epoch": 3.215186162415714, + "grad_norm": 0.20227729939136405, + "learning_rate": 0.00048341796513048447, + "loss": 3.0750865936279297, + "step": 5485, + "token_acc": 0.29024268010778786 + }, + { + "epoch": 3.215772500732923, + "grad_norm": 0.2412945672599017, + "learning_rate": 0.0004834092864756794, + "loss": 3.087524890899658, + "step": 5486, + "token_acc": 0.2875397199306342 + }, + { + "epoch": 3.216358839050132, + "grad_norm": 0.19130565217316675, + "learning_rate": 0.00048340060562830867, + "loss": 3.06927752494812, + "step": 5487, + "token_acc": 0.2926132787916929 + }, + { + "epoch": 3.2169451773673408, + "grad_norm": 0.2064681364454697, + "learning_rate": 0.00048339192258845367, + "loss": 3.065518856048584, + "step": 5488, + "token_acc": 0.2935095434347155 + }, + { + "epoch": 3.21753151568455, + "grad_norm": 0.1883689287245335, + "learning_rate": 0.0004833832373561961, + "loss": 3.098879098892212, + "step": 5489, + "token_acc": 0.2875645031204241 + }, + { + "epoch": 3.218117854001759, + "grad_norm": 0.23256275387425984, + "learning_rate": 0.0004833745499316175, + "loss": 3.0732219219207764, + "step": 5490, + "token_acc": 0.2896942603975413 + }, + { + "epoch": 3.218704192318968, + "grad_norm": 0.21086061395240516, + "learning_rate": 0.00048336586031479947, + "loss": 3.0566999912261963, + "step": 5491, + "token_acc": 0.29370183221863316 + }, + { + "epoch": 3.219290530636177, + "grad_norm": 0.21528969885137095, + "learning_rate": 0.0004833571685058237, + "loss": 3.073592185974121, + "step": 5492, + "token_acc": 0.2895099269299108 + }, + { + "epoch": 3.219876868953386, + "grad_norm": 0.27959681847456236, + "learning_rate": 0.0004833484745047717, + "loss": 3.0748610496520996, + "step": 5493, + "token_acc": 0.2908596434459699 + }, + { + "epoch": 3.220463207270595, + "grad_norm": 0.2115260374862627, + "learning_rate": 0.00048333977831172524, + "loss": 3.111239433288574, + "step": 5494, + "token_acc": 0.2857252376571604 + }, + { + "epoch": 3.221049545587804, + "grad_norm": 0.27346730532481467, + "learning_rate": 0.00048333107992676604, + "loss": 3.136847972869873, + "step": 5495, + "token_acc": 0.2816148693939693 + }, + { + "epoch": 3.221635883905013, + "grad_norm": 0.26843294646153165, + "learning_rate": 0.00048332237934997575, + "loss": 3.0761208534240723, + "step": 5496, + "token_acc": 0.29268709298601475 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.21955877344501762, + "learning_rate": 0.0004833136765814361, + "loss": 3.082918643951416, + "step": 5497, + "token_acc": 0.28897021439134485 + }, + { + "epoch": 3.2228085605394314, + "grad_norm": 0.24079336407276455, + "learning_rate": 0.0004833049716212289, + "loss": 3.0783653259277344, + "step": 5498, + "token_acc": 0.2902049102328237 + }, + { + "epoch": 3.22339489885664, + "grad_norm": 0.24800655161202645, + "learning_rate": 0.00048329626446943575, + "loss": 3.0728607177734375, + "step": 5499, + "token_acc": 0.2918452761782896 + }, + { + "epoch": 3.223981237173849, + "grad_norm": 0.2836032110449169, + "learning_rate": 0.00048328755512613863, + "loss": 3.0943117141723633, + "step": 5500, + "token_acc": 0.28712881515265776 + }, + { + "epoch": 3.2245675754910583, + "grad_norm": 0.24180727749636519, + "learning_rate": 0.00048327884359141934, + "loss": 3.086979866027832, + "step": 5501, + "token_acc": 0.29173869946044056 + }, + { + "epoch": 3.2251539138082674, + "grad_norm": 0.24400606363084748, + "learning_rate": 0.0004832701298653596, + "loss": 3.037991523742676, + "step": 5502, + "token_acc": 0.29601076383049135 + }, + { + "epoch": 3.2257402521254765, + "grad_norm": 0.280852611947615, + "learning_rate": 0.00048326141394804134, + "loss": 3.0824034214019775, + "step": 5503, + "token_acc": 0.28942529377002113 + }, + { + "epoch": 3.226326590442685, + "grad_norm": 0.23238860234146932, + "learning_rate": 0.00048325269583954645, + "loss": 3.0493392944335938, + "step": 5504, + "token_acc": 0.294387283037776 + }, + { + "epoch": 3.2269129287598943, + "grad_norm": 0.29869128465079686, + "learning_rate": 0.0004832439755399568, + "loss": 3.10185170173645, + "step": 5505, + "token_acc": 0.28625831479744046 + }, + { + "epoch": 3.2274992670771034, + "grad_norm": 0.24330989069721473, + "learning_rate": 0.00048323525304935425, + "loss": 3.099276304244995, + "step": 5506, + "token_acc": 0.28714688690197354 + }, + { + "epoch": 3.2280856053943126, + "grad_norm": 0.23793232832588967, + "learning_rate": 0.00048322652836782075, + "loss": 3.041694402694702, + "step": 5507, + "token_acc": 0.29627967119710064 + }, + { + "epoch": 3.2286719437115217, + "grad_norm": 0.22253746961225015, + "learning_rate": 0.00048321780149543836, + "loss": 3.112489700317383, + "step": 5508, + "token_acc": 0.28457195154109854 + }, + { + "epoch": 3.2292582820287308, + "grad_norm": 0.21079173998597106, + "learning_rate": 0.000483209072432289, + "loss": 3.066570520401001, + "step": 5509, + "token_acc": 0.2917556487120673 + }, + { + "epoch": 3.2298446203459394, + "grad_norm": 0.20509739825352644, + "learning_rate": 0.00048320034117845466, + "loss": 3.0440754890441895, + "step": 5510, + "token_acc": 0.2956563280713616 + }, + { + "epoch": 3.2304309586631486, + "grad_norm": 0.2397538912892448, + "learning_rate": 0.0004831916077340173, + "loss": 3.081899642944336, + "step": 5511, + "token_acc": 0.2880393230500597 + }, + { + "epoch": 3.2310172969803577, + "grad_norm": 0.2847713314617964, + "learning_rate": 0.0004831828720990591, + "loss": 3.136575222015381, + "step": 5512, + "token_acc": 0.28119859587240753 + }, + { + "epoch": 3.231603635297567, + "grad_norm": 0.20153762497357788, + "learning_rate": 0.00048317413427366196, + "loss": 3.121065616607666, + "step": 5513, + "token_acc": 0.2853670651179091 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.26840765200651917, + "learning_rate": 0.0004831653942579081, + "loss": 3.084972381591797, + "step": 5514, + "token_acc": 0.2891232847414038 + }, + { + "epoch": 3.2327763119319846, + "grad_norm": 0.2897725008131054, + "learning_rate": 0.0004831566520518795, + "loss": 3.0773301124572754, + "step": 5515, + "token_acc": 0.28984804613526116 + }, + { + "epoch": 3.2333626502491937, + "grad_norm": 0.2360949334406414, + "learning_rate": 0.00048314790765565833, + "loss": 3.0869359970092773, + "step": 5516, + "token_acc": 0.28876775362988016 + }, + { + "epoch": 3.233948988566403, + "grad_norm": 0.2618863808779009, + "learning_rate": 0.00048313916106932676, + "loss": 3.0679574012756348, + "step": 5517, + "token_acc": 0.29228825540175557 + }, + { + "epoch": 3.234535326883612, + "grad_norm": 0.26822296565669146, + "learning_rate": 0.00048313041229296693, + "loss": 3.0789883136749268, + "step": 5518, + "token_acc": 0.2907333108254883 + }, + { + "epoch": 3.235121665200821, + "grad_norm": 0.22257082193300218, + "learning_rate": 0.000483121661326661, + "loss": 3.1184747219085693, + "step": 5519, + "token_acc": 0.2865320785661291 + }, + { + "epoch": 3.23570800351803, + "grad_norm": 0.22930275527166308, + "learning_rate": 0.00048311290817049123, + "loss": 3.0690481662750244, + "step": 5520, + "token_acc": 0.290849865082546 + }, + { + "epoch": 3.236294341835239, + "grad_norm": 0.26003098408709757, + "learning_rate": 0.0004831041528245398, + "loss": 3.083949565887451, + "step": 5521, + "token_acc": 0.28990047176010236 + }, + { + "epoch": 3.236880680152448, + "grad_norm": 0.24657393657409338, + "learning_rate": 0.000483095395288889, + "loss": 3.0486793518066406, + "step": 5522, + "token_acc": 0.29499796207310797 + }, + { + "epoch": 3.237467018469657, + "grad_norm": 0.2117358387276992, + "learning_rate": 0.00048308663556362097, + "loss": 3.094142198562622, + "step": 5523, + "token_acc": 0.2884252975053044 + }, + { + "epoch": 3.238053356786866, + "grad_norm": 0.2231204335676452, + "learning_rate": 0.00048307787364881816, + "loss": 3.1309266090393066, + "step": 5524, + "token_acc": 0.2814122065528463 + }, + { + "epoch": 3.2386396951040752, + "grad_norm": 0.2530448252927042, + "learning_rate": 0.0004830691095445628, + "loss": 3.1188931465148926, + "step": 5525, + "token_acc": 0.2842186854337186 + }, + { + "epoch": 3.239226033421284, + "grad_norm": 0.2305938803963083, + "learning_rate": 0.00048306034325093717, + "loss": 3.074246406555176, + "step": 5526, + "token_acc": 0.2922767810114516 + }, + { + "epoch": 3.239812371738493, + "grad_norm": 0.2281008706649152, + "learning_rate": 0.0004830515747680237, + "loss": 3.1134886741638184, + "step": 5527, + "token_acc": 0.2856305657497532 + }, + { + "epoch": 3.240398710055702, + "grad_norm": 0.2092998691822021, + "learning_rate": 0.0004830428040959048, + "loss": 3.090656042098999, + "step": 5528, + "token_acc": 0.28787272468789327 + }, + { + "epoch": 3.2409850483729112, + "grad_norm": 0.2027139671287903, + "learning_rate": 0.0004830340312346627, + "loss": 3.074843168258667, + "step": 5529, + "token_acc": 0.2911344813426269 + }, + { + "epoch": 3.2415713866901203, + "grad_norm": 0.21946936744481052, + "learning_rate": 0.00048302525618437985, + "loss": 3.0870721340179443, + "step": 5530, + "token_acc": 0.2876531907033521 + }, + { + "epoch": 3.2421577250073295, + "grad_norm": 0.20031433622959838, + "learning_rate": 0.0004830164789451388, + "loss": 3.02978515625, + "step": 5531, + "token_acc": 0.29657000954281254 + }, + { + "epoch": 3.242744063324538, + "grad_norm": 0.19882359922338685, + "learning_rate": 0.0004830076995170219, + "loss": 3.0596399307250977, + "step": 5532, + "token_acc": 0.29195369774919616 + }, + { + "epoch": 3.2433304016417472, + "grad_norm": 0.18831758626723094, + "learning_rate": 0.00048299891790011177, + "loss": 3.0716328620910645, + "step": 5533, + "token_acc": 0.29120888718100996 + }, + { + "epoch": 3.2439167399589564, + "grad_norm": 0.21150153206431097, + "learning_rate": 0.0004829901340944906, + "loss": 3.065882682800293, + "step": 5534, + "token_acc": 0.2932831929889967 + }, + { + "epoch": 3.2445030782761655, + "grad_norm": 0.23250115931994414, + "learning_rate": 0.0004829813481002411, + "loss": 3.0413084030151367, + "step": 5535, + "token_acc": 0.2960883786488202 + }, + { + "epoch": 3.2450894165933746, + "grad_norm": 0.2833858117252575, + "learning_rate": 0.0004829725599174458, + "loss": 3.1076743602752686, + "step": 5536, + "token_acc": 0.28634254610246823 + }, + { + "epoch": 3.2456757549105832, + "grad_norm": 0.310441014199654, + "learning_rate": 0.0004829637695461873, + "loss": 3.0486221313476562, + "step": 5537, + "token_acc": 0.2931980620106007 + }, + { + "epoch": 3.2462620932277924, + "grad_norm": 0.2765746236856749, + "learning_rate": 0.00048295497698654804, + "loss": 3.062201976776123, + "step": 5538, + "token_acc": 0.2915463123435394 + }, + { + "epoch": 3.2468484315450015, + "grad_norm": 0.19542594289498158, + "learning_rate": 0.00048294618223861075, + "loss": 3.082529067993164, + "step": 5539, + "token_acc": 0.2894271930420602 + }, + { + "epoch": 3.2474347698622106, + "grad_norm": 0.27849102169961654, + "learning_rate": 0.0004829373853024579, + "loss": 3.0803189277648926, + "step": 5540, + "token_acc": 0.29021554546331974 + }, + { + "epoch": 3.2480211081794197, + "grad_norm": 0.2227586326521921, + "learning_rate": 0.0004829285861781723, + "loss": 3.0661215782165527, + "step": 5541, + "token_acc": 0.29176579622500837 + }, + { + "epoch": 3.2486074464966284, + "grad_norm": 0.20364548104494964, + "learning_rate": 0.0004829197848658364, + "loss": 3.0735926628112793, + "step": 5542, + "token_acc": 0.2895822745143477 + }, + { + "epoch": 3.2491937848138375, + "grad_norm": 0.2621733612398241, + "learning_rate": 0.000482910981365533, + "loss": 3.0894033908843994, + "step": 5543, + "token_acc": 0.2894053730904972 + }, + { + "epoch": 3.2497801231310466, + "grad_norm": 0.19978639876452248, + "learning_rate": 0.00048290217567734486, + "loss": 3.0747861862182617, + "step": 5544, + "token_acc": 0.2884870831510017 + }, + { + "epoch": 3.2503664614482557, + "grad_norm": 0.22115138332441409, + "learning_rate": 0.0004828933678013545, + "loss": 3.066645860671997, + "step": 5545, + "token_acc": 0.2908671172069693 + }, + { + "epoch": 3.250952799765465, + "grad_norm": 0.21760238275526372, + "learning_rate": 0.00048288455773764485, + "loss": 3.0593342781066895, + "step": 5546, + "token_acc": 0.2922667418780518 + }, + { + "epoch": 3.2515391380826735, + "grad_norm": 0.1937639325976976, + "learning_rate": 0.0004828757454862986, + "loss": 3.125030040740967, + "step": 5547, + "token_acc": 0.28257318983790203 + }, + { + "epoch": 3.2521254763998826, + "grad_norm": 0.2355524685825062, + "learning_rate": 0.00048286693104739856, + "loss": 3.1203601360321045, + "step": 5548, + "token_acc": 0.28508052200831363 + }, + { + "epoch": 3.2527118147170917, + "grad_norm": 0.2242657730880174, + "learning_rate": 0.0004828581144210274, + "loss": 3.0945048332214355, + "step": 5549, + "token_acc": 0.2886382170470616 + }, + { + "epoch": 3.253298153034301, + "grad_norm": 0.24069046921192702, + "learning_rate": 0.0004828492956072681, + "loss": 3.099910259246826, + "step": 5550, + "token_acc": 0.2874247720165042 + }, + { + "epoch": 3.25388449135151, + "grad_norm": 0.23100186480487003, + "learning_rate": 0.0004828404746062034, + "loss": 3.0883474349975586, + "step": 5551, + "token_acc": 0.28865603399322487 + }, + { + "epoch": 3.254470829668719, + "grad_norm": 0.2168281781062133, + "learning_rate": 0.00048283165141791616, + "loss": 3.1352527141571045, + "step": 5552, + "token_acc": 0.2830030026912626 + }, + { + "epoch": 3.2550571679859277, + "grad_norm": 0.24755457077599033, + "learning_rate": 0.0004828228260424894, + "loss": 3.0980682373046875, + "step": 5553, + "token_acc": 0.28801312217880737 + }, + { + "epoch": 3.255643506303137, + "grad_norm": 0.2691490735128906, + "learning_rate": 0.0004828139984800059, + "loss": 3.074782371520996, + "step": 5554, + "token_acc": 0.291357906900317 + }, + { + "epoch": 3.256229844620346, + "grad_norm": 0.24624511990986464, + "learning_rate": 0.00048280516873054857, + "loss": 3.067749500274658, + "step": 5555, + "token_acc": 0.29118769008711787 + }, + { + "epoch": 3.256816182937555, + "grad_norm": 0.2320774780419121, + "learning_rate": 0.00048279633679420046, + "loss": 3.0465569496154785, + "step": 5556, + "token_acc": 0.2949327301182319 + }, + { + "epoch": 3.257402521254764, + "grad_norm": 0.27411897719442974, + "learning_rate": 0.0004827875026710443, + "loss": 3.104708671569824, + "step": 5557, + "token_acc": 0.2856473071956362 + }, + { + "epoch": 3.257988859571973, + "grad_norm": 0.22278925125316826, + "learning_rate": 0.0004827786663611634, + "loss": 3.073840379714966, + "step": 5558, + "token_acc": 0.2905030038518652 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.23041377743166, + "learning_rate": 0.0004827698278646405, + "loss": 3.129525661468506, + "step": 5559, + "token_acc": 0.2827432203866181 + }, + { + "epoch": 3.259161536206391, + "grad_norm": 0.26890400252201524, + "learning_rate": 0.0004827609871815588, + "loss": 3.090733289718628, + "step": 5560, + "token_acc": 0.2878155352583507 + }, + { + "epoch": 3.2597478745236, + "grad_norm": 0.22484175163073575, + "learning_rate": 0.0004827521443120013, + "loss": 3.1161885261535645, + "step": 5561, + "token_acc": 0.28375744478451487 + }, + { + "epoch": 3.2603342128408093, + "grad_norm": 0.2550605487313567, + "learning_rate": 0.000482743299256051, + "loss": 3.0764551162719727, + "step": 5562, + "token_acc": 0.29176013782550086 + }, + { + "epoch": 3.2609205511580184, + "grad_norm": 0.2206675182139155, + "learning_rate": 0.00048273445201379094, + "loss": 3.0539767742156982, + "step": 5563, + "token_acc": 0.29212919629135947 + }, + { + "epoch": 3.261506889475227, + "grad_norm": 0.22240673153250273, + "learning_rate": 0.0004827256025853044, + "loss": 3.0845656394958496, + "step": 5564, + "token_acc": 0.28825744085474436 + }, + { + "epoch": 3.262093227792436, + "grad_norm": 0.2848616921402516, + "learning_rate": 0.0004827167509706745, + "loss": 3.0736348628997803, + "step": 5565, + "token_acc": 0.29237470192700166 + }, + { + "epoch": 3.2626795661096453, + "grad_norm": 0.23879443714578585, + "learning_rate": 0.0004827078971699842, + "loss": 3.0961108207702637, + "step": 5566, + "token_acc": 0.2863585590251891 + }, + { + "epoch": 3.2632659044268544, + "grad_norm": 0.2098864645776518, + "learning_rate": 0.0004826990411833168, + "loss": 3.0913915634155273, + "step": 5567, + "token_acc": 0.28922415066370555 + }, + { + "epoch": 3.2638522427440635, + "grad_norm": 0.2155371152336698, + "learning_rate": 0.0004826901830107555, + "loss": 3.0926570892333984, + "step": 5568, + "token_acc": 0.2879919394624394 + }, + { + "epoch": 3.264438581061272, + "grad_norm": 0.19687301641213242, + "learning_rate": 0.00048268132265238354, + "loss": 3.02547550201416, + "step": 5569, + "token_acc": 0.2988211242514649 + }, + { + "epoch": 3.2650249193784813, + "grad_norm": 0.2191573767298865, + "learning_rate": 0.00048267246010828395, + "loss": 3.086819648742676, + "step": 5570, + "token_acc": 0.2878316719462931 + }, + { + "epoch": 3.2656112576956904, + "grad_norm": 0.2544937132395049, + "learning_rate": 0.00048266359537854023, + "loss": 3.0988962650299072, + "step": 5571, + "token_acc": 0.2880391627597376 + }, + { + "epoch": 3.2661975960128995, + "grad_norm": 0.2076748724334846, + "learning_rate": 0.00048265472846323554, + "loss": 3.0667712688446045, + "step": 5572, + "token_acc": 0.2920748502143851 + }, + { + "epoch": 3.2667839343301086, + "grad_norm": 0.21024077915996622, + "learning_rate": 0.0004826458593624532, + "loss": 3.066645383834839, + "step": 5573, + "token_acc": 0.29202946574225125 + }, + { + "epoch": 3.2673702726473177, + "grad_norm": 0.20863706108736713, + "learning_rate": 0.00048263698807627644, + "loss": 3.085844039916992, + "step": 5574, + "token_acc": 0.2908935856959932 + }, + { + "epoch": 3.2679566109645264, + "grad_norm": 0.22750015638112903, + "learning_rate": 0.00048262811460478874, + "loss": 3.07515811920166, + "step": 5575, + "token_acc": 0.2902277157144097 + }, + { + "epoch": 3.2685429492817355, + "grad_norm": 0.21866372778163526, + "learning_rate": 0.0004826192389480733, + "loss": 3.0436739921569824, + "step": 5576, + "token_acc": 0.29545466005758575 + }, + { + "epoch": 3.2691292875989446, + "grad_norm": 0.2085153771693781, + "learning_rate": 0.0004826103611062136, + "loss": 3.081249237060547, + "step": 5577, + "token_acc": 0.29052503525118195 + }, + { + "epoch": 3.2697156259161537, + "grad_norm": 0.24075439402996177, + "learning_rate": 0.00048260148107929303, + "loss": 3.1441917419433594, + "step": 5578, + "token_acc": 0.2804097412368662 + }, + { + "epoch": 3.270301964233363, + "grad_norm": 0.23658815838790412, + "learning_rate": 0.0004825925988673949, + "loss": 3.0593252182006836, + "step": 5579, + "token_acc": 0.2931509483981973 + }, + { + "epoch": 3.2708883025505715, + "grad_norm": 0.24704226927261264, + "learning_rate": 0.00048258371447060277, + "loss": 3.0668158531188965, + "step": 5580, + "token_acc": 0.2924334429540275 + }, + { + "epoch": 3.2714746408677806, + "grad_norm": 0.2584884201416459, + "learning_rate": 0.0004825748278890001, + "loss": 3.089484214782715, + "step": 5581, + "token_acc": 0.28829440173719767 + }, + { + "epoch": 3.2720609791849897, + "grad_norm": 0.21770561941906955, + "learning_rate": 0.0004825659391226703, + "loss": 3.0831127166748047, + "step": 5582, + "token_acc": 0.2893206768130568 + }, + { + "epoch": 3.272647317502199, + "grad_norm": 0.24612857126906626, + "learning_rate": 0.0004825570481716969, + "loss": 3.0791373252868652, + "step": 5583, + "token_acc": 0.288189868679462 + }, + { + "epoch": 3.273233655819408, + "grad_norm": 0.20020996065935764, + "learning_rate": 0.00048254815503616334, + "loss": 3.0971226692199707, + "step": 5584, + "token_acc": 0.2882106166363097 + }, + { + "epoch": 3.273819994136617, + "grad_norm": 0.24961252140913656, + "learning_rate": 0.00048253925971615324, + "loss": 3.104736328125, + "step": 5585, + "token_acc": 0.28686475134706535 + }, + { + "epoch": 3.2744063324538257, + "grad_norm": 0.2985282583479439, + "learning_rate": 0.0004825303622117502, + "loss": 3.1484391689300537, + "step": 5586, + "token_acc": 0.2801261498028909 + }, + { + "epoch": 3.274992670771035, + "grad_norm": 0.3642982579503769, + "learning_rate": 0.00048252146252303774, + "loss": 3.1142873764038086, + "step": 5587, + "token_acc": 0.28479375690912384 + }, + { + "epoch": 3.275579009088244, + "grad_norm": 0.2755944521451712, + "learning_rate": 0.0004825125606500994, + "loss": 3.0979433059692383, + "step": 5588, + "token_acc": 0.2867411862175474 + }, + { + "epoch": 3.276165347405453, + "grad_norm": 0.2338510714318697, + "learning_rate": 0.0004825036565930189, + "loss": 3.057699203491211, + "step": 5589, + "token_acc": 0.2940097157221724 + }, + { + "epoch": 3.2767516857226617, + "grad_norm": 0.23144373217834288, + "learning_rate": 0.00048249475035187984, + "loss": 3.042672634124756, + "step": 5590, + "token_acc": 0.2935944510299185 + }, + { + "epoch": 3.277338024039871, + "grad_norm": 0.23461611146451863, + "learning_rate": 0.00048248584192676593, + "loss": 3.072084426879883, + "step": 5591, + "token_acc": 0.2910954812295389 + }, + { + "epoch": 3.27792436235708, + "grad_norm": 0.2404245539068119, + "learning_rate": 0.00048247693131776083, + "loss": 3.1187362670898438, + "step": 5592, + "token_acc": 0.2848057470353643 + }, + { + "epoch": 3.278510700674289, + "grad_norm": 0.19420473264809676, + "learning_rate": 0.0004824680185249481, + "loss": 3.06527042388916, + "step": 5593, + "token_acc": 0.29126760998517054 + }, + { + "epoch": 3.279097038991498, + "grad_norm": 0.20838846372068304, + "learning_rate": 0.00048245910354841173, + "loss": 3.0861334800720215, + "step": 5594, + "token_acc": 0.28757289909366196 + }, + { + "epoch": 3.2796833773087073, + "grad_norm": 0.2782528592092092, + "learning_rate": 0.0004824501863882353, + "loss": 3.114996910095215, + "step": 5595, + "token_acc": 0.28384358912389845 + }, + { + "epoch": 3.280269715625916, + "grad_norm": 0.2500523487965146, + "learning_rate": 0.0004824412670445025, + "loss": 3.111743688583374, + "step": 5596, + "token_acc": 0.2858417996903885 + }, + { + "epoch": 3.280856053943125, + "grad_norm": 0.2154042382800553, + "learning_rate": 0.00048243234551729737, + "loss": 3.105464458465576, + "step": 5597, + "token_acc": 0.2866963277283427 + }, + { + "epoch": 3.281442392260334, + "grad_norm": 0.2974301311415171, + "learning_rate": 0.0004824234218067035, + "loss": 3.1336405277252197, + "step": 5598, + "token_acc": 0.28211901570653375 + }, + { + "epoch": 3.2820287305775433, + "grad_norm": 0.24572189656894172, + "learning_rate": 0.0004824144959128047, + "loss": 3.0633602142333984, + "step": 5599, + "token_acc": 0.29346181518081976 + }, + { + "epoch": 3.2826150688947524, + "grad_norm": 0.2070264741476238, + "learning_rate": 0.00048240556783568503, + "loss": 3.0652341842651367, + "step": 5600, + "token_acc": 0.29193398462927295 + }, + { + "epoch": 3.283201407211961, + "grad_norm": 0.23818577454244913, + "learning_rate": 0.00048239663757542806, + "loss": 3.109423875808716, + "step": 5601, + "token_acc": 0.2848501034247317 + }, + { + "epoch": 3.28378774552917, + "grad_norm": 0.19073247199752943, + "learning_rate": 0.0004823877051321179, + "loss": 3.0666239261627197, + "step": 5602, + "token_acc": 0.292464746175054 + }, + { + "epoch": 3.2843740838463793, + "grad_norm": 0.29587031607351383, + "learning_rate": 0.00048237877050583844, + "loss": 3.0637729167938232, + "step": 5603, + "token_acc": 0.2924888502121179 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.21882348129684032, + "learning_rate": 0.0004823698336966735, + "loss": 3.089359760284424, + "step": 5604, + "token_acc": 0.28817105634654944 + }, + { + "epoch": 3.2855467604807975, + "grad_norm": 0.22778499390588072, + "learning_rate": 0.0004823608947047072, + "loss": 3.1079695224761963, + "step": 5605, + "token_acc": 0.28588037926970405 + }, + { + "epoch": 3.2861330987980066, + "grad_norm": 0.24175090605085456, + "learning_rate": 0.0004823519535300234, + "loss": 3.0653162002563477, + "step": 5606, + "token_acc": 0.29173449038607074 + }, + { + "epoch": 3.2867194371152153, + "grad_norm": 0.20790429547459105, + "learning_rate": 0.000482343010172706, + "loss": 3.0496041774749756, + "step": 5607, + "token_acc": 0.29385477753181355 + }, + { + "epoch": 3.2873057754324244, + "grad_norm": 0.2483078665022293, + "learning_rate": 0.0004823340646328391, + "loss": 3.102588415145874, + "step": 5608, + "token_acc": 0.2881679389312977 + }, + { + "epoch": 3.2878921137496335, + "grad_norm": 0.23515268495632705, + "learning_rate": 0.0004823251169105068, + "loss": 3.1007556915283203, + "step": 5609, + "token_acc": 0.28820658425648343 + }, + { + "epoch": 3.2884784520668426, + "grad_norm": 0.21233421145808948, + "learning_rate": 0.0004823161670057931, + "loss": 3.0945563316345215, + "step": 5610, + "token_acc": 0.28868392142161686 + }, + { + "epoch": 3.2890647903840518, + "grad_norm": 0.2589703532550193, + "learning_rate": 0.000482307214918782, + "loss": 3.1085004806518555, + "step": 5611, + "token_acc": 0.2854568938655302 + }, + { + "epoch": 3.2896511287012604, + "grad_norm": 0.22556743987744785, + "learning_rate": 0.00048229826064955764, + "loss": 3.1141834259033203, + "step": 5612, + "token_acc": 0.2856895444602239 + }, + { + "epoch": 3.2902374670184695, + "grad_norm": 0.21502366766656042, + "learning_rate": 0.00048228930419820423, + "loss": 3.1022229194641113, + "step": 5613, + "token_acc": 0.28490966550032026 + }, + { + "epoch": 3.2908238053356786, + "grad_norm": 0.2796042443947748, + "learning_rate": 0.00048228034556480574, + "loss": 3.0935940742492676, + "step": 5614, + "token_acc": 0.28808274433277703 + }, + { + "epoch": 3.2914101436528878, + "grad_norm": 0.2645599672131271, + "learning_rate": 0.00048227138474944643, + "loss": 3.081292152404785, + "step": 5615, + "token_acc": 0.29022148040790485 + }, + { + "epoch": 3.291996481970097, + "grad_norm": 0.22451169555887265, + "learning_rate": 0.0004822624217522105, + "loss": 3.0939862728118896, + "step": 5616, + "token_acc": 0.2878765789501088 + }, + { + "epoch": 3.292582820287306, + "grad_norm": 0.2750437428934105, + "learning_rate": 0.000482253456573182, + "loss": 3.0854268074035645, + "step": 5617, + "token_acc": 0.2911442896465007 + }, + { + "epoch": 3.2931691586045146, + "grad_norm": 0.24178880153119978, + "learning_rate": 0.00048224448921244535, + "loss": 3.1188011169433594, + "step": 5618, + "token_acc": 0.28323370338741816 + }, + { + "epoch": 3.2937554969217238, + "grad_norm": 0.3037231556032077, + "learning_rate": 0.0004822355196700846, + "loss": 3.0870141983032227, + "step": 5619, + "token_acc": 0.2881109310649209 + }, + { + "epoch": 3.294341835238933, + "grad_norm": 0.3075103611537917, + "learning_rate": 0.00048222654794618413, + "loss": 3.0998740196228027, + "step": 5620, + "token_acc": 0.2870975249802961 + }, + { + "epoch": 3.294928173556142, + "grad_norm": 0.23649248914714227, + "learning_rate": 0.00048221757404082817, + "loss": 3.0571320056915283, + "step": 5621, + "token_acc": 0.2924980581575387 + }, + { + "epoch": 3.295514511873351, + "grad_norm": 0.25067769450792743, + "learning_rate": 0.000482208597954101, + "loss": 3.081094264984131, + "step": 5622, + "token_acc": 0.2892858526756306 + }, + { + "epoch": 3.2961008501905598, + "grad_norm": 0.2744705993734885, + "learning_rate": 0.00048219961968608695, + "loss": 3.1241700649261475, + "step": 5623, + "token_acc": 0.2846619810464552 + }, + { + "epoch": 3.296687188507769, + "grad_norm": 0.2329372228460261, + "learning_rate": 0.0004821906392368703, + "loss": 3.0474495887756348, + "step": 5624, + "token_acc": 0.29451462323543565 + }, + { + "epoch": 3.297273526824978, + "grad_norm": 0.2272202919647519, + "learning_rate": 0.0004821816566065356, + "loss": 3.070211172103882, + "step": 5625, + "token_acc": 0.2924878278078837 + }, + { + "epoch": 3.297859865142187, + "grad_norm": 0.2316317267267144, + "learning_rate": 0.0004821726717951671, + "loss": 3.0903289318084717, + "step": 5626, + "token_acc": 0.28900211917227625 + }, + { + "epoch": 3.298446203459396, + "grad_norm": 0.224151862099049, + "learning_rate": 0.0004821636848028491, + "loss": 3.0758917331695557, + "step": 5627, + "token_acc": 0.29086551110830466 + }, + { + "epoch": 3.2990325417766053, + "grad_norm": 0.23649733478971557, + "learning_rate": 0.00048215469562966617, + "loss": 3.0742828845977783, + "step": 5628, + "token_acc": 0.2920394705879277 + }, + { + "epoch": 3.299618880093814, + "grad_norm": 0.22470763233267527, + "learning_rate": 0.00048214570427570276, + "loss": 3.0722882747650146, + "step": 5629, + "token_acc": 0.2915392483901897 + }, + { + "epoch": 3.300205218411023, + "grad_norm": 0.23471556058404805, + "learning_rate": 0.00048213671074104326, + "loss": 3.054325580596924, + "step": 5630, + "token_acc": 0.29333741566521965 + }, + { + "epoch": 3.300791556728232, + "grad_norm": 0.2068036009826202, + "learning_rate": 0.00048212771502577215, + "loss": 3.116121292114258, + "step": 5631, + "token_acc": 0.28464698423825086 + }, + { + "epoch": 3.3013778950454413, + "grad_norm": 0.2342874029765438, + "learning_rate": 0.00048211871712997397, + "loss": 3.0906126499176025, + "step": 5632, + "token_acc": 0.2870752644616511 + }, + { + "epoch": 3.3019642333626504, + "grad_norm": 0.2285501870819182, + "learning_rate": 0.00048210971705373316, + "loss": 3.099940776824951, + "step": 5633, + "token_acc": 0.2885108744594184 + }, + { + "epoch": 3.302550571679859, + "grad_norm": 0.2183531498227338, + "learning_rate": 0.0004821007147971344, + "loss": 3.045015573501587, + "step": 5634, + "token_acc": 0.2939536059463559 + }, + { + "epoch": 3.303136909997068, + "grad_norm": 0.21284600974492784, + "learning_rate": 0.0004820917103602622, + "loss": 3.0755672454833984, + "step": 5635, + "token_acc": 0.2911873138788831 + }, + { + "epoch": 3.3037232483142773, + "grad_norm": 0.1976349060974995, + "learning_rate": 0.0004820827037432011, + "loss": 3.1121678352355957, + "step": 5636, + "token_acc": 0.28570651272697123 + }, + { + "epoch": 3.3043095866314864, + "grad_norm": 0.19477786409651648, + "learning_rate": 0.0004820736949460357, + "loss": 3.115225315093994, + "step": 5637, + "token_acc": 0.28513020901628594 + }, + { + "epoch": 3.3048959249486956, + "grad_norm": 0.19311121610095494, + "learning_rate": 0.0004820646839688507, + "loss": 3.1131432056427, + "step": 5638, + "token_acc": 0.2856725433511065 + }, + { + "epoch": 3.3054822632659047, + "grad_norm": 0.2091723176479704, + "learning_rate": 0.00048205567081173066, + "loss": 3.1030778884887695, + "step": 5639, + "token_acc": 0.28610033641561117 + }, + { + "epoch": 3.3060686015831133, + "grad_norm": 0.19866899599185506, + "learning_rate": 0.0004820466554747603, + "loss": 3.104271173477173, + "step": 5640, + "token_acc": 0.28609438635397993 + }, + { + "epoch": 3.3066549399003224, + "grad_norm": 0.22217659679854995, + "learning_rate": 0.00048203763795802435, + "loss": 3.0823962688446045, + "step": 5641, + "token_acc": 0.28848467982364595 + }, + { + "epoch": 3.3072412782175316, + "grad_norm": 0.28319946674486257, + "learning_rate": 0.0004820286182616075, + "loss": 3.0545108318328857, + "step": 5642, + "token_acc": 0.29472654400858395 + }, + { + "epoch": 3.3078276165347407, + "grad_norm": 0.3758620258504028, + "learning_rate": 0.0004820195963855943, + "loss": 3.1115856170654297, + "step": 5643, + "token_acc": 0.285112367003635 + }, + { + "epoch": 3.3084139548519493, + "grad_norm": 0.32268690756611645, + "learning_rate": 0.00048201057233006973, + "loss": 3.1096878051757812, + "step": 5644, + "token_acc": 0.2852580979839109 + }, + { + "epoch": 3.3090002931691584, + "grad_norm": 0.2090951402460857, + "learning_rate": 0.0004820015460951185, + "loss": 3.1138486862182617, + "step": 5645, + "token_acc": 0.2858559794189894 + }, + { + "epoch": 3.3095866314863676, + "grad_norm": 0.2347910485965516, + "learning_rate": 0.0004819925176808253, + "loss": 3.075697898864746, + "step": 5646, + "token_acc": 0.29090870559189175 + }, + { + "epoch": 3.3101729698035767, + "grad_norm": 0.2106849560929445, + "learning_rate": 0.0004819834870872751, + "loss": 3.067077159881592, + "step": 5647, + "token_acc": 0.29152926355460745 + }, + { + "epoch": 3.310759308120786, + "grad_norm": 0.2870569893745065, + "learning_rate": 0.00048197445431455253, + "loss": 3.0558276176452637, + "step": 5648, + "token_acc": 0.29482275285918513 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.23609561201216378, + "learning_rate": 0.0004819654193627426, + "loss": 3.1206564903259277, + "step": 5649, + "token_acc": 0.2844688913461218 + }, + { + "epoch": 3.3119319847552036, + "grad_norm": 0.24739831880535437, + "learning_rate": 0.00048195638223193015, + "loss": 3.099743604660034, + "step": 5650, + "token_acc": 0.28642793403624933 + }, + { + "epoch": 3.3125183230724127, + "grad_norm": 0.1952489481118913, + "learning_rate": 0.0004819473429222001, + "loss": 3.1334993839263916, + "step": 5651, + "token_acc": 0.28193109792177523 + }, + { + "epoch": 3.313104661389622, + "grad_norm": 0.2225534494965365, + "learning_rate": 0.0004819383014336373, + "loss": 3.081575870513916, + "step": 5652, + "token_acc": 0.29049981898332516 + }, + { + "epoch": 3.313690999706831, + "grad_norm": 0.20554899874877397, + "learning_rate": 0.0004819292577663266, + "loss": 3.081087350845337, + "step": 5653, + "token_acc": 0.29000757102819874 + }, + { + "epoch": 3.31427733802404, + "grad_norm": 0.1945376043378655, + "learning_rate": 0.00048192021192035306, + "loss": 3.077475070953369, + "step": 5654, + "token_acc": 0.2890689721917543 + }, + { + "epoch": 3.3148636763412487, + "grad_norm": 0.19961744607953738, + "learning_rate": 0.0004819111638958017, + "loss": 3.1203060150146484, + "step": 5655, + "token_acc": 0.28431469897064177 + }, + { + "epoch": 3.315450014658458, + "grad_norm": 0.24641374969103133, + "learning_rate": 0.0004819021136927575, + "loss": 3.051339864730835, + "step": 5656, + "token_acc": 0.2939230961018872 + }, + { + "epoch": 3.316036352975667, + "grad_norm": 0.23000644068891238, + "learning_rate": 0.0004818930613113054, + "loss": 3.132145881652832, + "step": 5657, + "token_acc": 0.282413129003687 + }, + { + "epoch": 3.316622691292876, + "grad_norm": 0.21131128608647196, + "learning_rate": 0.00048188400675153046, + "loss": 3.1004552841186523, + "step": 5658, + "token_acc": 0.286698705588549 + }, + { + "epoch": 3.317209029610085, + "grad_norm": 0.23313579881447813, + "learning_rate": 0.0004818749500135177, + "loss": 3.089571475982666, + "step": 5659, + "token_acc": 0.2892515582882949 + }, + { + "epoch": 3.3177953679272942, + "grad_norm": 0.22549084311286544, + "learning_rate": 0.00048186589109735237, + "loss": 3.10986328125, + "step": 5660, + "token_acc": 0.28766621835614653 + }, + { + "epoch": 3.318381706244503, + "grad_norm": 0.2272767737107594, + "learning_rate": 0.0004818568300031193, + "loss": 3.0698978900909424, + "step": 5661, + "token_acc": 0.2915396424449622 + }, + { + "epoch": 3.318968044561712, + "grad_norm": 0.2507221312722534, + "learning_rate": 0.0004818477667309038, + "loss": 3.1122870445251465, + "step": 5662, + "token_acc": 0.28360814690864944 + }, + { + "epoch": 3.319554382878921, + "grad_norm": 0.21717127683381995, + "learning_rate": 0.00048183870128079093, + "loss": 3.0622549057006836, + "step": 5663, + "token_acc": 0.2921744118092682 + }, + { + "epoch": 3.3201407211961302, + "grad_norm": 0.25397811370792395, + "learning_rate": 0.00048182963365286593, + "loss": 3.0772716999053955, + "step": 5664, + "token_acc": 0.29129152742284253 + }, + { + "epoch": 3.3207270595133394, + "grad_norm": 0.2547583787020968, + "learning_rate": 0.00048182056384721386, + "loss": 3.091721773147583, + "step": 5665, + "token_acc": 0.2868746749869995 + }, + { + "epoch": 3.321313397830548, + "grad_norm": 0.21469580942889255, + "learning_rate": 0.00048181149186391994, + "loss": 3.082245349884033, + "step": 5666, + "token_acc": 0.29057398603706264 + }, + { + "epoch": 3.321899736147757, + "grad_norm": 0.27147762287583027, + "learning_rate": 0.00048180241770306943, + "loss": 3.1196463108062744, + "step": 5667, + "token_acc": 0.2842632771895183 + }, + { + "epoch": 3.3224860744649662, + "grad_norm": 0.2872118185862629, + "learning_rate": 0.0004817933413647476, + "loss": 3.0859527587890625, + "step": 5668, + "token_acc": 0.289538641500456 + }, + { + "epoch": 3.3230724127821754, + "grad_norm": 0.2327667862099696, + "learning_rate": 0.0004817842628490397, + "loss": 3.0475997924804688, + "step": 5669, + "token_acc": 0.2940962746673668 + }, + { + "epoch": 3.3236587510993845, + "grad_norm": 0.2194225984805931, + "learning_rate": 0.000481775182156031, + "loss": 3.0961875915527344, + "step": 5670, + "token_acc": 0.28780341577749124 + }, + { + "epoch": 3.3242450894165936, + "grad_norm": 0.2508191311648795, + "learning_rate": 0.00048176609928580674, + "loss": 3.0760416984558105, + "step": 5671, + "token_acc": 0.29160896475708675 + }, + { + "epoch": 3.3248314277338022, + "grad_norm": 0.2907824952063711, + "learning_rate": 0.00048175701423845224, + "loss": 3.118382453918457, + "step": 5672, + "token_acc": 0.2853155221449338 + }, + { + "epoch": 3.3254177660510114, + "grad_norm": 0.24762266025330715, + "learning_rate": 0.0004817479270140529, + "loss": 3.093312978744507, + "step": 5673, + "token_acc": 0.2885064192195246 + }, + { + "epoch": 3.3260041043682205, + "grad_norm": 0.2688422834901757, + "learning_rate": 0.0004817388376126941, + "loss": 3.1101064682006836, + "step": 5674, + "token_acc": 0.2861246791072938 + }, + { + "epoch": 3.3265904426854296, + "grad_norm": 0.26183862098939253, + "learning_rate": 0.0004817297460344612, + "loss": 3.1113715171813965, + "step": 5675, + "token_acc": 0.28483649938813127 + }, + { + "epoch": 3.3271767810026387, + "grad_norm": 0.21981799167965343, + "learning_rate": 0.0004817206522794396, + "loss": 3.0412604808807373, + "step": 5676, + "token_acc": 0.29373301063168333 + }, + { + "epoch": 3.3277631193198474, + "grad_norm": 0.21396434011977744, + "learning_rate": 0.00048171155634771476, + "loss": 3.053217649459839, + "step": 5677, + "token_acc": 0.2933580241174207 + }, + { + "epoch": 3.3283494576370565, + "grad_norm": 0.20084703208755356, + "learning_rate": 0.000481702458239372, + "loss": 3.081444263458252, + "step": 5678, + "token_acc": 0.2896609590915132 + }, + { + "epoch": 3.3289357959542656, + "grad_norm": 0.21160760827504185, + "learning_rate": 0.00048169335795449693, + "loss": 3.0933430194854736, + "step": 5679, + "token_acc": 0.2876954413597378 + }, + { + "epoch": 3.3295221342714747, + "grad_norm": 0.21125912927362353, + "learning_rate": 0.000481684255493175, + "loss": 3.0747501850128174, + "step": 5680, + "token_acc": 0.2896137975139706 + }, + { + "epoch": 3.330108472588684, + "grad_norm": 0.20380786808522253, + "learning_rate": 0.00048167515085549155, + "loss": 3.0939369201660156, + "step": 5681, + "token_acc": 0.28700149784684514 + }, + { + "epoch": 3.330694810905893, + "grad_norm": 0.1992534224994094, + "learning_rate": 0.00048166604404153236, + "loss": 3.1031932830810547, + "step": 5682, + "token_acc": 0.28615151791887905 + }, + { + "epoch": 3.3312811492231016, + "grad_norm": 0.23637323677063057, + "learning_rate": 0.0004816569350513828, + "loss": 3.1141958236694336, + "step": 5683, + "token_acc": 0.2862725131427074 + }, + { + "epoch": 3.3318674875403107, + "grad_norm": 0.2296107817641607, + "learning_rate": 0.0004816478238851285, + "loss": 3.076993465423584, + "step": 5684, + "token_acc": 0.2899644372892592 + }, + { + "epoch": 3.33245382585752, + "grad_norm": 0.19651672648090426, + "learning_rate": 0.00048163871054285513, + "loss": 3.11259126663208, + "step": 5685, + "token_acc": 0.28543921040357906 + }, + { + "epoch": 3.333040164174729, + "grad_norm": 0.21849696750702524, + "learning_rate": 0.0004816295950246481, + "loss": 3.0663373470306396, + "step": 5686, + "token_acc": 0.293057210578648 + }, + { + "epoch": 3.333626502491938, + "grad_norm": 0.23396732387562622, + "learning_rate": 0.0004816204773305932, + "loss": 3.0868396759033203, + "step": 5687, + "token_acc": 0.28901468828836474 + }, + { + "epoch": 3.3342128408091467, + "grad_norm": 0.28670765774363827, + "learning_rate": 0.00048161135746077605, + "loss": 3.0919270515441895, + "step": 5688, + "token_acc": 0.288924335230772 + }, + { + "epoch": 3.334799179126356, + "grad_norm": 0.3717367531623226, + "learning_rate": 0.00048160223541528224, + "loss": 3.0542221069335938, + "step": 5689, + "token_acc": 0.2934421626175189 + }, + { + "epoch": 3.335385517443565, + "grad_norm": 0.3182113824960578, + "learning_rate": 0.00048159311119419756, + "loss": 3.0562243461608887, + "step": 5690, + "token_acc": 0.29468335818638747 + }, + { + "epoch": 3.335971855760774, + "grad_norm": 0.2748012763173354, + "learning_rate": 0.00048158398479760767, + "loss": 3.0409555435180664, + "step": 5691, + "token_acc": 0.2961833158727224 + }, + { + "epoch": 3.336558194077983, + "grad_norm": 0.3758953589577359, + "learning_rate": 0.0004815748562255983, + "loss": 3.049755096435547, + "step": 5692, + "token_acc": 0.2937759717773353 + }, + { + "epoch": 3.3371445323951923, + "grad_norm": 0.2330906597893559, + "learning_rate": 0.00048156572547825526, + "loss": 3.0975193977355957, + "step": 5693, + "token_acc": 0.2876893853375474 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.29105387668326754, + "learning_rate": 0.0004815565925556642, + "loss": 3.1019654273986816, + "step": 5694, + "token_acc": 0.28834418554592905 + }, + { + "epoch": 3.33831720902961, + "grad_norm": 0.2142572911854091, + "learning_rate": 0.00048154745745791094, + "loss": 3.1076979637145996, + "step": 5695, + "token_acc": 0.28560871878054467 + }, + { + "epoch": 3.338903547346819, + "grad_norm": 0.3179448090273838, + "learning_rate": 0.00048153832018508146, + "loss": 3.1429243087768555, + "step": 5696, + "token_acc": 0.2810085105390181 + }, + { + "epoch": 3.3394898856640283, + "grad_norm": 0.2229124169650394, + "learning_rate": 0.0004815291807372614, + "loss": 3.088953971862793, + "step": 5697, + "token_acc": 0.2862387295189512 + }, + { + "epoch": 3.340076223981237, + "grad_norm": 0.24755959911567715, + "learning_rate": 0.0004815200391145367, + "loss": 3.0774893760681152, + "step": 5698, + "token_acc": 0.28960004099515746 + }, + { + "epoch": 3.340662562298446, + "grad_norm": 0.2498493871197153, + "learning_rate": 0.0004815108953169931, + "loss": 3.0650951862335205, + "step": 5699, + "token_acc": 0.2922341936797923 + }, + { + "epoch": 3.341248900615655, + "grad_norm": 0.2324897019439465, + "learning_rate": 0.0004815017493447167, + "loss": 3.0454745292663574, + "step": 5700, + "token_acc": 0.29462783484650573 + }, + { + "epoch": 3.3418352389328643, + "grad_norm": 0.2523601971066471, + "learning_rate": 0.0004814926011977933, + "loss": 3.1145246028900146, + "step": 5701, + "token_acc": 0.284871020820647 + }, + { + "epoch": 3.3424215772500734, + "grad_norm": 0.22452375315257608, + "learning_rate": 0.00048148345087630883, + "loss": 3.1056442260742188, + "step": 5702, + "token_acc": 0.28765953549513246 + }, + { + "epoch": 3.3430079155672825, + "grad_norm": 0.23563186322006818, + "learning_rate": 0.0004814742983803493, + "loss": 3.090827226638794, + "step": 5703, + "token_acc": 0.28820892694460953 + }, + { + "epoch": 3.343594253884491, + "grad_norm": 0.23346052564565173, + "learning_rate": 0.0004814651437100006, + "loss": 3.1089439392089844, + "step": 5704, + "token_acc": 0.2856773647854894 + }, + { + "epoch": 3.3441805922017003, + "grad_norm": 0.23736577474290993, + "learning_rate": 0.00048145598686534887, + "loss": 3.0792243480682373, + "step": 5705, + "token_acc": 0.2886090980250161 + }, + { + "epoch": 3.3447669305189094, + "grad_norm": 0.26103306842365875, + "learning_rate": 0.00048144682784647996, + "loss": 3.1224348545074463, + "step": 5706, + "token_acc": 0.28405102794325243 + }, + { + "epoch": 3.3453532688361185, + "grad_norm": 0.22484679590688617, + "learning_rate": 0.00048143766665348, + "loss": 3.061706304550171, + "step": 5707, + "token_acc": 0.2923737208697957 + }, + { + "epoch": 3.3459396071533276, + "grad_norm": 0.29717426051649715, + "learning_rate": 0.00048142850328643504, + "loss": 3.1045303344726562, + "step": 5708, + "token_acc": 0.28454244235340626 + }, + { + "epoch": 3.3465259454705363, + "grad_norm": 0.2889302846735189, + "learning_rate": 0.00048141933774543114, + "loss": 3.0770037174224854, + "step": 5709, + "token_acc": 0.291457906177255 + }, + { + "epoch": 3.3471122837877454, + "grad_norm": 0.2336356880595551, + "learning_rate": 0.0004814101700305544, + "loss": 3.126430034637451, + "step": 5710, + "token_acc": 0.2836267268353945 + }, + { + "epoch": 3.3476986221049545, + "grad_norm": 0.3138092962762753, + "learning_rate": 0.0004814010001418909, + "loss": 3.0566539764404297, + "step": 5711, + "token_acc": 0.29299494723510927 + }, + { + "epoch": 3.3482849604221636, + "grad_norm": 0.19939034817968354, + "learning_rate": 0.0004813918280795269, + "loss": 3.050727367401123, + "step": 5712, + "token_acc": 0.2942774056870666 + }, + { + "epoch": 3.3488712987393727, + "grad_norm": 0.3343314240904337, + "learning_rate": 0.00048138265384354846, + "loss": 3.15769100189209, + "step": 5713, + "token_acc": 0.28218551395188646 + }, + { + "epoch": 3.349457637056582, + "grad_norm": 0.22637676983025098, + "learning_rate": 0.00048137347743404174, + "loss": 3.060959577560425, + "step": 5714, + "token_acc": 0.29280808334997305 + }, + { + "epoch": 3.3500439753737905, + "grad_norm": 0.2738101562229442, + "learning_rate": 0.000481364298851093, + "loss": 3.127572536468506, + "step": 5715, + "token_acc": 0.28259453670642326 + }, + { + "epoch": 3.3506303136909996, + "grad_norm": 0.22417010382553762, + "learning_rate": 0.0004813551180947885, + "loss": 3.0949349403381348, + "step": 5716, + "token_acc": 0.287831161802384 + }, + { + "epoch": 3.3512166520082087, + "grad_norm": 0.25937109542935305, + "learning_rate": 0.0004813459351652143, + "loss": 3.102130889892578, + "step": 5717, + "token_acc": 0.28714053680182994 + }, + { + "epoch": 3.351802990325418, + "grad_norm": 0.22838379922335145, + "learning_rate": 0.0004813367500624569, + "loss": 3.09628963470459, + "step": 5718, + "token_acc": 0.28596513557952813 + }, + { + "epoch": 3.352389328642627, + "grad_norm": 0.23758687896684152, + "learning_rate": 0.0004813275627866024, + "loss": 3.0538878440856934, + "step": 5719, + "token_acc": 0.2957549964144296 + }, + { + "epoch": 3.3529756669598356, + "grad_norm": 0.21841973015618057, + "learning_rate": 0.0004813183733377371, + "loss": 3.0779478549957275, + "step": 5720, + "token_acc": 0.2904533920746633 + }, + { + "epoch": 3.3535620052770447, + "grad_norm": 0.2381607354302755, + "learning_rate": 0.0004813091817159475, + "loss": 3.0999414920806885, + "step": 5721, + "token_acc": 0.2870356019869655 + }, + { + "epoch": 3.354148343594254, + "grad_norm": 0.2610910647140743, + "learning_rate": 0.0004812999879213198, + "loss": 3.128830909729004, + "step": 5722, + "token_acc": 0.28514769798216194 + }, + { + "epoch": 3.354734681911463, + "grad_norm": 0.208631119058508, + "learning_rate": 0.0004812907919539403, + "loss": 3.042705535888672, + "step": 5723, + "token_acc": 0.2960087986170692 + }, + { + "epoch": 3.355321020228672, + "grad_norm": 0.24570551292326842, + "learning_rate": 0.0004812815938138956, + "loss": 3.1195530891418457, + "step": 5724, + "token_acc": 0.28511801435344714 + }, + { + "epoch": 3.355907358545881, + "grad_norm": 0.19530431013684368, + "learning_rate": 0.00048127239350127197, + "loss": 3.0791687965393066, + "step": 5725, + "token_acc": 0.28819945126032853 + }, + { + "epoch": 3.35649369686309, + "grad_norm": 0.20354008214004723, + "learning_rate": 0.0004812631910161558, + "loss": 3.0918564796447754, + "step": 5726, + "token_acc": 0.2875131477108099 + }, + { + "epoch": 3.357080035180299, + "grad_norm": 0.19945359696300652, + "learning_rate": 0.0004812539863586336, + "loss": 3.0526177883148193, + "step": 5727, + "token_acc": 0.2940605957854215 + }, + { + "epoch": 3.357666373497508, + "grad_norm": 0.21035645124820615, + "learning_rate": 0.00048124477952879186, + "loss": 3.1275181770324707, + "step": 5728, + "token_acc": 0.2841350258140575 + }, + { + "epoch": 3.358252711814717, + "grad_norm": 0.21589699669616275, + "learning_rate": 0.00048123557052671696, + "loss": 3.0966527462005615, + "step": 5729, + "token_acc": 0.28661376757328066 + }, + { + "epoch": 3.3588390501319263, + "grad_norm": 0.2321156662126395, + "learning_rate": 0.0004812263593524955, + "loss": 3.084888219833374, + "step": 5730, + "token_acc": 0.28849721234112385 + }, + { + "epoch": 3.359425388449135, + "grad_norm": 0.20656533903242963, + "learning_rate": 0.00048121714600621394, + "loss": 3.085391044616699, + "step": 5731, + "token_acc": 0.28785268736387626 + }, + { + "epoch": 3.360011726766344, + "grad_norm": 0.22435083180988985, + "learning_rate": 0.00048120793048795886, + "loss": 3.0957674980163574, + "step": 5732, + "token_acc": 0.2882599312031251 + }, + { + "epoch": 3.360598065083553, + "grad_norm": 0.25329957182049606, + "learning_rate": 0.00048119871279781693, + "loss": 3.115109443664551, + "step": 5733, + "token_acc": 0.28687792207792207 + }, + { + "epoch": 3.3611844034007623, + "grad_norm": 0.2027388215615873, + "learning_rate": 0.00048118949293587455, + "loss": 3.0481271743774414, + "step": 5734, + "token_acc": 0.294430839396021 + }, + { + "epoch": 3.3617707417179714, + "grad_norm": 0.239959939584249, + "learning_rate": 0.0004811802709022184, + "loss": 3.0477395057678223, + "step": 5735, + "token_acc": 0.29501432133363553 + }, + { + "epoch": 3.3623570800351805, + "grad_norm": 0.20143083654415844, + "learning_rate": 0.00048117104669693513, + "loss": 3.094008445739746, + "step": 5736, + "token_acc": 0.2874228565920654 + }, + { + "epoch": 3.362943418352389, + "grad_norm": 0.22177533077110737, + "learning_rate": 0.00048116182032011145, + "loss": 3.0817437171936035, + "step": 5737, + "token_acc": 0.28947974337774335 + }, + { + "epoch": 3.3635297566695983, + "grad_norm": 0.24851500372107896, + "learning_rate": 0.0004811525917718339, + "loss": 3.084895610809326, + "step": 5738, + "token_acc": 0.2884598733768704 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.21698641374797847, + "learning_rate": 0.00048114336105218924, + "loss": 3.0854220390319824, + "step": 5739, + "token_acc": 0.2892021030006743 + }, + { + "epoch": 3.3647024333040165, + "grad_norm": 0.23752104052936376, + "learning_rate": 0.00048113412816126424, + "loss": 3.0922746658325195, + "step": 5740, + "token_acc": 0.29005128152240894 + }, + { + "epoch": 3.3652887716212256, + "grad_norm": 0.2607651742569317, + "learning_rate": 0.0004811248930991454, + "loss": 3.064988374710083, + "step": 5741, + "token_acc": 0.29194212615132503 + }, + { + "epoch": 3.3658751099384343, + "grad_norm": 0.2972758987967038, + "learning_rate": 0.0004811156558659198, + "loss": 3.08284330368042, + "step": 5742, + "token_acc": 0.2888356701662883 + }, + { + "epoch": 3.3664614482556434, + "grad_norm": 0.26154677273649124, + "learning_rate": 0.000481106416461674, + "loss": 3.073464870452881, + "step": 5743, + "token_acc": 0.29190949422297857 + }, + { + "epoch": 3.3670477865728525, + "grad_norm": 0.21310976982862373, + "learning_rate": 0.00048109717488649487, + "loss": 3.041125774383545, + "step": 5744, + "token_acc": 0.2967950930670986 + }, + { + "epoch": 3.3676341248900616, + "grad_norm": 0.21450687766374527, + "learning_rate": 0.0004810879311404691, + "loss": 3.067251205444336, + "step": 5745, + "token_acc": 0.28984774261882695 + }, + { + "epoch": 3.3682204632072708, + "grad_norm": 0.23038102415734193, + "learning_rate": 0.00048107868522368364, + "loss": 3.0841753482818604, + "step": 5746, + "token_acc": 0.2891329646046627 + }, + { + "epoch": 3.36880680152448, + "grad_norm": 0.2143517825919709, + "learning_rate": 0.0004810694371362253, + "loss": 3.0788352489471436, + "step": 5747, + "token_acc": 0.2905380898025368 + }, + { + "epoch": 3.3693931398416885, + "grad_norm": 0.26354217614374253, + "learning_rate": 0.00048106018687818096, + "loss": 3.0930116176605225, + "step": 5748, + "token_acc": 0.28829707765883067 + }, + { + "epoch": 3.3699794781588976, + "grad_norm": 0.24387825181323383, + "learning_rate": 0.00048105093444963763, + "loss": 3.063042163848877, + "step": 5749, + "token_acc": 0.2924492609407716 + }, + { + "epoch": 3.3705658164761068, + "grad_norm": 0.19043904609869006, + "learning_rate": 0.000481041679850682, + "loss": 3.050421953201294, + "step": 5750, + "token_acc": 0.29360950874459285 + }, + { + "epoch": 3.371152154793316, + "grad_norm": 0.25782541601503517, + "learning_rate": 0.00048103242308140124, + "loss": 3.057704210281372, + "step": 5751, + "token_acc": 0.2941018667063311 + }, + { + "epoch": 3.3717384931105245, + "grad_norm": 0.2594283998728703, + "learning_rate": 0.00048102316414188207, + "loss": 3.0824294090270996, + "step": 5752, + "token_acc": 0.28943257100958886 + }, + { + "epoch": 3.3723248314277336, + "grad_norm": 0.22699682094921436, + "learning_rate": 0.0004810139030322116, + "loss": 3.118851900100708, + "step": 5753, + "token_acc": 0.2850955349705949 + }, + { + "epoch": 3.3729111697449428, + "grad_norm": 0.22707254573136912, + "learning_rate": 0.0004810046397524769, + "loss": 3.061995267868042, + "step": 5754, + "token_acc": 0.29152775586529034 + }, + { + "epoch": 3.373497508062152, + "grad_norm": 0.22714920060732402, + "learning_rate": 0.00048099537430276474, + "loss": 3.0638372898101807, + "step": 5755, + "token_acc": 0.2920150311773046 + }, + { + "epoch": 3.374083846379361, + "grad_norm": 0.18280795259911853, + "learning_rate": 0.00048098610668316245, + "loss": 3.0893235206604004, + "step": 5756, + "token_acc": 0.2859946966087299 + }, + { + "epoch": 3.37467018469657, + "grad_norm": 0.23415046979884144, + "learning_rate": 0.0004809768368937568, + "loss": 3.07240891456604, + "step": 5757, + "token_acc": 0.29082751996764733 + }, + { + "epoch": 3.3752565230137788, + "grad_norm": 0.2828043184444646, + "learning_rate": 0.0004809675649346351, + "loss": 3.1092591285705566, + "step": 5758, + "token_acc": 0.28759957694451826 + }, + { + "epoch": 3.375842861330988, + "grad_norm": 0.2552394450832197, + "learning_rate": 0.0004809582908058844, + "loss": 3.111790657043457, + "step": 5759, + "token_acc": 0.28387232600412843 + }, + { + "epoch": 3.376429199648197, + "grad_norm": 0.20007828106619335, + "learning_rate": 0.0004809490145075918, + "loss": 3.0867719650268555, + "step": 5760, + "token_acc": 0.28935293367888854 + }, + { + "epoch": 3.377015537965406, + "grad_norm": 0.34366265486089664, + "learning_rate": 0.0004809397360398443, + "loss": 3.0969371795654297, + "step": 5761, + "token_acc": 0.2878714859437751 + }, + { + "epoch": 3.377601876282615, + "grad_norm": 0.3099476734542247, + "learning_rate": 0.0004809304554027292, + "loss": 3.082019329071045, + "step": 5762, + "token_acc": 0.28911747183563496 + }, + { + "epoch": 3.378188214599824, + "grad_norm": 0.20427307692509192, + "learning_rate": 0.00048092117259633375, + "loss": 3.1058435440063477, + "step": 5763, + "token_acc": 0.2867441331563275 + }, + { + "epoch": 3.378774552917033, + "grad_norm": 0.2734124014165882, + "learning_rate": 0.0004809118876207449, + "loss": 3.0917820930480957, + "step": 5764, + "token_acc": 0.28890269370060007 + }, + { + "epoch": 3.379360891234242, + "grad_norm": 0.20355862732373922, + "learning_rate": 0.0004809026004760502, + "loss": 3.0744423866271973, + "step": 5765, + "token_acc": 0.2906720181558376 + }, + { + "epoch": 3.379947229551451, + "grad_norm": 0.25056662029779336, + "learning_rate": 0.0004808933111623366, + "loss": 3.055604934692383, + "step": 5766, + "token_acc": 0.29425527394456336 + }, + { + "epoch": 3.3805335678686603, + "grad_norm": 0.1990618498957767, + "learning_rate": 0.0004808840196796914, + "loss": 3.0865888595581055, + "step": 5767, + "token_acc": 0.2899646771125804 + }, + { + "epoch": 3.3811199061858694, + "grad_norm": 0.26088177363157555, + "learning_rate": 0.0004808747260282021, + "loss": 3.0731751918792725, + "step": 5768, + "token_acc": 0.2912242744063325 + }, + { + "epoch": 3.381706244503078, + "grad_norm": 0.21322159532527055, + "learning_rate": 0.0004808654302079558, + "loss": 3.0727858543395996, + "step": 5769, + "token_acc": 0.2907417068315308 + }, + { + "epoch": 3.382292582820287, + "grad_norm": 0.24936095740595454, + "learning_rate": 0.0004808561322190399, + "loss": 3.077361583709717, + "step": 5770, + "token_acc": 0.29025648326338027 + }, + { + "epoch": 3.3828789211374963, + "grad_norm": 0.1961713953261956, + "learning_rate": 0.0004808468320615417, + "loss": 3.0827064514160156, + "step": 5771, + "token_acc": 0.28903025702637625 + }, + { + "epoch": 3.3834652594547054, + "grad_norm": 0.2066706019402469, + "learning_rate": 0.00048083752973554863, + "loss": 3.1105713844299316, + "step": 5772, + "token_acc": 0.2863337526028179 + }, + { + "epoch": 3.3840515977719146, + "grad_norm": 0.191324531502551, + "learning_rate": 0.00048082822524114793, + "loss": 3.0602855682373047, + "step": 5773, + "token_acc": 0.29250441817117256 + }, + { + "epoch": 3.3846379360891232, + "grad_norm": 0.2172599810346052, + "learning_rate": 0.0004808189185784272, + "loss": 3.058100700378418, + "step": 5774, + "token_acc": 0.294175929000195 + }, + { + "epoch": 3.3852242744063323, + "grad_norm": 0.24849369799187593, + "learning_rate": 0.00048080960974747366, + "loss": 3.073580741882324, + "step": 5775, + "token_acc": 0.2911344833885743 + }, + { + "epoch": 3.3858106127235414, + "grad_norm": 0.1901356654576357, + "learning_rate": 0.0004808002987483749, + "loss": 3.0522637367248535, + "step": 5776, + "token_acc": 0.29239809202229367 + }, + { + "epoch": 3.3863969510407506, + "grad_norm": 0.22262370863072392, + "learning_rate": 0.00048079098558121835, + "loss": 3.1037778854370117, + "step": 5777, + "token_acc": 0.28617139893685983 + }, + { + "epoch": 3.3869832893579597, + "grad_norm": 0.25226341283453557, + "learning_rate": 0.00048078167024609154, + "loss": 3.0653905868530273, + "step": 5778, + "token_acc": 0.29262124815606466 + }, + { + "epoch": 3.387569627675169, + "grad_norm": 0.25095395010106475, + "learning_rate": 0.00048077235274308184, + "loss": 3.0395517349243164, + "step": 5779, + "token_acc": 0.29709670041042485 + }, + { + "epoch": 3.3881559659923774, + "grad_norm": 0.2835144388086564, + "learning_rate": 0.00048076303307227684, + "loss": 3.0758960247039795, + "step": 5780, + "token_acc": 0.2903199888032429 + }, + { + "epoch": 3.3887423043095866, + "grad_norm": 0.2853362401896701, + "learning_rate": 0.0004807537112337642, + "loss": 3.1059298515319824, + "step": 5781, + "token_acc": 0.28603461221877896 + }, + { + "epoch": 3.3893286426267957, + "grad_norm": 0.23077786522564972, + "learning_rate": 0.0004807443872276314, + "loss": 3.0568389892578125, + "step": 5782, + "token_acc": 0.29252857349812816 + }, + { + "epoch": 3.389914980944005, + "grad_norm": 0.244304402265046, + "learning_rate": 0.00048073506105396585, + "loss": 3.0718703269958496, + "step": 5783, + "token_acc": 0.29081794930433436 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.24247940725513317, + "learning_rate": 0.0004807257327128555, + "loss": 3.099249839782715, + "step": 5784, + "token_acc": 0.28725392294849517 + }, + { + "epoch": 3.3910876575784226, + "grad_norm": 0.19237477165199504, + "learning_rate": 0.0004807164022043876, + "loss": 3.105773448944092, + "step": 5785, + "token_acc": 0.2860451983048137 + }, + { + "epoch": 3.3916739958956317, + "grad_norm": 0.22914280124163292, + "learning_rate": 0.0004807070695286502, + "loss": 3.126349449157715, + "step": 5786, + "token_acc": 0.2840254124569026 + }, + { + "epoch": 3.392260334212841, + "grad_norm": 0.2751607940150409, + "learning_rate": 0.00048069773468573064, + "loss": 3.0968384742736816, + "step": 5787, + "token_acc": 0.28644330870677376 + }, + { + "epoch": 3.39284667253005, + "grad_norm": 0.27182449339447723, + "learning_rate": 0.00048068839767571674, + "loss": 3.0790700912475586, + "step": 5788, + "token_acc": 0.2897938386768049 + }, + { + "epoch": 3.393433010847259, + "grad_norm": 0.2632142137006546, + "learning_rate": 0.00048067905849869625, + "loss": 3.096647024154663, + "step": 5789, + "token_acc": 0.2887520152769734 + }, + { + "epoch": 3.394019349164468, + "grad_norm": 0.24519727283546697, + "learning_rate": 0.00048066971715475683, + "loss": 3.1253411769866943, + "step": 5790, + "token_acc": 0.28409934388392544 + }, + { + "epoch": 3.394605687481677, + "grad_norm": 0.22431401527896172, + "learning_rate": 0.00048066037364398624, + "loss": 3.0510153770446777, + "step": 5791, + "token_acc": 0.29287805427316743 + }, + { + "epoch": 3.395192025798886, + "grad_norm": 0.2927509222649674, + "learning_rate": 0.00048065102796647225, + "loss": 3.089550495147705, + "step": 5792, + "token_acc": 0.28901016133934115 + }, + { + "epoch": 3.395778364116095, + "grad_norm": 0.33694070162379813, + "learning_rate": 0.0004806416801223027, + "loss": 3.053389549255371, + "step": 5793, + "token_acc": 0.29556451395878297 + }, + { + "epoch": 3.396364702433304, + "grad_norm": 0.2532747358177874, + "learning_rate": 0.0004806323301115653, + "loss": 3.078293800354004, + "step": 5794, + "token_acc": 0.2901449589208232 + }, + { + "epoch": 3.3969510407505132, + "grad_norm": 0.23156978629444264, + "learning_rate": 0.00048062297793434797, + "loss": 3.0768070220947266, + "step": 5795, + "token_acc": 0.29080607316842205 + }, + { + "epoch": 3.397537379067722, + "grad_norm": 0.2708681163987018, + "learning_rate": 0.0004806136235907386, + "loss": 3.093792200088501, + "step": 5796, + "token_acc": 0.2877976366879146 + }, + { + "epoch": 3.398123717384931, + "grad_norm": 0.1918179856309644, + "learning_rate": 0.00048060426708082483, + "loss": 3.081390142440796, + "step": 5797, + "token_acc": 0.28891674834964287 + }, + { + "epoch": 3.39871005570214, + "grad_norm": 0.2803560236727329, + "learning_rate": 0.0004805949084046948, + "loss": 3.118910789489746, + "step": 5798, + "token_acc": 0.28462239923999766 + }, + { + "epoch": 3.3992963940193492, + "grad_norm": 0.20387651993892913, + "learning_rate": 0.0004805855475624363, + "loss": 3.122476577758789, + "step": 5799, + "token_acc": 0.2821834258506441 + }, + { + "epoch": 3.3998827323365584, + "grad_norm": 0.22997903574199927, + "learning_rate": 0.0004805761845541374, + "loss": 3.0593228340148926, + "step": 5800, + "token_acc": 0.2926455275313539 + }, + { + "epoch": 3.4004690706537675, + "grad_norm": 0.19302973158168396, + "learning_rate": 0.0004805668193798859, + "loss": 3.0827794075012207, + "step": 5801, + "token_acc": 0.2894010425730725 + }, + { + "epoch": 3.401055408970976, + "grad_norm": 0.26162173266112315, + "learning_rate": 0.0004805574520397699, + "loss": 3.1175765991210938, + "step": 5802, + "token_acc": 0.28682978986761054 + }, + { + "epoch": 3.4016417472881852, + "grad_norm": 0.20087411893382062, + "learning_rate": 0.00048054808253387716, + "loss": 3.044686794281006, + "step": 5803, + "token_acc": 0.2961704056197598 + }, + { + "epoch": 3.4022280856053944, + "grad_norm": 0.24200512103068264, + "learning_rate": 0.0004805387108622959, + "loss": 3.093392848968506, + "step": 5804, + "token_acc": 0.288144082108278 + }, + { + "epoch": 3.4028144239226035, + "grad_norm": 0.21056269740455547, + "learning_rate": 0.00048052933702511414, + "loss": 3.105135202407837, + "step": 5805, + "token_acc": 0.2862509194073763 + }, + { + "epoch": 3.403400762239812, + "grad_norm": 0.28302352515804924, + "learning_rate": 0.0004805199610224199, + "loss": 3.0494604110717773, + "step": 5806, + "token_acc": 0.29358068995967235 + }, + { + "epoch": 3.4039871005570213, + "grad_norm": 0.19848003622649396, + "learning_rate": 0.00048051058285430125, + "loss": 3.1077818870544434, + "step": 5807, + "token_acc": 0.2857316285677386 + }, + { + "epoch": 3.4045734388742304, + "grad_norm": 0.26028314600754743, + "learning_rate": 0.00048050120252084627, + "loss": 3.1068668365478516, + "step": 5808, + "token_acc": 0.2860129383545594 + }, + { + "epoch": 3.4051597771914395, + "grad_norm": 0.22115195264037904, + "learning_rate": 0.00048049182002214317, + "loss": 3.070546865463257, + "step": 5809, + "token_acc": 0.29210780117250573 + }, + { + "epoch": 3.4057461155086486, + "grad_norm": 0.21981647549314387, + "learning_rate": 0.00048048243535828, + "loss": 3.055387496948242, + "step": 5810, + "token_acc": 0.2928867900499788 + }, + { + "epoch": 3.4063324538258577, + "grad_norm": 0.1991489256734722, + "learning_rate": 0.0004804730485293448, + "loss": 3.0660040378570557, + "step": 5811, + "token_acc": 0.2907783908249108 + }, + { + "epoch": 3.4069187921430664, + "grad_norm": 0.23595584227530747, + "learning_rate": 0.000480463659535426, + "loss": 3.0615086555480957, + "step": 5812, + "token_acc": 0.29362805617989385 + }, + { + "epoch": 3.4075051304602755, + "grad_norm": 0.22364849082900676, + "learning_rate": 0.00048045426837661163, + "loss": 3.0842325687408447, + "step": 5813, + "token_acc": 0.2893531722972433 + }, + { + "epoch": 3.4080914687774846, + "grad_norm": 0.20912965557822802, + "learning_rate": 0.00048044487505298993, + "loss": 3.0990984439849854, + "step": 5814, + "token_acc": 0.28828482797067007 + }, + { + "epoch": 3.4086778070946937, + "grad_norm": 0.20673397990875436, + "learning_rate": 0.00048043547956464914, + "loss": 3.098456859588623, + "step": 5815, + "token_acc": 0.28671040983179713 + }, + { + "epoch": 3.409264145411903, + "grad_norm": 0.21435965519557224, + "learning_rate": 0.00048042608191167763, + "loss": 3.0953593254089355, + "step": 5816, + "token_acc": 0.28847356834643445 + }, + { + "epoch": 3.4098504837291115, + "grad_norm": 0.250425038624519, + "learning_rate": 0.00048041668209416354, + "loss": 3.07012939453125, + "step": 5817, + "token_acc": 0.2915172343757516 + }, + { + "epoch": 3.4104368220463206, + "grad_norm": 0.22281897593464214, + "learning_rate": 0.0004804072801121952, + "loss": 3.0896058082580566, + "step": 5818, + "token_acc": 0.2876357525023135 + }, + { + "epoch": 3.4110231603635297, + "grad_norm": 0.26051131666350635, + "learning_rate": 0.0004803978759658609, + "loss": 3.105618476867676, + "step": 5819, + "token_acc": 0.28777552010836815 + }, + { + "epoch": 3.411609498680739, + "grad_norm": 0.21875868497938686, + "learning_rate": 0.0004803884696552491, + "loss": 3.106571674346924, + "step": 5820, + "token_acc": 0.2865750895806433 + }, + { + "epoch": 3.412195836997948, + "grad_norm": 0.32509710592213953, + "learning_rate": 0.00048037906118044804, + "loss": 3.0984458923339844, + "step": 5821, + "token_acc": 0.28611493958139683 + }, + { + "epoch": 3.412782175315157, + "grad_norm": 0.21447833212759176, + "learning_rate": 0.0004803696505415461, + "loss": 3.0879740715026855, + "step": 5822, + "token_acc": 0.28737390562744025 + }, + { + "epoch": 3.4133685136323657, + "grad_norm": 0.3447987789992168, + "learning_rate": 0.0004803602377386318, + "loss": 3.1139421463012695, + "step": 5823, + "token_acc": 0.28499731180068777 + }, + { + "epoch": 3.413954851949575, + "grad_norm": 0.3025454785828331, + "learning_rate": 0.00048035082277179345, + "loss": 3.091762065887451, + "step": 5824, + "token_acc": 0.28864349148354257 + }, + { + "epoch": 3.414541190266784, + "grad_norm": 0.28424914658042627, + "learning_rate": 0.0004803414056411195, + "loss": 3.0364151000976562, + "step": 5825, + "token_acc": 0.29480832057370404 + }, + { + "epoch": 3.415127528583993, + "grad_norm": 0.248807730418761, + "learning_rate": 0.0004803319863466985, + "loss": 3.1043953895568848, + "step": 5826, + "token_acc": 0.28467934095145386 + }, + { + "epoch": 3.415713866901202, + "grad_norm": 0.25396216771209795, + "learning_rate": 0.00048032256488861883, + "loss": 3.1151437759399414, + "step": 5827, + "token_acc": 0.2841369692976071 + }, + { + "epoch": 3.416300205218411, + "grad_norm": 0.2014072291587069, + "learning_rate": 0.000480313141266969, + "loss": 3.1546125411987305, + "step": 5828, + "token_acc": 0.28141557032558434 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.290712634234121, + "learning_rate": 0.0004803037154818375, + "loss": 3.0473451614379883, + "step": 5829, + "token_acc": 0.2953191193461423 + }, + { + "epoch": 3.417472881852829, + "grad_norm": 0.21283381410406232, + "learning_rate": 0.00048029428753331306, + "loss": 3.096853256225586, + "step": 5830, + "token_acc": 0.287203227158536 + }, + { + "epoch": 3.418059220170038, + "grad_norm": 0.25027082994433786, + "learning_rate": 0.00048028485742148406, + "loss": 3.1329455375671387, + "step": 5831, + "token_acc": 0.2839542999540677 + }, + { + "epoch": 3.4186455584872473, + "grad_norm": 0.2108447522487557, + "learning_rate": 0.0004802754251464391, + "loss": 3.097334384918213, + "step": 5832, + "token_acc": 0.28691633817400797 + }, + { + "epoch": 3.4192318968044564, + "grad_norm": 0.24674563899870713, + "learning_rate": 0.00048026599070826684, + "loss": 3.1175389289855957, + "step": 5833, + "token_acc": 0.28472183486052444 + }, + { + "epoch": 3.419818235121665, + "grad_norm": 0.2178154824649168, + "learning_rate": 0.00048025655410705595, + "loss": 3.1463961601257324, + "step": 5834, + "token_acc": 0.28312026026350007 + }, + { + "epoch": 3.420404573438874, + "grad_norm": 0.23479124801882065, + "learning_rate": 0.0004802471153428949, + "loss": 3.10978364944458, + "step": 5835, + "token_acc": 0.2860416248693283 + }, + { + "epoch": 3.4209909117560833, + "grad_norm": 0.26709198247001814, + "learning_rate": 0.0004802376744158725, + "loss": 3.082141399383545, + "step": 5836, + "token_acc": 0.29098095097955695 + }, + { + "epoch": 3.4215772500732924, + "grad_norm": 0.21398849242316462, + "learning_rate": 0.00048022823132607746, + "loss": 3.1020755767822266, + "step": 5837, + "token_acc": 0.28861034366748567 + }, + { + "epoch": 3.4221635883905015, + "grad_norm": 0.2862274201728564, + "learning_rate": 0.0004802187860735984, + "loss": 3.086907386779785, + "step": 5838, + "token_acc": 0.29095168696033724 + }, + { + "epoch": 3.42274992670771, + "grad_norm": 0.21912026197580337, + "learning_rate": 0.000480209338658524, + "loss": 3.0943949222564697, + "step": 5839, + "token_acc": 0.287692192188305 + }, + { + "epoch": 3.4233362650249193, + "grad_norm": 0.28565839762132395, + "learning_rate": 0.00048019988908094315, + "loss": 3.0787644386291504, + "step": 5840, + "token_acc": 0.291068636061781 + }, + { + "epoch": 3.4239226033421284, + "grad_norm": 0.21368706092303408, + "learning_rate": 0.0004801904373409445, + "loss": 3.0760412216186523, + "step": 5841, + "token_acc": 0.2904921410686506 + }, + { + "epoch": 3.4245089416593375, + "grad_norm": 0.26817031917053785, + "learning_rate": 0.0004801809834386169, + "loss": 3.0939764976501465, + "step": 5842, + "token_acc": 0.2879793124398506 + }, + { + "epoch": 3.4250952799765466, + "grad_norm": 0.22522201372110295, + "learning_rate": 0.0004801715273740491, + "loss": 3.1108858585357666, + "step": 5843, + "token_acc": 0.2847255604452223 + }, + { + "epoch": 3.4256816182937557, + "grad_norm": 0.29889483013612345, + "learning_rate": 0.00048016206914733, + "loss": 3.03969144821167, + "step": 5844, + "token_acc": 0.29548378133618736 + }, + { + "epoch": 3.4262679566109644, + "grad_norm": 0.2070655861874664, + "learning_rate": 0.00048015260875854837, + "loss": 3.0833580493927, + "step": 5845, + "token_acc": 0.28904515361986005 + }, + { + "epoch": 3.4268542949281735, + "grad_norm": 0.238304551453722, + "learning_rate": 0.0004801431462077932, + "loss": 3.07564115524292, + "step": 5846, + "token_acc": 0.29077921020206027 + }, + { + "epoch": 3.4274406332453826, + "grad_norm": 0.22460132837599173, + "learning_rate": 0.0004801336814951532, + "loss": 3.113635301589966, + "step": 5847, + "token_acc": 0.28577541689009106 + }, + { + "epoch": 3.4280269715625917, + "grad_norm": 0.24815266817082246, + "learning_rate": 0.0004801242146207174, + "loss": 3.064605712890625, + "step": 5848, + "token_acc": 0.2912406158544344 + }, + { + "epoch": 3.4286133098798004, + "grad_norm": 0.23052326688352742, + "learning_rate": 0.0004801147455845747, + "loss": 3.0937862396240234, + "step": 5849, + "token_acc": 0.2874501521131393 + }, + { + "epoch": 3.4291996481970095, + "grad_norm": 0.23154088181448326, + "learning_rate": 0.00048010527438681404, + "loss": 3.057910442352295, + "step": 5850, + "token_acc": 0.2935089420822568 + }, + { + "epoch": 3.4297859865142186, + "grad_norm": 0.21041571332952558, + "learning_rate": 0.0004800958010275244, + "loss": 3.0691113471984863, + "step": 5851, + "token_acc": 0.29287693443870044 + }, + { + "epoch": 3.4303723248314277, + "grad_norm": 0.2429813309734059, + "learning_rate": 0.00048008632550679476, + "loss": 3.0724568367004395, + "step": 5852, + "token_acc": 0.29017508820983956 + }, + { + "epoch": 3.430958663148637, + "grad_norm": 0.20579764764510902, + "learning_rate": 0.00048007684782471415, + "loss": 3.0944318771362305, + "step": 5853, + "token_acc": 0.28809689057955273 + }, + { + "epoch": 3.431545001465846, + "grad_norm": 0.24348619575380165, + "learning_rate": 0.00048006736798137165, + "loss": 3.0369873046875, + "step": 5854, + "token_acc": 0.2957550881849183 + }, + { + "epoch": 3.432131339783055, + "grad_norm": 0.22050794185578645, + "learning_rate": 0.00048005788597685616, + "loss": 3.0986013412475586, + "step": 5855, + "token_acc": 0.2857560042293943 + }, + { + "epoch": 3.4327176781002637, + "grad_norm": 0.21902994089472705, + "learning_rate": 0.00048004840181125686, + "loss": 3.0648157596588135, + "step": 5856, + "token_acc": 0.2913929369393404 + }, + { + "epoch": 3.433304016417473, + "grad_norm": 0.22313334039128813, + "learning_rate": 0.0004800389154846628, + "loss": 3.0666134357452393, + "step": 5857, + "token_acc": 0.29199047084319396 + }, + { + "epoch": 3.433890354734682, + "grad_norm": 0.22607785023685548, + "learning_rate": 0.0004800294269971632, + "loss": 3.0783205032348633, + "step": 5858, + "token_acc": 0.2910110786124388 + }, + { + "epoch": 3.434476693051891, + "grad_norm": 0.20020795341939357, + "learning_rate": 0.000480019936348847, + "loss": 3.1080262660980225, + "step": 5859, + "token_acc": 0.2862674951019127 + }, + { + "epoch": 3.4350630313690997, + "grad_norm": 0.2468317868597328, + "learning_rate": 0.0004800104435398035, + "loss": 3.0623440742492676, + "step": 5860, + "token_acc": 0.2929462659618325 + }, + { + "epoch": 3.435649369686309, + "grad_norm": 0.2721877824932883, + "learning_rate": 0.0004800009485701218, + "loss": 3.0744192600250244, + "step": 5861, + "token_acc": 0.29018273582069226 + }, + { + "epoch": 3.436235708003518, + "grad_norm": 0.2124572696852251, + "learning_rate": 0.00047999145143989114, + "loss": 3.107813835144043, + "step": 5862, + "token_acc": 0.2856151952603471 + }, + { + "epoch": 3.436822046320727, + "grad_norm": 0.21054549913549925, + "learning_rate": 0.0004799819521492007, + "loss": 3.056771755218506, + "step": 5863, + "token_acc": 0.29344841012999 + }, + { + "epoch": 3.437408384637936, + "grad_norm": 0.22786342945410307, + "learning_rate": 0.0004799724506981398, + "loss": 3.0475807189941406, + "step": 5864, + "token_acc": 0.2929505097671713 + }, + { + "epoch": 3.4379947229551453, + "grad_norm": 0.22615496284597908, + "learning_rate": 0.0004799629470867975, + "loss": 3.0709400177001953, + "step": 5865, + "token_acc": 0.29007411259914184 + }, + { + "epoch": 3.438581061272354, + "grad_norm": 0.23612413106676294, + "learning_rate": 0.00047995344131526323, + "loss": 3.0852227210998535, + "step": 5866, + "token_acc": 0.29076783249025534 + }, + { + "epoch": 3.439167399589563, + "grad_norm": 0.24820027035710515, + "learning_rate": 0.00047994393338362623, + "loss": 3.0904593467712402, + "step": 5867, + "token_acc": 0.28847243354238566 + }, + { + "epoch": 3.439753737906772, + "grad_norm": 0.21806981933183187, + "learning_rate": 0.0004799344232919759, + "loss": 3.083679676055908, + "step": 5868, + "token_acc": 0.2912995211761706 + }, + { + "epoch": 3.4403400762239813, + "grad_norm": 0.2421915623171751, + "learning_rate": 0.00047992491104040144, + "loss": 3.073514461517334, + "step": 5869, + "token_acc": 0.29150059678588175 + }, + { + "epoch": 3.4409264145411904, + "grad_norm": 0.3239849721689016, + "learning_rate": 0.0004799153966289923, + "loss": 3.1023945808410645, + "step": 5870, + "token_acc": 0.28667478029886073 + }, + { + "epoch": 3.441512752858399, + "grad_norm": 0.35198821143481107, + "learning_rate": 0.00047990588005783773, + "loss": 3.1131088733673096, + "step": 5871, + "token_acc": 0.28420729102634895 + }, + { + "epoch": 3.442099091175608, + "grad_norm": 0.19056926777993347, + "learning_rate": 0.00047989636132702733, + "loss": 3.073345184326172, + "step": 5872, + "token_acc": 0.2910340140994849 + }, + { + "epoch": 3.4426854294928173, + "grad_norm": 0.290610025011877, + "learning_rate": 0.00047988684043665046, + "loss": 3.104518175125122, + "step": 5873, + "token_acc": 0.2852770205574421 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.2275268100529865, + "learning_rate": 0.00047987731738679634, + "loss": 3.047569751739502, + "step": 5874, + "token_acc": 0.293376609384259 + }, + { + "epoch": 3.4438581061272355, + "grad_norm": 0.25889692542266823, + "learning_rate": 0.00047986779217755465, + "loss": 3.0869345664978027, + "step": 5875, + "token_acc": 0.28864412810435836 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.25180984086815744, + "learning_rate": 0.00047985826480901483, + "loss": 3.047670364379883, + "step": 5876, + "token_acc": 0.2938706499454163 + }, + { + "epoch": 3.4450307827616533, + "grad_norm": 0.2280147041124647, + "learning_rate": 0.0004798487352812663, + "loss": 3.070922374725342, + "step": 5877, + "token_acc": 0.2922873640197658 + }, + { + "epoch": 3.4456171210788624, + "grad_norm": 0.24507837215999356, + "learning_rate": 0.0004798392035943987, + "loss": 3.0666935443878174, + "step": 5878, + "token_acc": 0.29259729491630826 + }, + { + "epoch": 3.4462034593960715, + "grad_norm": 0.23709361327681885, + "learning_rate": 0.0004798296697485014, + "loss": 3.091721296310425, + "step": 5879, + "token_acc": 0.288146028851545 + }, + { + "epoch": 3.4467897977132806, + "grad_norm": 0.2164678722251038, + "learning_rate": 0.0004798201337436642, + "loss": 3.080085277557373, + "step": 5880, + "token_acc": 0.2888464425979173 + }, + { + "epoch": 3.4473761360304898, + "grad_norm": 0.229220830230217, + "learning_rate": 0.0004798105955799764, + "loss": 3.0873990058898926, + "step": 5881, + "token_acc": 0.28886171431513635 + }, + { + "epoch": 3.4479624743476984, + "grad_norm": 0.23964510498453231, + "learning_rate": 0.0004798010552575277, + "loss": 3.0319862365722656, + "step": 5882, + "token_acc": 0.2955203538457582 + }, + { + "epoch": 3.4485488126649075, + "grad_norm": 0.21136483197286854, + "learning_rate": 0.00047979151277640784, + "loss": 3.069993019104004, + "step": 5883, + "token_acc": 0.29241147501320297 + }, + { + "epoch": 3.4491351509821166, + "grad_norm": 0.24858832947091292, + "learning_rate": 0.00047978196813670636, + "loss": 3.1154470443725586, + "step": 5884, + "token_acc": 0.28566514085320616 + }, + { + "epoch": 3.4497214892993258, + "grad_norm": 0.18313302123098088, + "learning_rate": 0.0004797724213385129, + "loss": 3.1227142810821533, + "step": 5885, + "token_acc": 0.28354372595960564 + }, + { + "epoch": 3.450307827616535, + "grad_norm": 0.22642898635030714, + "learning_rate": 0.0004797628723819172, + "loss": 3.0627830028533936, + "step": 5886, + "token_acc": 0.29105929197464475 + }, + { + "epoch": 3.450894165933744, + "grad_norm": 0.2353946425463065, + "learning_rate": 0.0004797533212670089, + "loss": 3.0720362663269043, + "step": 5887, + "token_acc": 0.28972563079534547 + }, + { + "epoch": 3.4514805042509527, + "grad_norm": 0.19565370516202266, + "learning_rate": 0.00047974376799387767, + "loss": 3.0723142623901367, + "step": 5888, + "token_acc": 0.2922520159968531 + }, + { + "epoch": 3.4520668425681618, + "grad_norm": 0.23873996806366388, + "learning_rate": 0.0004797342125626134, + "loss": 3.1233506202697754, + "step": 5889, + "token_acc": 0.28536165956638826 + }, + { + "epoch": 3.452653180885371, + "grad_norm": 0.20767168854301352, + "learning_rate": 0.00047972465497330574, + "loss": 3.09751296043396, + "step": 5890, + "token_acc": 0.288708119321991 + }, + { + "epoch": 3.45323951920258, + "grad_norm": 0.21617893846397562, + "learning_rate": 0.0004797150952260445, + "loss": 3.0834944248199463, + "step": 5891, + "token_acc": 0.2883928399019059 + }, + { + "epoch": 3.453825857519789, + "grad_norm": 0.24169749189591422, + "learning_rate": 0.0004797055333209195, + "loss": 3.1185481548309326, + "step": 5892, + "token_acc": 0.2850378968151446 + }, + { + "epoch": 3.4544121958369978, + "grad_norm": 0.19785715165223416, + "learning_rate": 0.0004796959692580206, + "loss": 3.1080713272094727, + "step": 5893, + "token_acc": 0.2861913682353985 + }, + { + "epoch": 3.454998534154207, + "grad_norm": 0.1978872677431063, + "learning_rate": 0.00047968640303743746, + "loss": 3.060851573944092, + "step": 5894, + "token_acc": 0.2914436737719735 + }, + { + "epoch": 3.455584872471416, + "grad_norm": 0.1997775882601183, + "learning_rate": 0.0004796768346592603, + "loss": 3.086789608001709, + "step": 5895, + "token_acc": 0.2900453394507326 + }, + { + "epoch": 3.456171210788625, + "grad_norm": 0.21493413444631596, + "learning_rate": 0.0004796672641235785, + "loss": 3.110297679901123, + "step": 5896, + "token_acc": 0.2871619939898398 + }, + { + "epoch": 3.456757549105834, + "grad_norm": 0.20435494200115842, + "learning_rate": 0.00047965769143048245, + "loss": 3.0590176582336426, + "step": 5897, + "token_acc": 0.2918432450657468 + }, + { + "epoch": 3.4573438874230433, + "grad_norm": 0.20248806829715904, + "learning_rate": 0.0004796481165800617, + "loss": 3.0713210105895996, + "step": 5898, + "token_acc": 0.2916391208731904 + }, + { + "epoch": 3.457930225740252, + "grad_norm": 0.2129810548603642, + "learning_rate": 0.00047963853957240645, + "loss": 3.107067584991455, + "step": 5899, + "token_acc": 0.28544293136734084 + }, + { + "epoch": 3.458516564057461, + "grad_norm": 0.19620967484533816, + "learning_rate": 0.00047962896040760653, + "loss": 3.0816421508789062, + "step": 5900, + "token_acc": 0.2881339540330232 + }, + { + "epoch": 3.45910290237467, + "grad_norm": 0.23558472529347352, + "learning_rate": 0.0004796193790857519, + "loss": 3.1406548023223877, + "step": 5901, + "token_acc": 0.28139418576319875 + }, + { + "epoch": 3.4596892406918793, + "grad_norm": 0.213363007482945, + "learning_rate": 0.0004796097956069327, + "loss": 3.0917458534240723, + "step": 5902, + "token_acc": 0.2888978469896272 + }, + { + "epoch": 3.460275579009088, + "grad_norm": 0.23480757352748172, + "learning_rate": 0.00047960020997123886, + "loss": 3.094179630279541, + "step": 5903, + "token_acc": 0.2880401999775076 + }, + { + "epoch": 3.460861917326297, + "grad_norm": 0.2999588220960693, + "learning_rate": 0.0004795906221787604, + "loss": 3.1163275241851807, + "step": 5904, + "token_acc": 0.28506473431408774 + }, + { + "epoch": 3.4614482556435062, + "grad_norm": 0.38834600505826655, + "learning_rate": 0.00047958103222958746, + "loss": 3.0935964584350586, + "step": 5905, + "token_acc": 0.28795436857198076 + }, + { + "epoch": 3.4620345939607153, + "grad_norm": 0.2394799030029201, + "learning_rate": 0.00047957144012381004, + "loss": 3.1024112701416016, + "step": 5906, + "token_acc": 0.2874212530637876 + }, + { + "epoch": 3.4626209322779244, + "grad_norm": 0.2260078154436305, + "learning_rate": 0.00047956184586151835, + "loss": 3.15973162651062, + "step": 5907, + "token_acc": 0.27877108843092924 + }, + { + "epoch": 3.4632072705951336, + "grad_norm": 0.23691234719081866, + "learning_rate": 0.0004795522494428024, + "loss": 3.129293918609619, + "step": 5908, + "token_acc": 0.28255072952064586 + }, + { + "epoch": 3.4637936089123427, + "grad_norm": 0.23005547958956088, + "learning_rate": 0.00047954265086775245, + "loss": 3.0672013759613037, + "step": 5909, + "token_acc": 0.2889954391299263 + }, + { + "epoch": 3.4643799472295513, + "grad_norm": 0.2697468946380799, + "learning_rate": 0.00047953305013645855, + "loss": 3.0594286918640137, + "step": 5910, + "token_acc": 0.29314551301245917 + }, + { + "epoch": 3.4649662855467604, + "grad_norm": 0.2032225219656816, + "learning_rate": 0.000479523447249011, + "loss": 3.091585159301758, + "step": 5911, + "token_acc": 0.2881227152937176 + }, + { + "epoch": 3.4655526238639696, + "grad_norm": 0.3188226230622295, + "learning_rate": 0.00047951384220549994, + "loss": 3.078885078430176, + "step": 5912, + "token_acc": 0.29143174843935127 + }, + { + "epoch": 3.4661389621811787, + "grad_norm": 0.19280524796679138, + "learning_rate": 0.0004795042350060156, + "loss": 3.1068716049194336, + "step": 5913, + "token_acc": 0.2850664022933845 + }, + { + "epoch": 3.4667253004983873, + "grad_norm": 0.25444860610954023, + "learning_rate": 0.00047949462565064817, + "loss": 3.1014790534973145, + "step": 5914, + "token_acc": 0.28729332428601745 + }, + { + "epoch": 3.4673116388155965, + "grad_norm": 0.22175254681788797, + "learning_rate": 0.00047948501413948806, + "loss": 3.067160129547119, + "step": 5915, + "token_acc": 0.2907565379032874 + }, + { + "epoch": 3.4678979771328056, + "grad_norm": 0.19633822875936438, + "learning_rate": 0.0004794754004726254, + "loss": 3.1334965229034424, + "step": 5916, + "token_acc": 0.282949222263062 + }, + { + "epoch": 3.4684843154500147, + "grad_norm": 0.2528880158320566, + "learning_rate": 0.00047946578465015067, + "loss": 3.0841922760009766, + "step": 5917, + "token_acc": 0.2898892110167737 + }, + { + "epoch": 3.469070653767224, + "grad_norm": 0.2162911476626947, + "learning_rate": 0.000479456166672154, + "loss": 3.123838186264038, + "step": 5918, + "token_acc": 0.2823161294663885 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 0.27880606639401173, + "learning_rate": 0.0004794465465387259, + "loss": 3.05690860748291, + "step": 5919, + "token_acc": 0.2931890834785443 + }, + { + "epoch": 3.4702433304016416, + "grad_norm": 0.20137499945192558, + "learning_rate": 0.0004794369242499567, + "loss": 3.1275272369384766, + "step": 5920, + "token_acc": 0.2835038809658099 + }, + { + "epoch": 3.4708296687188507, + "grad_norm": 0.27705383580452014, + "learning_rate": 0.00047942729980593674, + "loss": 3.1256914138793945, + "step": 5921, + "token_acc": 0.2835555745239308 + }, + { + "epoch": 3.47141600703606, + "grad_norm": 0.23354785475975945, + "learning_rate": 0.00047941767320675645, + "loss": 3.0796656608581543, + "step": 5922, + "token_acc": 0.29050151710199923 + }, + { + "epoch": 3.472002345353269, + "grad_norm": 0.2695762584939313, + "learning_rate": 0.0004794080444525063, + "loss": 3.0675456523895264, + "step": 5923, + "token_acc": 0.29118756311712646 + }, + { + "epoch": 3.472588683670478, + "grad_norm": 0.28046400587515247, + "learning_rate": 0.00047939841354327663, + "loss": 3.1155524253845215, + "step": 5924, + "token_acc": 0.2844096885743833 + }, + { + "epoch": 3.4731750219876867, + "grad_norm": 0.25623743361026474, + "learning_rate": 0.00047938878047915805, + "loss": 3.1065926551818848, + "step": 5925, + "token_acc": 0.2857889126364972 + }, + { + "epoch": 3.473761360304896, + "grad_norm": 0.2607301049000717, + "learning_rate": 0.00047937914526024095, + "loss": 3.1076412200927734, + "step": 5926, + "token_acc": 0.28519253780050036 + }, + { + "epoch": 3.474347698622105, + "grad_norm": 0.24389432803925373, + "learning_rate": 0.00047936950788661595, + "loss": 3.10764479637146, + "step": 5927, + "token_acc": 0.286375068430114 + }, + { + "epoch": 3.474934036939314, + "grad_norm": 0.23698449996802792, + "learning_rate": 0.0004793598683583734, + "loss": 3.0681302547454834, + "step": 5928, + "token_acc": 0.29139032552479466 + }, + { + "epoch": 3.475520375256523, + "grad_norm": 0.23919141595620294, + "learning_rate": 0.000479350226675604, + "loss": 3.091010093688965, + "step": 5929, + "token_acc": 0.2887083888833232 + }, + { + "epoch": 3.4761067135737322, + "grad_norm": 0.2554510519315503, + "learning_rate": 0.00047934058283839823, + "loss": 3.067103862762451, + "step": 5930, + "token_acc": 0.2926132195930343 + }, + { + "epoch": 3.476693051890941, + "grad_norm": 0.22115131572193175, + "learning_rate": 0.00047933093684684677, + "loss": 3.0883805751800537, + "step": 5931, + "token_acc": 0.288605709159645 + }, + { + "epoch": 3.47727939020815, + "grad_norm": 0.24639565109498957, + "learning_rate": 0.0004793212887010402, + "loss": 3.09421443939209, + "step": 5932, + "token_acc": 0.28765966732564213 + }, + { + "epoch": 3.477865728525359, + "grad_norm": 0.2506120930951755, + "learning_rate": 0.00047931163840106905, + "loss": 3.107215404510498, + "step": 5933, + "token_acc": 0.285880593745144 + }, + { + "epoch": 3.4784520668425682, + "grad_norm": 0.2414804712930038, + "learning_rate": 0.0004793019859470241, + "loss": 3.0895485877990723, + "step": 5934, + "token_acc": 0.28935288251483504 + }, + { + "epoch": 3.4790384051597774, + "grad_norm": 0.2425742527155166, + "learning_rate": 0.00047929233133899604, + "loss": 3.1272435188293457, + "step": 5935, + "token_acc": 0.2846425013410989 + }, + { + "epoch": 3.479624743476986, + "grad_norm": 0.21342719786388123, + "learning_rate": 0.00047928267457707544, + "loss": 3.039905071258545, + "step": 5936, + "token_acc": 0.2953030750958212 + }, + { + "epoch": 3.480211081794195, + "grad_norm": 0.24285279288429706, + "learning_rate": 0.00047927301566135313, + "loss": 3.0955562591552734, + "step": 5937, + "token_acc": 0.28780713191304047 + }, + { + "epoch": 3.4807974201114043, + "grad_norm": 0.21792209894276007, + "learning_rate": 0.00047926335459191975, + "loss": 3.06919002532959, + "step": 5938, + "token_acc": 0.2916688702495161 + }, + { + "epoch": 3.4813837584286134, + "grad_norm": 0.24508291700661203, + "learning_rate": 0.0004792536913688661, + "loss": 3.0584664344787598, + "step": 5939, + "token_acc": 0.29375971821153174 + }, + { + "epoch": 3.4819700967458225, + "grad_norm": 0.24703601539517928, + "learning_rate": 0.0004792440259922829, + "loss": 3.082655429840088, + "step": 5940, + "token_acc": 0.2905120314603374 + }, + { + "epoch": 3.4825564350630316, + "grad_norm": 0.23334061865041292, + "learning_rate": 0.00047923435846226105, + "loss": 3.1030750274658203, + "step": 5941, + "token_acc": 0.2872049939482543 + }, + { + "epoch": 3.4831427733802403, + "grad_norm": 0.2181423836865363, + "learning_rate": 0.0004792246887788912, + "loss": 3.1213622093200684, + "step": 5942, + "token_acc": 0.28480210734277245 + }, + { + "epoch": 3.4837291116974494, + "grad_norm": 0.22873209863714986, + "learning_rate": 0.0004792150169422644, + "loss": 3.086028575897217, + "step": 5943, + "token_acc": 0.2891458099005558 + }, + { + "epoch": 3.4843154500146585, + "grad_norm": 0.25394056758167133, + "learning_rate": 0.0004792053429524713, + "loss": 3.0655152797698975, + "step": 5944, + "token_acc": 0.29217168982575875 + }, + { + "epoch": 3.4849017883318676, + "grad_norm": 0.20447412407690269, + "learning_rate": 0.0004791956668096029, + "loss": 3.0902397632598877, + "step": 5945, + "token_acc": 0.2883598023832747 + }, + { + "epoch": 3.4854881266490767, + "grad_norm": 0.2794192915241018, + "learning_rate": 0.00047918598851375005, + "loss": 3.129974603652954, + "step": 5946, + "token_acc": 0.2836223649190993 + }, + { + "epoch": 3.4860744649662854, + "grad_norm": 0.23200072967168187, + "learning_rate": 0.0004791763080650037, + "loss": 3.0960357189178467, + "step": 5947, + "token_acc": 0.2884312407581664 + }, + { + "epoch": 3.4866608032834945, + "grad_norm": 0.2200670654492003, + "learning_rate": 0.00047916662546345474, + "loss": 3.0676565170288086, + "step": 5948, + "token_acc": 0.29082657888041374 + }, + { + "epoch": 3.4872471416007036, + "grad_norm": 0.20374680263050843, + "learning_rate": 0.00047915694070919414, + "loss": 3.086987018585205, + "step": 5949, + "token_acc": 0.2887206490426005 + }, + { + "epoch": 3.4878334799179127, + "grad_norm": 0.2097389169319842, + "learning_rate": 0.00047914725380231285, + "loss": 3.0346245765686035, + "step": 5950, + "token_acc": 0.29652972034846986 + }, + { + "epoch": 3.488419818235122, + "grad_norm": 0.19290277756024904, + "learning_rate": 0.00047913756474290194, + "loss": 3.109086036682129, + "step": 5951, + "token_acc": 0.285141251138058 + }, + { + "epoch": 3.489006156552331, + "grad_norm": 0.21263018477641957, + "learning_rate": 0.0004791278735310523, + "loss": 3.112579822540283, + "step": 5952, + "token_acc": 0.28584466699805244 + }, + { + "epoch": 3.4895924948695396, + "grad_norm": 0.22980915798373552, + "learning_rate": 0.0004791181801668551, + "loss": 3.0642826557159424, + "step": 5953, + "token_acc": 0.2917265175333742 + }, + { + "epoch": 3.4901788331867487, + "grad_norm": 0.20310021724413638, + "learning_rate": 0.00047910848465040136, + "loss": 3.0787506103515625, + "step": 5954, + "token_acc": 0.2926423470473978 + }, + { + "epoch": 3.490765171503958, + "grad_norm": 0.23360404622020986, + "learning_rate": 0.00047909878698178205, + "loss": 3.0644192695617676, + "step": 5955, + "token_acc": 0.2917618410585524 + }, + { + "epoch": 3.491351509821167, + "grad_norm": 0.2350227070102769, + "learning_rate": 0.0004790890871610884, + "loss": 3.0965120792388916, + "step": 5956, + "token_acc": 0.28681651925938173 + }, + { + "epoch": 3.4919378481383756, + "grad_norm": 0.20336331340612135, + "learning_rate": 0.0004790793851884114, + "loss": 3.0605881214141846, + "step": 5957, + "token_acc": 0.29251919861785175 + }, + { + "epoch": 3.4925241864555847, + "grad_norm": 0.24261990551067122, + "learning_rate": 0.0004790696810638424, + "loss": 3.108607053756714, + "step": 5958, + "token_acc": 0.284672331139934 + }, + { + "epoch": 3.493110524772794, + "grad_norm": 0.2657625273659287, + "learning_rate": 0.00047905997478747236, + "loss": 3.089564800262451, + "step": 5959, + "token_acc": 0.2888890578239308 + }, + { + "epoch": 3.493696863090003, + "grad_norm": 0.247642327934848, + "learning_rate": 0.0004790502663593925, + "loss": 3.080648899078369, + "step": 5960, + "token_acc": 0.2886338742743178 + }, + { + "epoch": 3.494283201407212, + "grad_norm": 0.21627449388476294, + "learning_rate": 0.0004790405557796941, + "loss": 3.0909807682037354, + "step": 5961, + "token_acc": 0.2886746346036609 + }, + { + "epoch": 3.494869539724421, + "grad_norm": 0.20143870693636776, + "learning_rate": 0.00047903084304846825, + "loss": 3.0977392196655273, + "step": 5962, + "token_acc": 0.28764756469392544 + }, + { + "epoch": 3.49545587804163, + "grad_norm": 0.20517791781100495, + "learning_rate": 0.00047902112816580625, + "loss": 3.0804243087768555, + "step": 5963, + "token_acc": 0.2889963724304716 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.19519751032683294, + "learning_rate": 0.0004790114111317994, + "loss": 3.066699504852295, + "step": 5964, + "token_acc": 0.2899672883726957 + }, + { + "epoch": 3.496628554676048, + "grad_norm": 0.2297208380412637, + "learning_rate": 0.0004790016919465389, + "loss": 3.045351266860962, + "step": 5965, + "token_acc": 0.29422974560866116 + }, + { + "epoch": 3.497214892993257, + "grad_norm": 0.20817944942754768, + "learning_rate": 0.0004789919706101161, + "loss": 3.0995283126831055, + "step": 5966, + "token_acc": 0.28771666054879796 + }, + { + "epoch": 3.4978012313104663, + "grad_norm": 0.17984041818737403, + "learning_rate": 0.00047898224712262236, + "loss": 3.063133716583252, + "step": 5967, + "token_acc": 0.2925494090377183 + }, + { + "epoch": 3.498387569627675, + "grad_norm": 0.292116076963666, + "learning_rate": 0.0004789725214841489, + "loss": 3.0872721672058105, + "step": 5968, + "token_acc": 0.28905245215774383 + }, + { + "epoch": 3.498973907944884, + "grad_norm": 0.313909267677068, + "learning_rate": 0.0004789627936947872, + "loss": 3.07171368598938, + "step": 5969, + "token_acc": 0.2908167612879861 + }, + { + "epoch": 3.499560246262093, + "grad_norm": 0.22306740094189756, + "learning_rate": 0.00047895306375462854, + "loss": 3.0661704540252686, + "step": 5970, + "token_acc": 0.2908186503042368 + }, + { + "epoch": 3.5001465845793023, + "grad_norm": 0.2788788165810863, + "learning_rate": 0.00047894333166376434, + "loss": 3.081724166870117, + "step": 5971, + "token_acc": 0.28948743356454376 + }, + { + "epoch": 3.5007329228965114, + "grad_norm": 0.2684937393772208, + "learning_rate": 0.00047893359742228614, + "loss": 3.091601848602295, + "step": 5972, + "token_acc": 0.2884710213144106 + }, + { + "epoch": 3.5013192612137205, + "grad_norm": 0.21539425873877566, + "learning_rate": 0.0004789238610302852, + "loss": 3.0789403915405273, + "step": 5973, + "token_acc": 0.2906933514282223 + }, + { + "epoch": 3.5019055995309296, + "grad_norm": 0.26179643399527397, + "learning_rate": 0.0004789141224878531, + "loss": 3.0750370025634766, + "step": 5974, + "token_acc": 0.2908853410740203 + }, + { + "epoch": 3.5024919378481383, + "grad_norm": 0.20841635969450642, + "learning_rate": 0.0004789043817950812, + "loss": 3.0844740867614746, + "step": 5975, + "token_acc": 0.28860233826653414 + }, + { + "epoch": 3.5030782761653474, + "grad_norm": 0.2320183461037999, + "learning_rate": 0.0004788946389520612, + "loss": 3.09194016456604, + "step": 5976, + "token_acc": 0.28934187594568817 + }, + { + "epoch": 3.5036646144825565, + "grad_norm": 0.21793289811200264, + "learning_rate": 0.00047888489395888446, + "loss": 3.0815606117248535, + "step": 5977, + "token_acc": 0.2894452440597437 + }, + { + "epoch": 3.5042509527997656, + "grad_norm": 0.3114209039212514, + "learning_rate": 0.00047887514681564257, + "loss": 3.128950834274292, + "step": 5978, + "token_acc": 0.2835602099693795 + }, + { + "epoch": 3.5048372911169743, + "grad_norm": 0.2201139633815306, + "learning_rate": 0.00047886539752242706, + "loss": 3.1177797317504883, + "step": 5979, + "token_acc": 0.2845294746551507 + }, + { + "epoch": 3.5054236294341834, + "grad_norm": 0.2670174859290515, + "learning_rate": 0.0004788556460793296, + "loss": 3.093773603439331, + "step": 5980, + "token_acc": 0.2898392374909286 + }, + { + "epoch": 3.5060099677513925, + "grad_norm": 0.21361580347317052, + "learning_rate": 0.0004788458924864417, + "loss": 3.09428071975708, + "step": 5981, + "token_acc": 0.287651671593041 + }, + { + "epoch": 3.5065963060686016, + "grad_norm": 0.21655124083199812, + "learning_rate": 0.00047883613674385507, + "loss": 3.1337759494781494, + "step": 5982, + "token_acc": 0.28267563942583335 + }, + { + "epoch": 3.5071826443858107, + "grad_norm": 0.21960754628886808, + "learning_rate": 0.00047882637885166126, + "loss": 3.0978775024414062, + "step": 5983, + "token_acc": 0.28644988474344746 + }, + { + "epoch": 3.50776898270302, + "grad_norm": 0.2254535058164971, + "learning_rate": 0.0004788166188099519, + "loss": 3.059940814971924, + "step": 5984, + "token_acc": 0.29226715492402533 + }, + { + "epoch": 3.5083553210202285, + "grad_norm": 0.18543314014344411, + "learning_rate": 0.0004788068566188188, + "loss": 3.06207537651062, + "step": 5985, + "token_acc": 0.2913921821158164 + }, + { + "epoch": 3.5089416593374376, + "grad_norm": 0.24836450388150022, + "learning_rate": 0.0004787970922783536, + "loss": 3.102248191833496, + "step": 5986, + "token_acc": 0.28772797336773503 + }, + { + "epoch": 3.5095279976546467, + "grad_norm": 0.22735159409742453, + "learning_rate": 0.000478787325788648, + "loss": 3.103120803833008, + "step": 5987, + "token_acc": 0.28605212867608026 + }, + { + "epoch": 3.510114335971856, + "grad_norm": 0.25862930411752544, + "learning_rate": 0.0004787775571497938, + "loss": 3.122980833053589, + "step": 5988, + "token_acc": 0.2819880622634885 + }, + { + "epoch": 3.5107006742890645, + "grad_norm": 0.20212926151408436, + "learning_rate": 0.00047876778636188273, + "loss": 3.1015028953552246, + "step": 5989, + "token_acc": 0.2868072535177652 + }, + { + "epoch": 3.5112870126062736, + "grad_norm": 0.2606634945491059, + "learning_rate": 0.0004787580134250066, + "loss": 3.0488338470458984, + "step": 5990, + "token_acc": 0.2939887537986113 + }, + { + "epoch": 3.5118733509234827, + "grad_norm": 0.2727368677444671, + "learning_rate": 0.0004787482383392571, + "loss": 3.0851917266845703, + "step": 5991, + "token_acc": 0.2894705036785221 + }, + { + "epoch": 3.512459689240692, + "grad_norm": 0.21441790378685785, + "learning_rate": 0.0004787384611047262, + "loss": 3.100041389465332, + "step": 5992, + "token_acc": 0.28854019312380835 + }, + { + "epoch": 3.513046027557901, + "grad_norm": 0.22969573302622853, + "learning_rate": 0.00047872868172150573, + "loss": 3.073490619659424, + "step": 5993, + "token_acc": 0.29006336121088006 + }, + { + "epoch": 3.51363236587511, + "grad_norm": 0.26748707152484175, + "learning_rate": 0.00047871890018968743, + "loss": 3.0738022327423096, + "step": 5994, + "token_acc": 0.29170121801578386 + }, + { + "epoch": 3.514218704192319, + "grad_norm": 0.24742014474517413, + "learning_rate": 0.0004787091165093633, + "loss": 3.08567214012146, + "step": 5995, + "token_acc": 0.2889967884997706 + }, + { + "epoch": 3.514805042509528, + "grad_norm": 0.20967098302264475, + "learning_rate": 0.0004786993306806252, + "loss": 3.0812015533447266, + "step": 5996, + "token_acc": 0.2888916464907023 + }, + { + "epoch": 3.515391380826737, + "grad_norm": 0.21407514659564247, + "learning_rate": 0.0004786895427035651, + "loss": 3.1094777584075928, + "step": 5997, + "token_acc": 0.2846600322979803 + }, + { + "epoch": 3.515977719143946, + "grad_norm": 0.20893276152304585, + "learning_rate": 0.0004786797525782749, + "loss": 3.116135358810425, + "step": 5998, + "token_acc": 0.2856432731796439 + }, + { + "epoch": 3.516564057461155, + "grad_norm": 0.2086591872128816, + "learning_rate": 0.00047866996030484653, + "loss": 3.0725951194763184, + "step": 5999, + "token_acc": 0.29233333760755 + }, + { + "epoch": 3.517150395778364, + "grad_norm": 0.26255282881530384, + "learning_rate": 0.0004786601658833721, + "loss": 3.0469374656677246, + "step": 6000, + "token_acc": 0.2942312352528538 + }, + { + "epoch": 3.517736734095573, + "grad_norm": 0.32893867897322204, + "learning_rate": 0.0004786503693139435, + "loss": 3.1021957397460938, + "step": 6001, + "token_acc": 0.28532917835040644 + }, + { + "epoch": 3.518323072412782, + "grad_norm": 0.2815719010791083, + "learning_rate": 0.0004786405705966528, + "loss": 3.132190227508545, + "step": 6002, + "token_acc": 0.28244159880415 + }, + { + "epoch": 3.518909410729991, + "grad_norm": 0.1950200050129264, + "learning_rate": 0.00047863076973159196, + "loss": 3.0931668281555176, + "step": 6003, + "token_acc": 0.28613224424398415 + }, + { + "epoch": 3.5194957490472003, + "grad_norm": 0.23810333904481973, + "learning_rate": 0.0004786209667188532, + "loss": 3.0766186714172363, + "step": 6004, + "token_acc": 0.2904505035160872 + }, + { + "epoch": 3.5200820873644094, + "grad_norm": 0.20382897109857082, + "learning_rate": 0.0004786111615585285, + "loss": 3.1412878036499023, + "step": 6005, + "token_acc": 0.2815263292050653 + }, + { + "epoch": 3.5206684256816185, + "grad_norm": 0.25180309420851293, + "learning_rate": 0.0004786013542507099, + "loss": 3.072093963623047, + "step": 6006, + "token_acc": 0.2912007941698831 + }, + { + "epoch": 3.521254763998827, + "grad_norm": 0.2226163934661916, + "learning_rate": 0.0004785915447954898, + "loss": 3.035400390625, + "step": 6007, + "token_acc": 0.2953153027232278 + }, + { + "epoch": 3.5218411023160363, + "grad_norm": 0.23056535899622616, + "learning_rate": 0.00047858173319296007, + "loss": 3.1257331371307373, + "step": 6008, + "token_acc": 0.281706169090828 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.24604349688522686, + "learning_rate": 0.000478571919443213, + "loss": 3.1137661933898926, + "step": 6009, + "token_acc": 0.28473933551350206 + }, + { + "epoch": 3.5230137789504545, + "grad_norm": 0.22146145992379532, + "learning_rate": 0.0004785621035463408, + "loss": 3.026855707168579, + "step": 6010, + "token_acc": 0.29788814741374575 + }, + { + "epoch": 3.523600117267663, + "grad_norm": 0.2981028208479258, + "learning_rate": 0.00047855228550243553, + "loss": 3.076784133911133, + "step": 6011, + "token_acc": 0.29015349550639474 + }, + { + "epoch": 3.5241864555848723, + "grad_norm": 0.20549515572512128, + "learning_rate": 0.00047854246531158954, + "loss": 3.0817158222198486, + "step": 6012, + "token_acc": 0.29020446543098205 + }, + { + "epoch": 3.5247727939020814, + "grad_norm": 0.2448656895681165, + "learning_rate": 0.0004785326429738951, + "loss": 3.0994873046875, + "step": 6013, + "token_acc": 0.2883166425714678 + }, + { + "epoch": 3.5253591322192905, + "grad_norm": 0.24045836148315997, + "learning_rate": 0.00047852281848944435, + "loss": 3.0627567768096924, + "step": 6014, + "token_acc": 0.2926420598069058 + }, + { + "epoch": 3.5259454705364996, + "grad_norm": 0.19414816276974475, + "learning_rate": 0.00047851299185832974, + "loss": 3.1357312202453613, + "step": 6015, + "token_acc": 0.28466136645693446 + }, + { + "epoch": 3.5265318088537088, + "grad_norm": 0.2100676942092523, + "learning_rate": 0.00047850316308064347, + "loss": 3.075984477996826, + "step": 6016, + "token_acc": 0.29138177311445107 + }, + { + "epoch": 3.527118147170918, + "grad_norm": 0.1767316226674439, + "learning_rate": 0.0004784933321564779, + "loss": 3.0914430618286133, + "step": 6017, + "token_acc": 0.28809369820401304 + }, + { + "epoch": 3.5277044854881265, + "grad_norm": 0.23834412801881802, + "learning_rate": 0.00047848349908592534, + "loss": 3.0830278396606445, + "step": 6018, + "token_acc": 0.2905995567120669 + }, + { + "epoch": 3.5282908238053357, + "grad_norm": 0.20136414791524018, + "learning_rate": 0.0004784736638690782, + "loss": 3.1203746795654297, + "step": 6019, + "token_acc": 0.28580708376661834 + }, + { + "epoch": 3.5288771621225448, + "grad_norm": 0.21203260072393462, + "learning_rate": 0.0004784638265060289, + "loss": 3.0485548973083496, + "step": 6020, + "token_acc": 0.2948570097914195 + }, + { + "epoch": 3.529463500439754, + "grad_norm": 0.2539327149376188, + "learning_rate": 0.00047845398699686975, + "loss": 3.0871853828430176, + "step": 6021, + "token_acc": 0.2900814998550398 + }, + { + "epoch": 3.5300498387569625, + "grad_norm": 0.19771395043935258, + "learning_rate": 0.00047844414534169334, + "loss": 3.064157247543335, + "step": 6022, + "token_acc": 0.29237666573054316 + }, + { + "epoch": 3.5306361770741717, + "grad_norm": 0.2035848446376272, + "learning_rate": 0.0004784343015405919, + "loss": 3.1128532886505127, + "step": 6023, + "token_acc": 0.2862062398417995 + }, + { + "epoch": 3.5312225153913808, + "grad_norm": 0.20468539844415362, + "learning_rate": 0.0004784244555936581, + "loss": 3.040302276611328, + "step": 6024, + "token_acc": 0.2966674867950604 + }, + { + "epoch": 3.53180885370859, + "grad_norm": 0.21116021043571656, + "learning_rate": 0.00047841460750098434, + "loss": 3.08364200592041, + "step": 6025, + "token_acc": 0.28990447542148173 + }, + { + "epoch": 3.532395192025799, + "grad_norm": 0.25132135319706655, + "learning_rate": 0.0004784047572626631, + "loss": 3.066924810409546, + "step": 6026, + "token_acc": 0.2926643445066123 + }, + { + "epoch": 3.532981530343008, + "grad_norm": 0.22122588785613362, + "learning_rate": 0.0004783949048787869, + "loss": 3.09605073928833, + "step": 6027, + "token_acc": 0.2876989230476744 + }, + { + "epoch": 3.533567868660217, + "grad_norm": 0.27596119066460945, + "learning_rate": 0.00047838505034944836, + "loss": 3.097844123840332, + "step": 6028, + "token_acc": 0.2858009234917776 + }, + { + "epoch": 3.534154206977426, + "grad_norm": 0.26715735946591523, + "learning_rate": 0.0004783751936747401, + "loss": 3.052086114883423, + "step": 6029, + "token_acc": 0.29449976380663634 + }, + { + "epoch": 3.534740545294635, + "grad_norm": 0.18687202174574064, + "learning_rate": 0.0004783653348547545, + "loss": 3.0766286849975586, + "step": 6030, + "token_acc": 0.28957626247043095 + }, + { + "epoch": 3.535326883611844, + "grad_norm": 0.24079070917471235, + "learning_rate": 0.00047835547388958444, + "loss": 3.1003494262695312, + "step": 6031, + "token_acc": 0.28722589195121445 + }, + { + "epoch": 3.535913221929053, + "grad_norm": 0.2170699969772782, + "learning_rate": 0.0004783456107793224, + "loss": 3.121161937713623, + "step": 6032, + "token_acc": 0.2838605049842325 + }, + { + "epoch": 3.536499560246262, + "grad_norm": 0.19770945047579877, + "learning_rate": 0.00047833574552406103, + "loss": 3.099660873413086, + "step": 6033, + "token_acc": 0.2890059989130889 + }, + { + "epoch": 3.537085898563471, + "grad_norm": 0.2213738190721832, + "learning_rate": 0.0004783258781238929, + "loss": 3.057271957397461, + "step": 6034, + "token_acc": 0.2938741895001046 + }, + { + "epoch": 3.53767223688068, + "grad_norm": 0.21195044027559198, + "learning_rate": 0.000478316008578911, + "loss": 3.1295642852783203, + "step": 6035, + "token_acc": 0.2822314864740462 + }, + { + "epoch": 3.538258575197889, + "grad_norm": 0.21743989145001846, + "learning_rate": 0.00047830613688920777, + "loss": 3.0409865379333496, + "step": 6036, + "token_acc": 0.29490492245448413 + }, + { + "epoch": 3.5388449135150983, + "grad_norm": 0.19379257134930958, + "learning_rate": 0.000478296263054876, + "loss": 3.068934917449951, + "step": 6037, + "token_acc": 0.2911316006445042 + }, + { + "epoch": 3.5394312518323074, + "grad_norm": 0.22145731308281166, + "learning_rate": 0.0004782863870760085, + "loss": 3.071803569793701, + "step": 6038, + "token_acc": 0.2905575511958491 + }, + { + "epoch": 3.540017590149516, + "grad_norm": 0.20561135145440868, + "learning_rate": 0.00047827650895269805, + "loss": 3.1095962524414062, + "step": 6039, + "token_acc": 0.2848458498230763 + }, + { + "epoch": 3.5406039284667252, + "grad_norm": 0.2558954360046193, + "learning_rate": 0.00047826662868503733, + "loss": 3.1218907833099365, + "step": 6040, + "token_acc": 0.2832678649367139 + }, + { + "epoch": 3.5411902667839343, + "grad_norm": 0.3125864658981447, + "learning_rate": 0.00047825674627311935, + "loss": 3.077052354812622, + "step": 6041, + "token_acc": 0.2916955134820218 + }, + { + "epoch": 3.5417766051011434, + "grad_norm": 0.28375200554268565, + "learning_rate": 0.0004782468617170367, + "loss": 3.0992369651794434, + "step": 6042, + "token_acc": 0.284857586299709 + }, + { + "epoch": 3.542362943418352, + "grad_norm": 0.2416543450716066, + "learning_rate": 0.00047823697501688233, + "loss": 3.082282304763794, + "step": 6043, + "token_acc": 0.2887448087063374 + }, + { + "epoch": 3.5429492817355612, + "grad_norm": 0.3171979429807752, + "learning_rate": 0.00047822708617274923, + "loss": 3.097660541534424, + "step": 6044, + "token_acc": 0.28746801575588987 + }, + { + "epoch": 3.5435356200527703, + "grad_norm": 0.21590103521526163, + "learning_rate": 0.00047821719518473016, + "loss": 3.10398006439209, + "step": 6045, + "token_acc": 0.2876173542921027 + }, + { + "epoch": 3.5441219583699795, + "grad_norm": 0.3520410154364761, + "learning_rate": 0.0004782073020529181, + "loss": 3.1175670623779297, + "step": 6046, + "token_acc": 0.28513065393288234 + }, + { + "epoch": 3.5447082966871886, + "grad_norm": 0.24730097886728836, + "learning_rate": 0.00047819740677740584, + "loss": 3.088317632675171, + "step": 6047, + "token_acc": 0.28929394987952556 + }, + { + "epoch": 3.5452946350043977, + "grad_norm": 0.2790158099033525, + "learning_rate": 0.0004781875093582866, + "loss": 3.119417190551758, + "step": 6048, + "token_acc": 0.28379942952772896 + }, + { + "epoch": 3.545880973321607, + "grad_norm": 0.2170574192007608, + "learning_rate": 0.00047817760979565303, + "loss": 3.08149790763855, + "step": 6049, + "token_acc": 0.28855751135879226 + }, + { + "epoch": 3.5464673116388155, + "grad_norm": 0.2741517409016626, + "learning_rate": 0.0004781677080895984, + "loss": 3.031248092651367, + "step": 6050, + "token_acc": 0.29570952785215177 + }, + { + "epoch": 3.5470536499560246, + "grad_norm": 0.21710240619242815, + "learning_rate": 0.0004781578042402156, + "loss": 3.1196911334991455, + "step": 6051, + "token_acc": 0.2842616265556659 + }, + { + "epoch": 3.5476399882732337, + "grad_norm": 0.24697629356132034, + "learning_rate": 0.00047814789824759767, + "loss": 3.0917844772338867, + "step": 6052, + "token_acc": 0.28773133881673224 + }, + { + "epoch": 3.548226326590443, + "grad_norm": 0.22666633676878634, + "learning_rate": 0.0004781379901118377, + "loss": 3.0785088539123535, + "step": 6053, + "token_acc": 0.2908692464446289 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.23028019361754382, + "learning_rate": 0.00047812807983302867, + "loss": 3.1226372718811035, + "step": 6054, + "token_acc": 0.28509020430242604 + }, + { + "epoch": 3.5493990032248606, + "grad_norm": 0.2090816704759526, + "learning_rate": 0.0004781181674112638, + "loss": 3.1141867637634277, + "step": 6055, + "token_acc": 0.2849584501883246 + }, + { + "epoch": 3.5499853415420697, + "grad_norm": 0.2202981861422166, + "learning_rate": 0.0004781082528466361, + "loss": 3.0736582279205322, + "step": 6056, + "token_acc": 0.289328404028792 + }, + { + "epoch": 3.550571679859279, + "grad_norm": 0.22477475895791238, + "learning_rate": 0.0004780983361392387, + "loss": 3.076233386993408, + "step": 6057, + "token_acc": 0.2910754538370214 + }, + { + "epoch": 3.551158018176488, + "grad_norm": 0.19289651386377385, + "learning_rate": 0.00047808841728916486, + "loss": 3.066540241241455, + "step": 6058, + "token_acc": 0.29031376523589436 + }, + { + "epoch": 3.551744356493697, + "grad_norm": 0.19333325795202816, + "learning_rate": 0.0004780784962965077, + "loss": 3.0855627059936523, + "step": 6059, + "token_acc": 0.2879536457055051 + }, + { + "epoch": 3.552330694810906, + "grad_norm": 0.21945646208421482, + "learning_rate": 0.0004780685731613604, + "loss": 3.0759763717651367, + "step": 6060, + "token_acc": 0.2916137629985624 + }, + { + "epoch": 3.552917033128115, + "grad_norm": 0.2262993380224385, + "learning_rate": 0.00047805864788381616, + "loss": 3.090543270111084, + "step": 6061, + "token_acc": 0.289961875089124 + }, + { + "epoch": 3.553503371445324, + "grad_norm": 0.21081935189939519, + "learning_rate": 0.0004780487204639682, + "loss": 3.0963568687438965, + "step": 6062, + "token_acc": 0.28842421423764747 + }, + { + "epoch": 3.554089709762533, + "grad_norm": 0.22710755985761227, + "learning_rate": 0.0004780387909019099, + "loss": 3.1080727577209473, + "step": 6063, + "token_acc": 0.286900846560574 + }, + { + "epoch": 3.554676048079742, + "grad_norm": 0.23025140142205822, + "learning_rate": 0.00047802885919773436, + "loss": 3.065800666809082, + "step": 6064, + "token_acc": 0.29320443026093485 + }, + { + "epoch": 3.555262386396951, + "grad_norm": 0.20999208801339014, + "learning_rate": 0.000478018925351535, + "loss": 3.0565340518951416, + "step": 6065, + "token_acc": 0.2922210614824063 + }, + { + "epoch": 3.55584872471416, + "grad_norm": 0.2106451756440314, + "learning_rate": 0.0004780089893634051, + "loss": 3.0704643726348877, + "step": 6066, + "token_acc": 0.28946269996139035 + }, + { + "epoch": 3.556435063031369, + "grad_norm": 0.26392526063665517, + "learning_rate": 0.000477999051233438, + "loss": 3.1443710327148438, + "step": 6067, + "token_acc": 0.28012039151801915 + }, + { + "epoch": 3.557021401348578, + "grad_norm": 0.23660124070894392, + "learning_rate": 0.000477989110961727, + "loss": 3.116360664367676, + "step": 6068, + "token_acc": 0.28421414822940166 + }, + { + "epoch": 3.5576077396657872, + "grad_norm": 0.19641525986087416, + "learning_rate": 0.00047797916854836554, + "loss": 3.115983009338379, + "step": 6069, + "token_acc": 0.28584151680229325 + }, + { + "epoch": 3.5581940779829964, + "grad_norm": 0.2016102454667875, + "learning_rate": 0.000477969223993447, + "loss": 3.071686029434204, + "step": 6070, + "token_acc": 0.2920063423513642 + }, + { + "epoch": 3.5587804163002055, + "grad_norm": 0.21777419084493524, + "learning_rate": 0.00047795927729706484, + "loss": 3.059999465942383, + "step": 6071, + "token_acc": 0.2915543114105494 + }, + { + "epoch": 3.559366754617414, + "grad_norm": 0.20466529287014912, + "learning_rate": 0.0004779493284593124, + "loss": 3.0969653129577637, + "step": 6072, + "token_acc": 0.2859841130210356 + }, + { + "epoch": 3.5599530929346233, + "grad_norm": 0.17594473084978354, + "learning_rate": 0.00047793937748028323, + "loss": 3.0789592266082764, + "step": 6073, + "token_acc": 0.2904278691295059 + }, + { + "epoch": 3.5605394312518324, + "grad_norm": 0.17291870875002943, + "learning_rate": 0.0004779294243600707, + "loss": 3.085874557495117, + "step": 6074, + "token_acc": 0.2911383545196599 + }, + { + "epoch": 3.5611257695690415, + "grad_norm": 0.21431402420415846, + "learning_rate": 0.0004779194690987684, + "loss": 3.1280360221862793, + "step": 6075, + "token_acc": 0.28440405057248197 + }, + { + "epoch": 3.56171210788625, + "grad_norm": 0.31055969854273807, + "learning_rate": 0.0004779095116964698, + "loss": 3.108879566192627, + "step": 6076, + "token_acc": 0.2872302628141491 + }, + { + "epoch": 3.5622984462034593, + "grad_norm": 0.3371314806592172, + "learning_rate": 0.0004778995521532685, + "loss": 3.0597105026245117, + "step": 6077, + "token_acc": 0.29391647501432533 + }, + { + "epoch": 3.5628847845206684, + "grad_norm": 0.2138190712878982, + "learning_rate": 0.00047788959046925797, + "loss": 3.072995662689209, + "step": 6078, + "token_acc": 0.2924031555587679 + }, + { + "epoch": 3.5634711228378775, + "grad_norm": 0.27984404009076747, + "learning_rate": 0.0004778796266445318, + "loss": 3.067516565322876, + "step": 6079, + "token_acc": 0.29301465598262694 + }, + { + "epoch": 3.5640574611550866, + "grad_norm": 0.21684213419178763, + "learning_rate": 0.0004778696606791836, + "loss": 3.0667333602905273, + "step": 6080, + "token_acc": 0.2919727116208624 + }, + { + "epoch": 3.5646437994722957, + "grad_norm": 0.25388039885578323, + "learning_rate": 0.00047785969257330705, + "loss": 3.0914976596832275, + "step": 6081, + "token_acc": 0.28904869098977615 + }, + { + "epoch": 3.565230137789505, + "grad_norm": 0.21544744718239908, + "learning_rate": 0.00047784972232699566, + "loss": 3.0481085777282715, + "step": 6082, + "token_acc": 0.29330518189754 + }, + { + "epoch": 3.5658164761067135, + "grad_norm": 0.2299557294622156, + "learning_rate": 0.00047783974994034325, + "loss": 3.0937414169311523, + "step": 6083, + "token_acc": 0.2875131191868431 + }, + { + "epoch": 3.5664028144239226, + "grad_norm": 0.27472314604687254, + "learning_rate": 0.0004778297754134433, + "loss": 3.084862232208252, + "step": 6084, + "token_acc": 0.2889697467946522 + }, + { + "epoch": 3.5669891527411317, + "grad_norm": 0.19171067473319317, + "learning_rate": 0.0004778197987463897, + "loss": 3.110020637512207, + "step": 6085, + "token_acc": 0.2859867323418398 + }, + { + "epoch": 3.567575491058341, + "grad_norm": 0.2535443857362273, + "learning_rate": 0.0004778098199392761, + "loss": 3.110361099243164, + "step": 6086, + "token_acc": 0.28642661224137184 + }, + { + "epoch": 3.5681618293755495, + "grad_norm": 0.22376798027829048, + "learning_rate": 0.00047779983899219614, + "loss": 3.1167047023773193, + "step": 6087, + "token_acc": 0.28659363805442645 + }, + { + "epoch": 3.5687481676927586, + "grad_norm": 0.2529065217246973, + "learning_rate": 0.0004777898559052437, + "loss": 3.0566165447235107, + "step": 6088, + "token_acc": 0.29346751375287944 + }, + { + "epoch": 3.5693345060099677, + "grad_norm": 0.21075350874348966, + "learning_rate": 0.0004777798706785125, + "loss": 3.100754737854004, + "step": 6089, + "token_acc": 0.2861667542236739 + }, + { + "epoch": 3.569920844327177, + "grad_norm": 0.2709613704428958, + "learning_rate": 0.0004777698833120964, + "loss": 3.0707015991210938, + "step": 6090, + "token_acc": 0.29262913612122887 + }, + { + "epoch": 3.570507182644386, + "grad_norm": 0.2343618137578625, + "learning_rate": 0.0004777598938060891, + "loss": 3.0947189331054688, + "step": 6091, + "token_acc": 0.28833164856320836 + }, + { + "epoch": 3.571093520961595, + "grad_norm": 0.2241432259566997, + "learning_rate": 0.00047774990216058454, + "loss": 3.0784573554992676, + "step": 6092, + "token_acc": 0.2894586818333891 + }, + { + "epoch": 3.5716798592788037, + "grad_norm": 0.19551955466333742, + "learning_rate": 0.00047773990837567657, + "loss": 3.0498709678649902, + "step": 6093, + "token_acc": 0.2927040473990049 + }, + { + "epoch": 3.572266197596013, + "grad_norm": 0.19587532929007806, + "learning_rate": 0.00047772991245145904, + "loss": 3.069504737854004, + "step": 6094, + "token_acc": 0.2911250003270274 + }, + { + "epoch": 3.572852535913222, + "grad_norm": 0.21101221017589122, + "learning_rate": 0.00047771991438802573, + "loss": 3.0905089378356934, + "step": 6095, + "token_acc": 0.287147384482961 + }, + { + "epoch": 3.573438874230431, + "grad_norm": 0.20236658811978933, + "learning_rate": 0.0004777099141854708, + "loss": 3.0461857318878174, + "step": 6096, + "token_acc": 0.29494617560779374 + }, + { + "epoch": 3.5740252125476397, + "grad_norm": 0.24114969280795062, + "learning_rate": 0.00047769991184388804, + "loss": 3.0389723777770996, + "step": 6097, + "token_acc": 0.29420878881310536 + }, + { + "epoch": 3.574611550864849, + "grad_norm": 0.22186159189140797, + "learning_rate": 0.0004776899073633715, + "loss": 3.0373387336730957, + "step": 6098, + "token_acc": 0.29637481950097094 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 0.19430216809701573, + "learning_rate": 0.000477679900744015, + "loss": 3.0670900344848633, + "step": 6099, + "token_acc": 0.29157500013002347 + }, + { + "epoch": 3.575784227499267, + "grad_norm": 0.2610316997840719, + "learning_rate": 0.0004776698919859127, + "loss": 3.100808620452881, + "step": 6100, + "token_acc": 0.28657782032805584 + }, + { + "epoch": 3.576370565816476, + "grad_norm": 0.2535076239290258, + "learning_rate": 0.00047765988108915857, + "loss": 3.0593724250793457, + "step": 6101, + "token_acc": 0.2918306525021828 + }, + { + "epoch": 3.5769569041336853, + "grad_norm": 0.2291889075268578, + "learning_rate": 0.0004776498680538466, + "loss": 3.079282283782959, + "step": 6102, + "token_acc": 0.2887910681525556 + }, + { + "epoch": 3.5775432424508944, + "grad_norm": 0.2098851794151694, + "learning_rate": 0.00047763985288007085, + "loss": 3.0537474155426025, + "step": 6103, + "token_acc": 0.29461693496951297 + }, + { + "epoch": 3.578129580768103, + "grad_norm": 0.2514736907085398, + "learning_rate": 0.0004776298355679255, + "loss": 3.081895351409912, + "step": 6104, + "token_acc": 0.28882546429296113 + }, + { + "epoch": 3.578715919085312, + "grad_norm": 0.25639747013447084, + "learning_rate": 0.0004776198161175045, + "loss": 3.072284460067749, + "step": 6105, + "token_acc": 0.2907978283118113 + }, + { + "epoch": 3.5793022574025213, + "grad_norm": 0.20468551853002448, + "learning_rate": 0.0004776097945289021, + "loss": 3.0495762825012207, + "step": 6106, + "token_acc": 0.29452453689986 + }, + { + "epoch": 3.5798885957197304, + "grad_norm": 0.2622304404380167, + "learning_rate": 0.0004775997708022124, + "loss": 3.127688407897949, + "step": 6107, + "token_acc": 0.28256165053640897 + }, + { + "epoch": 3.580474934036939, + "grad_norm": 0.22849655173306108, + "learning_rate": 0.00047758974493752947, + "loss": 3.138761520385742, + "step": 6108, + "token_acc": 0.282469409637541 + }, + { + "epoch": 3.581061272354148, + "grad_norm": 0.21289464254768897, + "learning_rate": 0.0004775797169349476, + "loss": 3.0690762996673584, + "step": 6109, + "token_acc": 0.29035518793976217 + }, + { + "epoch": 3.5816476106713573, + "grad_norm": 0.2503781919365284, + "learning_rate": 0.000477569686794561, + "loss": 3.0683746337890625, + "step": 6110, + "token_acc": 0.2913076559254766 + }, + { + "epoch": 3.5822339489885664, + "grad_norm": 0.2061015191146862, + "learning_rate": 0.0004775596545164639, + "loss": 3.0513505935668945, + "step": 6111, + "token_acc": 0.29413052605549816 + }, + { + "epoch": 3.5828202873057755, + "grad_norm": 0.22973891102742997, + "learning_rate": 0.0004775496201007504, + "loss": 3.1333398818969727, + "step": 6112, + "token_acc": 0.2838682111164955 + }, + { + "epoch": 3.5834066256229846, + "grad_norm": 0.201200575524959, + "learning_rate": 0.0004775395835475148, + "loss": 3.1161303520202637, + "step": 6113, + "token_acc": 0.28424492476523894 + }, + { + "epoch": 3.5839929639401937, + "grad_norm": 0.19444768381835464, + "learning_rate": 0.0004775295448568515, + "loss": 3.083249807357788, + "step": 6114, + "token_acc": 0.290126555049654 + }, + { + "epoch": 3.5845793022574024, + "grad_norm": 0.17201432056093507, + "learning_rate": 0.0004775195040288547, + "loss": 3.0807690620422363, + "step": 6115, + "token_acc": 0.29083650073565476 + }, + { + "epoch": 3.5851656405746115, + "grad_norm": 0.22920058568961252, + "learning_rate": 0.0004775094610636188, + "loss": 3.0926599502563477, + "step": 6116, + "token_acc": 0.28719250904270033 + }, + { + "epoch": 3.5857519788918206, + "grad_norm": 0.24767227692114227, + "learning_rate": 0.00047749941596123805, + "loss": 3.095942735671997, + "step": 6117, + "token_acc": 0.2886343798767424 + }, + { + "epoch": 3.5863383172090297, + "grad_norm": 0.20748647290726602, + "learning_rate": 0.0004774893687218068, + "loss": 3.0598459243774414, + "step": 6118, + "token_acc": 0.29318196818691833 + }, + { + "epoch": 3.5869246555262384, + "grad_norm": 0.21878671420519874, + "learning_rate": 0.0004774793193454196, + "loss": 3.071408271789551, + "step": 6119, + "token_acc": 0.29111665422500593 + }, + { + "epoch": 3.5875109938434475, + "grad_norm": 0.289051300361958, + "learning_rate": 0.0004774692678321706, + "loss": 3.114727258682251, + "step": 6120, + "token_acc": 0.28526619612914667 + }, + { + "epoch": 3.5880973321606566, + "grad_norm": 0.2238484436034807, + "learning_rate": 0.00047745921418215434, + "loss": 3.053147792816162, + "step": 6121, + "token_acc": 0.2937151580410811 + }, + { + "epoch": 3.5886836704778657, + "grad_norm": 0.18646507233837395, + "learning_rate": 0.0004774491583954653, + "loss": 3.032620906829834, + "step": 6122, + "token_acc": 0.29550179257692605 + }, + { + "epoch": 3.589270008795075, + "grad_norm": 0.23580325117038076, + "learning_rate": 0.00047743910047219797, + "loss": 3.0558462142944336, + "step": 6123, + "token_acc": 0.2919754589535647 + }, + { + "epoch": 3.589856347112284, + "grad_norm": 0.19364107394716706, + "learning_rate": 0.00047742904041244674, + "loss": 3.0633974075317383, + "step": 6124, + "token_acc": 0.29236689097835195 + }, + { + "epoch": 3.590442685429493, + "grad_norm": 0.23520470132089974, + "learning_rate": 0.0004774189782163061, + "loss": 3.0803332328796387, + "step": 6125, + "token_acc": 0.28942831764452365 + }, + { + "epoch": 3.5910290237467017, + "grad_norm": 0.26290736887822436, + "learning_rate": 0.0004774089138838706, + "loss": 3.0983376502990723, + "step": 6126, + "token_acc": 0.28632994659701205 + }, + { + "epoch": 3.591615362063911, + "grad_norm": 0.2237605436060088, + "learning_rate": 0.00047739884741523485, + "loss": 3.0694193840026855, + "step": 6127, + "token_acc": 0.291640866873065 + }, + { + "epoch": 3.59220170038112, + "grad_norm": 0.20886323923382272, + "learning_rate": 0.00047738877881049334, + "loss": 3.101034164428711, + "step": 6128, + "token_acc": 0.2860221885025124 + }, + { + "epoch": 3.592788038698329, + "grad_norm": 0.217531252974018, + "learning_rate": 0.0004773787080697406, + "loss": 3.056234836578369, + "step": 6129, + "token_acc": 0.2934061575236296 + }, + { + "epoch": 3.5933743770155377, + "grad_norm": 0.20772401907988092, + "learning_rate": 0.0004773686351930714, + "loss": 3.085545063018799, + "step": 6130, + "token_acc": 0.2884021027909629 + }, + { + "epoch": 3.593960715332747, + "grad_norm": 0.21938291358499384, + "learning_rate": 0.0004773585601805802, + "loss": 3.072904109954834, + "step": 6131, + "token_acc": 0.28970801948450087 + }, + { + "epoch": 3.594547053649956, + "grad_norm": 0.23890405225059724, + "learning_rate": 0.00047734848303236167, + "loss": 3.0692057609558105, + "step": 6132, + "token_acc": 0.2916914125148148 + }, + { + "epoch": 3.595133391967165, + "grad_norm": 0.2395632705844236, + "learning_rate": 0.00047733840374851054, + "loss": 3.0858116149902344, + "step": 6133, + "token_acc": 0.2886483827327512 + }, + { + "epoch": 3.595719730284374, + "grad_norm": 0.19938532101372497, + "learning_rate": 0.0004773283223291214, + "loss": 3.102911949157715, + "step": 6134, + "token_acc": 0.2872610776176701 + }, + { + "epoch": 3.5963060686015833, + "grad_norm": 0.1860892143999292, + "learning_rate": 0.0004773182387742891, + "loss": 3.078582763671875, + "step": 6135, + "token_acc": 0.2893563354163819 + }, + { + "epoch": 3.5968924069187924, + "grad_norm": 0.19354341727959734, + "learning_rate": 0.0004773081530841082, + "loss": 3.1193454265594482, + "step": 6136, + "token_acc": 0.2844470666962947 + }, + { + "epoch": 3.597478745236001, + "grad_norm": 0.19924702050923326, + "learning_rate": 0.0004772980652586735, + "loss": 3.091604709625244, + "step": 6137, + "token_acc": 0.2889168533336067 + }, + { + "epoch": 3.59806508355321, + "grad_norm": 0.22927768235522245, + "learning_rate": 0.0004772879752980798, + "loss": 3.0477333068847656, + "step": 6138, + "token_acc": 0.2941999606271981 + }, + { + "epoch": 3.5986514218704193, + "grad_norm": 0.2979808594007924, + "learning_rate": 0.0004772778832024217, + "loss": 3.0915913581848145, + "step": 6139, + "token_acc": 0.2887546965116186 + }, + { + "epoch": 3.5992377601876284, + "grad_norm": 0.28665369194031165, + "learning_rate": 0.00047726778897179433, + "loss": 3.1050682067871094, + "step": 6140, + "token_acc": 0.28550109482014047 + }, + { + "epoch": 3.599824098504837, + "grad_norm": 0.2478521673189605, + "learning_rate": 0.0004772576926062923, + "loss": 3.1017208099365234, + "step": 6141, + "token_acc": 0.28742015546848426 + }, + { + "epoch": 3.600410436822046, + "grad_norm": 0.2847732604249733, + "learning_rate": 0.00047724759410601035, + "loss": 3.0851950645446777, + "step": 6142, + "token_acc": 0.28896857474799775 + }, + { + "epoch": 3.6009967751392553, + "grad_norm": 0.22798102669016376, + "learning_rate": 0.00047723749347104363, + "loss": 3.096829414367676, + "step": 6143, + "token_acc": 0.28860728040750405 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.21713068733581306, + "learning_rate": 0.0004772273907014868, + "loss": 3.0462703704833984, + "step": 6144, + "token_acc": 0.29586516340446306 + }, + { + "epoch": 3.6021694517736735, + "grad_norm": 0.2268065175670834, + "learning_rate": 0.00047721728579743487, + "loss": 3.0793564319610596, + "step": 6145, + "token_acc": 0.28781232814947444 + }, + { + "epoch": 3.6027557900908826, + "grad_norm": 0.23139116845101704, + "learning_rate": 0.0004772071787589826, + "loss": 3.0989623069763184, + "step": 6146, + "token_acc": 0.28692971158724584 + }, + { + "epoch": 3.6033421284080913, + "grad_norm": 0.21611950213225842, + "learning_rate": 0.00047719706958622513, + "loss": 3.048210382461548, + "step": 6147, + "token_acc": 0.2940842841126426 + }, + { + "epoch": 3.6039284667253004, + "grad_norm": 0.21748068594075223, + "learning_rate": 0.0004771869582792573, + "loss": 3.084216594696045, + "step": 6148, + "token_acc": 0.289161380243295 + }, + { + "epoch": 3.6045148050425095, + "grad_norm": 0.19099557933798814, + "learning_rate": 0.00047717684483817425, + "loss": 3.1019816398620605, + "step": 6149, + "token_acc": 0.2862859384076139 + }, + { + "epoch": 3.6051011433597187, + "grad_norm": 0.21046883733817803, + "learning_rate": 0.0004771667292630708, + "loss": 3.1363589763641357, + "step": 6150, + "token_acc": 0.2820389541841518 + }, + { + "epoch": 3.6056874816769273, + "grad_norm": 0.24918852904935335, + "learning_rate": 0.000477156611554042, + "loss": 3.1044440269470215, + "step": 6151, + "token_acc": 0.2866876974942667 + }, + { + "epoch": 3.6062738199941364, + "grad_norm": 0.2871318398068339, + "learning_rate": 0.00047714649171118296, + "loss": 3.1065168380737305, + "step": 6152, + "token_acc": 0.2857030679471374 + }, + { + "epoch": 3.6068601583113455, + "grad_norm": 0.20726122903419147, + "learning_rate": 0.0004771363697345887, + "loss": 3.0898056030273438, + "step": 6153, + "token_acc": 0.2879864226578202 + }, + { + "epoch": 3.6074464966285547, + "grad_norm": 0.22849655198521618, + "learning_rate": 0.0004771262456243544, + "loss": 3.1361660957336426, + "step": 6154, + "token_acc": 0.2797342594149294 + }, + { + "epoch": 3.6080328349457638, + "grad_norm": 0.21458153035986993, + "learning_rate": 0.00047711611938057496, + "loss": 3.073190212249756, + "step": 6155, + "token_acc": 0.28994228802540767 + }, + { + "epoch": 3.608619173262973, + "grad_norm": 0.25545589844118616, + "learning_rate": 0.00047710599100334565, + "loss": 3.1131415367126465, + "step": 6156, + "token_acc": 0.2839673387252815 + }, + { + "epoch": 3.609205511580182, + "grad_norm": 0.24726736855435918, + "learning_rate": 0.00047709586049276165, + "loss": 3.084441900253296, + "step": 6157, + "token_acc": 0.2895341914828877 + }, + { + "epoch": 3.6097918498973907, + "grad_norm": 0.1984414527088492, + "learning_rate": 0.000477085727848918, + "loss": 3.0631606578826904, + "step": 6158, + "token_acc": 0.29389436714845335 + }, + { + "epoch": 3.6103781882145998, + "grad_norm": 0.1898311563909263, + "learning_rate": 0.00047707559307191, + "loss": 3.0513486862182617, + "step": 6159, + "token_acc": 0.2939433535160204 + }, + { + "epoch": 3.610964526531809, + "grad_norm": 0.2344889160096546, + "learning_rate": 0.00047706545616183274, + "loss": 3.0630717277526855, + "step": 6160, + "token_acc": 0.2917871916892165 + }, + { + "epoch": 3.611550864849018, + "grad_norm": 0.2014382254291346, + "learning_rate": 0.0004770553171187815, + "loss": 3.034069299697876, + "step": 6161, + "token_acc": 0.29681578856899565 + }, + { + "epoch": 3.6121372031662267, + "grad_norm": 0.19323502557645017, + "learning_rate": 0.0004770451759428515, + "loss": 3.1025657653808594, + "step": 6162, + "token_acc": 0.28746280373633487 + }, + { + "epoch": 3.6127235414834358, + "grad_norm": 0.24961574108924697, + "learning_rate": 0.00047703503263413807, + "loss": 3.084105968475342, + "step": 6163, + "token_acc": 0.28898550572529097 + }, + { + "epoch": 3.613309879800645, + "grad_norm": 0.2861568362886386, + "learning_rate": 0.0004770248871927364, + "loss": 3.052492380142212, + "step": 6164, + "token_acc": 0.29388593242303646 + }, + { + "epoch": 3.613896218117854, + "grad_norm": 0.19815537713231182, + "learning_rate": 0.0004770147396187418, + "loss": 3.1211977005004883, + "step": 6165, + "token_acc": 0.2837363404456261 + }, + { + "epoch": 3.614482556435063, + "grad_norm": 0.23209739560927015, + "learning_rate": 0.00047700458991224964, + "loss": 3.161158561706543, + "step": 6166, + "token_acc": 0.27751919614454357 + }, + { + "epoch": 3.615068894752272, + "grad_norm": 0.2911482814418456, + "learning_rate": 0.0004769944380733553, + "loss": 3.0570719242095947, + "step": 6167, + "token_acc": 0.2914293673800319 + }, + { + "epoch": 3.6156552330694813, + "grad_norm": 0.22721726031400136, + "learning_rate": 0.0004769842841021541, + "loss": 3.066655158996582, + "step": 6168, + "token_acc": 0.2915528568117965 + }, + { + "epoch": 3.61624157138669, + "grad_norm": 0.19723944276794295, + "learning_rate": 0.0004769741279987414, + "loss": 3.106085777282715, + "step": 6169, + "token_acc": 0.28563930594589565 + }, + { + "epoch": 3.616827909703899, + "grad_norm": 0.2043437274005556, + "learning_rate": 0.00047696396976321256, + "loss": 3.059593677520752, + "step": 6170, + "token_acc": 0.2935196984750961 + }, + { + "epoch": 3.6174142480211082, + "grad_norm": 0.18683702186265808, + "learning_rate": 0.00047695380939566314, + "loss": 3.0783352851867676, + "step": 6171, + "token_acc": 0.29276142879092976 + }, + { + "epoch": 3.6180005863383173, + "grad_norm": 0.24316790097054164, + "learning_rate": 0.00047694364689618844, + "loss": 3.096801280975342, + "step": 6172, + "token_acc": 0.2881470695905161 + }, + { + "epoch": 3.618586924655526, + "grad_norm": 0.21890597200081613, + "learning_rate": 0.00047693348226488403, + "loss": 3.060990333557129, + "step": 6173, + "token_acc": 0.2916454143993911 + }, + { + "epoch": 3.619173262972735, + "grad_norm": 0.21315895460354048, + "learning_rate": 0.0004769233155018454, + "loss": 3.0711026191711426, + "step": 6174, + "token_acc": 0.29160285246324935 + }, + { + "epoch": 3.6197596012899442, + "grad_norm": 0.2509137142499466, + "learning_rate": 0.0004769131466071679, + "loss": 3.1608777046203613, + "step": 6175, + "token_acc": 0.2791856200116948 + }, + { + "epoch": 3.6203459396071533, + "grad_norm": 0.21963180563037735, + "learning_rate": 0.0004769029755809472, + "loss": 3.0996413230895996, + "step": 6176, + "token_acc": 0.28584191115185864 + }, + { + "epoch": 3.6209322779243625, + "grad_norm": 0.22359001954251972, + "learning_rate": 0.00047689280242327884, + "loss": 3.0976853370666504, + "step": 6177, + "token_acc": 0.2881094049655651 + }, + { + "epoch": 3.6215186162415716, + "grad_norm": 0.22446769676673142, + "learning_rate": 0.0004768826271342583, + "loss": 3.077608346939087, + "step": 6178, + "token_acc": 0.29003724633300376 + }, + { + "epoch": 3.6221049545587807, + "grad_norm": 0.23685189025065137, + "learning_rate": 0.0004768724497139812, + "loss": 3.0701286792755127, + "step": 6179, + "token_acc": 0.2918935206885717 + }, + { + "epoch": 3.6226912928759893, + "grad_norm": 0.23679083876027873, + "learning_rate": 0.00047686227016254315, + "loss": 3.0648012161254883, + "step": 6180, + "token_acc": 0.29228603260883945 + }, + { + "epoch": 3.6232776311931985, + "grad_norm": 0.24234513773312819, + "learning_rate": 0.0004768520884800398, + "loss": 3.1013264656066895, + "step": 6181, + "token_acc": 0.2870063169672341 + }, + { + "epoch": 3.6238639695104076, + "grad_norm": 0.25733685944573786, + "learning_rate": 0.00047684190466656674, + "loss": 3.118103504180908, + "step": 6182, + "token_acc": 0.283923168668202 + }, + { + "epoch": 3.6244503078276167, + "grad_norm": 0.2248902928697395, + "learning_rate": 0.00047683171872221964, + "loss": 3.0534701347351074, + "step": 6183, + "token_acc": 0.29327650883413986 + }, + { + "epoch": 3.6250366461448253, + "grad_norm": 0.27249207301391154, + "learning_rate": 0.0004768215306470943, + "loss": 3.0688135623931885, + "step": 6184, + "token_acc": 0.2924603344081659 + }, + { + "epoch": 3.6256229844620345, + "grad_norm": 0.304496002357318, + "learning_rate": 0.0004768113404412862, + "loss": 3.0284557342529297, + "step": 6185, + "token_acc": 0.2968628611594096 + }, + { + "epoch": 3.6262093227792436, + "grad_norm": 0.18877088115358923, + "learning_rate": 0.00047680114810489126, + "loss": 3.0931167602539062, + "step": 6186, + "token_acc": 0.28689559548820254 + }, + { + "epoch": 3.6267956610964527, + "grad_norm": 0.22025622833470732, + "learning_rate": 0.00047679095363800515, + "loss": 3.062897205352783, + "step": 6187, + "token_acc": 0.2929161372299873 + }, + { + "epoch": 3.627381999413662, + "grad_norm": 0.20737391241847097, + "learning_rate": 0.0004767807570407236, + "loss": 3.0727691650390625, + "step": 6188, + "token_acc": 0.28956141456141454 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.2209541145496801, + "learning_rate": 0.0004767705583131424, + "loss": 3.083425521850586, + "step": 6189, + "token_acc": 0.28936211580317456 + }, + { + "epoch": 3.62855467604808, + "grad_norm": 0.24295002611734384, + "learning_rate": 0.00047676035745535753, + "loss": 3.095484733581543, + "step": 6190, + "token_acc": 0.2877602432931928 + }, + { + "epoch": 3.6291410143652887, + "grad_norm": 0.18887464587121233, + "learning_rate": 0.0004767501544674645, + "loss": 3.090249538421631, + "step": 6191, + "token_acc": 0.2878626069541656 + }, + { + "epoch": 3.629727352682498, + "grad_norm": 0.2739051507571642, + "learning_rate": 0.0004767399493495594, + "loss": 3.0976619720458984, + "step": 6192, + "token_acc": 0.2878212243919472 + }, + { + "epoch": 3.630313690999707, + "grad_norm": 0.19223149731864148, + "learning_rate": 0.00047672974210173803, + "loss": 3.0885186195373535, + "step": 6193, + "token_acc": 0.2885616825972057 + }, + { + "epoch": 3.630900029316916, + "grad_norm": 0.2316482804956627, + "learning_rate": 0.0004767195327240962, + "loss": 3.097896099090576, + "step": 6194, + "token_acc": 0.28610354923211256 + }, + { + "epoch": 3.6314863676341247, + "grad_norm": 0.2178982831809425, + "learning_rate": 0.00047670932121672994, + "loss": 3.0861058235168457, + "step": 6195, + "token_acc": 0.2888716146243579 + }, + { + "epoch": 3.632072705951334, + "grad_norm": 0.21499441368527678, + "learning_rate": 0.000476699107579735, + "loss": 3.0841989517211914, + "step": 6196, + "token_acc": 0.28859822135683333 + }, + { + "epoch": 3.632659044268543, + "grad_norm": 0.20166165837469957, + "learning_rate": 0.0004766888918132075, + "loss": 3.1198253631591797, + "step": 6197, + "token_acc": 0.28530043467832994 + }, + { + "epoch": 3.633245382585752, + "grad_norm": 0.25243003728941044, + "learning_rate": 0.00047667867391724326, + "loss": 3.0948214530944824, + "step": 6198, + "token_acc": 0.2888231001990224 + }, + { + "epoch": 3.633831720902961, + "grad_norm": 0.2597949203094148, + "learning_rate": 0.00047666845389193844, + "loss": 3.131648302078247, + "step": 6199, + "token_acc": 0.2822694007305449 + }, + { + "epoch": 3.6344180592201702, + "grad_norm": 0.21529718017280955, + "learning_rate": 0.00047665823173738886, + "loss": 3.0767598152160645, + "step": 6200, + "token_acc": 0.29077363418817675 + }, + { + "epoch": 3.635004397537379, + "grad_norm": 0.24940255725691335, + "learning_rate": 0.00047664800745369064, + "loss": 3.0652365684509277, + "step": 6201, + "token_acc": 0.29327032760704985 + }, + { + "epoch": 3.635590735854588, + "grad_norm": 0.20055836852612532, + "learning_rate": 0.0004766377810409398, + "loss": 3.1221132278442383, + "step": 6202, + "token_acc": 0.28106990745784055 + }, + { + "epoch": 3.636177074171797, + "grad_norm": 0.22678682101731504, + "learning_rate": 0.0004766275524992324, + "loss": 3.0877137184143066, + "step": 6203, + "token_acc": 0.28937667472457174 + }, + { + "epoch": 3.6367634124890063, + "grad_norm": 0.2754439934437512, + "learning_rate": 0.0004766173218286645, + "loss": 3.1052157878875732, + "step": 6204, + "token_acc": 0.28661094723041625 + }, + { + "epoch": 3.637349750806215, + "grad_norm": 0.21171118960532365, + "learning_rate": 0.00047660708902933223, + "loss": 3.093491315841675, + "step": 6205, + "token_acc": 0.28907416658429796 + }, + { + "epoch": 3.637936089123424, + "grad_norm": 0.25315282061976235, + "learning_rate": 0.0004765968541013318, + "loss": 3.090362548828125, + "step": 6206, + "token_acc": 0.28790232201490334 + }, + { + "epoch": 3.638522427440633, + "grad_norm": 0.24392693684960898, + "learning_rate": 0.0004765866170447592, + "loss": 3.1106064319610596, + "step": 6207, + "token_acc": 0.28598399687955406 + }, + { + "epoch": 3.6391087657578423, + "grad_norm": 0.18222006037600721, + "learning_rate": 0.0004765763778597107, + "loss": 3.0544707775115967, + "step": 6208, + "token_acc": 0.2932226615208466 + }, + { + "epoch": 3.6396951040750514, + "grad_norm": 0.22079661216213745, + "learning_rate": 0.0004765661365462824, + "loss": 3.1136183738708496, + "step": 6209, + "token_acc": 0.28499264938736313 + }, + { + "epoch": 3.6402814423922605, + "grad_norm": 0.17803388429075456, + "learning_rate": 0.0004765558931045706, + "loss": 3.0879428386688232, + "step": 6210, + "token_acc": 0.288806224267627 + }, + { + "epoch": 3.6408677807094696, + "grad_norm": 0.22646868712896623, + "learning_rate": 0.00047654564753467143, + "loss": 3.090172290802002, + "step": 6211, + "token_acc": 0.2889591964846202 + }, + { + "epoch": 3.6414541190266783, + "grad_norm": 0.19908643396019987, + "learning_rate": 0.00047653539983668117, + "loss": 3.084719181060791, + "step": 6212, + "token_acc": 0.2874309579038247 + }, + { + "epoch": 3.6420404573438874, + "grad_norm": 0.19252639591798976, + "learning_rate": 0.00047652515001069613, + "loss": 3.044753313064575, + "step": 6213, + "token_acc": 0.2956261458450256 + }, + { + "epoch": 3.6426267956610965, + "grad_norm": 0.22536987238600592, + "learning_rate": 0.00047651489805681255, + "loss": 3.090442180633545, + "step": 6214, + "token_acc": 0.28893456069757123 + }, + { + "epoch": 3.6432131339783056, + "grad_norm": 0.21112075705873906, + "learning_rate": 0.00047650464397512674, + "loss": 3.1197757720947266, + "step": 6215, + "token_acc": 0.28439293245410063 + }, + { + "epoch": 3.6437994722955143, + "grad_norm": 0.21088413497887928, + "learning_rate": 0.00047649438776573494, + "loss": 3.111454963684082, + "step": 6216, + "token_acc": 0.28508773094956635 + }, + { + "epoch": 3.6443858106127234, + "grad_norm": 0.2338630637272971, + "learning_rate": 0.00047648412942873363, + "loss": 3.0553455352783203, + "step": 6217, + "token_acc": 0.2925149152852453 + }, + { + "epoch": 3.6449721489299325, + "grad_norm": 0.2527369762607655, + "learning_rate": 0.00047647386896421915, + "loss": 3.0808944702148438, + "step": 6218, + "token_acc": 0.29092652573045624 + }, + { + "epoch": 3.6455584872471416, + "grad_norm": 0.24080196818436336, + "learning_rate": 0.0004764636063722878, + "loss": 3.115847110748291, + "step": 6219, + "token_acc": 0.28540999199038847 + }, + { + "epoch": 3.6461448255643507, + "grad_norm": 0.2047574716932012, + "learning_rate": 0.000476453341653036, + "loss": 3.0454177856445312, + "step": 6220, + "token_acc": 0.29516576871448796 + }, + { + "epoch": 3.64673116388156, + "grad_norm": 0.21725096741971558, + "learning_rate": 0.00047644307480656026, + "loss": 3.1163296699523926, + "step": 6221, + "token_acc": 0.28523514324579713 + }, + { + "epoch": 3.647317502198769, + "grad_norm": 0.26251639460341036, + "learning_rate": 0.00047643280583295694, + "loss": 3.0693156719207764, + "step": 6222, + "token_acc": 0.29153100527749404 + }, + { + "epoch": 3.6479038405159776, + "grad_norm": 0.3069609613672709, + "learning_rate": 0.00047642253473232255, + "loss": 3.0870721340179443, + "step": 6223, + "token_acc": 0.28760876442583444 + }, + { + "epoch": 3.6484901788331867, + "grad_norm": 0.294005411916087, + "learning_rate": 0.00047641226150475357, + "loss": 3.087651252746582, + "step": 6224, + "token_acc": 0.28842587111795115 + }, + { + "epoch": 3.649076517150396, + "grad_norm": 0.2063518943121151, + "learning_rate": 0.0004764019861503465, + "loss": 3.0559561252593994, + "step": 6225, + "token_acc": 0.2925019755871138 + }, + { + "epoch": 3.649662855467605, + "grad_norm": 0.28362668986195483, + "learning_rate": 0.0004763917086691978, + "loss": 3.0708789825439453, + "step": 6226, + "token_acc": 0.29053344258506403 + }, + { + "epoch": 3.6502491937848136, + "grad_norm": 0.30496142217560235, + "learning_rate": 0.00047638142906140403, + "loss": 3.1105563640594482, + "step": 6227, + "token_acc": 0.285320124212932 + }, + { + "epoch": 3.6508355321020227, + "grad_norm": 0.22565563687211634, + "learning_rate": 0.00047637114732706186, + "loss": 3.0886902809143066, + "step": 6228, + "token_acc": 0.28961035509249416 + }, + { + "epoch": 3.651421870419232, + "grad_norm": 0.18822991769045821, + "learning_rate": 0.00047636086346626783, + "loss": 3.027623176574707, + "step": 6229, + "token_acc": 0.2981633530059425 + }, + { + "epoch": 3.652008208736441, + "grad_norm": 0.23974902357576536, + "learning_rate": 0.0004763505774791184, + "loss": 3.1236419677734375, + "step": 6230, + "token_acc": 0.2841485020938473 + }, + { + "epoch": 3.65259454705365, + "grad_norm": 0.21181182226205186, + "learning_rate": 0.0004763402893657104, + "loss": 3.094027042388916, + "step": 6231, + "token_acc": 0.28781114371087463 + }, + { + "epoch": 3.653180885370859, + "grad_norm": 0.26768787816589984, + "learning_rate": 0.00047632999912614033, + "loss": 3.114208221435547, + "step": 6232, + "token_acc": 0.2845583187072404 + }, + { + "epoch": 3.6537672236880683, + "grad_norm": 0.2890370560076679, + "learning_rate": 0.00047631970676050494, + "loss": 3.102231979370117, + "step": 6233, + "token_acc": 0.28813852721621974 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.22943613032760116, + "learning_rate": 0.00047630941226890083, + "loss": 3.092461109161377, + "step": 6234, + "token_acc": 0.28880832964294173 + }, + { + "epoch": 3.654939900322486, + "grad_norm": 0.2472866075783113, + "learning_rate": 0.0004762991156514248, + "loss": 3.107983350753784, + "step": 6235, + "token_acc": 0.28601362646334627 + }, + { + "epoch": 3.655526238639695, + "grad_norm": 0.3046132094720925, + "learning_rate": 0.0004762888169081735, + "loss": 3.151815414428711, + "step": 6236, + "token_acc": 0.2816000657336145 + }, + { + "epoch": 3.6561125769569043, + "grad_norm": 0.24462702280019508, + "learning_rate": 0.0004762785160392437, + "loss": 3.0504300594329834, + "step": 6237, + "token_acc": 0.2928942364049131 + }, + { + "epoch": 3.656698915274113, + "grad_norm": 0.25652431828056904, + "learning_rate": 0.0004762682130447321, + "loss": 3.0745091438293457, + "step": 6238, + "token_acc": 0.2898492358529533 + }, + { + "epoch": 3.657285253591322, + "grad_norm": 0.2925398133522588, + "learning_rate": 0.00047625790792473556, + "loss": 3.0625357627868652, + "step": 6239, + "token_acc": 0.2931582539557371 + }, + { + "epoch": 3.657871591908531, + "grad_norm": 0.2369871157977074, + "learning_rate": 0.0004762476006793509, + "loss": 3.048499345779419, + "step": 6240, + "token_acc": 0.2934155913370694 + }, + { + "epoch": 3.6584579302257403, + "grad_norm": 0.23558119800398816, + "learning_rate": 0.0004762372913086749, + "loss": 3.099194288253784, + "step": 6241, + "token_acc": 0.2885295956271731 + }, + { + "epoch": 3.6590442685429494, + "grad_norm": 0.23242298993515245, + "learning_rate": 0.0004762269798128044, + "loss": 3.086749315261841, + "step": 6242, + "token_acc": 0.28819099846184715 + }, + { + "epoch": 3.6596306068601585, + "grad_norm": 0.2658606701628042, + "learning_rate": 0.00047621666619183624, + "loss": 3.108912229537964, + "step": 6243, + "token_acc": 0.2866268014539078 + }, + { + "epoch": 3.660216945177367, + "grad_norm": 0.20165901670828362, + "learning_rate": 0.0004762063504458673, + "loss": 3.088257312774658, + "step": 6244, + "token_acc": 0.28996089647055845 + }, + { + "epoch": 3.6608032834945763, + "grad_norm": 0.21964186599107574, + "learning_rate": 0.0004761960325749946, + "loss": 3.076192855834961, + "step": 6245, + "token_acc": 0.2898234789520489 + }, + { + "epoch": 3.6613896218117854, + "grad_norm": 0.1983458800912109, + "learning_rate": 0.00047618571257931495, + "loss": 3.0533924102783203, + "step": 6246, + "token_acc": 0.29369114925855117 + }, + { + "epoch": 3.6619759601289945, + "grad_norm": 0.2233203065309482, + "learning_rate": 0.00047617539045892535, + "loss": 3.121384620666504, + "step": 6247, + "token_acc": 0.28413885590704774 + }, + { + "epoch": 3.6625622984462036, + "grad_norm": 0.2615998448871885, + "learning_rate": 0.00047616506621392266, + "loss": 3.098362445831299, + "step": 6248, + "token_acc": 0.28746183604919645 + }, + { + "epoch": 3.6631486367634123, + "grad_norm": 0.21387160312354464, + "learning_rate": 0.00047615473984440396, + "loss": 3.1184983253479004, + "step": 6249, + "token_acc": 0.2831478582172352 + }, + { + "epoch": 3.6637349750806214, + "grad_norm": 0.22303363105330418, + "learning_rate": 0.0004761444113504663, + "loss": 3.048077344894409, + "step": 6250, + "token_acc": 0.2935597682991048 + }, + { + "epoch": 3.6643213133978305, + "grad_norm": 0.18200892173872896, + "learning_rate": 0.00047613408073220653, + "loss": 3.102721691131592, + "step": 6251, + "token_acc": 0.28736738284373836 + }, + { + "epoch": 3.6649076517150396, + "grad_norm": 0.24624543516368888, + "learning_rate": 0.0004761237479897218, + "loss": 3.066155433654785, + "step": 6252, + "token_acc": 0.29176031562144644 + }, + { + "epoch": 3.6654939900322487, + "grad_norm": 0.2536493957241849, + "learning_rate": 0.0004761134131231092, + "loss": 3.112647294998169, + "step": 6253, + "token_acc": 0.2854767511089418 + }, + { + "epoch": 3.666080328349458, + "grad_norm": 0.21241022032794418, + "learning_rate": 0.00047610307613246575, + "loss": 3.0763943195343018, + "step": 6254, + "token_acc": 0.29107534151082864 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.23362248433459845, + "learning_rate": 0.00047609273701788857, + "loss": 3.0946872234344482, + "step": 6255, + "token_acc": 0.2873993673765168 + }, + { + "epoch": 3.6672530049838756, + "grad_norm": 0.21881371002325514, + "learning_rate": 0.0004760823957794748, + "loss": 3.0392847061157227, + "step": 6256, + "token_acc": 0.2959683198785911 + }, + { + "epoch": 3.6678393433010847, + "grad_norm": 0.1899075179973683, + "learning_rate": 0.00047607205241732164, + "loss": 3.080958843231201, + "step": 6257, + "token_acc": 0.2889517285327386 + }, + { + "epoch": 3.668425681618294, + "grad_norm": 0.20643395051812038, + "learning_rate": 0.0004760617069315261, + "loss": 3.106217384338379, + "step": 6258, + "token_acc": 0.2872088770303944 + }, + { + "epoch": 3.6690120199355025, + "grad_norm": 0.20507092832618257, + "learning_rate": 0.00047605135932218546, + "loss": 3.0806360244750977, + "step": 6259, + "token_acc": 0.2894196357856811 + }, + { + "epoch": 3.6695983582527116, + "grad_norm": 0.17100317733580359, + "learning_rate": 0.0004760410095893969, + "loss": 3.0370893478393555, + "step": 6260, + "token_acc": 0.29525727775034194 + }, + { + "epoch": 3.6701846965699207, + "grad_norm": 0.20698216792346336, + "learning_rate": 0.00047603065773325773, + "loss": 3.1035714149475098, + "step": 6261, + "token_acc": 0.2866073456134352 + }, + { + "epoch": 3.67077103488713, + "grad_norm": 0.20582652474136504, + "learning_rate": 0.000476020303753865, + "loss": 3.119476556777954, + "step": 6262, + "token_acc": 0.28628544935508743 + }, + { + "epoch": 3.671357373204339, + "grad_norm": 0.22798684487631307, + "learning_rate": 0.0004760099476513161, + "loss": 3.105592966079712, + "step": 6263, + "token_acc": 0.28439875332929865 + }, + { + "epoch": 3.671943711521548, + "grad_norm": 0.20290258734572972, + "learning_rate": 0.0004759995894257084, + "loss": 3.038015365600586, + "step": 6264, + "token_acc": 0.29646191177561376 + }, + { + "epoch": 3.672530049838757, + "grad_norm": 0.2443659943765557, + "learning_rate": 0.000475989229077139, + "loss": 3.0509090423583984, + "step": 6265, + "token_acc": 0.29322042655484765 + }, + { + "epoch": 3.673116388155966, + "grad_norm": 0.24525846602370088, + "learning_rate": 0.00047597886660570533, + "loss": 3.0995731353759766, + "step": 6266, + "token_acc": 0.2867245708012726 + }, + { + "epoch": 3.673702726473175, + "grad_norm": 0.19504477506266588, + "learning_rate": 0.0004759685020115047, + "loss": 3.1039326190948486, + "step": 6267, + "token_acc": 0.2870681362617757 + }, + { + "epoch": 3.674289064790384, + "grad_norm": 0.21086440987066887, + "learning_rate": 0.0004759581352946345, + "loss": 3.068416118621826, + "step": 6268, + "token_acc": 0.2908765454228721 + }, + { + "epoch": 3.674875403107593, + "grad_norm": 0.21902279984431536, + "learning_rate": 0.0004759477664551921, + "loss": 3.0437686443328857, + "step": 6269, + "token_acc": 0.29653908933917555 + }, + { + "epoch": 3.675461741424802, + "grad_norm": 0.18404177002741884, + "learning_rate": 0.00047593739549327496, + "loss": 3.0717930793762207, + "step": 6270, + "token_acc": 0.2906671721777623 + }, + { + "epoch": 3.676048079742011, + "grad_norm": 0.18090798508197553, + "learning_rate": 0.00047592702240898045, + "loss": 3.117943286895752, + "step": 6271, + "token_acc": 0.2842980246654179 + }, + { + "epoch": 3.67663441805922, + "grad_norm": 0.23377700145256902, + "learning_rate": 0.00047591664720240593, + "loss": 3.101257562637329, + "step": 6272, + "token_acc": 0.2874908117189961 + }, + { + "epoch": 3.677220756376429, + "grad_norm": 0.22691808097231375, + "learning_rate": 0.00047590626987364893, + "loss": 3.1025478839874268, + "step": 6273, + "token_acc": 0.28647035745073807 + }, + { + "epoch": 3.6778070946936383, + "grad_norm": 0.22916477012381561, + "learning_rate": 0.000475895890422807, + "loss": 3.062260627746582, + "step": 6274, + "token_acc": 0.2907589368035616 + }, + { + "epoch": 3.6783934330108474, + "grad_norm": 0.23994674069313526, + "learning_rate": 0.0004758855088499775, + "loss": 3.0703911781311035, + "step": 6275, + "token_acc": 0.2906898722535551 + }, + { + "epoch": 3.6789797713280565, + "grad_norm": 0.2379897405007281, + "learning_rate": 0.0004758751251552581, + "loss": 3.0789647102355957, + "step": 6276, + "token_acc": 0.28963149570008484 + }, + { + "epoch": 3.679566109645265, + "grad_norm": 0.19730749848966164, + "learning_rate": 0.0004758647393387463, + "loss": 3.0743894577026367, + "step": 6277, + "token_acc": 0.29058828674251874 + }, + { + "epoch": 3.6801524479624743, + "grad_norm": 0.22335253272557967, + "learning_rate": 0.0004758543514005396, + "loss": 3.0588574409484863, + "step": 6278, + "token_acc": 0.29139148378609203 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.23368121401051467, + "learning_rate": 0.0004758439613407355, + "loss": 3.053405523300171, + "step": 6279, + "token_acc": 0.29412319544396603 + }, + { + "epoch": 3.6813251245968925, + "grad_norm": 0.22202402098337143, + "learning_rate": 0.0004758335691594318, + "loss": 3.0849609375, + "step": 6280, + "token_acc": 0.2893810896258389 + }, + { + "epoch": 3.681911462914101, + "grad_norm": 0.20135257202952184, + "learning_rate": 0.0004758231748567261, + "loss": 3.0706231594085693, + "step": 6281, + "token_acc": 0.29125868961961293 + }, + { + "epoch": 3.6824978012313103, + "grad_norm": 0.21289311571550834, + "learning_rate": 0.00047581277843271585, + "loss": 3.054109573364258, + "step": 6282, + "token_acc": 0.29351748529243354 + }, + { + "epoch": 3.6830841395485194, + "grad_norm": 0.2573553424762857, + "learning_rate": 0.0004758023798874989, + "loss": 3.0890324115753174, + "step": 6283, + "token_acc": 0.28887107913372434 + }, + { + "epoch": 3.6836704778657285, + "grad_norm": 0.2283806847761128, + "learning_rate": 0.0004757919792211729, + "loss": 3.1533498764038086, + "step": 6284, + "token_acc": 0.27820880661347586 + }, + { + "epoch": 3.6842568161829377, + "grad_norm": 0.20540577972612378, + "learning_rate": 0.00047578157643383544, + "loss": 3.075993537902832, + "step": 6285, + "token_acc": 0.2906262835035839 + }, + { + "epoch": 3.6848431545001468, + "grad_norm": 0.22375276760162746, + "learning_rate": 0.00047577117152558437, + "loss": 3.0647780895233154, + "step": 6286, + "token_acc": 0.2925636988979915 + }, + { + "epoch": 3.685429492817356, + "grad_norm": 0.21415046372869917, + "learning_rate": 0.0004757607644965173, + "loss": 3.0991854667663574, + "step": 6287, + "token_acc": 0.2854994673232099 + }, + { + "epoch": 3.6860158311345645, + "grad_norm": 0.21465970960962477, + "learning_rate": 0.0004757503553467321, + "loss": 3.0805742740631104, + "step": 6288, + "token_acc": 0.2893275300614402 + }, + { + "epoch": 3.6866021694517737, + "grad_norm": 0.21714624925360393, + "learning_rate": 0.00047573994407632653, + "loss": 3.0888969898223877, + "step": 6289, + "token_acc": 0.2888452482192804 + }, + { + "epoch": 3.6871885077689828, + "grad_norm": 0.34475094524051425, + "learning_rate": 0.0004757295306853984, + "loss": 3.130868434906006, + "step": 6290, + "token_acc": 0.28242955288108407 + }, + { + "epoch": 3.687774846086192, + "grad_norm": 0.32778502380039076, + "learning_rate": 0.0004757191151740454, + "loss": 3.0563433170318604, + "step": 6291, + "token_acc": 0.292867452051083 + }, + { + "epoch": 3.6883611844034006, + "grad_norm": 0.23288229488820736, + "learning_rate": 0.0004757086975423656, + "loss": 3.0520567893981934, + "step": 6292, + "token_acc": 0.2950680778847275 + }, + { + "epoch": 3.6889475227206097, + "grad_norm": 0.2983543937554026, + "learning_rate": 0.00047569827779045664, + "loss": 3.070394992828369, + "step": 6293, + "token_acc": 0.2911868819395137 + }, + { + "epoch": 3.6895338610378188, + "grad_norm": 0.34117789944213417, + "learning_rate": 0.0004756878559184166, + "loss": 3.057326316833496, + "step": 6294, + "token_acc": 0.29340623332872967 + }, + { + "epoch": 3.690120199355028, + "grad_norm": 0.23148547060191585, + "learning_rate": 0.00047567743192634314, + "loss": 3.100137233734131, + "step": 6295, + "token_acc": 0.287123777614479 + }, + { + "epoch": 3.690706537672237, + "grad_norm": 0.28121143603649723, + "learning_rate": 0.0004756670058143344, + "loss": 3.065199136734009, + "step": 6296, + "token_acc": 0.29168741445610374 + }, + { + "epoch": 3.691292875989446, + "grad_norm": 0.22718486945526592, + "learning_rate": 0.00047565657758248816, + "loss": 3.0887603759765625, + "step": 6297, + "token_acc": 0.2873286165611868 + }, + { + "epoch": 3.6918792143066548, + "grad_norm": 0.2506051681794216, + "learning_rate": 0.00047564614723090247, + "loss": 3.0363707542419434, + "step": 6298, + "token_acc": 0.2942847230133496 + }, + { + "epoch": 3.692465552623864, + "grad_norm": 0.21521561727648317, + "learning_rate": 0.00047563571475967535, + "loss": 3.105945110321045, + "step": 6299, + "token_acc": 0.28972472547412337 + }, + { + "epoch": 3.693051890941073, + "grad_norm": 0.21361156627561806, + "learning_rate": 0.0004756252801689047, + "loss": 3.061645984649658, + "step": 6300, + "token_acc": 0.29263675220190993 + }, + { + "epoch": 3.693638229258282, + "grad_norm": 0.2069030764472745, + "learning_rate": 0.00047561484345868854, + "loss": 3.0578322410583496, + "step": 6301, + "token_acc": 0.29353489200884036 + }, + { + "epoch": 3.6942245675754912, + "grad_norm": 0.20670603616125363, + "learning_rate": 0.00047560440462912504, + "loss": 3.099064826965332, + "step": 6302, + "token_acc": 0.28871987818757244 + }, + { + "epoch": 3.6948109058927, + "grad_norm": 0.21841739846028618, + "learning_rate": 0.00047559396368031205, + "loss": 3.078824281692505, + "step": 6303, + "token_acc": 0.2903975589354999 + }, + { + "epoch": 3.695397244209909, + "grad_norm": 0.19856069888976535, + "learning_rate": 0.00047558352061234785, + "loss": 3.140575885772705, + "step": 6304, + "token_acc": 0.2827258443598966 + }, + { + "epoch": 3.695983582527118, + "grad_norm": 0.20165550675336996, + "learning_rate": 0.0004755730754253304, + "loss": 3.0617616176605225, + "step": 6305, + "token_acc": 0.29107232207647954 + }, + { + "epoch": 3.6965699208443272, + "grad_norm": 0.20947075345087968, + "learning_rate": 0.0004755626281193579, + "loss": 3.060178279876709, + "step": 6306, + "token_acc": 0.2916798309447722 + }, + { + "epoch": 3.6971562591615363, + "grad_norm": 0.19413655033577182, + "learning_rate": 0.00047555217869452843, + "loss": 3.1074752807617188, + "step": 6307, + "token_acc": 0.2858410441624947 + }, + { + "epoch": 3.6977425974787455, + "grad_norm": 0.20651578958433153, + "learning_rate": 0.0004755417271509402, + "loss": 3.1317760944366455, + "step": 6308, + "token_acc": 0.2822121026941752 + }, + { + "epoch": 3.698328935795954, + "grad_norm": 0.1851037953116679, + "learning_rate": 0.0004755312734886914, + "loss": 3.0649638175964355, + "step": 6309, + "token_acc": 0.29192491363543993 + }, + { + "epoch": 3.6989152741131632, + "grad_norm": 0.21068140323826287, + "learning_rate": 0.00047552081770788014, + "loss": 3.0676498413085938, + "step": 6310, + "token_acc": 0.29133634355906485 + }, + { + "epoch": 3.6995016124303723, + "grad_norm": 0.22738852852724492, + "learning_rate": 0.0004755103598086047, + "loss": 3.07861590385437, + "step": 6311, + "token_acc": 0.29074357232592235 + }, + { + "epoch": 3.7000879507475815, + "grad_norm": 0.17778649446636097, + "learning_rate": 0.0004754998997909633, + "loss": 3.0292582511901855, + "step": 6312, + "token_acc": 0.2948299685770333 + }, + { + "epoch": 3.70067428906479, + "grad_norm": 0.2146787775697283, + "learning_rate": 0.0004754894376550542, + "loss": 3.072861433029175, + "step": 6313, + "token_acc": 0.2900795934975487 + }, + { + "epoch": 3.7012606273819992, + "grad_norm": 0.2664391773091337, + "learning_rate": 0.0004754789734009757, + "loss": 3.096806764602661, + "step": 6314, + "token_acc": 0.2867016180820937 + }, + { + "epoch": 3.7018469656992083, + "grad_norm": 0.25718888422964237, + "learning_rate": 0.0004754685070288261, + "loss": 3.047502040863037, + "step": 6315, + "token_acc": 0.2929279636146373 + }, + { + "epoch": 3.7024333040164175, + "grad_norm": 0.20189826627025353, + "learning_rate": 0.0004754580385387037, + "loss": 3.112776279449463, + "step": 6316, + "token_acc": 0.28554291602372517 + }, + { + "epoch": 3.7030196423336266, + "grad_norm": 0.22236618456791737, + "learning_rate": 0.00047544756793070674, + "loss": 3.084738254547119, + "step": 6317, + "token_acc": 0.28979864578309356 + }, + { + "epoch": 3.7036059806508357, + "grad_norm": 0.2724813211451575, + "learning_rate": 0.00047543709520493383, + "loss": 3.090397834777832, + "step": 6318, + "token_acc": 0.28971236408241335 + }, + { + "epoch": 3.704192318968045, + "grad_norm": 0.2551393406451765, + "learning_rate": 0.0004754266203614831, + "loss": 3.0775043964385986, + "step": 6319, + "token_acc": 0.2916023043233322 + }, + { + "epoch": 3.7047786572852535, + "grad_norm": 0.22539730988524811, + "learning_rate": 0.000475416143400453, + "loss": 3.0729165077209473, + "step": 6320, + "token_acc": 0.29112627801915136 + }, + { + "epoch": 3.7053649956024626, + "grad_norm": 0.28555473781419266, + "learning_rate": 0.0004754056643219421, + "loss": 3.109708309173584, + "step": 6321, + "token_acc": 0.28676166073065096 + }, + { + "epoch": 3.7059513339196717, + "grad_norm": 0.1966779139163532, + "learning_rate": 0.00047539518312604865, + "loss": 3.096301555633545, + "step": 6322, + "token_acc": 0.2870080579072935 + }, + { + "epoch": 3.706537672236881, + "grad_norm": 0.2932845820030136, + "learning_rate": 0.00047538469981287115, + "loss": 3.041316509246826, + "step": 6323, + "token_acc": 0.2964478087561542 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.28659920298395736, + "learning_rate": 0.00047537421438250814, + "loss": 3.0733954906463623, + "step": 6324, + "token_acc": 0.29084740434116246 + }, + { + "epoch": 3.7077103488712986, + "grad_norm": 0.22756561576942325, + "learning_rate": 0.00047536372683505815, + "loss": 3.0740504264831543, + "step": 6325, + "token_acc": 0.2897523999370508 + }, + { + "epoch": 3.7082966871885077, + "grad_norm": 0.3631635284043102, + "learning_rate": 0.0004753532371706195, + "loss": 3.0869903564453125, + "step": 6326, + "token_acc": 0.2899041188030017 + }, + { + "epoch": 3.708883025505717, + "grad_norm": 0.22212766138999201, + "learning_rate": 0.0004753427453892909, + "loss": 3.0890285968780518, + "step": 6327, + "token_acc": 0.28805227409690076 + }, + { + "epoch": 3.709469363822926, + "grad_norm": 0.238277483012778, + "learning_rate": 0.0004753322514911709, + "loss": 3.1144447326660156, + "step": 6328, + "token_acc": 0.2848909809402926 + }, + { + "epoch": 3.710055702140135, + "grad_norm": 0.21968159059481188, + "learning_rate": 0.000475321755476358, + "loss": 3.0980639457702637, + "step": 6329, + "token_acc": 0.28712004781420764 + }, + { + "epoch": 3.710642040457344, + "grad_norm": 0.21928506862340844, + "learning_rate": 0.00047531125734495087, + "loss": 3.0754799842834473, + "step": 6330, + "token_acc": 0.2914571151375305 + }, + { + "epoch": 3.711228378774553, + "grad_norm": 0.19981257691269322, + "learning_rate": 0.000475300757097048, + "loss": 3.059030532836914, + "step": 6331, + "token_acc": 0.29247898094667985 + }, + { + "epoch": 3.711814717091762, + "grad_norm": 0.21537483668662177, + "learning_rate": 0.00047529025473274825, + "loss": 3.0732533931732178, + "step": 6332, + "token_acc": 0.2909539894569835 + }, + { + "epoch": 3.712401055408971, + "grad_norm": 0.1978371814252081, + "learning_rate": 0.00047527975025214996, + "loss": 3.1028800010681152, + "step": 6333, + "token_acc": 0.285731436460772 + }, + { + "epoch": 3.71298739372618, + "grad_norm": 0.23446813288548501, + "learning_rate": 0.0004752692436553522, + "loss": 3.0716171264648438, + "step": 6334, + "token_acc": 0.290743820872282 + }, + { + "epoch": 3.713573732043389, + "grad_norm": 0.2336705396491335, + "learning_rate": 0.0004752587349424533, + "loss": 3.0415682792663574, + "step": 6335, + "token_acc": 0.2934679407564221 + }, + { + "epoch": 3.714160070360598, + "grad_norm": 0.2182677270496905, + "learning_rate": 0.00047524822411355215, + "loss": 3.0925354957580566, + "step": 6336, + "token_acc": 0.28842309001191085 + }, + { + "epoch": 3.714746408677807, + "grad_norm": 0.22765144769728646, + "learning_rate": 0.00047523771116874747, + "loss": 3.054443359375, + "step": 6337, + "token_acc": 0.2943773349450056 + }, + { + "epoch": 3.715332746995016, + "grad_norm": 0.19958336277078395, + "learning_rate": 0.000475227196108138, + "loss": 3.1000285148620605, + "step": 6338, + "token_acc": 0.2850800267310533 + }, + { + "epoch": 3.7159190853122253, + "grad_norm": 0.22876066559832609, + "learning_rate": 0.00047521667893182253, + "loss": 3.144841432571411, + "step": 6339, + "token_acc": 0.28191783512241003 + }, + { + "epoch": 3.7165054236294344, + "grad_norm": 0.22160308200784715, + "learning_rate": 0.00047520615963989987, + "loss": 3.0947482585906982, + "step": 6340, + "token_acc": 0.2872743775180933 + }, + { + "epoch": 3.7170917619466435, + "grad_norm": 0.19663828098623984, + "learning_rate": 0.00047519563823246875, + "loss": 3.066298007965088, + "step": 6341, + "token_acc": 0.292177660782492 + }, + { + "epoch": 3.717678100263852, + "grad_norm": 0.20398305968807984, + "learning_rate": 0.0004751851147096281, + "loss": 3.0842764377593994, + "step": 6342, + "token_acc": 0.29026055841805126 + }, + { + "epoch": 3.7182644385810613, + "grad_norm": 0.1896376488946728, + "learning_rate": 0.0004751745890714767, + "loss": 3.115405797958374, + "step": 6343, + "token_acc": 0.2853894346444416 + }, + { + "epoch": 3.7188507768982704, + "grad_norm": 0.23692828993406395, + "learning_rate": 0.00047516406131811354, + "loss": 3.128922939300537, + "step": 6344, + "token_acc": 0.2841535617649423 + }, + { + "epoch": 3.7194371152154795, + "grad_norm": 0.22687820994633073, + "learning_rate": 0.00047515353144963736, + "loss": 3.0918853282928467, + "step": 6345, + "token_acc": 0.28732471748766936 + }, + { + "epoch": 3.720023453532688, + "grad_norm": 0.20137883907480006, + "learning_rate": 0.00047514299946614717, + "loss": 3.0686206817626953, + "step": 6346, + "token_acc": 0.29174380634906505 + }, + { + "epoch": 3.7206097918498973, + "grad_norm": 0.17200122758836944, + "learning_rate": 0.0004751324653677419, + "loss": 3.0734782218933105, + "step": 6347, + "token_acc": 0.29060088224669683 + }, + { + "epoch": 3.7211961301671064, + "grad_norm": 0.19134925356395815, + "learning_rate": 0.00047512192915452053, + "loss": 3.045711040496826, + "step": 6348, + "token_acc": 0.29364700620928313 + }, + { + "epoch": 3.7217824684843155, + "grad_norm": 0.18758235668477113, + "learning_rate": 0.00047511139082658194, + "loss": 3.0867111682891846, + "step": 6349, + "token_acc": 0.2886962329338933 + }, + { + "epoch": 3.7223688068015246, + "grad_norm": 0.20260956495466897, + "learning_rate": 0.0004751008503840252, + "loss": 3.1001393795013428, + "step": 6350, + "token_acc": 0.28853601460642375 + }, + { + "epoch": 3.7229551451187337, + "grad_norm": 0.22310748227684707, + "learning_rate": 0.00047509030782694927, + "loss": 3.0518672466278076, + "step": 6351, + "token_acc": 0.29270070522449254 + }, + { + "epoch": 3.7235414834359424, + "grad_norm": 0.23294080017392524, + "learning_rate": 0.0004750797631554532, + "loss": 3.143362522125244, + "step": 6352, + "token_acc": 0.2823193673710059 + }, + { + "epoch": 3.7241278217531515, + "grad_norm": 0.23264506010627564, + "learning_rate": 0.0004750692163696361, + "loss": 3.128628730773926, + "step": 6353, + "token_acc": 0.28241240821765917 + }, + { + "epoch": 3.7247141600703606, + "grad_norm": 0.24198188370330811, + "learning_rate": 0.0004750586674695969, + "loss": 3.1108243465423584, + "step": 6354, + "token_acc": 0.28610885752820137 + }, + { + "epoch": 3.7253004983875697, + "grad_norm": 0.3137847620825557, + "learning_rate": 0.00047504811645543486, + "loss": 3.104112148284912, + "step": 6355, + "token_acc": 0.28617102487784507 + }, + { + "epoch": 3.7258868367047784, + "grad_norm": 0.3303062147803285, + "learning_rate": 0.0004750375633272491, + "loss": 3.0763165950775146, + "step": 6356, + "token_acc": 0.2890334387123154 + }, + { + "epoch": 3.7264731750219875, + "grad_norm": 0.19696538558008614, + "learning_rate": 0.00047502700808513856, + "loss": 3.057522773742676, + "step": 6357, + "token_acc": 0.2935974087356371 + }, + { + "epoch": 3.7270595133391966, + "grad_norm": 0.2997035596768689, + "learning_rate": 0.00047501645072920253, + "loss": 3.1123204231262207, + "step": 6358, + "token_acc": 0.28498380553175073 + }, + { + "epoch": 3.7276458516564057, + "grad_norm": 0.22792941995398375, + "learning_rate": 0.0004750058912595402, + "loss": 3.084667921066284, + "step": 6359, + "token_acc": 0.28957525942776235 + }, + { + "epoch": 3.728232189973615, + "grad_norm": 0.23187680012319573, + "learning_rate": 0.0004749953296762507, + "loss": 3.1029927730560303, + "step": 6360, + "token_acc": 0.2879277559101331 + }, + { + "epoch": 3.728818528290824, + "grad_norm": 0.2599375000107801, + "learning_rate": 0.00047498476597943317, + "loss": 3.0861122608184814, + "step": 6361, + "token_acc": 0.2884346557283376 + }, + { + "epoch": 3.729404866608033, + "grad_norm": 0.2452764840818546, + "learning_rate": 0.0004749742001691871, + "loss": 3.088015556335449, + "step": 6362, + "token_acc": 0.2898343796018472 + }, + { + "epoch": 3.7299912049252417, + "grad_norm": 0.2363777536075646, + "learning_rate": 0.00047496363224561145, + "loss": 3.0809226036071777, + "step": 6363, + "token_acc": 0.29010584140605494 + }, + { + "epoch": 3.730577543242451, + "grad_norm": 0.2224000507144588, + "learning_rate": 0.00047495306220880567, + "loss": 3.1168951988220215, + "step": 6364, + "token_acc": 0.28508976310083367 + }, + { + "epoch": 3.73116388155966, + "grad_norm": 0.27244797013263033, + "learning_rate": 0.00047494249005886903, + "loss": 3.0753300189971924, + "step": 6365, + "token_acc": 0.29095698475619913 + }, + { + "epoch": 3.731750219876869, + "grad_norm": 0.20947797041117322, + "learning_rate": 0.00047493191579590075, + "loss": 3.1048426628112793, + "step": 6366, + "token_acc": 0.286545908423578 + }, + { + "epoch": 3.7323365581940777, + "grad_norm": 0.2449995968578766, + "learning_rate": 0.00047492133942000025, + "loss": 3.071350574493408, + "step": 6367, + "token_acc": 0.2913103653466583 + }, + { + "epoch": 3.732922896511287, + "grad_norm": 0.2326212799842961, + "learning_rate": 0.0004749107609312668, + "loss": 3.084408760070801, + "step": 6368, + "token_acc": 0.2890459751505434 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.20377827842068108, + "learning_rate": 0.00047490018032979986, + "loss": 3.0904946327209473, + "step": 6369, + "token_acc": 0.28881516489866677 + }, + { + "epoch": 3.734095573145705, + "grad_norm": 0.22610124446801713, + "learning_rate": 0.00047488959761569883, + "loss": 3.143413543701172, + "step": 6370, + "token_acc": 0.28060643422111237 + }, + { + "epoch": 3.734681911462914, + "grad_norm": 0.19149977516521693, + "learning_rate": 0.000474879012789063, + "loss": 3.045419692993164, + "step": 6371, + "token_acc": 0.29487237466913185 + }, + { + "epoch": 3.7352682497801233, + "grad_norm": 0.2156397829702423, + "learning_rate": 0.0004748684258499919, + "loss": 3.081057071685791, + "step": 6372, + "token_acc": 0.29122793211090997 + }, + { + "epoch": 3.7358545880973324, + "grad_norm": 0.22736878276091305, + "learning_rate": 0.000474857836798585, + "loss": 3.024683952331543, + "step": 6373, + "token_acc": 0.29650535617738893 + }, + { + "epoch": 3.736440926414541, + "grad_norm": 0.22701789248966978, + "learning_rate": 0.0004748472456349416, + "loss": 3.0468409061431885, + "step": 6374, + "token_acc": 0.29384342277907843 + }, + { + "epoch": 3.73702726473175, + "grad_norm": 0.20546274159111727, + "learning_rate": 0.0004748366523591614, + "loss": 3.07374906539917, + "step": 6375, + "token_acc": 0.29122983431366767 + }, + { + "epoch": 3.7376136030489593, + "grad_norm": 0.1922325029554844, + "learning_rate": 0.00047482605697134385, + "loss": 3.0682168006896973, + "step": 6376, + "token_acc": 0.2902951561839889 + }, + { + "epoch": 3.7381999413661684, + "grad_norm": 0.20137358024658844, + "learning_rate": 0.00047481545947158845, + "loss": 3.137650728225708, + "step": 6377, + "token_acc": 0.28087518020026014 + }, + { + "epoch": 3.738786279683377, + "grad_norm": 0.2294557176990818, + "learning_rate": 0.0004748048598599947, + "loss": 3.0798189640045166, + "step": 6378, + "token_acc": 0.29042616146846445 + }, + { + "epoch": 3.739372618000586, + "grad_norm": 0.2118823668288472, + "learning_rate": 0.00047479425813666223, + "loss": 3.0516233444213867, + "step": 6379, + "token_acc": 0.293272564883892 + }, + { + "epoch": 3.7399589563177953, + "grad_norm": 0.1998996152742327, + "learning_rate": 0.0004747836543016906, + "loss": 3.0955705642700195, + "step": 6380, + "token_acc": 0.2899631948059422 + }, + { + "epoch": 3.7405452946350044, + "grad_norm": 0.23288761918033363, + "learning_rate": 0.00047477304835517946, + "loss": 3.0891377925872803, + "step": 6381, + "token_acc": 0.28923892420951386 + }, + { + "epoch": 3.7411316329522135, + "grad_norm": 0.25974493784458447, + "learning_rate": 0.00047476244029722846, + "loss": 3.1461148262023926, + "step": 6382, + "token_acc": 0.28120567470610075 + }, + { + "epoch": 3.7417179712694226, + "grad_norm": 0.21724643186351109, + "learning_rate": 0.0004747518301279372, + "loss": 3.0851516723632812, + "step": 6383, + "token_acc": 0.2884287441462655 + }, + { + "epoch": 3.7423043095866317, + "grad_norm": 0.21108497092543624, + "learning_rate": 0.0004747412178474053, + "loss": 3.036337375640869, + "step": 6384, + "token_acc": 0.2965106584404198 + }, + { + "epoch": 3.7428906479038404, + "grad_norm": 0.25328917846710813, + "learning_rate": 0.0004747306034557325, + "loss": 3.0986037254333496, + "step": 6385, + "token_acc": 0.28756795303747495 + }, + { + "epoch": 3.7434769862210495, + "grad_norm": 0.25887677769530903, + "learning_rate": 0.00047471998695301857, + "loss": 3.1008682250976562, + "step": 6386, + "token_acc": 0.28516363961659996 + }, + { + "epoch": 3.7440633245382586, + "grad_norm": 0.25135231797596536, + "learning_rate": 0.00047470936833936305, + "loss": 3.062150478363037, + "step": 6387, + "token_acc": 0.2920213879710537 + }, + { + "epoch": 3.7446496628554677, + "grad_norm": 0.19991561481775152, + "learning_rate": 0.0004746987476148659, + "loss": 3.102640390396118, + "step": 6388, + "token_acc": 0.2881951906620025 + }, + { + "epoch": 3.7452360011726764, + "grad_norm": 0.2545281507767261, + "learning_rate": 0.00047468812477962686, + "loss": 3.0894246101379395, + "step": 6389, + "token_acc": 0.28666467764814263 + }, + { + "epoch": 3.7458223394898855, + "grad_norm": 0.27288966421379557, + "learning_rate": 0.00047467749983374553, + "loss": 3.07578706741333, + "step": 6390, + "token_acc": 0.28974354353271936 + }, + { + "epoch": 3.7464086778070946, + "grad_norm": 0.19786237631646636, + "learning_rate": 0.00047466687277732193, + "loss": 3.0883891582489014, + "step": 6391, + "token_acc": 0.28828307144554977 + }, + { + "epoch": 3.7469950161243037, + "grad_norm": 0.21565778977889355, + "learning_rate": 0.00047465624361045576, + "loss": 3.1305129528045654, + "step": 6392, + "token_acc": 0.2834539327954463 + }, + { + "epoch": 3.747581354441513, + "grad_norm": 0.24551120635081713, + "learning_rate": 0.000474645612333247, + "loss": 3.0904908180236816, + "step": 6393, + "token_acc": 0.29071121303822717 + }, + { + "epoch": 3.748167692758722, + "grad_norm": 0.19047104739032295, + "learning_rate": 0.00047463497894579534, + "loss": 3.0701749324798584, + "step": 6394, + "token_acc": 0.29226095654428924 + }, + { + "epoch": 3.748754031075931, + "grad_norm": 0.22056156495431772, + "learning_rate": 0.00047462434344820075, + "loss": 3.0653555393218994, + "step": 6395, + "token_acc": 0.29207595403684433 + }, + { + "epoch": 3.7493403693931397, + "grad_norm": 0.25326340255258445, + "learning_rate": 0.00047461370584056317, + "loss": 3.082517623901367, + "step": 6396, + "token_acc": 0.2895502774512471 + }, + { + "epoch": 3.749926707710349, + "grad_norm": 0.19065836356036706, + "learning_rate": 0.0004746030661229825, + "loss": 3.0944652557373047, + "step": 6397, + "token_acc": 0.28873820361199726 + }, + { + "epoch": 3.750513046027558, + "grad_norm": 0.2523395435345909, + "learning_rate": 0.0004745924242955587, + "loss": 3.109856605529785, + "step": 6398, + "token_acc": 0.28643605561569263 + }, + { + "epoch": 3.751099384344767, + "grad_norm": 0.2832290168847538, + "learning_rate": 0.00047458178035839164, + "loss": 3.085594654083252, + "step": 6399, + "token_acc": 0.2886416117619643 + }, + { + "epoch": 3.7516857226619758, + "grad_norm": 0.18623456571495328, + "learning_rate": 0.00047457113431158146, + "loss": 3.051520586013794, + "step": 6400, + "token_acc": 0.29476584022038566 + }, + { + "epoch": 3.752272060979185, + "grad_norm": 0.2528214702796898, + "learning_rate": 0.000474560486155228, + "loss": 3.0712080001831055, + "step": 6401, + "token_acc": 0.2902262736038465 + }, + { + "epoch": 3.752858399296394, + "grad_norm": 0.21497797853552386, + "learning_rate": 0.00047454983588943146, + "loss": 3.09531307220459, + "step": 6402, + "token_acc": 0.2902139357248528 + }, + { + "epoch": 3.753444737613603, + "grad_norm": 0.21596729792076638, + "learning_rate": 0.0004745391835142917, + "loss": 3.0975565910339355, + "step": 6403, + "token_acc": 0.2857240535956135 + }, + { + "epoch": 3.754031075930812, + "grad_norm": 0.21625966222461265, + "learning_rate": 0.00047452852902990896, + "loss": 3.069629192352295, + "step": 6404, + "token_acc": 0.29146853071533585 + }, + { + "epoch": 3.7546174142480213, + "grad_norm": 0.19659421613237166, + "learning_rate": 0.0004745178724363832, + "loss": 3.067108392715454, + "step": 6405, + "token_acc": 0.29222581770882905 + }, + { + "epoch": 3.75520375256523, + "grad_norm": 0.25160829218458997, + "learning_rate": 0.00047450721373381465, + "loss": 3.0912842750549316, + "step": 6406, + "token_acc": 0.28888556191695536 + }, + { + "epoch": 3.755790090882439, + "grad_norm": 0.18700214518917274, + "learning_rate": 0.0004744965529223033, + "loss": 3.0798068046569824, + "step": 6407, + "token_acc": 0.2893945584645525 + }, + { + "epoch": 3.756376429199648, + "grad_norm": 0.24660516983618272, + "learning_rate": 0.00047448589000194933, + "loss": 3.0610501766204834, + "step": 6408, + "token_acc": 0.2923797729398758 + }, + { + "epoch": 3.7569627675168573, + "grad_norm": 0.22821101279186334, + "learning_rate": 0.00047447522497285293, + "loss": 3.068467378616333, + "step": 6409, + "token_acc": 0.2908708526031471 + }, + { + "epoch": 3.757549105834066, + "grad_norm": 0.21073411015822335, + "learning_rate": 0.0004744645578351143, + "loss": 3.0901501178741455, + "step": 6410, + "token_acc": 0.2897971497306761 + }, + { + "epoch": 3.758135444151275, + "grad_norm": 0.2386400768893923, + "learning_rate": 0.00047445388858883365, + "loss": 3.0981240272521973, + "step": 6411, + "token_acc": 0.28805684893728917 + }, + { + "epoch": 3.758721782468484, + "grad_norm": 0.18918978538703182, + "learning_rate": 0.0004744432172341111, + "loss": 3.0546884536743164, + "step": 6412, + "token_acc": 0.2939091992608808 + }, + { + "epoch": 3.7593081207856933, + "grad_norm": 0.22136922806183532, + "learning_rate": 0.00047443254377104696, + "loss": 3.0653867721557617, + "step": 6413, + "token_acc": 0.2926235496743938 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.23143306661745558, + "learning_rate": 0.00047442186819974153, + "loss": 3.071683406829834, + "step": 6414, + "token_acc": 0.29083673474597255 + }, + { + "epoch": 3.7604807974201115, + "grad_norm": 0.19948785699427196, + "learning_rate": 0.00047441119052029506, + "loss": 3.1045453548431396, + "step": 6415, + "token_acc": 0.28719680603790093 + }, + { + "epoch": 3.7610671357373207, + "grad_norm": 0.22361873721942832, + "learning_rate": 0.00047440051073280786, + "loss": 3.0922703742980957, + "step": 6416, + "token_acc": 0.2884903137288043 + }, + { + "epoch": 3.7616534740545293, + "grad_norm": 0.25970467791759433, + "learning_rate": 0.00047438982883738027, + "loss": 3.0833253860473633, + "step": 6417, + "token_acc": 0.28823003101547645 + }, + { + "epoch": 3.7622398123717384, + "grad_norm": 0.2119238599325054, + "learning_rate": 0.00047437914483411256, + "loss": 3.1086997985839844, + "step": 6418, + "token_acc": 0.28556019394819404 + }, + { + "epoch": 3.7628261506889475, + "grad_norm": 0.22027045724409705, + "learning_rate": 0.00047436845872310515, + "loss": 3.0741004943847656, + "step": 6419, + "token_acc": 0.28981572669198075 + }, + { + "epoch": 3.7634124890061567, + "grad_norm": 0.3317358814104254, + "learning_rate": 0.00047435777050445837, + "loss": 3.130650281906128, + "step": 6420, + "token_acc": 0.28348512022221 + }, + { + "epoch": 3.7639988273233653, + "grad_norm": 0.30222877529653575, + "learning_rate": 0.0004743470801782728, + "loss": 3.079010248184204, + "step": 6421, + "token_acc": 0.29010901143420664 + }, + { + "epoch": 3.7645851656405744, + "grad_norm": 0.19628348017431524, + "learning_rate": 0.0004743363877446486, + "loss": 3.09476900100708, + "step": 6422, + "token_acc": 0.2883022733332805 + }, + { + "epoch": 3.7651715039577835, + "grad_norm": 0.26057818981682146, + "learning_rate": 0.00047432569320368634, + "loss": 3.0595431327819824, + "step": 6423, + "token_acc": 0.29281757611582604 + }, + { + "epoch": 3.7657578422749927, + "grad_norm": 0.17855140797385025, + "learning_rate": 0.0004743149965554865, + "loss": 3.045809268951416, + "step": 6424, + "token_acc": 0.29422164249697486 + }, + { + "epoch": 3.7663441805922018, + "grad_norm": 0.2415857122525715, + "learning_rate": 0.0004743042978001495, + "loss": 3.0594403743743896, + "step": 6425, + "token_acc": 0.29193847765309344 + }, + { + "epoch": 3.766930518909411, + "grad_norm": 0.18545875682024152, + "learning_rate": 0.00047429359693777594, + "loss": 3.1017227172851562, + "step": 6426, + "token_acc": 0.28646010519980114 + }, + { + "epoch": 3.76751685722662, + "grad_norm": 0.24179820240991265, + "learning_rate": 0.0004742828939684662, + "loss": 3.0467870235443115, + "step": 6427, + "token_acc": 0.29418214248242575 + }, + { + "epoch": 3.7681031955438287, + "grad_norm": 0.24844337877755568, + "learning_rate": 0.000474272188892321, + "loss": 3.058894634246826, + "step": 6428, + "token_acc": 0.29369662228731896 + }, + { + "epoch": 3.7686895338610378, + "grad_norm": 0.19880144880719225, + "learning_rate": 0.00047426148170944075, + "loss": 3.068110704421997, + "step": 6429, + "token_acc": 0.29187325720622626 + }, + { + "epoch": 3.769275872178247, + "grad_norm": 0.2244749915506914, + "learning_rate": 0.0004742507724199261, + "loss": 3.1019911766052246, + "step": 6430, + "token_acc": 0.2860349432984469 + }, + { + "epoch": 3.769862210495456, + "grad_norm": 0.2127816384180002, + "learning_rate": 0.00047424006102387753, + "loss": 3.074827194213867, + "step": 6431, + "token_acc": 0.29164547686620623 + }, + { + "epoch": 3.7704485488126647, + "grad_norm": 0.2502440320856774, + "learning_rate": 0.0004742293475213958, + "loss": 3.106818199157715, + "step": 6432, + "token_acc": 0.2859799743688592 + }, + { + "epoch": 3.771034887129874, + "grad_norm": 0.21142667348637867, + "learning_rate": 0.0004742186319125815, + "loss": 3.056962251663208, + "step": 6433, + "token_acc": 0.2914921739867505 + }, + { + "epoch": 3.771621225447083, + "grad_norm": 0.23724458027285325, + "learning_rate": 0.00047420791419753536, + "loss": 3.102792739868164, + "step": 6434, + "token_acc": 0.2866748665457308 + }, + { + "epoch": 3.772207563764292, + "grad_norm": 0.21776103294564458, + "learning_rate": 0.0004741971943763579, + "loss": 3.0702145099639893, + "step": 6435, + "token_acc": 0.29200041749518946 + }, + { + "epoch": 3.772793902081501, + "grad_norm": 0.2079951551446765, + "learning_rate": 0.00047418647244915, + "loss": 3.0796360969543457, + "step": 6436, + "token_acc": 0.2909337385895468 + }, + { + "epoch": 3.7733802403987102, + "grad_norm": 0.22327151160181286, + "learning_rate": 0.0004741757484160122, + "loss": 3.109640598297119, + "step": 6437, + "token_acc": 0.2856802150598958 + }, + { + "epoch": 3.7739665787159193, + "grad_norm": 0.21142913617489068, + "learning_rate": 0.0004741650222770454, + "loss": 3.0803279876708984, + "step": 6438, + "token_acc": 0.2881597604752542 + }, + { + "epoch": 3.774552917033128, + "grad_norm": 0.21314802950040948, + "learning_rate": 0.00047415429403235024, + "loss": 3.0818324089050293, + "step": 6439, + "token_acc": 0.2883151509414614 + }, + { + "epoch": 3.775139255350337, + "grad_norm": 0.20301888300831222, + "learning_rate": 0.0004741435636820275, + "loss": 3.088108539581299, + "step": 6440, + "token_acc": 0.28967277778366857 + }, + { + "epoch": 3.7757255936675462, + "grad_norm": 0.2032300205702744, + "learning_rate": 0.000474132831226178, + "loss": 3.0700416564941406, + "step": 6441, + "token_acc": 0.2909704760338088 + }, + { + "epoch": 3.7763119319847553, + "grad_norm": 0.22832430446296118, + "learning_rate": 0.0004741220966649027, + "loss": 3.0908679962158203, + "step": 6442, + "token_acc": 0.28638404355181585 + }, + { + "epoch": 3.776898270301964, + "grad_norm": 0.2042237975450884, + "learning_rate": 0.00047411135999830226, + "loss": 3.100489377975464, + "step": 6443, + "token_acc": 0.2858693717743371 + }, + { + "epoch": 3.777484608619173, + "grad_norm": 0.2039097274087466, + "learning_rate": 0.0004741006212264775, + "loss": 3.081758975982666, + "step": 6444, + "token_acc": 0.2884134408417561 + }, + { + "epoch": 3.7780709469363822, + "grad_norm": 0.23049677035636082, + "learning_rate": 0.0004740898803495295, + "loss": 3.1115474700927734, + "step": 6445, + "token_acc": 0.28419950706278324 + }, + { + "epoch": 3.7786572852535913, + "grad_norm": 0.24178161173432677, + "learning_rate": 0.0004740791373675589, + "loss": 3.0616371631622314, + "step": 6446, + "token_acc": 0.29035810233875503 + }, + { + "epoch": 3.7792436235708005, + "grad_norm": 0.22665243486901956, + "learning_rate": 0.0004740683922806669, + "loss": 3.1025235652923584, + "step": 6447, + "token_acc": 0.2864875354687115 + }, + { + "epoch": 3.7798299618880096, + "grad_norm": 0.20542575778739877, + "learning_rate": 0.0004740576450889542, + "loss": 3.0961828231811523, + "step": 6448, + "token_acc": 0.2876179284788418 + }, + { + "epoch": 3.7804163002052187, + "grad_norm": 0.19107041311984885, + "learning_rate": 0.0004740468957925219, + "loss": 3.1155200004577637, + "step": 6449, + "token_acc": 0.28589409907411356 + }, + { + "epoch": 3.7810026385224274, + "grad_norm": 0.2247804287160029, + "learning_rate": 0.0004740361443914709, + "loss": 3.0904228687286377, + "step": 6450, + "token_acc": 0.28790667654722346 + }, + { + "epoch": 3.7815889768396365, + "grad_norm": 0.23899753968672394, + "learning_rate": 0.00047402539088590225, + "loss": 3.044929027557373, + "step": 6451, + "token_acc": 0.29441679674038634 + }, + { + "epoch": 3.7821753151568456, + "grad_norm": 0.2408823728109118, + "learning_rate": 0.00047401463527591687, + "loss": 3.0716428756713867, + "step": 6452, + "token_acc": 0.2925693322861847 + }, + { + "epoch": 3.7827616534740547, + "grad_norm": 0.20254956700910243, + "learning_rate": 0.0004740038775616159, + "loss": 3.0462749004364014, + "step": 6453, + "token_acc": 0.29393332596452176 + }, + { + "epoch": 3.7833479917912634, + "grad_norm": 0.2471268919872209, + "learning_rate": 0.0004739931177431003, + "loss": 3.0643296241760254, + "step": 6454, + "token_acc": 0.2912173373391048 + }, + { + "epoch": 3.7839343301084725, + "grad_norm": 0.21312562596555237, + "learning_rate": 0.00047398235582047125, + "loss": 3.134032726287842, + "step": 6455, + "token_acc": 0.2825558718935436 + }, + { + "epoch": 3.7845206684256816, + "grad_norm": 0.21554699290830556, + "learning_rate": 0.00047397159179382977, + "loss": 3.068225622177124, + "step": 6456, + "token_acc": 0.2912419383107729 + }, + { + "epoch": 3.7851070067428907, + "grad_norm": 0.1895343276454945, + "learning_rate": 0.0004739608256632769, + "loss": 3.0592827796936035, + "step": 6457, + "token_acc": 0.29140623544844724 + }, + { + "epoch": 3.7856933450601, + "grad_norm": 0.21787192590093057, + "learning_rate": 0.00047395005742891395, + "loss": 3.063180685043335, + "step": 6458, + "token_acc": 0.291065568369028 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.20634922715191942, + "learning_rate": 0.000473939287090842, + "loss": 3.0723299980163574, + "step": 6459, + "token_acc": 0.2908520807645281 + }, + { + "epoch": 3.7868660216945176, + "grad_norm": 0.19582456614616287, + "learning_rate": 0.0004739285146491622, + "loss": 3.0688743591308594, + "step": 6460, + "token_acc": 0.2917738851857593 + }, + { + "epoch": 3.7874523600117267, + "grad_norm": 0.21620687349128492, + "learning_rate": 0.00047391774010397574, + "loss": 3.0666770935058594, + "step": 6461, + "token_acc": 0.29123699326161606 + }, + { + "epoch": 3.788038698328936, + "grad_norm": 0.20895968063006662, + "learning_rate": 0.00047390696345538385, + "loss": 3.082728385925293, + "step": 6462, + "token_acc": 0.2874165174108898 + }, + { + "epoch": 3.788625036646145, + "grad_norm": 0.19742636998017737, + "learning_rate": 0.00047389618470348777, + "loss": 3.1156840324401855, + "step": 6463, + "token_acc": 0.2852937095573859 + }, + { + "epoch": 3.7892113749633536, + "grad_norm": 0.19149777652974273, + "learning_rate": 0.00047388540384838877, + "loss": 3.0431203842163086, + "step": 6464, + "token_acc": 0.29476156438272233 + }, + { + "epoch": 3.7897977132805627, + "grad_norm": 0.20829704014445763, + "learning_rate": 0.000473874620890188, + "loss": 3.1093311309814453, + "step": 6465, + "token_acc": 0.28736350480199974 + }, + { + "epoch": 3.790384051597772, + "grad_norm": 0.19347030684133373, + "learning_rate": 0.00047386383582898685, + "loss": 3.098940372467041, + "step": 6466, + "token_acc": 0.28861558588538233 + }, + { + "epoch": 3.790970389914981, + "grad_norm": 0.18723717596538453, + "learning_rate": 0.0004738530486648867, + "loss": 3.0609235763549805, + "step": 6467, + "token_acc": 0.29272230436849833 + }, + { + "epoch": 3.79155672823219, + "grad_norm": 0.2127756737010464, + "learning_rate": 0.00047384225939798875, + "loss": 3.038447856903076, + "step": 6468, + "token_acc": 0.2954359274429491 + }, + { + "epoch": 3.792143066549399, + "grad_norm": 0.19724976284962384, + "learning_rate": 0.0004738314680283944, + "loss": 3.0177078247070312, + "step": 6469, + "token_acc": 0.29762839975371025 + }, + { + "epoch": 3.7927294048666083, + "grad_norm": 0.2296480241058165, + "learning_rate": 0.000473820674556205, + "loss": 3.0890488624572754, + "step": 6470, + "token_acc": 0.29041833476589823 + }, + { + "epoch": 3.793315743183817, + "grad_norm": 0.3002770702722004, + "learning_rate": 0.00047380987898152207, + "loss": 3.0694825649261475, + "step": 6471, + "token_acc": 0.292006917090979 + }, + { + "epoch": 3.793902081501026, + "grad_norm": 0.5011007192051179, + "learning_rate": 0.0004737990813044468, + "loss": 3.0852646827697754, + "step": 6472, + "token_acc": 0.28961570149224625 + }, + { + "epoch": 3.794488419818235, + "grad_norm": 0.44467004824027656, + "learning_rate": 0.0004737882815250808, + "loss": 3.063803195953369, + "step": 6473, + "token_acc": 0.2918106986294631 + }, + { + "epoch": 3.7950747581354443, + "grad_norm": 0.269909424964999, + "learning_rate": 0.00047377747964352546, + "loss": 3.090294599533081, + "step": 6474, + "token_acc": 0.28807092329080797 + }, + { + "epoch": 3.795661096452653, + "grad_norm": 0.27585784792092855, + "learning_rate": 0.0004737666756598822, + "loss": 3.07590389251709, + "step": 6475, + "token_acc": 0.2892968926960855 + }, + { + "epoch": 3.796247434769862, + "grad_norm": 0.2599556050868869, + "learning_rate": 0.0004737558695742526, + "loss": 3.1060409545898438, + "step": 6476, + "token_acc": 0.28603567659399215 + }, + { + "epoch": 3.796833773087071, + "grad_norm": 0.21055743592283394, + "learning_rate": 0.0004737450613867381, + "loss": 3.086583137512207, + "step": 6477, + "token_acc": 0.2880370040182436 + }, + { + "epoch": 3.7974201114042803, + "grad_norm": 0.23334809332973078, + "learning_rate": 0.0004737342510974402, + "loss": 3.067042350769043, + "step": 6478, + "token_acc": 0.2917798184339272 + }, + { + "epoch": 3.7980064497214894, + "grad_norm": 0.20076907788305, + "learning_rate": 0.00047372343870646054, + "loss": 3.078303098678589, + "step": 6479, + "token_acc": 0.2908017994046746 + }, + { + "epoch": 3.7985927880386985, + "grad_norm": 0.19508491214327245, + "learning_rate": 0.00047371262421390067, + "loss": 3.0965092182159424, + "step": 6480, + "token_acc": 0.2875405176082633 + }, + { + "epoch": 3.7991791263559076, + "grad_norm": 0.2377572860038879, + "learning_rate": 0.00047370180761986214, + "loss": 3.0716793537139893, + "step": 6481, + "token_acc": 0.2893204686708239 + }, + { + "epoch": 3.7997654646731163, + "grad_norm": 0.20704379405681425, + "learning_rate": 0.0004736909889244465, + "loss": 3.0547895431518555, + "step": 6482, + "token_acc": 0.29292929292929293 + }, + { + "epoch": 3.8003518029903254, + "grad_norm": 0.21962113699705885, + "learning_rate": 0.0004736801681277555, + "loss": 3.093174934387207, + "step": 6483, + "token_acc": 0.28729403437020723 + }, + { + "epoch": 3.8009381413075345, + "grad_norm": 0.20243848394263678, + "learning_rate": 0.00047366934522989076, + "loss": 3.061690330505371, + "step": 6484, + "token_acc": 0.29224018216349784 + }, + { + "epoch": 3.8015244796247436, + "grad_norm": 0.21849963634397454, + "learning_rate": 0.0004736585202309539, + "loss": 3.118579387664795, + "step": 6485, + "token_acc": 0.2847922819960533 + }, + { + "epoch": 3.8021108179419523, + "grad_norm": 0.2230702448403142, + "learning_rate": 0.0004736476931310466, + "loss": 3.1301307678222656, + "step": 6486, + "token_acc": 0.2817265838260558 + }, + { + "epoch": 3.8026971562591614, + "grad_norm": 0.22013017880315414, + "learning_rate": 0.00047363686393027063, + "loss": 3.0648999214172363, + "step": 6487, + "token_acc": 0.2918241609691424 + }, + { + "epoch": 3.8032834945763705, + "grad_norm": 0.23059748764978694, + "learning_rate": 0.0004736260326287276, + "loss": 3.096896171569824, + "step": 6488, + "token_acc": 0.2884981139252544 + }, + { + "epoch": 3.8038698328935796, + "grad_norm": 0.19967932274814715, + "learning_rate": 0.00047361519922651943, + "loss": 3.072824239730835, + "step": 6489, + "token_acc": 0.29092197609964804 + }, + { + "epoch": 3.8044561712107887, + "grad_norm": 0.2074536231331756, + "learning_rate": 0.00047360436372374776, + "loss": 3.0848183631896973, + "step": 6490, + "token_acc": 0.2897681732866636 + }, + { + "epoch": 3.805042509527998, + "grad_norm": 0.2328604757541851, + "learning_rate": 0.0004735935261205144, + "loss": 3.0796430110931396, + "step": 6491, + "token_acc": 0.28943674055124863 + }, + { + "epoch": 3.805628847845207, + "grad_norm": 0.2645724881621736, + "learning_rate": 0.00047358268641692114, + "loss": 3.0948429107666016, + "step": 6492, + "token_acc": 0.2874423172367184 + }, + { + "epoch": 3.8062151861624156, + "grad_norm": 0.2283438658322231, + "learning_rate": 0.00047357184461306986, + "loss": 3.1003668308258057, + "step": 6493, + "token_acc": 0.28553309471686916 + }, + { + "epoch": 3.8068015244796247, + "grad_norm": 0.21773807805972234, + "learning_rate": 0.0004735610007090623, + "loss": 3.0615758895874023, + "step": 6494, + "token_acc": 0.2921427667917528 + }, + { + "epoch": 3.807387862796834, + "grad_norm": 0.21669628908162353, + "learning_rate": 0.0004735501547050005, + "loss": 3.0590713024139404, + "step": 6495, + "token_acc": 0.29293807839086244 + }, + { + "epoch": 3.807974201114043, + "grad_norm": 0.21885551846769633, + "learning_rate": 0.0004735393066009861, + "loss": 3.0455322265625, + "step": 6496, + "token_acc": 0.2948628939083981 + }, + { + "epoch": 3.8085605394312516, + "grad_norm": 0.23225083455986617, + "learning_rate": 0.00047352845639712124, + "loss": 3.1016952991485596, + "step": 6497, + "token_acc": 0.286676443703354 + }, + { + "epoch": 3.8091468777484607, + "grad_norm": 0.1831727015065005, + "learning_rate": 0.0004735176040935077, + "loss": 3.0895891189575195, + "step": 6498, + "token_acc": 0.2894352227497856 + }, + { + "epoch": 3.80973321606567, + "grad_norm": 0.22051646500312877, + "learning_rate": 0.00047350674969024744, + "loss": 3.062586784362793, + "step": 6499, + "token_acc": 0.2925988560595452 + }, + { + "epoch": 3.810319554382879, + "grad_norm": 0.18062216756519453, + "learning_rate": 0.00047349589318744246, + "loss": 3.084185838699341, + "step": 6500, + "token_acc": 0.2876098957268452 + }, + { + "epoch": 3.810905892700088, + "grad_norm": 0.2258568872461433, + "learning_rate": 0.0004734850345851948, + "loss": 3.1014063358306885, + "step": 6501, + "token_acc": 0.2865528095907843 + }, + { + "epoch": 3.811492231017297, + "grad_norm": 0.19692213567732791, + "learning_rate": 0.00047347417388360623, + "loss": 3.0633296966552734, + "step": 6502, + "token_acc": 0.291835774289748 + }, + { + "epoch": 3.8120785693345063, + "grad_norm": 0.23134858135010314, + "learning_rate": 0.0004734633110827791, + "loss": 3.0930023193359375, + "step": 6503, + "token_acc": 0.2886753879670835 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.19789121786995423, + "learning_rate": 0.00047345244618281527, + "loss": 3.0659589767456055, + "step": 6504, + "token_acc": 0.2923300924082196 + }, + { + "epoch": 3.813251245968924, + "grad_norm": 0.23491367590369172, + "learning_rate": 0.0004734415791838167, + "loss": 3.092395544052124, + "step": 6505, + "token_acc": 0.289600235628029 + }, + { + "epoch": 3.813837584286133, + "grad_norm": 0.21326482938713123, + "learning_rate": 0.00047343071008588565, + "loss": 3.1044740676879883, + "step": 6506, + "token_acc": 0.2871264075861485 + }, + { + "epoch": 3.8144239226033423, + "grad_norm": 0.18654998183816066, + "learning_rate": 0.0004734198388891241, + "loss": 3.039532423019409, + "step": 6507, + "token_acc": 0.2936162147004144 + }, + { + "epoch": 3.815010260920551, + "grad_norm": 0.19397634738348812, + "learning_rate": 0.0004734089655936343, + "loss": 3.059903144836426, + "step": 6508, + "token_acc": 0.2942210043560794 + }, + { + "epoch": 3.81559659923776, + "grad_norm": 0.19844929419940502, + "learning_rate": 0.0004733980901995183, + "loss": 3.107282876968384, + "step": 6509, + "token_acc": 0.28612871093950565 + }, + { + "epoch": 3.816182937554969, + "grad_norm": 0.18815098749156828, + "learning_rate": 0.00047338721270687823, + "loss": 3.0847420692443848, + "step": 6510, + "token_acc": 0.29008106625490154 + }, + { + "epoch": 3.8167692758721783, + "grad_norm": 0.19994557916635425, + "learning_rate": 0.0004733763331158164, + "loss": 3.082430839538574, + "step": 6511, + "token_acc": 0.28864543489106936 + }, + { + "epoch": 3.8173556141893874, + "grad_norm": 0.22114834635248928, + "learning_rate": 0.0004733654514264348, + "loss": 3.0741260051727295, + "step": 6512, + "token_acc": 0.2899823313458219 + }, + { + "epoch": 3.8179419525065965, + "grad_norm": 0.21245523930134028, + "learning_rate": 0.0004733545676388359, + "loss": 3.074143886566162, + "step": 6513, + "token_acc": 0.28932991414873455 + }, + { + "epoch": 3.818528290823805, + "grad_norm": 0.20524472016876247, + "learning_rate": 0.0004733436817531218, + "loss": 3.069697856903076, + "step": 6514, + "token_acc": 0.2906101816455664 + }, + { + "epoch": 3.8191146291410143, + "grad_norm": 0.2047996736642631, + "learning_rate": 0.0004733327937693947, + "loss": 3.0679783821105957, + "step": 6515, + "token_acc": 0.29301700080067783 + }, + { + "epoch": 3.8197009674582234, + "grad_norm": 0.20203877463515277, + "learning_rate": 0.00047332190368775697, + "loss": 3.036182403564453, + "step": 6516, + "token_acc": 0.2948909535875945 + }, + { + "epoch": 3.8202873057754325, + "grad_norm": 0.21285683850501752, + "learning_rate": 0.00047331101150831093, + "loss": 3.114842176437378, + "step": 6517, + "token_acc": 0.28343772679342233 + }, + { + "epoch": 3.820873644092641, + "grad_norm": 0.2033666317722894, + "learning_rate": 0.0004733001172311587, + "loss": 3.068603515625, + "step": 6518, + "token_acc": 0.2915196086052936 + }, + { + "epoch": 3.8214599824098503, + "grad_norm": 0.19465744217443753, + "learning_rate": 0.00047328922085640294, + "loss": 3.059149980545044, + "step": 6519, + "token_acc": 0.28956177841132774 + }, + { + "epoch": 3.8220463207270594, + "grad_norm": 0.21655080022368167, + "learning_rate": 0.0004732783223841458, + "loss": 3.090820074081421, + "step": 6520, + "token_acc": 0.28815267119341453 + }, + { + "epoch": 3.8226326590442685, + "grad_norm": 0.2320489651383197, + "learning_rate": 0.0004732674218144897, + "loss": 3.0795536041259766, + "step": 6521, + "token_acc": 0.2905472324790747 + }, + { + "epoch": 3.8232189973614776, + "grad_norm": 0.23110586104112646, + "learning_rate": 0.000473256519147537, + "loss": 3.0489437580108643, + "step": 6522, + "token_acc": 0.2935650760405758 + }, + { + "epoch": 3.8238053356786867, + "grad_norm": 0.20862953240589457, + "learning_rate": 0.0004732456143833901, + "loss": 3.104154586791992, + "step": 6523, + "token_acc": 0.28494704464568493 + }, + { + "epoch": 3.824391673995896, + "grad_norm": 0.23405414922468074, + "learning_rate": 0.00047323470752215155, + "loss": 3.0823616981506348, + "step": 6524, + "token_acc": 0.2902636870545099 + }, + { + "epoch": 3.8249780123131045, + "grad_norm": 0.25539323628937666, + "learning_rate": 0.00047322379856392375, + "loss": 3.1174960136413574, + "step": 6525, + "token_acc": 0.2827509546476958 + }, + { + "epoch": 3.8255643506303136, + "grad_norm": 0.2671591046573203, + "learning_rate": 0.0004732128875088091, + "loss": 3.0943658351898193, + "step": 6526, + "token_acc": 0.28748294733293656 + }, + { + "epoch": 3.8261506889475227, + "grad_norm": 0.22001468083453776, + "learning_rate": 0.0004732019743569101, + "loss": 3.075218915939331, + "step": 6527, + "token_acc": 0.2895261355719754 + }, + { + "epoch": 3.826737027264732, + "grad_norm": 0.2478147625609848, + "learning_rate": 0.00047319105910832937, + "loss": 3.14341402053833, + "step": 6528, + "token_acc": 0.28149194657579246 + }, + { + "epoch": 3.8273233655819405, + "grad_norm": 0.2301154088190999, + "learning_rate": 0.0004731801417631695, + "loss": 3.0601937770843506, + "step": 6529, + "token_acc": 0.29323342986792117 + }, + { + "epoch": 3.8279097038991496, + "grad_norm": 0.2034744061773359, + "learning_rate": 0.00047316922232153283, + "loss": 3.0827488899230957, + "step": 6530, + "token_acc": 0.28918183774701356 + }, + { + "epoch": 3.8284960422163588, + "grad_norm": 0.22890530251673363, + "learning_rate": 0.00047315830078352206, + "loss": 3.0690417289733887, + "step": 6531, + "token_acc": 0.28920489635623686 + }, + { + "epoch": 3.829082380533568, + "grad_norm": 0.2996797010799379, + "learning_rate": 0.0004731473771492397, + "loss": 3.116313934326172, + "step": 6532, + "token_acc": 0.28445849818789787 + }, + { + "epoch": 3.829668718850777, + "grad_norm": 0.230619664187333, + "learning_rate": 0.00047313645141878856, + "loss": 3.0436506271362305, + "step": 6533, + "token_acc": 0.2950313994900217 + }, + { + "epoch": 3.830255057167986, + "grad_norm": 0.21000301867424817, + "learning_rate": 0.00047312552359227107, + "loss": 3.0266683101654053, + "step": 6534, + "token_acc": 0.2978534411167993 + }, + { + "epoch": 3.830841395485195, + "grad_norm": 0.24053419385368047, + "learning_rate": 0.00047311459366978993, + "loss": 3.120404005050659, + "step": 6535, + "token_acc": 0.2826958726316137 + }, + { + "epoch": 3.831427733802404, + "grad_norm": 0.19532967110024352, + "learning_rate": 0.00047310366165144793, + "loss": 3.077125072479248, + "step": 6536, + "token_acc": 0.29160118781699207 + }, + { + "epoch": 3.832014072119613, + "grad_norm": 0.2793489616461439, + "learning_rate": 0.0004730927275373476, + "loss": 3.0455527305603027, + "step": 6537, + "token_acc": 0.2933173607324871 + }, + { + "epoch": 3.832600410436822, + "grad_norm": 0.2370049855899673, + "learning_rate": 0.00047308179132759165, + "loss": 3.047926187515259, + "step": 6538, + "token_acc": 0.2907201490515255 + }, + { + "epoch": 3.833186748754031, + "grad_norm": 0.22055934054741122, + "learning_rate": 0.00047307085302228293, + "loss": 3.1643500328063965, + "step": 6539, + "token_acc": 0.27806982179605755 + }, + { + "epoch": 3.83377308707124, + "grad_norm": 0.23909417091680155, + "learning_rate": 0.00047305991262152415, + "loss": 3.111429214477539, + "step": 6540, + "token_acc": 0.284384459095244 + }, + { + "epoch": 3.834359425388449, + "grad_norm": 0.18388801127496174, + "learning_rate": 0.00047304897012541804, + "loss": 3.0450832843780518, + "step": 6541, + "token_acc": 0.29290060131726775 + }, + { + "epoch": 3.834945763705658, + "grad_norm": 0.2773612622918213, + "learning_rate": 0.00047303802553406743, + "loss": 3.096778392791748, + "step": 6542, + "token_acc": 0.285249784338647 + }, + { + "epoch": 3.835532102022867, + "grad_norm": 0.25169660268409194, + "learning_rate": 0.0004730270788475751, + "loss": 3.1223127841949463, + "step": 6543, + "token_acc": 0.2845925330767889 + }, + { + "epoch": 3.8361184403400763, + "grad_norm": 0.20499225958037362, + "learning_rate": 0.0004730161300660439, + "loss": 3.1041648387908936, + "step": 6544, + "token_acc": 0.28591700819672133 + }, + { + "epoch": 3.8367047786572854, + "grad_norm": 0.25350700847426316, + "learning_rate": 0.0004730051791895767, + "loss": 3.098144054412842, + "step": 6545, + "token_acc": 0.28842203970941216 + }, + { + "epoch": 3.8372911169744945, + "grad_norm": 0.17981883159819664, + "learning_rate": 0.00047299422621827644, + "loss": 3.081315040588379, + "step": 6546, + "token_acc": 0.2910331669295521 + }, + { + "epoch": 3.837877455291703, + "grad_norm": 0.23043088958405447, + "learning_rate": 0.00047298327115224585, + "loss": 3.1111674308776855, + "step": 6547, + "token_acc": 0.2841437827356066 + }, + { + "epoch": 3.8384637936089123, + "grad_norm": 0.18195702762820298, + "learning_rate": 0.0004729723139915878, + "loss": 3.1058735847473145, + "step": 6548, + "token_acc": 0.28672605005772656 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.1961388587724037, + "learning_rate": 0.0004729613547364054, + "loss": 3.093325614929199, + "step": 6549, + "token_acc": 0.28827461053815384 + }, + { + "epoch": 3.8396364702433305, + "grad_norm": 0.205869630926981, + "learning_rate": 0.0004729503933868015, + "loss": 3.0579991340637207, + "step": 6550, + "token_acc": 0.2918355589925108 + }, + { + "epoch": 3.840222808560539, + "grad_norm": 0.19026519566255432, + "learning_rate": 0.00047293942994287906, + "loss": 3.1072189807891846, + "step": 6551, + "token_acc": 0.2878851114020194 + }, + { + "epoch": 3.8408091468777483, + "grad_norm": 0.19030965677443198, + "learning_rate": 0.0004729284644047411, + "loss": 3.0612363815307617, + "step": 6552, + "token_acc": 0.29345670679705116 + }, + { + "epoch": 3.8413954851949574, + "grad_norm": 0.19645996335555976, + "learning_rate": 0.0004729174967724907, + "loss": 3.098538875579834, + "step": 6553, + "token_acc": 0.2879619081617118 + }, + { + "epoch": 3.8419818235121665, + "grad_norm": 0.2046378485035818, + "learning_rate": 0.0004729065270462307, + "loss": 3.106729507446289, + "step": 6554, + "token_acc": 0.2858357212042941 + }, + { + "epoch": 3.8425681618293757, + "grad_norm": 0.2205577951262936, + "learning_rate": 0.0004728955552260643, + "loss": 3.081890106201172, + "step": 6555, + "token_acc": 0.2882507317254391 + }, + { + "epoch": 3.8431545001465848, + "grad_norm": 0.21198051209583782, + "learning_rate": 0.0004728845813120945, + "loss": 3.060849189758301, + "step": 6556, + "token_acc": 0.2910310688464795 + }, + { + "epoch": 3.843740838463794, + "grad_norm": 0.226368656236823, + "learning_rate": 0.00047287360530442443, + "loss": 3.038071632385254, + "step": 6557, + "token_acc": 0.29528084337575805 + }, + { + "epoch": 3.8443271767810026, + "grad_norm": 0.18747209968932652, + "learning_rate": 0.0004728626272031571, + "loss": 3.047152280807495, + "step": 6558, + "token_acc": 0.294955650313103 + }, + { + "epoch": 3.8449135150982117, + "grad_norm": 0.19978446971679106, + "learning_rate": 0.0004728516470083958, + "loss": 3.114880323410034, + "step": 6559, + "token_acc": 0.28567120211325064 + }, + { + "epoch": 3.8454998534154208, + "grad_norm": 0.22774678731462242, + "learning_rate": 0.00047284066472024345, + "loss": 3.097235679626465, + "step": 6560, + "token_acc": 0.28715421389053014 + }, + { + "epoch": 3.84608619173263, + "grad_norm": 0.26506366283926763, + "learning_rate": 0.0004728296803388034, + "loss": 3.068356513977051, + "step": 6561, + "token_acc": 0.29219619318808676 + }, + { + "epoch": 3.8466725300498386, + "grad_norm": 0.23603790353083118, + "learning_rate": 0.00047281869386417875, + "loss": 3.095968723297119, + "step": 6562, + "token_acc": 0.2864278909287535 + }, + { + "epoch": 3.8472588683670477, + "grad_norm": 0.18312453819402238, + "learning_rate": 0.00047280770529647276, + "loss": 3.0661392211914062, + "step": 6563, + "token_acc": 0.2942461014983282 + }, + { + "epoch": 3.847845206684257, + "grad_norm": 0.23315685128152697, + "learning_rate": 0.0004727967146357885, + "loss": 3.050487518310547, + "step": 6564, + "token_acc": 0.29299231371153256 + }, + { + "epoch": 3.848431545001466, + "grad_norm": 0.20862164346485312, + "learning_rate": 0.0004727857218822295, + "loss": 3.0750577449798584, + "step": 6565, + "token_acc": 0.2884535644825305 + }, + { + "epoch": 3.849017883318675, + "grad_norm": 0.20626026952355725, + "learning_rate": 0.00047277472703589874, + "loss": 3.107205629348755, + "step": 6566, + "token_acc": 0.2853362428407178 + }, + { + "epoch": 3.849604221635884, + "grad_norm": 0.26122708225945274, + "learning_rate": 0.00047276373009689967, + "loss": 3.0458974838256836, + "step": 6567, + "token_acc": 0.29335063570616826 + }, + { + "epoch": 3.850190559953093, + "grad_norm": 0.2638426466898938, + "learning_rate": 0.0004727527310653355, + "loss": 3.071479558944702, + "step": 6568, + "token_acc": 0.2896536302012049 + }, + { + "epoch": 3.850776898270302, + "grad_norm": 0.19358775029135447, + "learning_rate": 0.0004727417299413096, + "loss": 3.1301212310791016, + "step": 6569, + "token_acc": 0.2827535450973177 + }, + { + "epoch": 3.851363236587511, + "grad_norm": 0.30818062047642736, + "learning_rate": 0.0004727307267249253, + "loss": 3.040440559387207, + "step": 6570, + "token_acc": 0.29502927665121365 + }, + { + "epoch": 3.85194957490472, + "grad_norm": 0.2805472515363853, + "learning_rate": 0.00047271972141628595, + "loss": 3.0688462257385254, + "step": 6571, + "token_acc": 0.29137527361743837 + }, + { + "epoch": 3.852535913221929, + "grad_norm": 0.2248302058377536, + "learning_rate": 0.00047270871401549486, + "loss": 3.1054718494415283, + "step": 6572, + "token_acc": 0.2837317915121461 + }, + { + "epoch": 3.853122251539138, + "grad_norm": 0.282010960062093, + "learning_rate": 0.0004726977045226556, + "loss": 3.0875115394592285, + "step": 6573, + "token_acc": 0.2897645362806343 + }, + { + "epoch": 3.853708589856347, + "grad_norm": 0.20278909014136015, + "learning_rate": 0.00047268669293787146, + "loss": 3.0533828735351562, + "step": 6574, + "token_acc": 0.293884980223319 + }, + { + "epoch": 3.854294928173556, + "grad_norm": 0.2322292968067153, + "learning_rate": 0.0004726756792612459, + "loss": 3.092543840408325, + "step": 6575, + "token_acc": 0.2870271025499754 + }, + { + "epoch": 3.8548812664907652, + "grad_norm": 0.20052266194500518, + "learning_rate": 0.00047266466349288246, + "loss": 3.095717430114746, + "step": 6576, + "token_acc": 0.2881497202818433 + }, + { + "epoch": 3.8554676048079743, + "grad_norm": 0.27372754082444634, + "learning_rate": 0.00047265364563288447, + "loss": 3.0760579109191895, + "step": 6577, + "token_acc": 0.29151160048800995 + }, + { + "epoch": 3.8560539431251835, + "grad_norm": 0.22984783695492186, + "learning_rate": 0.00047264262568135553, + "loss": 3.063521385192871, + "step": 6578, + "token_acc": 0.29177435929374396 + }, + { + "epoch": 3.856640281442392, + "grad_norm": 0.20161924254011016, + "learning_rate": 0.0004726316036383992, + "loss": 3.0870859622955322, + "step": 6579, + "token_acc": 0.2887328250770495 + }, + { + "epoch": 3.8572266197596012, + "grad_norm": 0.2632783487658834, + "learning_rate": 0.00047262057950411883, + "loss": 3.059124231338501, + "step": 6580, + "token_acc": 0.2925985187656033 + }, + { + "epoch": 3.8578129580768104, + "grad_norm": 0.22564075988844529, + "learning_rate": 0.0004726095532786182, + "loss": 3.086447238922119, + "step": 6581, + "token_acc": 0.2890474520488818 + }, + { + "epoch": 3.8583992963940195, + "grad_norm": 0.26638932945937466, + "learning_rate": 0.0004725985249620008, + "loss": 3.0860652923583984, + "step": 6582, + "token_acc": 0.2888075649398318 + }, + { + "epoch": 3.858985634711228, + "grad_norm": 0.21260337165845883, + "learning_rate": 0.0004725874945543702, + "loss": 3.06182861328125, + "step": 6583, + "token_acc": 0.2926865382337422 + }, + { + "epoch": 3.8595719730284372, + "grad_norm": 0.23834517377292713, + "learning_rate": 0.00047257646205582995, + "loss": 3.0690417289733887, + "step": 6584, + "token_acc": 0.2918016885411135 + }, + { + "epoch": 3.8601583113456464, + "grad_norm": 0.1994599165650327, + "learning_rate": 0.00047256542746648385, + "loss": 3.0980920791625977, + "step": 6585, + "token_acc": 0.28810130235964154 + }, + { + "epoch": 3.8607446496628555, + "grad_norm": 0.21346721053231602, + "learning_rate": 0.0004725543907864354, + "loss": 3.0762486457824707, + "step": 6586, + "token_acc": 0.2911234339060318 + }, + { + "epoch": 3.8613309879800646, + "grad_norm": 0.18167562928045092, + "learning_rate": 0.0004725433520157884, + "loss": 3.0722007751464844, + "step": 6587, + "token_acc": 0.291313488705997 + }, + { + "epoch": 3.8619173262972737, + "grad_norm": 0.2314529977260273, + "learning_rate": 0.00047253231115464644, + "loss": 3.0874311923980713, + "step": 6588, + "token_acc": 0.2885219368889976 + }, + { + "epoch": 3.862503664614483, + "grad_norm": 0.19894689811674457, + "learning_rate": 0.00047252126820311336, + "loss": 3.1121835708618164, + "step": 6589, + "token_acc": 0.28474272079317897 + }, + { + "epoch": 3.8630900029316915, + "grad_norm": 0.21301419417547374, + "learning_rate": 0.00047251022316129276, + "loss": 3.080428123474121, + "step": 6590, + "token_acc": 0.28893414171333365 + }, + { + "epoch": 3.8636763412489006, + "grad_norm": 0.24213155116539303, + "learning_rate": 0.0004724991760292885, + "loss": 3.0988974571228027, + "step": 6591, + "token_acc": 0.2875001645581285 + }, + { + "epoch": 3.8642626795661097, + "grad_norm": 0.1813269200159823, + "learning_rate": 0.0004724881268072042, + "loss": 3.0734262466430664, + "step": 6592, + "token_acc": 0.2902757034951811 + }, + { + "epoch": 3.864849017883319, + "grad_norm": 0.23740536213107427, + "learning_rate": 0.00047247707549514384, + "loss": 3.0526695251464844, + "step": 6593, + "token_acc": 0.29156929134070025 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.222665593870044, + "learning_rate": 0.0004724660220932111, + "loss": 3.048471212387085, + "step": 6594, + "token_acc": 0.29532492881111216 + }, + { + "epoch": 3.8660216945177366, + "grad_norm": 0.24227285623643968, + "learning_rate": 0.0004724549666015099, + "loss": 3.0528934001922607, + "step": 6595, + "token_acc": 0.2916868835997644 + }, + { + "epoch": 3.8666080328349457, + "grad_norm": 0.2625336529629332, + "learning_rate": 0.00047244390902014406, + "loss": 3.090010166168213, + "step": 6596, + "token_acc": 0.2877753058377543 + }, + { + "epoch": 3.867194371152155, + "grad_norm": 0.20896348809560866, + "learning_rate": 0.0004724328493492174, + "loss": 3.082413911819458, + "step": 6597, + "token_acc": 0.28744775373938647 + }, + { + "epoch": 3.867780709469364, + "grad_norm": 0.22484978918597365, + "learning_rate": 0.0004724217875888339, + "loss": 3.065814256668091, + "step": 6598, + "token_acc": 0.29059318667452494 + }, + { + "epoch": 3.868367047786573, + "grad_norm": 0.26274849125285965, + "learning_rate": 0.0004724107237390974, + "loss": 3.0292577743530273, + "step": 6599, + "token_acc": 0.2969701901144676 + }, + { + "epoch": 3.868953386103782, + "grad_norm": 0.21049425378758016, + "learning_rate": 0.0004723996578001118, + "loss": 3.115936040878296, + "step": 6600, + "token_acc": 0.28475933354311056 + }, + { + "epoch": 3.869539724420991, + "grad_norm": 0.22910782562178267, + "learning_rate": 0.00047238858977198116, + "loss": 3.0805931091308594, + "step": 6601, + "token_acc": 0.28930428191688107 + }, + { + "epoch": 3.8701260627382, + "grad_norm": 0.27888540159794756, + "learning_rate": 0.00047237751965480937, + "loss": 3.0922207832336426, + "step": 6602, + "token_acc": 0.2892300266020226 + }, + { + "epoch": 3.870712401055409, + "grad_norm": 0.190703335552826, + "learning_rate": 0.00047236644744870043, + "loss": 3.054598808288574, + "step": 6603, + "token_acc": 0.293918493039673 + }, + { + "epoch": 3.871298739372618, + "grad_norm": 0.22559187780460158, + "learning_rate": 0.0004723553731537584, + "loss": 3.0338029861450195, + "step": 6604, + "token_acc": 0.2950637235195297 + }, + { + "epoch": 3.871885077689827, + "grad_norm": 0.1950661274654235, + "learning_rate": 0.00047234429677008727, + "loss": 3.074355125427246, + "step": 6605, + "token_acc": 0.291761875156438 + }, + { + "epoch": 3.872471416007036, + "grad_norm": 0.25366873194543654, + "learning_rate": 0.00047233321829779105, + "loss": 3.054440975189209, + "step": 6606, + "token_acc": 0.29201840399432516 + }, + { + "epoch": 3.873057754324245, + "grad_norm": 0.20492641193232897, + "learning_rate": 0.00047232213773697385, + "loss": 3.0460822582244873, + "step": 6607, + "token_acc": 0.2945879268343036 + }, + { + "epoch": 3.873644092641454, + "grad_norm": 0.33904724963182903, + "learning_rate": 0.00047231105508773976, + "loss": 3.125011920928955, + "step": 6608, + "token_acc": 0.28189973614775726 + }, + { + "epoch": 3.8742304309586633, + "grad_norm": 0.2904293628297362, + "learning_rate": 0.00047229997035019286, + "loss": 3.0500335693359375, + "step": 6609, + "token_acc": 0.2941685198912775 + }, + { + "epoch": 3.8748167692758724, + "grad_norm": 0.26096504833578144, + "learning_rate": 0.0004722888835244373, + "loss": 3.072169065475464, + "step": 6610, + "token_acc": 0.2909984600787071 + }, + { + "epoch": 3.875403107593081, + "grad_norm": 0.21706438398825628, + "learning_rate": 0.00047227779461057716, + "loss": 3.1308698654174805, + "step": 6611, + "token_acc": 0.2819087091180627 + }, + { + "epoch": 3.87598944591029, + "grad_norm": 0.2932457600914625, + "learning_rate": 0.0004722667036087167, + "loss": 3.1182332038879395, + "step": 6612, + "token_acc": 0.28420195017150923 + }, + { + "epoch": 3.8765757842274993, + "grad_norm": 0.2203285559027362, + "learning_rate": 0.00047225561051896013, + "loss": 3.1260557174682617, + "step": 6613, + "token_acc": 0.28366002873269447 + }, + { + "epoch": 3.8771621225447084, + "grad_norm": 0.2879904457535955, + "learning_rate": 0.00047224451534141155, + "loss": 3.0877788066864014, + "step": 6614, + "token_acc": 0.2880016908070393 + }, + { + "epoch": 3.8777484608619175, + "grad_norm": 0.2136169308749751, + "learning_rate": 0.0004722334180761752, + "loss": 3.121661424636841, + "step": 6615, + "token_acc": 0.2827520355101356 + }, + { + "epoch": 3.878334799179126, + "grad_norm": 0.24961190861872978, + "learning_rate": 0.00047222231872335544, + "loss": 3.0897679328918457, + "step": 6616, + "token_acc": 0.28644471020922546 + }, + { + "epoch": 3.8789211374963353, + "grad_norm": 0.21968356416794158, + "learning_rate": 0.00047221121728305634, + "loss": 3.0652503967285156, + "step": 6617, + "token_acc": 0.29132681881292133 + }, + { + "epoch": 3.8795074758135444, + "grad_norm": 0.228852570329781, + "learning_rate": 0.0004722001137553823, + "loss": 3.076927661895752, + "step": 6618, + "token_acc": 0.28941257527814634 + }, + { + "epoch": 3.8800938141307535, + "grad_norm": 0.18276597737584713, + "learning_rate": 0.0004721890081404376, + "loss": 3.0959906578063965, + "step": 6619, + "token_acc": 0.28629836094585237 + }, + { + "epoch": 3.8806801524479626, + "grad_norm": 0.2107761990609545, + "learning_rate": 0.00047217790043832666, + "loss": 3.0804977416992188, + "step": 6620, + "token_acc": 0.28996709778985885 + }, + { + "epoch": 3.8812664907651717, + "grad_norm": 0.18060748877366736, + "learning_rate": 0.00047216679064915367, + "loss": 3.065890312194824, + "step": 6621, + "token_acc": 0.29160001230542054 + }, + { + "epoch": 3.8818528290823804, + "grad_norm": 0.2097903401983441, + "learning_rate": 0.00047215567877302307, + "loss": 3.116609573364258, + "step": 6622, + "token_acc": 0.2834456233143385 + }, + { + "epoch": 3.8824391673995895, + "grad_norm": 0.2140896843330736, + "learning_rate": 0.0004721445648100392, + "loss": 3.0928001403808594, + "step": 6623, + "token_acc": 0.2887632620327886 + }, + { + "epoch": 3.8830255057167986, + "grad_norm": 0.23193868868851975, + "learning_rate": 0.0004721334487603065, + "loss": 3.081294059753418, + "step": 6624, + "token_acc": 0.2885319060956676 + }, + { + "epoch": 3.8836118440340077, + "grad_norm": 0.23821497865142813, + "learning_rate": 0.0004721223306239294, + "loss": 3.06607985496521, + "step": 6625, + "token_acc": 0.2908129125140673 + }, + { + "epoch": 3.8841981823512164, + "grad_norm": 0.2242050472440741, + "learning_rate": 0.00047211121040101236, + "loss": 3.0999040603637695, + "step": 6626, + "token_acc": 0.2860724946250026 + }, + { + "epoch": 3.8847845206684255, + "grad_norm": 0.18887521783447783, + "learning_rate": 0.0004721000880916597, + "loss": 3.111179828643799, + "step": 6627, + "token_acc": 0.28545246754328646 + }, + { + "epoch": 3.8853708589856346, + "grad_norm": 0.26754673878816626, + "learning_rate": 0.00047208896369597606, + "loss": 3.1077122688293457, + "step": 6628, + "token_acc": 0.2852491088896631 + }, + { + "epoch": 3.8859571973028437, + "grad_norm": 0.1955636962346107, + "learning_rate": 0.0004720778372140658, + "loss": 3.0622434616088867, + "step": 6629, + "token_acc": 0.29373639062571494 + }, + { + "epoch": 3.886543535620053, + "grad_norm": 0.26239880346720634, + "learning_rate": 0.00047206670864603355, + "loss": 3.108665704727173, + "step": 6630, + "token_acc": 0.28626711181814374 + }, + { + "epoch": 3.887129873937262, + "grad_norm": 0.25029463222093773, + "learning_rate": 0.00047205557799198384, + "loss": 3.09226131439209, + "step": 6631, + "token_acc": 0.2878963485953682 + }, + { + "epoch": 3.887716212254471, + "grad_norm": 0.18668712804382437, + "learning_rate": 0.00047204444525202115, + "loss": 3.0789241790771484, + "step": 6632, + "token_acc": 0.289724443848765 + }, + { + "epoch": 3.8883025505716797, + "grad_norm": 0.21656257362302248, + "learning_rate": 0.0004720333104262502, + "loss": 3.0601563453674316, + "step": 6633, + "token_acc": 0.29314690672285004 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.21055114654414064, + "learning_rate": 0.0004720221735147754, + "loss": 3.051942825317383, + "step": 6634, + "token_acc": 0.29334907979394226 + }, + { + "epoch": 3.889475227206098, + "grad_norm": 0.22650309723016437, + "learning_rate": 0.0004720110345177015, + "loss": 3.146289825439453, + "step": 6635, + "token_acc": 0.28176377157643395 + }, + { + "epoch": 3.890061565523307, + "grad_norm": 0.22260194201710884, + "learning_rate": 0.00047199989343513313, + "loss": 3.05391263961792, + "step": 6636, + "token_acc": 0.2921983101013782 + }, + { + "epoch": 3.8906479038405157, + "grad_norm": 0.2311845208979968, + "learning_rate": 0.0004719887502671748, + "loss": 3.0894980430603027, + "step": 6637, + "token_acc": 0.28979457179076873 + }, + { + "epoch": 3.891234242157725, + "grad_norm": 0.21589241681931762, + "learning_rate": 0.00047197760501393137, + "loss": 3.1009857654571533, + "step": 6638, + "token_acc": 0.2871004505131141 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 0.1973022616475723, + "learning_rate": 0.0004719664576755075, + "loss": 3.087038278579712, + "step": 6639, + "token_acc": 0.28951127008891764 + }, + { + "epoch": 3.892406918792143, + "grad_norm": 0.24438284751710712, + "learning_rate": 0.00047195530825200777, + "loss": 3.077129364013672, + "step": 6640, + "token_acc": 0.28843225798055777 + }, + { + "epoch": 3.892993257109352, + "grad_norm": 0.22235985272724695, + "learning_rate": 0.00047194415674353706, + "loss": 3.0899295806884766, + "step": 6641, + "token_acc": 0.28781617357207934 + }, + { + "epoch": 3.8935795954265613, + "grad_norm": 0.21728930185112144, + "learning_rate": 0.00047193300315020005, + "loss": 3.063229560852051, + "step": 6642, + "token_acc": 0.2920231135881848 + }, + { + "epoch": 3.8941659337437704, + "grad_norm": 0.20445500399529878, + "learning_rate": 0.00047192184747210154, + "loss": 3.1142172813415527, + "step": 6643, + "token_acc": 0.28463625994729735 + }, + { + "epoch": 3.894752272060979, + "grad_norm": 0.224680959905295, + "learning_rate": 0.00047191068970934636, + "loss": 3.04136323928833, + "step": 6644, + "token_acc": 0.2942601804838942 + }, + { + "epoch": 3.895338610378188, + "grad_norm": 0.2301327263517826, + "learning_rate": 0.0004718995298620392, + "loss": 3.056760787963867, + "step": 6645, + "token_acc": 0.2930707821115284 + }, + { + "epoch": 3.8959249486953973, + "grad_norm": 0.2114690610331847, + "learning_rate": 0.000471888367930285, + "loss": 3.062340259552002, + "step": 6646, + "token_acc": 0.2911979309039679 + }, + { + "epoch": 3.8965112870126064, + "grad_norm": 0.19176960364255283, + "learning_rate": 0.00047187720391418864, + "loss": 3.077188014984131, + "step": 6647, + "token_acc": 0.2890829650157272 + }, + { + "epoch": 3.897097625329815, + "grad_norm": 0.1934051542163186, + "learning_rate": 0.00047186603781385484, + "loss": 3.085202693939209, + "step": 6648, + "token_acc": 0.2897958028135665 + }, + { + "epoch": 3.897683963647024, + "grad_norm": 0.20521224089781526, + "learning_rate": 0.00047185486962938864, + "loss": 3.1122426986694336, + "step": 6649, + "token_acc": 0.28319009221542046 + }, + { + "epoch": 3.8982703019642333, + "grad_norm": 0.23689514530386674, + "learning_rate": 0.0004718436993608949, + "loss": 3.054687023162842, + "step": 6650, + "token_acc": 0.29526064294276216 + }, + { + "epoch": 3.8988566402814424, + "grad_norm": 0.24640207631283378, + "learning_rate": 0.00047183252700847846, + "loss": 3.1443819999694824, + "step": 6651, + "token_acc": 0.2806144912995098 + }, + { + "epoch": 3.8994429785986515, + "grad_norm": 0.2172801293456974, + "learning_rate": 0.00047182135257224444, + "loss": 3.0730905532836914, + "step": 6652, + "token_acc": 0.2901232624213109 + }, + { + "epoch": 3.9000293169158606, + "grad_norm": 0.31618468315127146, + "learning_rate": 0.0004718101760522977, + "loss": 3.084804058074951, + "step": 6653, + "token_acc": 0.28772287812826064 + }, + { + "epoch": 3.9006156552330697, + "grad_norm": 0.3115187915492716, + "learning_rate": 0.00047179899744874323, + "loss": 3.061718702316284, + "step": 6654, + "token_acc": 0.291010838468874 + }, + { + "epoch": 3.9012019935502784, + "grad_norm": 0.1972923325538355, + "learning_rate": 0.0004717878167616861, + "loss": 3.0482168197631836, + "step": 6655, + "token_acc": 0.2950436670513188 + }, + { + "epoch": 3.9017883318674875, + "grad_norm": 0.2406114219717918, + "learning_rate": 0.0004717766339912313, + "loss": 3.070314884185791, + "step": 6656, + "token_acc": 0.2913224322146255 + }, + { + "epoch": 3.9023746701846966, + "grad_norm": 0.23090866711494307, + "learning_rate": 0.00047176544913748374, + "loss": 3.109677791595459, + "step": 6657, + "token_acc": 0.28549114664005915 + }, + { + "epoch": 3.9029610085019057, + "grad_norm": 0.20389602364079093, + "learning_rate": 0.0004717542622005487, + "loss": 3.088420867919922, + "step": 6658, + "token_acc": 0.2884563597339057 + }, + { + "epoch": 3.9035473468191144, + "grad_norm": 0.22644741911959915, + "learning_rate": 0.00047174307318053124, + "loss": 3.0168306827545166, + "step": 6659, + "token_acc": 0.2983666408414081 + }, + { + "epoch": 3.9041336851363235, + "grad_norm": 0.18561159863861174, + "learning_rate": 0.00047173188207753637, + "loss": 3.0381031036376953, + "step": 6660, + "token_acc": 0.2962874368690301 + }, + { + "epoch": 3.9047200234535326, + "grad_norm": 0.22997217115589738, + "learning_rate": 0.0004717206888916693, + "loss": 3.058438777923584, + "step": 6661, + "token_acc": 0.29140573555248084 + }, + { + "epoch": 3.9053063617707418, + "grad_norm": 0.1961829691021727, + "learning_rate": 0.00047170949362303503, + "loss": 3.084615707397461, + "step": 6662, + "token_acc": 0.2887559142777623 + }, + { + "epoch": 3.905892700087951, + "grad_norm": 0.24062653256321054, + "learning_rate": 0.00047169829627173885, + "loss": 3.0795886516571045, + "step": 6663, + "token_acc": 0.2888947914905947 + }, + { + "epoch": 3.90647903840516, + "grad_norm": 0.19869335816548647, + "learning_rate": 0.000471687096837886, + "loss": 3.0801472663879395, + "step": 6664, + "token_acc": 0.291593985074664 + }, + { + "epoch": 3.9070653767223686, + "grad_norm": 0.1844416488017622, + "learning_rate": 0.00047167589532158153, + "loss": 3.0913257598876953, + "step": 6665, + "token_acc": 0.2882596024650267 + }, + { + "epoch": 3.9076517150395778, + "grad_norm": 0.19289318339101477, + "learning_rate": 0.0004716646917229308, + "loss": 3.097916841506958, + "step": 6666, + "token_acc": 0.2879944866335058 + }, + { + "epoch": 3.908238053356787, + "grad_norm": 0.19245358137353094, + "learning_rate": 0.00047165348604203897, + "loss": 3.0841991901397705, + "step": 6667, + "token_acc": 0.2881381636104728 + }, + { + "epoch": 3.908824391673996, + "grad_norm": 0.20040711571177733, + "learning_rate": 0.00047164227827901125, + "loss": 3.0799715518951416, + "step": 6668, + "token_acc": 0.28834200243252334 + }, + { + "epoch": 3.909410729991205, + "grad_norm": 0.19668353139336017, + "learning_rate": 0.00047163106843395303, + "loss": 3.07839298248291, + "step": 6669, + "token_acc": 0.2892456514587114 + }, + { + "epoch": 3.9099970683084138, + "grad_norm": 0.1986881082462936, + "learning_rate": 0.00047161985650696957, + "loss": 3.1098709106445312, + "step": 6670, + "token_acc": 0.2855512325762848 + }, + { + "epoch": 3.910583406625623, + "grad_norm": 0.17520790625033966, + "learning_rate": 0.0004716086424981662, + "loss": 3.1099987030029297, + "step": 6671, + "token_acc": 0.28613849039545675 + }, + { + "epoch": 3.911169744942832, + "grad_norm": 0.18384721935889844, + "learning_rate": 0.00047159742640764826, + "loss": 3.0757803916931152, + "step": 6672, + "token_acc": 0.2894037955781979 + }, + { + "epoch": 3.911756083260041, + "grad_norm": 0.18310671805957543, + "learning_rate": 0.00047158620823552113, + "loss": 3.096815586090088, + "step": 6673, + "token_acc": 0.28833197792543713 + }, + { + "epoch": 3.91234242157725, + "grad_norm": 0.19080713888393488, + "learning_rate": 0.00047157498798189014, + "loss": 3.0587055683135986, + "step": 6674, + "token_acc": 0.2923268690791718 + }, + { + "epoch": 3.9129287598944593, + "grad_norm": 0.1926956903404848, + "learning_rate": 0.00047156376564686073, + "loss": 3.0705649852752686, + "step": 6675, + "token_acc": 0.2913416011881339 + }, + { + "epoch": 3.913515098211668, + "grad_norm": 0.17238272652262301, + "learning_rate": 0.0004715525412305383, + "loss": 3.089484691619873, + "step": 6676, + "token_acc": 0.28848837357619356 + }, + { + "epoch": 3.914101436528877, + "grad_norm": 0.17690938059305153, + "learning_rate": 0.0004715413147330282, + "loss": 3.0427112579345703, + "step": 6677, + "token_acc": 0.29562446132569 + }, + { + "epoch": 3.914687774846086, + "grad_norm": 0.19549704463123602, + "learning_rate": 0.000471530086154436, + "loss": 3.02778697013855, + "step": 6678, + "token_acc": 0.29731599927329466 + }, + { + "epoch": 3.9152741131632953, + "grad_norm": 0.20729477990740128, + "learning_rate": 0.00047151885549486726, + "loss": 3.084157943725586, + "step": 6679, + "token_acc": 0.28948299294398006 + }, + { + "epoch": 3.915860451480504, + "grad_norm": 0.23673077945689236, + "learning_rate": 0.00047150762275442737, + "loss": 3.0899147987365723, + "step": 6680, + "token_acc": 0.2878961967140095 + }, + { + "epoch": 3.916446789797713, + "grad_norm": 0.267893165845178, + "learning_rate": 0.0004714963879332218, + "loss": 3.0904488563537598, + "step": 6681, + "token_acc": 0.2869433251622813 + }, + { + "epoch": 3.917033128114922, + "grad_norm": 0.23562682574382318, + "learning_rate": 0.00047148515103135615, + "loss": 3.1031858921051025, + "step": 6682, + "token_acc": 0.28527509060260176 + }, + { + "epoch": 3.9176194664321313, + "grad_norm": 0.19066435141456595, + "learning_rate": 0.00047147391204893597, + "loss": 3.1164937019348145, + "step": 6683, + "token_acc": 0.2850263950135685 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.16830948674467125, + "learning_rate": 0.00047146267098606675, + "loss": 3.1011576652526855, + "step": 6684, + "token_acc": 0.28734436867432916 + }, + { + "epoch": 3.9187921430665495, + "grad_norm": 0.18699898519794278, + "learning_rate": 0.0004714514278428542, + "loss": 3.0827369689941406, + "step": 6685, + "token_acc": 0.28942741818030504 + }, + { + "epoch": 3.9193784813837587, + "grad_norm": 0.1932032427962044, + "learning_rate": 0.000471440182619404, + "loss": 3.0541751384735107, + "step": 6686, + "token_acc": 0.29583533814488105 + }, + { + "epoch": 3.9199648197009673, + "grad_norm": 0.20142223281160146, + "learning_rate": 0.00047142893531582156, + "loss": 3.0695598125457764, + "step": 6687, + "token_acc": 0.29027564885297863 + }, + { + "epoch": 3.9205511580181764, + "grad_norm": 0.2168516087780529, + "learning_rate": 0.0004714176859322127, + "loss": 3.0429539680480957, + "step": 6688, + "token_acc": 0.29459460830825906 + }, + { + "epoch": 3.9211374963353856, + "grad_norm": 0.27909404390062775, + "learning_rate": 0.00047140643446868304, + "loss": 3.0664031505584717, + "step": 6689, + "token_acc": 0.29037023701099274 + }, + { + "epoch": 3.9217238346525947, + "grad_norm": 0.2264269048157196, + "learning_rate": 0.00047139518092533824, + "loss": 3.0987868309020996, + "step": 6690, + "token_acc": 0.28385851823877245 + }, + { + "epoch": 3.9223101729698033, + "grad_norm": 0.1750092583580957, + "learning_rate": 0.00047138392530228404, + "loss": 3.110069751739502, + "step": 6691, + "token_acc": 0.28680917373813397 + }, + { + "epoch": 3.9228965112870124, + "grad_norm": 0.2504495721367425, + "learning_rate": 0.00047137266759962626, + "loss": 3.111448287963867, + "step": 6692, + "token_acc": 0.28527862419322947 + }, + { + "epoch": 3.9234828496042216, + "grad_norm": 0.21087969825901873, + "learning_rate": 0.0004713614078174705, + "loss": 3.0733635425567627, + "step": 6693, + "token_acc": 0.28992635461949007 + }, + { + "epoch": 3.9240691879214307, + "grad_norm": 0.23317930476711, + "learning_rate": 0.00047135014595592263, + "loss": 3.0887603759765625, + "step": 6694, + "token_acc": 0.2877039509654398 + }, + { + "epoch": 3.92465552623864, + "grad_norm": 0.29169340309731334, + "learning_rate": 0.00047133888201508837, + "loss": 3.1063990592956543, + "step": 6695, + "token_acc": 0.2845426268012442 + }, + { + "epoch": 3.925241864555849, + "grad_norm": 0.19998608384548078, + "learning_rate": 0.0004713276159950737, + "loss": 3.0589139461517334, + "step": 6696, + "token_acc": 0.29228350170997286 + }, + { + "epoch": 3.925828202873058, + "grad_norm": 0.21339411901816419, + "learning_rate": 0.0004713163478959842, + "loss": 3.0778064727783203, + "step": 6697, + "token_acc": 0.289695921062147 + }, + { + "epoch": 3.9264145411902667, + "grad_norm": 0.2408294773973168, + "learning_rate": 0.00047130507771792583, + "loss": 3.0697922706604004, + "step": 6698, + "token_acc": 0.28958039007027137 + }, + { + "epoch": 3.927000879507476, + "grad_norm": 0.1901120536550186, + "learning_rate": 0.00047129380546100455, + "loss": 3.0792102813720703, + "step": 6699, + "token_acc": 0.2903775738999856 + }, + { + "epoch": 3.927587217824685, + "grad_norm": 0.20833930039569232, + "learning_rate": 0.00047128253112532607, + "loss": 3.0582709312438965, + "step": 6700, + "token_acc": 0.2941672634678125 + }, + { + "epoch": 3.928173556141894, + "grad_norm": 0.19728328258371358, + "learning_rate": 0.0004712712547109965, + "loss": 3.1104788780212402, + "step": 6701, + "token_acc": 0.2825135660142311 + }, + { + "epoch": 3.9287598944591027, + "grad_norm": 0.1864014634407815, + "learning_rate": 0.00047125997621812155, + "loss": 3.1384544372558594, + "step": 6702, + "token_acc": 0.2813310686017579 + }, + { + "epoch": 3.929346232776312, + "grad_norm": 0.19204618316889274, + "learning_rate": 0.0004712486956468073, + "loss": 3.1339359283447266, + "step": 6703, + "token_acc": 0.28313880447971135 + }, + { + "epoch": 3.929932571093521, + "grad_norm": 0.19061345588990544, + "learning_rate": 0.0004712374129971598, + "loss": 3.090369701385498, + "step": 6704, + "token_acc": 0.2881379738800427 + }, + { + "epoch": 3.93051890941073, + "grad_norm": 0.18025205903266708, + "learning_rate": 0.0004712261282692848, + "loss": 3.0653324127197266, + "step": 6705, + "token_acc": 0.29257056292103645 + }, + { + "epoch": 3.931105247727939, + "grad_norm": 0.18393953680834926, + "learning_rate": 0.0004712148414632885, + "loss": 3.0820493698120117, + "step": 6706, + "token_acc": 0.2885569581201635 + }, + { + "epoch": 3.9316915860451482, + "grad_norm": 0.17902241570905314, + "learning_rate": 0.00047120355257927683, + "loss": 3.0954947471618652, + "step": 6707, + "token_acc": 0.28560615682909823 + }, + { + "epoch": 3.9322779243623573, + "grad_norm": 0.2072126774081727, + "learning_rate": 0.00047119226161735587, + "loss": 3.1053380966186523, + "step": 6708, + "token_acc": 0.28607547595139043 + }, + { + "epoch": 3.932864262679566, + "grad_norm": 0.22218547408543435, + "learning_rate": 0.0004711809685776316, + "loss": 3.085787773132324, + "step": 6709, + "token_acc": 0.28791855027784613 + }, + { + "epoch": 3.933450600996775, + "grad_norm": 0.22864912477353647, + "learning_rate": 0.0004711696734602103, + "loss": 3.1162140369415283, + "step": 6710, + "token_acc": 0.2849529153529111 + }, + { + "epoch": 3.9340369393139842, + "grad_norm": 0.2284841786202647, + "learning_rate": 0.0004711583762651979, + "loss": 3.09869122505188, + "step": 6711, + "token_acc": 0.28728169667766984 + }, + { + "epoch": 3.9346232776311933, + "grad_norm": 0.21334651403748614, + "learning_rate": 0.00047114707699270057, + "loss": 3.0433740615844727, + "step": 6712, + "token_acc": 0.2946870030729585 + }, + { + "epoch": 3.935209615948402, + "grad_norm": 0.18712087450012704, + "learning_rate": 0.0004711357756428244, + "loss": 3.052543878555298, + "step": 6713, + "token_acc": 0.2936395098920748 + }, + { + "epoch": 3.935795954265611, + "grad_norm": 0.20664953464546496, + "learning_rate": 0.0004711244722156756, + "loss": 3.1054911613464355, + "step": 6714, + "token_acc": 0.286683716236779 + }, + { + "epoch": 3.9363822925828202, + "grad_norm": 0.27965961443222476, + "learning_rate": 0.0004711131667113604, + "loss": 3.066488742828369, + "step": 6715, + "token_acc": 0.290917580497182 + }, + { + "epoch": 3.9369686309000294, + "grad_norm": 0.3294615803960452, + "learning_rate": 0.00047110185912998496, + "loss": 3.098814010620117, + "step": 6716, + "token_acc": 0.2860231841067874 + }, + { + "epoch": 3.9375549692172385, + "grad_norm": 0.30108595466984267, + "learning_rate": 0.0004710905494716555, + "loss": 3.068279266357422, + "step": 6717, + "token_acc": 0.29092357680394354 + }, + { + "epoch": 3.9381413075344476, + "grad_norm": 0.21202038541135715, + "learning_rate": 0.0004710792377364782, + "loss": 3.126347780227661, + "step": 6718, + "token_acc": 0.28441802007287337 + }, + { + "epoch": 3.9387276458516562, + "grad_norm": 0.2818223415135372, + "learning_rate": 0.0004710679239245594, + "loss": 3.0310590267181396, + "step": 6719, + "token_acc": 0.2964558670247819 + }, + { + "epoch": 3.9393139841688654, + "grad_norm": 0.21207809280685008, + "learning_rate": 0.0004710566080360053, + "loss": 3.0929980278015137, + "step": 6720, + "token_acc": 0.2853044711816319 + }, + { + "epoch": 3.9399003224860745, + "grad_norm": 0.264754360033794, + "learning_rate": 0.0004710452900709223, + "loss": 3.1281919479370117, + "step": 6721, + "token_acc": 0.2829935386068905 + }, + { + "epoch": 3.9404866608032836, + "grad_norm": 0.2337898588152632, + "learning_rate": 0.0004710339700294167, + "loss": 3.1238808631896973, + "step": 6722, + "token_acc": 0.28449085426296733 + }, + { + "epoch": 3.9410729991204922, + "grad_norm": 0.2607217411675205, + "learning_rate": 0.00047102264791159474, + "loss": 3.078977108001709, + "step": 6723, + "token_acc": 0.29015102897598416 + }, + { + "epoch": 3.9416593374377014, + "grad_norm": 0.22530027258467475, + "learning_rate": 0.0004710113237175628, + "loss": 3.1104190349578857, + "step": 6724, + "token_acc": 0.2849929465203283 + }, + { + "epoch": 3.9422456757549105, + "grad_norm": 0.21371778336103323, + "learning_rate": 0.00047099999744742733, + "loss": 3.0787291526794434, + "step": 6725, + "token_acc": 0.2910668828515029 + }, + { + "epoch": 3.9428320140721196, + "grad_norm": 0.2018423944719916, + "learning_rate": 0.0004709886691012947, + "loss": 3.065816879272461, + "step": 6726, + "token_acc": 0.29119826014848194 + }, + { + "epoch": 3.9434183523893287, + "grad_norm": 0.16915372324712086, + "learning_rate": 0.0004709773386792713, + "loss": 3.109806537628174, + "step": 6727, + "token_acc": 0.2851704836183772 + }, + { + "epoch": 3.944004690706538, + "grad_norm": 0.18813686835216847, + "learning_rate": 0.00047096600618146356, + "loss": 3.0517992973327637, + "step": 6728, + "token_acc": 0.29324062791197764 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.19255171692010578, + "learning_rate": 0.000470954671607978, + "loss": 3.0422275066375732, + "step": 6729, + "token_acc": 0.2946406188215409 + }, + { + "epoch": 3.9451773673409556, + "grad_norm": 0.20497771430921866, + "learning_rate": 0.00047094333495892093, + "loss": 3.0861008167266846, + "step": 6730, + "token_acc": 0.28956807012539065 + }, + { + "epoch": 3.9457637056581647, + "grad_norm": 0.1743050333160798, + "learning_rate": 0.0004709319962343991, + "loss": 3.0805163383483887, + "step": 6731, + "token_acc": 0.2893749725706644 + }, + { + "epoch": 3.946350043975374, + "grad_norm": 0.20666380378787028, + "learning_rate": 0.0004709206554345188, + "loss": 3.0975799560546875, + "step": 6732, + "token_acc": 0.28546363650054973 + }, + { + "epoch": 3.946936382292583, + "grad_norm": 0.217184219916497, + "learning_rate": 0.0004709093125593866, + "loss": 3.103559732437134, + "step": 6733, + "token_acc": 0.2869196325821773 + }, + { + "epoch": 3.9475227206097916, + "grad_norm": 0.18637974034769236, + "learning_rate": 0.0004708979676091091, + "loss": 3.063138008117676, + "step": 6734, + "token_acc": 0.2917454825552763 + }, + { + "epoch": 3.9481090589270007, + "grad_norm": 0.2253756441975982, + "learning_rate": 0.0004708866205837929, + "loss": 3.0463500022888184, + "step": 6735, + "token_acc": 0.2941732241669874 + }, + { + "epoch": 3.94869539724421, + "grad_norm": 0.18825481872750727, + "learning_rate": 0.0004708752714835445, + "loss": 3.06581974029541, + "step": 6736, + "token_acc": 0.2916749156069951 + }, + { + "epoch": 3.949281735561419, + "grad_norm": 0.20139931050227008, + "learning_rate": 0.00047086392030847057, + "loss": 3.0891823768615723, + "step": 6737, + "token_acc": 0.28980619062276103 + }, + { + "epoch": 3.949868073878628, + "grad_norm": 0.21386398880494314, + "learning_rate": 0.0004708525670586778, + "loss": 3.1262433528900146, + "step": 6738, + "token_acc": 0.2829819986005979 + }, + { + "epoch": 3.950454412195837, + "grad_norm": 0.22337405754738288, + "learning_rate": 0.0004708412117342727, + "loss": 3.0991740226745605, + "step": 6739, + "token_acc": 0.2864382899618747 + }, + { + "epoch": 3.9510407505130463, + "grad_norm": 0.23147787570889594, + "learning_rate": 0.000470829854335362, + "loss": 3.1200380325317383, + "step": 6740, + "token_acc": 0.28283799903555745 + }, + { + "epoch": 3.951627088830255, + "grad_norm": 0.22148111901866932, + "learning_rate": 0.0004708184948620524, + "loss": 3.125523805618286, + "step": 6741, + "token_acc": 0.2820963581649573 + }, + { + "epoch": 3.952213427147464, + "grad_norm": 0.21309266386302453, + "learning_rate": 0.0004708071333144506, + "loss": 3.0885539054870605, + "step": 6742, + "token_acc": 0.2885979125457683 + }, + { + "epoch": 3.952799765464673, + "grad_norm": 0.3642746592695076, + "learning_rate": 0.00047079576969266337, + "loss": 3.0962767601013184, + "step": 6743, + "token_acc": 0.28905832836008416 + }, + { + "epoch": 3.9533861037818823, + "grad_norm": 0.4193620614459352, + "learning_rate": 0.00047078440399679736, + "loss": 3.120298385620117, + "step": 6744, + "token_acc": 0.28218164521840233 + }, + { + "epoch": 3.953972442099091, + "grad_norm": 0.19240679293953977, + "learning_rate": 0.0004707730362269593, + "loss": 3.061475992202759, + "step": 6745, + "token_acc": 0.2909522862721706 + }, + { + "epoch": 3.9545587804163, + "grad_norm": 0.32427776247222884, + "learning_rate": 0.0004707616663832562, + "loss": 3.0943586826324463, + "step": 6746, + "token_acc": 0.28898758270642616 + }, + { + "epoch": 3.955145118733509, + "grad_norm": 0.24097182711340676, + "learning_rate": 0.00047075029446579466, + "loss": 3.089096784591675, + "step": 6747, + "token_acc": 0.2896286352988994 + }, + { + "epoch": 3.9557314570507183, + "grad_norm": 0.24937544750280952, + "learning_rate": 0.00047073892047468156, + "loss": 3.0960211753845215, + "step": 6748, + "token_acc": 0.28681687376503506 + }, + { + "epoch": 3.9563177953679274, + "grad_norm": 0.16521302171777708, + "learning_rate": 0.00047072754441002373, + "loss": 3.0952634811401367, + "step": 6749, + "token_acc": 0.28801745642686727 + }, + { + "epoch": 3.9569041336851365, + "grad_norm": 0.24804867504687367, + "learning_rate": 0.0004707161662719281, + "loss": 3.0993199348449707, + "step": 6750, + "token_acc": 0.28588505061618585 + }, + { + "epoch": 3.9574904720023456, + "grad_norm": 0.21366757986462223, + "learning_rate": 0.00047070478606050153, + "loss": 3.1269121170043945, + "step": 6751, + "token_acc": 0.28364530085751855 + }, + { + "epoch": 3.9580768103195543, + "grad_norm": 0.23719457887400647, + "learning_rate": 0.0004706934037758509, + "loss": 3.071544885635376, + "step": 6752, + "token_acc": 0.2888691991334231 + }, + { + "epoch": 3.9586631486367634, + "grad_norm": 0.18695313419638268, + "learning_rate": 0.00047068201941808307, + "loss": 3.0645318031311035, + "step": 6753, + "token_acc": 0.29317200246795094 + }, + { + "epoch": 3.9592494869539725, + "grad_norm": 0.20038386258425406, + "learning_rate": 0.00047067063298730506, + "loss": 3.0458388328552246, + "step": 6754, + "token_acc": 0.29502805390824094 + }, + { + "epoch": 3.9598358252711816, + "grad_norm": 0.22383552498805998, + "learning_rate": 0.00047065924448362384, + "loss": 3.119743824005127, + "step": 6755, + "token_acc": 0.28255059594370324 + }, + { + "epoch": 3.9604221635883903, + "grad_norm": 0.2151942809580209, + "learning_rate": 0.00047064785390714636, + "loss": 3.0881083011627197, + "step": 6756, + "token_acc": 0.2888687023967577 + }, + { + "epoch": 3.9610085019055994, + "grad_norm": 0.221555034772601, + "learning_rate": 0.0004706364612579796, + "loss": 3.0781450271606445, + "step": 6757, + "token_acc": 0.2892811857479881 + }, + { + "epoch": 3.9615948402228085, + "grad_norm": 0.194438632915114, + "learning_rate": 0.0004706250665362306, + "loss": 3.048779010772705, + "step": 6758, + "token_acc": 0.29320898150257885 + }, + { + "epoch": 3.9621811785400176, + "grad_norm": 0.23622624132996456, + "learning_rate": 0.00047061366974200636, + "loss": 3.067814826965332, + "step": 6759, + "token_acc": 0.2906316948230569 + }, + { + "epoch": 3.9627675168572267, + "grad_norm": 0.2005839072078433, + "learning_rate": 0.0004706022708754141, + "loss": 3.0776000022888184, + "step": 6760, + "token_acc": 0.28964207366956557 + }, + { + "epoch": 3.963353855174436, + "grad_norm": 0.25833832365958964, + "learning_rate": 0.0004705908699365606, + "loss": 3.0757744312286377, + "step": 6761, + "token_acc": 0.2906780718451912 + }, + { + "epoch": 3.963940193491645, + "grad_norm": 0.20596521905517404, + "learning_rate": 0.0004705794669255532, + "loss": 3.0538580417633057, + "step": 6762, + "token_acc": 0.2933118959273293 + }, + { + "epoch": 3.9645265318088536, + "grad_norm": 0.2781311027725708, + "learning_rate": 0.00047056806184249893, + "loss": 3.0851030349731445, + "step": 6763, + "token_acc": 0.289389573242414 + }, + { + "epoch": 3.9651128701260627, + "grad_norm": 0.26257372782398836, + "learning_rate": 0.00047055665468750496, + "loss": 3.0718703269958496, + "step": 6764, + "token_acc": 0.2888256227758007 + }, + { + "epoch": 3.965699208443272, + "grad_norm": 0.23733675173602187, + "learning_rate": 0.0004705452454606785, + "loss": 3.0409162044525146, + "step": 6765, + "token_acc": 0.29440663558655356 + }, + { + "epoch": 3.966285546760481, + "grad_norm": 0.22161716716514832, + "learning_rate": 0.00047053383416212645, + "loss": 3.015023708343506, + "step": 6766, + "token_acc": 0.2993078864517472 + }, + { + "epoch": 3.9668718850776896, + "grad_norm": 0.22152293597944567, + "learning_rate": 0.0004705224207919564, + "loss": 3.107854127883911, + "step": 6767, + "token_acc": 0.2845793659098043 + }, + { + "epoch": 3.9674582233948987, + "grad_norm": 0.24453977971760799, + "learning_rate": 0.00047051100535027524, + "loss": 3.119394302368164, + "step": 6768, + "token_acc": 0.28394418300453217 + }, + { + "epoch": 3.968044561712108, + "grad_norm": 0.21700064635332458, + "learning_rate": 0.00047049958783719037, + "loss": 3.1139888763427734, + "step": 6769, + "token_acc": 0.28529597459994127 + }, + { + "epoch": 3.968630900029317, + "grad_norm": 0.24974585073482308, + "learning_rate": 0.00047048816825280906, + "loss": 3.0812134742736816, + "step": 6770, + "token_acc": 0.2896397440969049 + }, + { + "epoch": 3.969217238346526, + "grad_norm": 0.22022902088480992, + "learning_rate": 0.0004704767465972384, + "loss": 3.0303401947021484, + "step": 6771, + "token_acc": 0.2948009923417107 + }, + { + "epoch": 3.969803576663735, + "grad_norm": 0.22068937344967937, + "learning_rate": 0.00047046532287058583, + "loss": 3.0402743816375732, + "step": 6772, + "token_acc": 0.2952332460224559 + }, + { + "epoch": 3.970389914980944, + "grad_norm": 0.22039579356326613, + "learning_rate": 0.00047045389707295874, + "loss": 3.080740213394165, + "step": 6773, + "token_acc": 0.28981163513705727 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.2606676557694435, + "learning_rate": 0.0004704424692044642, + "loss": 3.03606915473938, + "step": 6774, + "token_acc": 0.29545551772471396 + }, + { + "epoch": 3.971562591615362, + "grad_norm": 0.21043388557794823, + "learning_rate": 0.0004704310392652098, + "loss": 3.0216474533081055, + "step": 6775, + "token_acc": 0.29878209977201997 + }, + { + "epoch": 3.972148929932571, + "grad_norm": 0.25791143071491207, + "learning_rate": 0.00047041960725530287, + "loss": 3.078902244567871, + "step": 6776, + "token_acc": 0.290062340518225 + }, + { + "epoch": 3.97273526824978, + "grad_norm": 0.19272324955183281, + "learning_rate": 0.0004704081731748506, + "loss": 3.126561403274536, + "step": 6777, + "token_acc": 0.28126656033584685 + }, + { + "epoch": 3.973321606566989, + "grad_norm": 0.29041820023243975, + "learning_rate": 0.0004703967370239607, + "loss": 3.0903172492980957, + "step": 6778, + "token_acc": 0.2903547974345865 + }, + { + "epoch": 3.973907944884198, + "grad_norm": 0.19181091816741253, + "learning_rate": 0.0004703852988027404, + "loss": 3.0639896392822266, + "step": 6779, + "token_acc": 0.29090201209321787 + }, + { + "epoch": 3.974494283201407, + "grad_norm": 0.23078965544665764, + "learning_rate": 0.0004703738585112971, + "loss": 3.1098828315734863, + "step": 6780, + "token_acc": 0.28441182772659096 + }, + { + "epoch": 3.9750806215186163, + "grad_norm": 0.17526521185359176, + "learning_rate": 0.0004703624161497384, + "loss": 3.049208641052246, + "step": 6781, + "token_acc": 0.2939239083220482 + }, + { + "epoch": 3.9756669598358254, + "grad_norm": 0.25784573234595515, + "learning_rate": 0.0004703509717181718, + "loss": 3.161747455596924, + "step": 6782, + "token_acc": 0.2798242044020569 + }, + { + "epoch": 3.9762532981530345, + "grad_norm": 0.19943207011031283, + "learning_rate": 0.0004703395252167048, + "loss": 3.075469493865967, + "step": 6783, + "token_acc": 0.28981551131825267 + }, + { + "epoch": 3.976839636470243, + "grad_norm": 0.23025895079809544, + "learning_rate": 0.00047032807664544477, + "loss": 3.0728116035461426, + "step": 6784, + "token_acc": 0.2889932158389606 + }, + { + "epoch": 3.9774259747874523, + "grad_norm": 0.2037055254242142, + "learning_rate": 0.0004703166260044993, + "loss": 3.1167564392089844, + "step": 6785, + "token_acc": 0.28381103058487617 + }, + { + "epoch": 3.9780123131046614, + "grad_norm": 0.23559765457279833, + "learning_rate": 0.0004703051732939761, + "loss": 3.0963549613952637, + "step": 6786, + "token_acc": 0.2870102681686771 + }, + { + "epoch": 3.9785986514218705, + "grad_norm": 0.21976918916749097, + "learning_rate": 0.0004702937185139826, + "loss": 3.068171977996826, + "step": 6787, + "token_acc": 0.28993141390446336 + }, + { + "epoch": 3.979184989739079, + "grad_norm": 0.2061312057577153, + "learning_rate": 0.0004702822616646265, + "loss": 3.087235927581787, + "step": 6788, + "token_acc": 0.28833241505484625 + }, + { + "epoch": 3.9797713280562883, + "grad_norm": 0.22580428526740257, + "learning_rate": 0.0004702708027460154, + "loss": 3.0525827407836914, + "step": 6789, + "token_acc": 0.29281732523030274 + }, + { + "epoch": 3.9803576663734974, + "grad_norm": 0.19661689762957427, + "learning_rate": 0.00047025934175825695, + "loss": 3.065704822540283, + "step": 6790, + "token_acc": 0.29177415572232646 + }, + { + "epoch": 3.9809440046907065, + "grad_norm": 0.19885967618202607, + "learning_rate": 0.0004702478787014588, + "loss": 3.0729615688323975, + "step": 6791, + "token_acc": 0.2898957642327846 + }, + { + "epoch": 3.9815303430079156, + "grad_norm": 0.21685628523747688, + "learning_rate": 0.00047023641357572853, + "loss": 3.087904453277588, + "step": 6792, + "token_acc": 0.2873452241358908 + }, + { + "epoch": 3.9821166813251248, + "grad_norm": 0.19689846946269074, + "learning_rate": 0.000470224946381174, + "loss": 3.068155288696289, + "step": 6793, + "token_acc": 0.2914459558946129 + }, + { + "epoch": 3.982703019642334, + "grad_norm": 0.18966152394711594, + "learning_rate": 0.0004702134771179028, + "loss": 3.107975959777832, + "step": 6794, + "token_acc": 0.2882968528385786 + }, + { + "epoch": 3.9832893579595425, + "grad_norm": 0.20914988185962483, + "learning_rate": 0.0004702020057860228, + "loss": 3.064915180206299, + "step": 6795, + "token_acc": 0.2908122308983434 + }, + { + "epoch": 3.9838756962767516, + "grad_norm": 0.1944886030313796, + "learning_rate": 0.0004701905323856416, + "loss": 3.0708730220794678, + "step": 6796, + "token_acc": 0.29118811358598123 + }, + { + "epoch": 3.9844620345939608, + "grad_norm": 0.2371724988431959, + "learning_rate": 0.0004701790569168671, + "loss": 3.069450855255127, + "step": 6797, + "token_acc": 0.2915478681309151 + }, + { + "epoch": 3.98504837291117, + "grad_norm": 0.22705857027517132, + "learning_rate": 0.00047016757937980706, + "loss": 3.084702491760254, + "step": 6798, + "token_acc": 0.29040560833043855 + }, + { + "epoch": 3.9856347112283785, + "grad_norm": 0.2573477318946264, + "learning_rate": 0.00047015609977456925, + "loss": 3.1077802181243896, + "step": 6799, + "token_acc": 0.2852639290370868 + }, + { + "epoch": 3.9862210495455876, + "grad_norm": 0.2162666094342554, + "learning_rate": 0.0004701446181012615, + "loss": 3.0677242279052734, + "step": 6800, + "token_acc": 0.2913358303465076 + }, + { + "epoch": 3.9868073878627968, + "grad_norm": 0.2195683330686813, + "learning_rate": 0.00047013313435999185, + "loss": 3.1398673057556152, + "step": 6801, + "token_acc": 0.28236704127812573 + }, + { + "epoch": 3.987393726180006, + "grad_norm": 0.1856475485866107, + "learning_rate": 0.00047012164855086795, + "loss": 3.1162350177764893, + "step": 6802, + "token_acc": 0.2846689500183677 + }, + { + "epoch": 3.987980064497215, + "grad_norm": 0.20089717677320684, + "learning_rate": 0.0004701101606739978, + "loss": 3.0914692878723145, + "step": 6803, + "token_acc": 0.2884379928890315 + }, + { + "epoch": 3.988566402814424, + "grad_norm": 0.2482854516135699, + "learning_rate": 0.0004700986707294893, + "loss": 3.0586884021759033, + "step": 6804, + "token_acc": 0.2918337332610883 + }, + { + "epoch": 3.989152741131633, + "grad_norm": 0.20052617053776917, + "learning_rate": 0.0004700871787174504, + "loss": 2.9862594604492188, + "step": 6805, + "token_acc": 0.30325846234855014 + }, + { + "epoch": 3.989739079448842, + "grad_norm": 0.20058594257541457, + "learning_rate": 0.000470075684637989, + "loss": 3.0808024406433105, + "step": 6806, + "token_acc": 0.2891893821921162 + }, + { + "epoch": 3.990325417766051, + "grad_norm": 0.16283201469828032, + "learning_rate": 0.00047006418849121305, + "loss": 3.0789926052093506, + "step": 6807, + "token_acc": 0.2884958781558867 + }, + { + "epoch": 3.99091175608326, + "grad_norm": 0.204195383943405, + "learning_rate": 0.00047005269027723066, + "loss": 3.13724422454834, + "step": 6808, + "token_acc": 0.28270051005225816 + }, + { + "epoch": 3.991498094400469, + "grad_norm": 0.18817203242199967, + "learning_rate": 0.00047004118999614976, + "loss": 3.068647861480713, + "step": 6809, + "token_acc": 0.2913964300063135 + }, + { + "epoch": 3.992084432717678, + "grad_norm": 0.23832667940108215, + "learning_rate": 0.00047002968764807835, + "loss": 3.067481279373169, + "step": 6810, + "token_acc": 0.2912233357989168 + }, + { + "epoch": 3.992670771034887, + "grad_norm": 0.26409117100894863, + "learning_rate": 0.0004700181832331245, + "loss": 3.0568323135375977, + "step": 6811, + "token_acc": 0.29265277608250206 + }, + { + "epoch": 3.993257109352096, + "grad_norm": 0.20437327768526597, + "learning_rate": 0.00047000667675139643, + "loss": 3.0584967136383057, + "step": 6812, + "token_acc": 0.2944079284149313 + }, + { + "epoch": 3.993843447669305, + "grad_norm": 0.20678632204099828, + "learning_rate": 0.00046999516820300194, + "loss": 3.0637078285217285, + "step": 6813, + "token_acc": 0.2916622487955375 + }, + { + "epoch": 3.9944297859865143, + "grad_norm": 0.22612083609515282, + "learning_rate": 0.0004699836575880494, + "loss": 3.064908742904663, + "step": 6814, + "token_acc": 0.2922555640035923 + }, + { + "epoch": 3.9950161243037234, + "grad_norm": 0.1843691557535468, + "learning_rate": 0.0004699721449066468, + "loss": 3.0699219703674316, + "step": 6815, + "token_acc": 0.2931559206413617 + }, + { + "epoch": 3.9956024626209325, + "grad_norm": 0.19022653115325808, + "learning_rate": 0.0004699606301589022, + "loss": 3.0397629737854004, + "step": 6816, + "token_acc": 0.2974636737121637 + }, + { + "epoch": 3.996188800938141, + "grad_norm": 0.21282574312559707, + "learning_rate": 0.000469949113344924, + "loss": 3.0314745903015137, + "step": 6817, + "token_acc": 0.29501121538596625 + }, + { + "epoch": 3.9967751392553503, + "grad_norm": 0.21333608177095004, + "learning_rate": 0.0004699375944648203, + "loss": 3.09308123588562, + "step": 6818, + "token_acc": 0.2859053517283609 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.19307933035407412, + "learning_rate": 0.0004699260735186992, + "loss": 3.114429473876953, + "step": 6819, + "token_acc": 0.28392725876489444 + }, + { + "epoch": 3.9979478158897686, + "grad_norm": 0.2531383379432577, + "learning_rate": 0.000469914550506669, + "loss": 3.096020221710205, + "step": 6820, + "token_acc": 0.2849527158987264 + }, + { + "epoch": 3.998534154206977, + "grad_norm": 0.22700210949037603, + "learning_rate": 0.0004699030254288379, + "loss": 3.0789248943328857, + "step": 6821, + "token_acc": 0.2893205798244749 + }, + { + "epoch": 3.9991204925241863, + "grad_norm": 0.2120301985441921, + "learning_rate": 0.0004698914982853142, + "loss": 3.077389717102051, + "step": 6822, + "token_acc": 0.2903556090918613 + }, + { + "epoch": 3.9997068308413954, + "grad_norm": 0.24120719983359898, + "learning_rate": 0.0004698799690762063, + "loss": 3.0784013271331787, + "step": 6823, + "token_acc": 0.29167256869966407 + }, + { + "epoch": 4.0, + "grad_norm": 0.2698352357714906, + "learning_rate": 0.00046986843780162223, + "loss": 3.1038906574249268, + "step": 6824, + "token_acc": 0.2831471489728196 + }, + { + "epoch": 4.0, + "eval_loss": 3.089488983154297, + "eval_runtime": 6.5439, + "eval_samples_per_second": 39.121, + "eval_steps_per_second": 4.89, + "eval_token_acc": 0.2882056610713456, + "step": 6824 + }, + { + "epoch": 4.000586338317209, + "grad_norm": 0.22260690071440387, + "learning_rate": 0.0004698569044616706, + "loss": 3.0069327354431152, + "step": 6825, + "token_acc": 0.298171431771723 + }, + { + "epoch": 4.001172676634418, + "grad_norm": 0.2655844088085972, + "learning_rate": 0.00046984536905645953, + "loss": 3.030156135559082, + "step": 6826, + "token_acc": 0.2951415601891875 + }, + { + "epoch": 4.001759014951627, + "grad_norm": 0.268079784754666, + "learning_rate": 0.00046983383158609747, + "loss": 2.9917855262756348, + "step": 6827, + "token_acc": 0.3012787279564827 + }, + { + "epoch": 4.0023453532688364, + "grad_norm": 0.20488151570251312, + "learning_rate": 0.0004698222920506928, + "loss": 3.061344623565674, + "step": 6828, + "token_acc": 0.29035067229963035 + }, + { + "epoch": 4.002931691586046, + "grad_norm": 0.23190414435654447, + "learning_rate": 0.0004698107504503539, + "loss": 3.0224814414978027, + "step": 6829, + "token_acc": 0.2965082577122922 + }, + { + "epoch": 4.003518029903254, + "grad_norm": 0.2534718398324014, + "learning_rate": 0.0004697992067851893, + "loss": 2.957138776779175, + "step": 6830, + "token_acc": 0.3075330827729512 + }, + { + "epoch": 4.004104368220463, + "grad_norm": 0.18137690513984497, + "learning_rate": 0.0004697876610553072, + "loss": 2.981412649154663, + "step": 6831, + "token_acc": 0.30177488602664543 + }, + { + "epoch": 4.004690706537672, + "grad_norm": 0.22079347468450836, + "learning_rate": 0.00046977611326081624, + "loss": 3.004589080810547, + "step": 6832, + "token_acc": 0.29844672013202495 + }, + { + "epoch": 4.005277044854881, + "grad_norm": 0.1813443221535867, + "learning_rate": 0.0004697645634018248, + "loss": 2.9939393997192383, + "step": 6833, + "token_acc": 0.2994442928318364 + }, + { + "epoch": 4.00586338317209, + "grad_norm": 0.2529627652999093, + "learning_rate": 0.0004697530114784415, + "loss": 3.054389715194702, + "step": 6834, + "token_acc": 0.29264113875534936 + }, + { + "epoch": 4.006449721489299, + "grad_norm": 0.23235954691165187, + "learning_rate": 0.0004697414574907748, + "loss": 3.0710723400115967, + "step": 6835, + "token_acc": 0.2920870531901194 + }, + { + "epoch": 4.0070360598065085, + "grad_norm": 0.20611726338978273, + "learning_rate": 0.0004697299014389331, + "loss": 2.97102689743042, + "step": 6836, + "token_acc": 0.3029358014804776 + }, + { + "epoch": 4.007622398123718, + "grad_norm": 0.27159685179124016, + "learning_rate": 0.0004697183433230252, + "loss": 3.025338649749756, + "step": 6837, + "token_acc": 0.2946773093330347 + }, + { + "epoch": 4.008208736440927, + "grad_norm": 0.19514061563943455, + "learning_rate": 0.00046970678314315944, + "loss": 3.015861749649048, + "step": 6838, + "token_acc": 0.29786056615720413 + }, + { + "epoch": 4.008795074758136, + "grad_norm": 0.22763630656875633, + "learning_rate": 0.0004696952208994446, + "loss": 2.989206075668335, + "step": 6839, + "token_acc": 0.3004810393446307 + }, + { + "epoch": 4.009381413075345, + "grad_norm": 0.20991918224800166, + "learning_rate": 0.0004696836565919891, + "loss": 3.0400190353393555, + "step": 6840, + "token_acc": 0.2946971602264031 + }, + { + "epoch": 4.009967751392553, + "grad_norm": 0.2245715301180232, + "learning_rate": 0.00046967209022090174, + "loss": 3.0174708366394043, + "step": 6841, + "token_acc": 0.29670940401849466 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 0.23340546889696623, + "learning_rate": 0.00046966052178629104, + "loss": 3.0225179195404053, + "step": 6842, + "token_acc": 0.296744903911174 + }, + { + "epoch": 4.011140428026971, + "grad_norm": 0.1753424505645774, + "learning_rate": 0.0004696489512882658, + "loss": 3.0634422302246094, + "step": 6843, + "token_acc": 0.2912659258365157 + }, + { + "epoch": 4.0117267663441805, + "grad_norm": 0.18240601105511994, + "learning_rate": 0.0004696373787269346, + "loss": 2.9633193016052246, + "step": 6844, + "token_acc": 0.304967215783787 + }, + { + "epoch": 4.01231310466139, + "grad_norm": 0.18507481245384436, + "learning_rate": 0.0004696258041024062, + "loss": 3.045401096343994, + "step": 6845, + "token_acc": 0.2939066720067367 + }, + { + "epoch": 4.012899442978599, + "grad_norm": 0.1792942214824011, + "learning_rate": 0.00046961422741478935, + "loss": 2.9911468029022217, + "step": 6846, + "token_acc": 0.30138896366083445 + }, + { + "epoch": 4.013485781295808, + "grad_norm": 0.20668944854740126, + "learning_rate": 0.0004696026486641928, + "loss": 3.0343358516693115, + "step": 6847, + "token_acc": 0.2941754755545524 + }, + { + "epoch": 4.014072119613017, + "grad_norm": 0.1994219588846641, + "learning_rate": 0.00046959106785072514, + "loss": 3.062152624130249, + "step": 6848, + "token_acc": 0.2924575678086985 + }, + { + "epoch": 4.014658457930226, + "grad_norm": 0.1785525985589352, + "learning_rate": 0.00046957948497449543, + "loss": 3.0140161514282227, + "step": 6849, + "token_acc": 0.2967031013048901 + }, + { + "epoch": 4.015244796247435, + "grad_norm": 0.18257138021147615, + "learning_rate": 0.0004695679000356123, + "loss": 3.0283353328704834, + "step": 6850, + "token_acc": 0.29580266924615367 + }, + { + "epoch": 4.015831134564644, + "grad_norm": 0.198966707388853, + "learning_rate": 0.00046955631303418466, + "loss": 3.016545295715332, + "step": 6851, + "token_acc": 0.29634337463181054 + }, + { + "epoch": 4.0164174728818525, + "grad_norm": 0.18327198870616615, + "learning_rate": 0.0004695447239703212, + "loss": 3.009713888168335, + "step": 6852, + "token_acc": 0.29964154593828946 + }, + { + "epoch": 4.017003811199062, + "grad_norm": 0.2168668193096582, + "learning_rate": 0.000469533132844131, + "loss": 3.051368474960327, + "step": 6853, + "token_acc": 0.2934052877563573 + }, + { + "epoch": 4.017590149516271, + "grad_norm": 0.24085387395127703, + "learning_rate": 0.00046952153965572287, + "loss": 3.037550210952759, + "step": 6854, + "token_acc": 0.29297129872932454 + }, + { + "epoch": 4.01817648783348, + "grad_norm": 0.18981365391191862, + "learning_rate": 0.00046950994440520563, + "loss": 3.0249505043029785, + "step": 6855, + "token_acc": 0.29575096425681285 + }, + { + "epoch": 4.018762826150689, + "grad_norm": 0.19621160798260576, + "learning_rate": 0.00046949834709268825, + "loss": 3.012082576751709, + "step": 6856, + "token_acc": 0.2975827049689125 + }, + { + "epoch": 4.019349164467898, + "grad_norm": 0.2565647922835809, + "learning_rate": 0.00046948674771827973, + "loss": 3.0411696434020996, + "step": 6857, + "token_acc": 0.2940476996627707 + }, + { + "epoch": 4.019935502785107, + "grad_norm": 0.21604388463244778, + "learning_rate": 0.0004694751462820889, + "loss": 3.024933338165283, + "step": 6858, + "token_acc": 0.2972631805119115 + }, + { + "epoch": 4.020521841102316, + "grad_norm": 0.18801112042431076, + "learning_rate": 0.0004694635427842249, + "loss": 3.0511531829833984, + "step": 6859, + "token_acc": 0.2917154490918698 + }, + { + "epoch": 4.021108179419525, + "grad_norm": 0.2210499019985458, + "learning_rate": 0.0004694519372247965, + "loss": 2.994570255279541, + "step": 6860, + "token_acc": 0.3013219154232064 + }, + { + "epoch": 4.0216945177367345, + "grad_norm": 0.16799811551388025, + "learning_rate": 0.000469440329603913, + "loss": 3.0238254070281982, + "step": 6861, + "token_acc": 0.29599403763741383 + }, + { + "epoch": 4.022280856053943, + "grad_norm": 0.19582023390415662, + "learning_rate": 0.0004694287199216833, + "loss": 3.0360264778137207, + "step": 6862, + "token_acc": 0.29391213439336034 + }, + { + "epoch": 4.022867194371152, + "grad_norm": 0.2260617593650311, + "learning_rate": 0.0004694171081782164, + "loss": 3.0060901641845703, + "step": 6863, + "token_acc": 0.2989718281996033 + }, + { + "epoch": 4.023453532688361, + "grad_norm": 0.21839909724678994, + "learning_rate": 0.00046940549437362146, + "loss": 3.000239849090576, + "step": 6864, + "token_acc": 0.2988024769691664 + }, + { + "epoch": 4.02403987100557, + "grad_norm": 0.23015586094481597, + "learning_rate": 0.0004693938785080076, + "loss": 3.0355401039123535, + "step": 6865, + "token_acc": 0.29435923529839353 + }, + { + "epoch": 4.024626209322779, + "grad_norm": 0.2102867049700502, + "learning_rate": 0.0004693822605814838, + "loss": 3.021564245223999, + "step": 6866, + "token_acc": 0.2970450480691062 + }, + { + "epoch": 4.025212547639988, + "grad_norm": 0.2960152409563197, + "learning_rate": 0.00046937064059415936, + "loss": 3.0535082817077637, + "step": 6867, + "token_acc": 0.2923429562631644 + }, + { + "epoch": 4.025798885957197, + "grad_norm": 0.2861481980028111, + "learning_rate": 0.00046935901854614333, + "loss": 3.020144462585449, + "step": 6868, + "token_acc": 0.29523312839874344 + }, + { + "epoch": 4.0263852242744065, + "grad_norm": 0.20496457856794262, + "learning_rate": 0.00046934739443754486, + "loss": 3.0126473903656006, + "step": 6869, + "token_acc": 0.2960968938425475 + }, + { + "epoch": 4.026971562591616, + "grad_norm": 0.22771350289531408, + "learning_rate": 0.00046933576826847323, + "loss": 2.9915900230407715, + "step": 6870, + "token_acc": 0.30009344188344467 + }, + { + "epoch": 4.027557900908825, + "grad_norm": 0.19861864730551765, + "learning_rate": 0.0004693241400390377, + "loss": 2.9985527992248535, + "step": 6871, + "token_acc": 0.2989354973629714 + }, + { + "epoch": 4.028144239226034, + "grad_norm": 0.3103334622836779, + "learning_rate": 0.00046931250974934733, + "loss": 3.0265254974365234, + "step": 6872, + "token_acc": 0.2956640482547322 + }, + { + "epoch": 4.028730577543242, + "grad_norm": 0.25773326840042843, + "learning_rate": 0.0004693008773995114, + "loss": 3.0148262977600098, + "step": 6873, + "token_acc": 0.2984690942184378 + }, + { + "epoch": 4.029316915860451, + "grad_norm": 0.2649035462786855, + "learning_rate": 0.0004692892429896394, + "loss": 3.0531787872314453, + "step": 6874, + "token_acc": 0.29167547952571105 + }, + { + "epoch": 4.02990325417766, + "grad_norm": 0.278222421407958, + "learning_rate": 0.0004692776065198403, + "loss": 3.005423069000244, + "step": 6875, + "token_acc": 0.29826530234133897 + }, + { + "epoch": 4.030489592494869, + "grad_norm": 0.24209249537953462, + "learning_rate": 0.00046926596799022364, + "loss": 3.0543293952941895, + "step": 6876, + "token_acc": 0.2919332278853318 + }, + { + "epoch": 4.0310759308120785, + "grad_norm": 0.2978605655946107, + "learning_rate": 0.00046925432740089866, + "loss": 2.9957046508789062, + "step": 6877, + "token_acc": 0.29955352697965193 + }, + { + "epoch": 4.031662269129288, + "grad_norm": 0.18059172446797572, + "learning_rate": 0.0004692426847519747, + "loss": 3.038783073425293, + "step": 6878, + "token_acc": 0.29602598444556594 + }, + { + "epoch": 4.032248607446497, + "grad_norm": 0.21756620978469665, + "learning_rate": 0.0004692310400435612, + "loss": 3.0213725566864014, + "step": 6879, + "token_acc": 0.29742703734387615 + }, + { + "epoch": 4.032834945763706, + "grad_norm": 0.21685630435933803, + "learning_rate": 0.0004692193932757676, + "loss": 3.037707567214966, + "step": 6880, + "token_acc": 0.29347050211276043 + }, + { + "epoch": 4.033421284080915, + "grad_norm": 0.21359526064619144, + "learning_rate": 0.000469207744448703, + "loss": 3.0247395038604736, + "step": 6881, + "token_acc": 0.2965832871782505 + }, + { + "epoch": 4.034007622398124, + "grad_norm": 0.22520771511111365, + "learning_rate": 0.00046919609356247717, + "loss": 3.0610897541046143, + "step": 6882, + "token_acc": 0.29090531007896603 + }, + { + "epoch": 4.034593960715333, + "grad_norm": 0.20290081837452523, + "learning_rate": 0.00046918444061719934, + "loss": 2.9987564086914062, + "step": 6883, + "token_acc": 0.29979073384058197 + }, + { + "epoch": 4.035180299032541, + "grad_norm": 0.2451934884490686, + "learning_rate": 0.00046917278561297904, + "loss": 3.043059825897217, + "step": 6884, + "token_acc": 0.292794214786145 + }, + { + "epoch": 4.0357666373497505, + "grad_norm": 0.22227876744965416, + "learning_rate": 0.00046916112854992575, + "loss": 2.9726884365081787, + "step": 6885, + "token_acc": 0.30394178741136824 + }, + { + "epoch": 4.03635297566696, + "grad_norm": 0.2576036925090188, + "learning_rate": 0.000469149469428149, + "loss": 3.020151376724243, + "step": 6886, + "token_acc": 0.29739871353971087 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.1998107795688115, + "learning_rate": 0.00046913780824775833, + "loss": 3.0467567443847656, + "step": 6887, + "token_acc": 0.2910934332431466 + }, + { + "epoch": 4.037525652301378, + "grad_norm": 0.23337729140136795, + "learning_rate": 0.0004691261450088632, + "loss": 3.01007342338562, + "step": 6888, + "token_acc": 0.29989450085718056 + }, + { + "epoch": 4.038111990618587, + "grad_norm": 0.19617003535014987, + "learning_rate": 0.00046911447971157317, + "loss": 3.027754783630371, + "step": 6889, + "token_acc": 0.2948005332786381 + }, + { + "epoch": 4.038698328935796, + "grad_norm": 0.18972949392499389, + "learning_rate": 0.00046910281235599795, + "loss": 3.038025379180908, + "step": 6890, + "token_acc": 0.2946342430105396 + }, + { + "epoch": 4.039284667253005, + "grad_norm": 0.20987356005107077, + "learning_rate": 0.00046909114294224705, + "loss": 3.0281636714935303, + "step": 6891, + "token_acc": 0.29545131724047097 + }, + { + "epoch": 4.039871005570214, + "grad_norm": 0.20144081539719003, + "learning_rate": 0.00046907947147043004, + "loss": 3.0218405723571777, + "step": 6892, + "token_acc": 0.29522746489039203 + }, + { + "epoch": 4.040457343887423, + "grad_norm": 0.17852191284728103, + "learning_rate": 0.0004690677979406566, + "loss": 3.032956600189209, + "step": 6893, + "token_acc": 0.29439937922192044 + }, + { + "epoch": 4.0410436822046325, + "grad_norm": 0.206433891291475, + "learning_rate": 0.0004690561223530364, + "loss": 3.0238828659057617, + "step": 6894, + "token_acc": 0.2953048607508236 + }, + { + "epoch": 4.041630020521841, + "grad_norm": 0.22214700093232284, + "learning_rate": 0.0004690444447076792, + "loss": 3.0033318996429443, + "step": 6895, + "token_acc": 0.2993316463979356 + }, + { + "epoch": 4.04221635883905, + "grad_norm": 0.17651990440778284, + "learning_rate": 0.0004690327650046945, + "loss": 3.054837226867676, + "step": 6896, + "token_acc": 0.2911309698899148 + }, + { + "epoch": 4.042802697156259, + "grad_norm": 0.24616805696331384, + "learning_rate": 0.0004690210832441922, + "loss": 3.043307304382324, + "step": 6897, + "token_acc": 0.2925626425989864 + }, + { + "epoch": 4.043389035473468, + "grad_norm": 0.2441945725380238, + "learning_rate": 0.000469009399426282, + "loss": 3.0131282806396484, + "step": 6898, + "token_acc": 0.2974898018180066 + }, + { + "epoch": 4.043975373790677, + "grad_norm": 0.175712746485911, + "learning_rate": 0.00046899771355107345, + "loss": 2.9759538173675537, + "step": 6899, + "token_acc": 0.3021990449542427 + }, + { + "epoch": 4.044561712107886, + "grad_norm": 0.24009947878923604, + "learning_rate": 0.00046898602561867657, + "loss": 3.000570297241211, + "step": 6900, + "token_acc": 0.29952857427842466 + }, + { + "epoch": 4.045148050425095, + "grad_norm": 0.2324834753336778, + "learning_rate": 0.0004689743356292011, + "loss": 3.0600855350494385, + "step": 6901, + "token_acc": 0.29177262843092344 + }, + { + "epoch": 4.0457343887423045, + "grad_norm": 0.183741354758939, + "learning_rate": 0.00046896264358275676, + "loss": 2.99874210357666, + "step": 6902, + "token_acc": 0.2999199936650274 + }, + { + "epoch": 4.046320727059514, + "grad_norm": 0.26727244927978455, + "learning_rate": 0.0004689509494794535, + "loss": 3.0150675773620605, + "step": 6903, + "token_acc": 0.29761275062625375 + }, + { + "epoch": 4.046907065376723, + "grad_norm": 0.25032008672551587, + "learning_rate": 0.0004689392533194011, + "loss": 3.0060577392578125, + "step": 6904, + "token_acc": 0.29764915118869306 + }, + { + "epoch": 4.047493403693931, + "grad_norm": 0.18609127997940217, + "learning_rate": 0.00046892755510270935, + "loss": 2.997628927230835, + "step": 6905, + "token_acc": 0.2988568791614477 + }, + { + "epoch": 4.04807974201114, + "grad_norm": 0.2354162334673799, + "learning_rate": 0.00046891585482948826, + "loss": 3.01798677444458, + "step": 6906, + "token_acc": 0.2965549797217386 + }, + { + "epoch": 4.048666080328349, + "grad_norm": 0.24765067703019808, + "learning_rate": 0.0004689041524998478, + "loss": 3.042515277862549, + "step": 6907, + "token_acc": 0.29236058516881774 + }, + { + "epoch": 4.049252418645558, + "grad_norm": 0.19434951343111195, + "learning_rate": 0.00046889244811389776, + "loss": 3.006464958190918, + "step": 6908, + "token_acc": 0.29977633958313504 + }, + { + "epoch": 4.049838756962767, + "grad_norm": 0.21516852405378936, + "learning_rate": 0.0004688807416717481, + "loss": 3.0086557865142822, + "step": 6909, + "token_acc": 0.29720220116237567 + }, + { + "epoch": 4.0504250952799765, + "grad_norm": 0.1889444532997563, + "learning_rate": 0.0004688690331735088, + "loss": 3.012896776199341, + "step": 6910, + "token_acc": 0.298572178612132 + }, + { + "epoch": 4.051011433597186, + "grad_norm": 0.19430157426751662, + "learning_rate": 0.00046885732261928994, + "loss": 3.023897409439087, + "step": 6911, + "token_acc": 0.29463307222130825 + }, + { + "epoch": 4.051597771914395, + "grad_norm": 0.22192942669809293, + "learning_rate": 0.0004688456100092015, + "loss": 3.054649829864502, + "step": 6912, + "token_acc": 0.2913921472684714 + }, + { + "epoch": 4.052184110231604, + "grad_norm": 0.19648577572836648, + "learning_rate": 0.00046883389534335327, + "loss": 3.0402438640594482, + "step": 6913, + "token_acc": 0.295317558497618 + }, + { + "epoch": 4.052770448548813, + "grad_norm": 0.2226932974454573, + "learning_rate": 0.00046882217862185557, + "loss": 3.0250353813171387, + "step": 6914, + "token_acc": 0.29640651755874853 + }, + { + "epoch": 4.053356786866022, + "grad_norm": 0.2060785075528739, + "learning_rate": 0.00046881045984481837, + "loss": 3.006999969482422, + "step": 6915, + "token_acc": 0.29778659851071165 + }, + { + "epoch": 4.05394312518323, + "grad_norm": 0.19013061283197213, + "learning_rate": 0.00046879873901235173, + "loss": 2.949985980987549, + "step": 6916, + "token_acc": 0.30683229477546314 + }, + { + "epoch": 4.054529463500439, + "grad_norm": 0.22219645702940816, + "learning_rate": 0.00046878701612456574, + "loss": 3.033959150314331, + "step": 6917, + "token_acc": 0.29532168252842006 + }, + { + "epoch": 4.0551158018176485, + "grad_norm": 0.19965449878892974, + "learning_rate": 0.0004687752911815706, + "loss": 3.0350377559661865, + "step": 6918, + "token_acc": 0.293743314583287 + }, + { + "epoch": 4.055702140134858, + "grad_norm": 0.19549447919118917, + "learning_rate": 0.00046876356418347644, + "loss": 3.0276265144348145, + "step": 6919, + "token_acc": 0.29501917717719295 + }, + { + "epoch": 4.056288478452067, + "grad_norm": 0.18212262711844912, + "learning_rate": 0.00046875183513039333, + "loss": 3.014735221862793, + "step": 6920, + "token_acc": 0.29793146753711786 + }, + { + "epoch": 4.056874816769276, + "grad_norm": 0.18243033166319184, + "learning_rate": 0.0004687401040224315, + "loss": 3.0087013244628906, + "step": 6921, + "token_acc": 0.29906942969297123 + }, + { + "epoch": 4.057461155086485, + "grad_norm": 0.19626669876873382, + "learning_rate": 0.0004687283708597012, + "loss": 3.0438191890716553, + "step": 6922, + "token_acc": 0.2931826073536788 + }, + { + "epoch": 4.058047493403694, + "grad_norm": 0.22150554911158846, + "learning_rate": 0.00046871663564231254, + "loss": 3.0479063987731934, + "step": 6923, + "token_acc": 0.2920175835010887 + }, + { + "epoch": 4.058633831720903, + "grad_norm": 0.22735947314716726, + "learning_rate": 0.00046870489837037583, + "loss": 3.063234329223633, + "step": 6924, + "token_acc": 0.28998517458207657 + }, + { + "epoch": 4.059220170038112, + "grad_norm": 0.22102872460792763, + "learning_rate": 0.0004686931590440013, + "loss": 3.068086624145508, + "step": 6925, + "token_acc": 0.2878088784921083 + }, + { + "epoch": 4.059806508355321, + "grad_norm": 0.19411663310453908, + "learning_rate": 0.00046868141766329927, + "loss": 3.0074307918548584, + "step": 6926, + "token_acc": 0.29875983600295397 + }, + { + "epoch": 4.06039284667253, + "grad_norm": 0.17444384316267805, + "learning_rate": 0.0004686696742283799, + "loss": 2.984783172607422, + "step": 6927, + "token_acc": 0.3023880326092179 + }, + { + "epoch": 4.060979184989739, + "grad_norm": 0.19375440610173533, + "learning_rate": 0.0004686579287393537, + "loss": 3.0217232704162598, + "step": 6928, + "token_acc": 0.2973614188444067 + }, + { + "epoch": 4.061565523306948, + "grad_norm": 0.17997940963603093, + "learning_rate": 0.0004686461811963309, + "loss": 3.051995277404785, + "step": 6929, + "token_acc": 0.2927326467586518 + }, + { + "epoch": 4.062151861624157, + "grad_norm": 0.2288387475781029, + "learning_rate": 0.00046863443159942184, + "loss": 2.970278263092041, + "step": 6930, + "token_acc": 0.30419211444258354 + }, + { + "epoch": 4.062738199941366, + "grad_norm": 0.2366988918868131, + "learning_rate": 0.0004686226799487369, + "loss": 3.0362985134124756, + "step": 6931, + "token_acc": 0.29453910708291375 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.18709339768117442, + "learning_rate": 0.0004686109262443865, + "loss": 3.0148675441741943, + "step": 6932, + "token_acc": 0.2977506943243436 + }, + { + "epoch": 4.063910876575784, + "grad_norm": 0.19991857977372324, + "learning_rate": 0.00046859917048648093, + "loss": 2.981264114379883, + "step": 6933, + "token_acc": 0.300840048358559 + }, + { + "epoch": 4.064497214892993, + "grad_norm": 0.21963186365571524, + "learning_rate": 0.00046858741267513085, + "loss": 3.0716636180877686, + "step": 6934, + "token_acc": 0.28713313762203063 + }, + { + "epoch": 4.0650835532102025, + "grad_norm": 0.2070139372725275, + "learning_rate": 0.0004685756528104465, + "loss": 2.9902968406677246, + "step": 6935, + "token_acc": 0.2996819271682306 + }, + { + "epoch": 4.065669891527412, + "grad_norm": 0.20291388398468088, + "learning_rate": 0.00046856389089253847, + "loss": 3.016145706176758, + "step": 6936, + "token_acc": 0.29670733619119183 + }, + { + "epoch": 4.066256229844621, + "grad_norm": 0.2087869225600144, + "learning_rate": 0.00046855212692151714, + "loss": 3.0176455974578857, + "step": 6937, + "token_acc": 0.29703672375761964 + }, + { + "epoch": 4.066842568161829, + "grad_norm": 0.2625364593450508, + "learning_rate": 0.0004685403608974932, + "loss": 3.0330357551574707, + "step": 6938, + "token_acc": 0.29709014992177546 + }, + { + "epoch": 4.067428906479038, + "grad_norm": 0.24001909845134076, + "learning_rate": 0.000468528592820577, + "loss": 3.039304733276367, + "step": 6939, + "token_acc": 0.29373897683558337 + }, + { + "epoch": 4.068015244796247, + "grad_norm": 0.18300065820668468, + "learning_rate": 0.0004685168226908791, + "loss": 3.045700788497925, + "step": 6940, + "token_acc": 0.2937364389721068 + }, + { + "epoch": 4.068601583113456, + "grad_norm": 0.2226301329618281, + "learning_rate": 0.0004685050505085101, + "loss": 3.0371899604797363, + "step": 6941, + "token_acc": 0.2920033768357562 + }, + { + "epoch": 4.069187921430665, + "grad_norm": 0.24995116450718977, + "learning_rate": 0.0004684932762735806, + "loss": 3.035311698913574, + "step": 6942, + "token_acc": 0.29509077037182757 + }, + { + "epoch": 4.0697742597478745, + "grad_norm": 0.21497200172938793, + "learning_rate": 0.0004684814999862013, + "loss": 3.020979404449463, + "step": 6943, + "token_acc": 0.29604287568821225 + }, + { + "epoch": 4.070360598065084, + "grad_norm": 0.19988610615377478, + "learning_rate": 0.0004684697216464826, + "loss": 2.9862046241760254, + "step": 6944, + "token_acc": 0.3006142949677721 + }, + { + "epoch": 4.070946936382293, + "grad_norm": 0.22694268163892065, + "learning_rate": 0.0004684579412545353, + "loss": 2.987281322479248, + "step": 6945, + "token_acc": 0.30181648812296225 + }, + { + "epoch": 4.071533274699502, + "grad_norm": 0.18285552289015322, + "learning_rate": 0.00046844615881047, + "loss": 2.980762243270874, + "step": 6946, + "token_acc": 0.3022050025639565 + }, + { + "epoch": 4.072119613016711, + "grad_norm": 0.2054839152647431, + "learning_rate": 0.0004684343743143974, + "loss": 3.023253917694092, + "step": 6947, + "token_acc": 0.2975228989944751 + }, + { + "epoch": 4.07270595133392, + "grad_norm": 0.1736171316412354, + "learning_rate": 0.0004684225877664282, + "loss": 3.0151586532592773, + "step": 6948, + "token_acc": 0.29745865161076057 + }, + { + "epoch": 4.073292289651128, + "grad_norm": 0.22389806306314175, + "learning_rate": 0.0004684107991666731, + "loss": 3.0390310287475586, + "step": 6949, + "token_acc": 0.2945940274800726 + }, + { + "epoch": 4.073878627968337, + "grad_norm": 0.21184294745580443, + "learning_rate": 0.00046839900851524286, + "loss": 2.989704132080078, + "step": 6950, + "token_acc": 0.30116297356587474 + }, + { + "epoch": 4.0744649662855466, + "grad_norm": 0.20746140767417867, + "learning_rate": 0.00046838721581224824, + "loss": 3.040753126144409, + "step": 6951, + "token_acc": 0.29387476718868866 + }, + { + "epoch": 4.075051304602756, + "grad_norm": 0.29710964299337256, + "learning_rate": 0.0004683754210578, + "loss": 3.0222530364990234, + "step": 6952, + "token_acc": 0.2965594844159549 + }, + { + "epoch": 4.075637642919965, + "grad_norm": 0.18860422709293817, + "learning_rate": 0.00046836362425200893, + "loss": 3.063882827758789, + "step": 6953, + "token_acc": 0.29041648997473846 + }, + { + "epoch": 4.076223981237174, + "grad_norm": 0.276509277095217, + "learning_rate": 0.0004683518253949859, + "loss": 2.996743679046631, + "step": 6954, + "token_acc": 0.3003923220727244 + }, + { + "epoch": 4.076810319554383, + "grad_norm": 0.26764628636433185, + "learning_rate": 0.00046834002448684173, + "loss": 3.0060930252075195, + "step": 6955, + "token_acc": 0.29875557158683946 + }, + { + "epoch": 4.077396657871592, + "grad_norm": 0.1932403849426315, + "learning_rate": 0.00046832822152768715, + "loss": 2.9837794303894043, + "step": 6956, + "token_acc": 0.30224323337039616 + }, + { + "epoch": 4.077982996188801, + "grad_norm": 0.23859170421072165, + "learning_rate": 0.00046831641651763324, + "loss": 3.035407781600952, + "step": 6957, + "token_acc": 0.2947849489134109 + }, + { + "epoch": 4.07856933450601, + "grad_norm": 0.2052574764075219, + "learning_rate": 0.00046830460945679065, + "loss": 3.0555951595306396, + "step": 6958, + "token_acc": 0.29304932392079996 + }, + { + "epoch": 4.0791556728232194, + "grad_norm": 0.2280025103484422, + "learning_rate": 0.00046829280034527056, + "loss": 3.0079140663146973, + "step": 6959, + "token_acc": 0.2987068084633035 + }, + { + "epoch": 4.079742011140428, + "grad_norm": 0.2635089767326323, + "learning_rate": 0.00046828098918318367, + "loss": 3.005866050720215, + "step": 6960, + "token_acc": 0.299281689315988 + }, + { + "epoch": 4.080328349457637, + "grad_norm": 0.19025951773023753, + "learning_rate": 0.00046826917597064105, + "loss": 3.0000319480895996, + "step": 6961, + "token_acc": 0.3006623697418174 + }, + { + "epoch": 4.080914687774846, + "grad_norm": 0.2016138841141072, + "learning_rate": 0.0004682573607077537, + "loss": 3.0290088653564453, + "step": 6962, + "token_acc": 0.29619968493963134 + }, + { + "epoch": 4.081501026092055, + "grad_norm": 0.20047344519557195, + "learning_rate": 0.00046824554339463246, + "loss": 3.019904136657715, + "step": 6963, + "token_acc": 0.29873826306209683 + }, + { + "epoch": 4.082087364409264, + "grad_norm": 0.1820617028158984, + "learning_rate": 0.0004682337240313884, + "loss": 3.007856845855713, + "step": 6964, + "token_acc": 0.2977153723639424 + }, + { + "epoch": 4.082673702726473, + "grad_norm": 0.19729080801125376, + "learning_rate": 0.0004682219026181327, + "loss": 3.0237984657287598, + "step": 6965, + "token_acc": 0.29659095075235814 + }, + { + "epoch": 4.083260041043682, + "grad_norm": 0.18015351188053685, + "learning_rate": 0.00046821007915497624, + "loss": 2.9836111068725586, + "step": 6966, + "token_acc": 0.3020407948608047 + }, + { + "epoch": 4.0838463793608915, + "grad_norm": 0.21627990643049952, + "learning_rate": 0.0004681982536420301, + "loss": 3.01456356048584, + "step": 6967, + "token_acc": 0.2977896329492328 + }, + { + "epoch": 4.084432717678101, + "grad_norm": 0.18436026984896042, + "learning_rate": 0.0004681864260794054, + "loss": 3.0086987018585205, + "step": 6968, + "token_acc": 0.29730919068635697 + }, + { + "epoch": 4.08501905599531, + "grad_norm": 0.1944022629982658, + "learning_rate": 0.00046817459646721326, + "loss": 3.0141165256500244, + "step": 6969, + "token_acc": 0.2971052221941911 + }, + { + "epoch": 4.085605394312518, + "grad_norm": 0.1918470188300405, + "learning_rate": 0.00046816276480556474, + "loss": 3.0507915019989014, + "step": 6970, + "token_acc": 0.29311558603364973 + }, + { + "epoch": 4.086191732629727, + "grad_norm": 0.1740556453643377, + "learning_rate": 0.0004681509310945711, + "loss": 3.0208933353424072, + "step": 6971, + "token_acc": 0.29482196426242335 + }, + { + "epoch": 4.086778070946936, + "grad_norm": 0.19032179316072878, + "learning_rate": 0.00046813909533434335, + "loss": 3.01246976852417, + "step": 6972, + "token_acc": 0.2965107650488066 + }, + { + "epoch": 4.087364409264145, + "grad_norm": 0.21728345570591184, + "learning_rate": 0.00046812725752499274, + "loss": 3.0037271976470947, + "step": 6973, + "token_acc": 0.2987636518999172 + }, + { + "epoch": 4.087950747581354, + "grad_norm": 0.2524870985994271, + "learning_rate": 0.0004681154176666305, + "loss": 3.028642177581787, + "step": 6974, + "token_acc": 0.2961405392507755 + }, + { + "epoch": 4.0885370858985635, + "grad_norm": 0.18524157869353852, + "learning_rate": 0.00046810357575936785, + "loss": 3.0291829109191895, + "step": 6975, + "token_acc": 0.2956931013254993 + }, + { + "epoch": 4.089123424215773, + "grad_norm": 0.17659972104074667, + "learning_rate": 0.000468091731803316, + "loss": 3.018859624862671, + "step": 6976, + "token_acc": 0.29561614632574973 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.22539728764593853, + "learning_rate": 0.00046807988579858616, + "loss": 2.992833137512207, + "step": 6977, + "token_acc": 0.2999649081763949 + }, + { + "epoch": 4.090296100850191, + "grad_norm": 0.30569418253105196, + "learning_rate": 0.00046806803774528973, + "loss": 2.9887354373931885, + "step": 6978, + "token_acc": 0.3009159266741334 + }, + { + "epoch": 4.0908824391674, + "grad_norm": 0.2621945734935609, + "learning_rate": 0.00046805618764353787, + "loss": 3.0711047649383545, + "step": 6979, + "token_acc": 0.2881126315707986 + }, + { + "epoch": 4.091468777484609, + "grad_norm": 0.20531612116435735, + "learning_rate": 0.000468044335493442, + "loss": 3.019559860229492, + "step": 6980, + "token_acc": 0.29795473704688413 + }, + { + "epoch": 4.092055115801817, + "grad_norm": 0.22166480049157664, + "learning_rate": 0.0004680324812951134, + "loss": 3.047178030014038, + "step": 6981, + "token_acc": 0.2938977152407935 + }, + { + "epoch": 4.092641454119026, + "grad_norm": 0.2121211297742218, + "learning_rate": 0.00046802062504866347, + "loss": 3.0059547424316406, + "step": 6982, + "token_acc": 0.2987444083228393 + }, + { + "epoch": 4.0932277924362355, + "grad_norm": 0.18462009581558061, + "learning_rate": 0.00046800876675420354, + "loss": 3.023655414581299, + "step": 6983, + "token_acc": 0.2969322815833917 + }, + { + "epoch": 4.093814130753445, + "grad_norm": 0.21173537085481642, + "learning_rate": 0.000467996906411845, + "loss": 3.017996072769165, + "step": 6984, + "token_acc": 0.2972103843806563 + }, + { + "epoch": 4.094400469070654, + "grad_norm": 0.20464311722094766, + "learning_rate": 0.0004679850440216993, + "loss": 3.0208888053894043, + "step": 6985, + "token_acc": 0.2974608540494931 + }, + { + "epoch": 4.094986807387863, + "grad_norm": 0.1949518577052013, + "learning_rate": 0.0004679731795838779, + "loss": 3.0158894062042236, + "step": 6986, + "token_acc": 0.29754929260761226 + }, + { + "epoch": 4.095573145705072, + "grad_norm": 0.2683485014340852, + "learning_rate": 0.00046796131309849204, + "loss": 2.9732635021209717, + "step": 6987, + "token_acc": 0.30163195490339517 + }, + { + "epoch": 4.096159484022281, + "grad_norm": 0.19544464452482588, + "learning_rate": 0.00046794944456565344, + "loss": 3.0257205963134766, + "step": 6988, + "token_acc": 0.2946038875513207 + }, + { + "epoch": 4.09674582233949, + "grad_norm": 0.23640717720282306, + "learning_rate": 0.00046793757398547355, + "loss": 3.0117897987365723, + "step": 6989, + "token_acc": 0.2983151154517819 + }, + { + "epoch": 4.097332160656699, + "grad_norm": 0.2707466030490996, + "learning_rate": 0.0004679257013580638, + "loss": 3.0323681831359863, + "step": 6990, + "token_acc": 0.29341487952858564 + }, + { + "epoch": 4.097918498973908, + "grad_norm": 0.1836026728924073, + "learning_rate": 0.0004679138266835357, + "loss": 2.995007038116455, + "step": 6991, + "token_acc": 0.3000697303927806 + }, + { + "epoch": 4.098504837291117, + "grad_norm": 0.2678182527150261, + "learning_rate": 0.0004679019499620009, + "loss": 3.0316834449768066, + "step": 6992, + "token_acc": 0.29499220623026834 + }, + { + "epoch": 4.099091175608326, + "grad_norm": 0.2301636007508659, + "learning_rate": 0.00046789007119357084, + "loss": 3.0489447116851807, + "step": 6993, + "token_acc": 0.29207601971565733 + }, + { + "epoch": 4.099677513925535, + "grad_norm": 0.21162966333927336, + "learning_rate": 0.0004678781903783572, + "loss": 3.0323922634124756, + "step": 6994, + "token_acc": 0.29403135457397767 + }, + { + "epoch": 4.100263852242744, + "grad_norm": 0.3333828792077415, + "learning_rate": 0.0004678663075164716, + "loss": 3.0274620056152344, + "step": 6995, + "token_acc": 0.29581163590056564 + }, + { + "epoch": 4.100850190559953, + "grad_norm": 0.1681058846455195, + "learning_rate": 0.0004678544226080256, + "loss": 3.010754108428955, + "step": 6996, + "token_acc": 0.2970315127586459 + }, + { + "epoch": 4.101436528877162, + "grad_norm": 0.267796528364808, + "learning_rate": 0.00046784253565313084, + "loss": 3.020566940307617, + "step": 6997, + "token_acc": 0.29677590623714145 + }, + { + "epoch": 4.102022867194371, + "grad_norm": 0.23824032227667175, + "learning_rate": 0.000467830646651899, + "loss": 3.040004253387451, + "step": 6998, + "token_acc": 0.29354891818114226 + }, + { + "epoch": 4.10260920551158, + "grad_norm": 0.23235772502746654, + "learning_rate": 0.00046781875560444175, + "loss": 3.039081573486328, + "step": 6999, + "token_acc": 0.2932035022768879 + }, + { + "epoch": 4.1031955438287895, + "grad_norm": 0.22219244939826613, + "learning_rate": 0.00046780686251087086, + "loss": 3.031935691833496, + "step": 7000, + "token_acc": 0.29485961799394633 + }, + { + "epoch": 4.103781882145999, + "grad_norm": 0.18284440256275608, + "learning_rate": 0.0004677949673712979, + "loss": 3.0427985191345215, + "step": 7001, + "token_acc": 0.29369891272108317 + }, + { + "epoch": 4.104368220463208, + "grad_norm": 0.21760512164415677, + "learning_rate": 0.0004677830701858348, + "loss": 3.030557155609131, + "step": 7002, + "token_acc": 0.2954305546258297 + }, + { + "epoch": 4.104954558780416, + "grad_norm": 0.1733596256474521, + "learning_rate": 0.0004677711709545932, + "loss": 3.024519443511963, + "step": 7003, + "token_acc": 0.2940926970405417 + }, + { + "epoch": 4.105540897097625, + "grad_norm": 0.25388201387907494, + "learning_rate": 0.0004677592696776849, + "loss": 3.040799379348755, + "step": 7004, + "token_acc": 0.2931021595925974 + }, + { + "epoch": 4.106127235414834, + "grad_norm": 0.2078727164356633, + "learning_rate": 0.0004677473663552217, + "loss": 3.069350242614746, + "step": 7005, + "token_acc": 0.2919334588006818 + }, + { + "epoch": 4.106713573732043, + "grad_norm": 0.20615525699195483, + "learning_rate": 0.0004677354609873153, + "loss": 3.0088462829589844, + "step": 7006, + "token_acc": 0.2999498756967656 + }, + { + "epoch": 4.107299912049252, + "grad_norm": 0.21538929123990208, + "learning_rate": 0.0004677235535740778, + "loss": 3.0167691707611084, + "step": 7007, + "token_acc": 0.29699274894030075 + }, + { + "epoch": 4.1078862503664615, + "grad_norm": 0.2331754784166338, + "learning_rate": 0.00046771164411562086, + "loss": 3.0611116886138916, + "step": 7008, + "token_acc": 0.29120035903216074 + }, + { + "epoch": 4.108472588683671, + "grad_norm": 0.245061151178424, + "learning_rate": 0.00046769973261205633, + "loss": 2.997422218322754, + "step": 7009, + "token_acc": 0.29964422325504875 + }, + { + "epoch": 4.10905892700088, + "grad_norm": 0.21695938857743266, + "learning_rate": 0.0004676878190634963, + "loss": 3.0566868782043457, + "step": 7010, + "token_acc": 0.2918127681372863 + }, + { + "epoch": 4.109645265318089, + "grad_norm": 0.23389780342873942, + "learning_rate": 0.0004676759034700524, + "loss": 3.025515079498291, + "step": 7011, + "token_acc": 0.2969554855078035 + }, + { + "epoch": 4.110231603635298, + "grad_norm": 0.21530993723750178, + "learning_rate": 0.0004676639858318368, + "loss": 3.0238678455352783, + "step": 7012, + "token_acc": 0.2953092622668832 + }, + { + "epoch": 4.110817941952506, + "grad_norm": 0.2092232079236262, + "learning_rate": 0.0004676520661489613, + "loss": 2.995426654815674, + "step": 7013, + "token_acc": 0.30052392801218386 + }, + { + "epoch": 4.111404280269715, + "grad_norm": 0.22172985277176535, + "learning_rate": 0.00046764014442153795, + "loss": 3.0704421997070312, + "step": 7014, + "token_acc": 0.290688261338855 + }, + { + "epoch": 4.111990618586924, + "grad_norm": 0.1970558360186922, + "learning_rate": 0.00046762822064967875, + "loss": 3.048707962036133, + "step": 7015, + "token_acc": 0.29292835890713476 + }, + { + "epoch": 4.1125769569041335, + "grad_norm": 0.2358926141520634, + "learning_rate": 0.00046761629483349563, + "loss": 3.0217857360839844, + "step": 7016, + "token_acc": 0.29722266626101573 + }, + { + "epoch": 4.113163295221343, + "grad_norm": 0.2029791189883375, + "learning_rate": 0.0004676043669731007, + "loss": 3.0441980361938477, + "step": 7017, + "token_acc": 0.29347729864557354 + }, + { + "epoch": 4.113749633538552, + "grad_norm": 0.19396491898229826, + "learning_rate": 0.00046759243706860594, + "loss": 3.020205020904541, + "step": 7018, + "token_acc": 0.2940790661610851 + }, + { + "epoch": 4.114335971855761, + "grad_norm": 0.21385889856847917, + "learning_rate": 0.00046758050512012346, + "loss": 3.0264432430267334, + "step": 7019, + "token_acc": 0.2973115362123375 + }, + { + "epoch": 4.11492231017297, + "grad_norm": 0.21052951369067843, + "learning_rate": 0.00046756857112776527, + "loss": 3.0004782676696777, + "step": 7020, + "token_acc": 0.2981847379604314 + }, + { + "epoch": 4.115508648490179, + "grad_norm": 0.2276907459739103, + "learning_rate": 0.0004675566350916436, + "loss": 3.0024402141571045, + "step": 7021, + "token_acc": 0.3000336590692036 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.17915423417343723, + "learning_rate": 0.0004675446970118705, + "loss": 3.0221309661865234, + "step": 7022, + "token_acc": 0.29533178398225673 + }, + { + "epoch": 4.116681325124597, + "grad_norm": 0.2214559681794065, + "learning_rate": 0.0004675327568885581, + "loss": 2.99198579788208, + "step": 7023, + "token_acc": 0.30182483857600695 + }, + { + "epoch": 4.1172676634418055, + "grad_norm": 0.1704442329849183, + "learning_rate": 0.0004675208147218186, + "loss": 3.045954465866089, + "step": 7024, + "token_acc": 0.2944355650576526 + }, + { + "epoch": 4.117854001759015, + "grad_norm": 0.20964180464879437, + "learning_rate": 0.0004675088705117642, + "loss": 2.953301429748535, + "step": 7025, + "token_acc": 0.3070579103117392 + }, + { + "epoch": 4.118440340076224, + "grad_norm": 0.2237840178751719, + "learning_rate": 0.00046749692425850694, + "loss": 3.055589199066162, + "step": 7026, + "token_acc": 0.29000910147223524 + }, + { + "epoch": 4.119026678393433, + "grad_norm": 0.20040454860254014, + "learning_rate": 0.00046748497596215923, + "loss": 3.0536410808563232, + "step": 7027, + "token_acc": 0.29176031449787554 + }, + { + "epoch": 4.119613016710642, + "grad_norm": 0.22960995696203798, + "learning_rate": 0.0004674730256228332, + "loss": 3.030405044555664, + "step": 7028, + "token_acc": 0.295961574259788 + }, + { + "epoch": 4.120199355027851, + "grad_norm": 0.24602932714665643, + "learning_rate": 0.0004674610732406411, + "loss": 3.0607151985168457, + "step": 7029, + "token_acc": 0.2915940105653545 + }, + { + "epoch": 4.12078569334506, + "grad_norm": 0.21155961842110704, + "learning_rate": 0.0004674491188156954, + "loss": 2.997239828109741, + "step": 7030, + "token_acc": 0.29983294129190347 + }, + { + "epoch": 4.121372031662269, + "grad_norm": 0.18861416969373043, + "learning_rate": 0.0004674371623481081, + "loss": 3.010213851928711, + "step": 7031, + "token_acc": 0.2983006104603201 + }, + { + "epoch": 4.121958369979478, + "grad_norm": 0.21770415666674248, + "learning_rate": 0.00046742520383799183, + "loss": 2.9993700981140137, + "step": 7032, + "token_acc": 0.29977478067303914 + }, + { + "epoch": 4.1225447082966875, + "grad_norm": 0.1922978996459676, + "learning_rate": 0.00046741324328545853, + "loss": 2.988560676574707, + "step": 7033, + "token_acc": 0.30240451702234267 + }, + { + "epoch": 4.123131046613897, + "grad_norm": 0.2106380183464224, + "learning_rate": 0.0004674012806906209, + "loss": 3.0431597232818604, + "step": 7034, + "token_acc": 0.29275645190565003 + }, + { + "epoch": 4.123717384931105, + "grad_norm": 0.2521084749734736, + "learning_rate": 0.0004673893160535912, + "loss": 3.027672290802002, + "step": 7035, + "token_acc": 0.2934456837460867 + }, + { + "epoch": 4.124303723248314, + "grad_norm": 0.3203196909258401, + "learning_rate": 0.00046737734937448176, + "loss": 3.071023941040039, + "step": 7036, + "token_acc": 0.28911407835364134 + }, + { + "epoch": 4.124890061565523, + "grad_norm": 0.22570587478439844, + "learning_rate": 0.00046736538065340514, + "loss": 3.0172128677368164, + "step": 7037, + "token_acc": 0.2966379727850771 + }, + { + "epoch": 4.125476399882732, + "grad_norm": 0.24113098085033072, + "learning_rate": 0.0004673534098904736, + "loss": 3.0400915145874023, + "step": 7038, + "token_acc": 0.2960601091014103 + }, + { + "epoch": 4.126062738199941, + "grad_norm": 0.26401919596547946, + "learning_rate": 0.0004673414370857997, + "loss": 3.007387161254883, + "step": 7039, + "token_acc": 0.29966184856119876 + }, + { + "epoch": 4.12664907651715, + "grad_norm": 0.18716402660398995, + "learning_rate": 0.0004673294622394958, + "loss": 3.003145217895508, + "step": 7040, + "token_acc": 0.2997829803163932 + }, + { + "epoch": 4.1272354148343595, + "grad_norm": 0.2716082962372313, + "learning_rate": 0.0004673174853516745, + "loss": 3.008039712905884, + "step": 7041, + "token_acc": 0.2982904955524663 + }, + { + "epoch": 4.127821753151569, + "grad_norm": 0.18188170870095105, + "learning_rate": 0.00046730550642244825, + "loss": 3.0684940814971924, + "step": 7042, + "token_acc": 0.2893988849151192 + }, + { + "epoch": 4.128408091468778, + "grad_norm": 0.2125056665408206, + "learning_rate": 0.0004672935254519296, + "loss": 3.0241904258728027, + "step": 7043, + "token_acc": 0.29768451519536904 + }, + { + "epoch": 4.128994429785987, + "grad_norm": 0.18690014248234924, + "learning_rate": 0.000467281542440231, + "loss": 3.008909225463867, + "step": 7044, + "token_acc": 0.2992523205675567 + }, + { + "epoch": 4.129580768103196, + "grad_norm": 0.21416336044173756, + "learning_rate": 0.00046726955738746526, + "loss": 3.0233187675476074, + "step": 7045, + "token_acc": 0.2950428836288544 + }, + { + "epoch": 4.130167106420404, + "grad_norm": 0.1846573960326613, + "learning_rate": 0.00046725757029374474, + "loss": 2.9919557571411133, + "step": 7046, + "token_acc": 0.30108723905343776 + }, + { + "epoch": 4.130753444737613, + "grad_norm": 0.20571309395340406, + "learning_rate": 0.000467245581159182, + "loss": 3.003404378890991, + "step": 7047, + "token_acc": 0.29898791080082937 + }, + { + "epoch": 4.131339783054822, + "grad_norm": 0.18782420717819068, + "learning_rate": 0.0004672335899838899, + "loss": 3.0275511741638184, + "step": 7048, + "token_acc": 0.29568912852931023 + }, + { + "epoch": 4.1319261213720315, + "grad_norm": 0.2154965341010058, + "learning_rate": 0.00046722159676798093, + "loss": 3.103482246398926, + "step": 7049, + "token_acc": 0.28561526489936306 + }, + { + "epoch": 4.132512459689241, + "grad_norm": 0.18755705767215644, + "learning_rate": 0.0004672096015115678, + "loss": 3.064627170562744, + "step": 7050, + "token_acc": 0.2891123361867217 + }, + { + "epoch": 4.13309879800645, + "grad_norm": 0.21327065598708966, + "learning_rate": 0.00046719760421476305, + "loss": 3.0556278228759766, + "step": 7051, + "token_acc": 0.29301282518779365 + }, + { + "epoch": 4.133685136323659, + "grad_norm": 0.19109782160511382, + "learning_rate": 0.0004671856048776795, + "loss": 3.037001609802246, + "step": 7052, + "token_acc": 0.292851874168883 + }, + { + "epoch": 4.134271474640868, + "grad_norm": 0.19425716655822833, + "learning_rate": 0.00046717360350042995, + "loss": 3.078153371810913, + "step": 7053, + "token_acc": 0.2885069093360297 + }, + { + "epoch": 4.134857812958077, + "grad_norm": 0.19570132400152884, + "learning_rate": 0.0004671616000831269, + "loss": 3.007575750350952, + "step": 7054, + "token_acc": 0.3003465111767369 + }, + { + "epoch": 4.135444151275286, + "grad_norm": 0.18609806584699723, + "learning_rate": 0.00046714959462588335, + "loss": 3.0401804447174072, + "step": 7055, + "token_acc": 0.2939394256804529 + }, + { + "epoch": 4.136030489592494, + "grad_norm": 0.1965627896488159, + "learning_rate": 0.00046713758712881194, + "loss": 3.090794086456299, + "step": 7056, + "token_acc": 0.2864947534795053 + }, + { + "epoch": 4.1366168279097035, + "grad_norm": 0.1876064957362089, + "learning_rate": 0.00046712557759202557, + "loss": 2.988605499267578, + "step": 7057, + "token_acc": 0.3021640287459731 + }, + { + "epoch": 4.137203166226913, + "grad_norm": 0.19365874979699654, + "learning_rate": 0.0004671135660156369, + "loss": 3.0327749252319336, + "step": 7058, + "token_acc": 0.2958143847057601 + }, + { + "epoch": 4.137789504544122, + "grad_norm": 0.17517389753847912, + "learning_rate": 0.00046710155239975884, + "loss": 3.0815582275390625, + "step": 7059, + "token_acc": 0.28933563457413514 + }, + { + "epoch": 4.138375842861331, + "grad_norm": 0.20282444417773865, + "learning_rate": 0.00046708953674450427, + "loss": 3.0498476028442383, + "step": 7060, + "token_acc": 0.29249855181867984 + }, + { + "epoch": 4.13896218117854, + "grad_norm": 0.19830425263450888, + "learning_rate": 0.000467077519049986, + "loss": 2.9706664085388184, + "step": 7061, + "token_acc": 0.3044392066985272 + }, + { + "epoch": 4.139548519495749, + "grad_norm": 0.17767338353855427, + "learning_rate": 0.000467065499316317, + "loss": 3.0146965980529785, + "step": 7062, + "token_acc": 0.2995778556351987 + }, + { + "epoch": 4.140134857812958, + "grad_norm": 0.20777925721691495, + "learning_rate": 0.00046705347754361006, + "loss": 3.0099711418151855, + "step": 7063, + "token_acc": 0.29751095883508843 + }, + { + "epoch": 4.140721196130167, + "grad_norm": 0.22985953041035598, + "learning_rate": 0.0004670414537319783, + "loss": 3.0828890800476074, + "step": 7064, + "token_acc": 0.2888711591282717 + }, + { + "epoch": 4.141307534447376, + "grad_norm": 0.2043645571169984, + "learning_rate": 0.00046702942788153445, + "loss": 3.066344976425171, + "step": 7065, + "token_acc": 0.29120096442598004 + }, + { + "epoch": 4.1418938727645855, + "grad_norm": 0.2160954783105623, + "learning_rate": 0.00046701739999239155, + "loss": 3.0320119857788086, + "step": 7066, + "token_acc": 0.2943595892196215 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.2503227641533591, + "learning_rate": 0.0004670053700646627, + "loss": 3.0036163330078125, + "step": 7067, + "token_acc": 0.2991095165578298 + }, + { + "epoch": 4.143066549399003, + "grad_norm": 0.23345882267355128, + "learning_rate": 0.00046699333809846076, + "loss": 3.035076379776001, + "step": 7068, + "token_acc": 0.2954892258412869 + }, + { + "epoch": 4.143652887716212, + "grad_norm": 0.19076298634248695, + "learning_rate": 0.0004669813040938988, + "loss": 3.04569673538208, + "step": 7069, + "token_acc": 0.29291245929452975 + }, + { + "epoch": 4.144239226033421, + "grad_norm": 0.1773726042855761, + "learning_rate": 0.00046696926805108997, + "loss": 3.0345423221588135, + "step": 7070, + "token_acc": 0.2948005656510018 + }, + { + "epoch": 4.14482556435063, + "grad_norm": 0.21569106611411484, + "learning_rate": 0.00046695722997014725, + "loss": 3.0541744232177734, + "step": 7071, + "token_acc": 0.2906861043619114 + }, + { + "epoch": 4.145411902667839, + "grad_norm": 0.20188609011027933, + "learning_rate": 0.00046694518985118357, + "loss": 3.021017551422119, + "step": 7072, + "token_acc": 0.29739049564647213 + }, + { + "epoch": 4.145998240985048, + "grad_norm": 0.23569023334218667, + "learning_rate": 0.00046693314769431223, + "loss": 3.00738525390625, + "step": 7073, + "token_acc": 0.29773470565602156 + }, + { + "epoch": 4.1465845793022575, + "grad_norm": 0.1960733928332109, + "learning_rate": 0.00046692110349964627, + "loss": 3.0366311073303223, + "step": 7074, + "token_acc": 0.2959329238114647 + }, + { + "epoch": 4.147170917619467, + "grad_norm": 0.18832683100035275, + "learning_rate": 0.00046690905726729887, + "loss": 3.0316362380981445, + "step": 7075, + "token_acc": 0.2950518953956778 + }, + { + "epoch": 4.147757255936676, + "grad_norm": 0.2179836225519626, + "learning_rate": 0.0004668970089973832, + "loss": 3.041081190109253, + "step": 7076, + "token_acc": 0.2932194132334582 + }, + { + "epoch": 4.148343594253885, + "grad_norm": 0.22847330709256103, + "learning_rate": 0.00046688495869001234, + "loss": 3.0492234230041504, + "step": 7077, + "token_acc": 0.2919259295491465 + }, + { + "epoch": 4.148929932571093, + "grad_norm": 0.1904728149121686, + "learning_rate": 0.00046687290634529955, + "loss": 3.019866466522217, + "step": 7078, + "token_acc": 0.2959847791732513 + }, + { + "epoch": 4.149516270888302, + "grad_norm": 0.27838198988043333, + "learning_rate": 0.0004668608519633581, + "loss": 3.0496110916137695, + "step": 7079, + "token_acc": 0.2933138531013415 + }, + { + "epoch": 4.150102609205511, + "grad_norm": 0.2767449097879659, + "learning_rate": 0.00046684879554430113, + "loss": 3.046294689178467, + "step": 7080, + "token_acc": 0.29320437770425045 + }, + { + "epoch": 4.15068894752272, + "grad_norm": 0.19460546435211695, + "learning_rate": 0.00046683673708824194, + "loss": 3.066836357116699, + "step": 7081, + "token_acc": 0.29037366385479796 + }, + { + "epoch": 4.1512752858399296, + "grad_norm": 0.29151963843918316, + "learning_rate": 0.0004668246765952938, + "loss": 3.0040342807769775, + "step": 7082, + "token_acc": 0.30145942575398404 + }, + { + "epoch": 4.151861624157139, + "grad_norm": 0.1929812410587892, + "learning_rate": 0.00046681261406557003, + "loss": 3.041063070297241, + "step": 7083, + "token_acc": 0.29300945325202726 + }, + { + "epoch": 4.152447962474348, + "grad_norm": 0.2536166281757521, + "learning_rate": 0.0004668005494991838, + "loss": 3.023494243621826, + "step": 7084, + "token_acc": 0.2951332657688249 + }, + { + "epoch": 4.153034300791557, + "grad_norm": 0.21373170274499917, + "learning_rate": 0.00046678848289624864, + "loss": 3.047337055206299, + "step": 7085, + "token_acc": 0.29454262446589813 + }, + { + "epoch": 4.153620639108766, + "grad_norm": 0.23353646972111555, + "learning_rate": 0.00046677641425687784, + "loss": 3.0630502700805664, + "step": 7086, + "token_acc": 0.29203935254736096 + }, + { + "epoch": 4.154206977425975, + "grad_norm": 0.22203336151340522, + "learning_rate": 0.0004667643435811847, + "loss": 2.994072675704956, + "step": 7087, + "token_acc": 0.3005077729144656 + }, + { + "epoch": 4.154793315743184, + "grad_norm": 0.20928959904605512, + "learning_rate": 0.00046675227086928264, + "loss": 3.023728847503662, + "step": 7088, + "token_acc": 0.29455682188212307 + }, + { + "epoch": 4.1553796540603924, + "grad_norm": 0.24338924103312093, + "learning_rate": 0.00046674019612128506, + "loss": 2.979199171066284, + "step": 7089, + "token_acc": 0.30235288530541604 + }, + { + "epoch": 4.155965992377602, + "grad_norm": 0.1736921816887968, + "learning_rate": 0.0004667281193373054, + "loss": 3.018723487854004, + "step": 7090, + "token_acc": 0.29572209611741046 + }, + { + "epoch": 4.156552330694811, + "grad_norm": 0.22540200906335275, + "learning_rate": 0.0004667160405174571, + "loss": 3.010500431060791, + "step": 7091, + "token_acc": 0.29890123550203757 + }, + { + "epoch": 4.15713866901202, + "grad_norm": 0.19587881689285558, + "learning_rate": 0.0004667039596618536, + "loss": 3.0391292572021484, + "step": 7092, + "token_acc": 0.2947037821894736 + }, + { + "epoch": 4.157725007329229, + "grad_norm": 0.21982108721514407, + "learning_rate": 0.0004666918767706085, + "loss": 3.0136666297912598, + "step": 7093, + "token_acc": 0.29763448975260015 + }, + { + "epoch": 4.158311345646438, + "grad_norm": 0.20567943770917999, + "learning_rate": 0.0004666797918438352, + "loss": 3.0618886947631836, + "step": 7094, + "token_acc": 0.29082116268928365 + }, + { + "epoch": 4.158897683963647, + "grad_norm": 0.19781248613019312, + "learning_rate": 0.00046666770488164723, + "loss": 3.0552635192871094, + "step": 7095, + "token_acc": 0.29241186048371903 + }, + { + "epoch": 4.159484022280856, + "grad_norm": 0.19881244603717121, + "learning_rate": 0.0004666556158841581, + "loss": 3.0654823780059814, + "step": 7096, + "token_acc": 0.29076996264521604 + }, + { + "epoch": 4.160070360598065, + "grad_norm": 0.17761953760147184, + "learning_rate": 0.00046664352485148143, + "loss": 3.0315709114074707, + "step": 7097, + "token_acc": 0.2944346580662697 + }, + { + "epoch": 4.1606566989152745, + "grad_norm": 0.1978985990562107, + "learning_rate": 0.0004666314317837307, + "loss": 3.014829635620117, + "step": 7098, + "token_acc": 0.29787647538965834 + }, + { + "epoch": 4.161243037232484, + "grad_norm": 0.1674233731702359, + "learning_rate": 0.00046661933668101964, + "loss": 3.0356192588806152, + "step": 7099, + "token_acc": 0.29375117559440594 + }, + { + "epoch": 4.161829375549692, + "grad_norm": 0.20094298637790062, + "learning_rate": 0.00046660723954346185, + "loss": 3.0140483379364014, + "step": 7100, + "token_acc": 0.2983582694337694 + }, + { + "epoch": 4.162415713866901, + "grad_norm": 0.19347567926127904, + "learning_rate": 0.0004665951403711709, + "loss": 3.027526378631592, + "step": 7101, + "token_acc": 0.2953698009011814 + }, + { + "epoch": 4.16300205218411, + "grad_norm": 0.18566759674894454, + "learning_rate": 0.00046658303916426045, + "loss": 3.0216879844665527, + "step": 7102, + "token_acc": 0.29696816612704463 + }, + { + "epoch": 4.163588390501319, + "grad_norm": 0.17922073572211084, + "learning_rate": 0.0004665709359228442, + "loss": 3.0508852005004883, + "step": 7103, + "token_acc": 0.29356801446509706 + }, + { + "epoch": 4.164174728818528, + "grad_norm": 0.21629321281797556, + "learning_rate": 0.00046655883064703586, + "loss": 3.0434796810150146, + "step": 7104, + "token_acc": 0.2948495562527062 + }, + { + "epoch": 4.164761067135737, + "grad_norm": 0.2015907229433968, + "learning_rate": 0.00046654672333694907, + "loss": 3.0592262744903564, + "step": 7105, + "token_acc": 0.2897117433006819 + }, + { + "epoch": 4.1653474054529465, + "grad_norm": 0.1931371653354883, + "learning_rate": 0.0004665346139926977, + "loss": 3.0275039672851562, + "step": 7106, + "token_acc": 0.29688584577746463 + }, + { + "epoch": 4.165933743770156, + "grad_norm": 0.17821961288112476, + "learning_rate": 0.00046652250261439534, + "loss": 3.042079210281372, + "step": 7107, + "token_acc": 0.2939778858865489 + }, + { + "epoch": 4.166520082087365, + "grad_norm": 0.20033317855731325, + "learning_rate": 0.0004665103892021558, + "loss": 3.016655921936035, + "step": 7108, + "token_acc": 0.29708529414793383 + }, + { + "epoch": 4.167106420404574, + "grad_norm": 0.18525657417228694, + "learning_rate": 0.00046649827375609297, + "loss": 3.030959367752075, + "step": 7109, + "token_acc": 0.29468318558720774 + }, + { + "epoch": 4.167692758721783, + "grad_norm": 0.18083164013408412, + "learning_rate": 0.0004664861562763206, + "loss": 3.0290822982788086, + "step": 7110, + "token_acc": 0.2948144123519374 + }, + { + "epoch": 4.168279097038991, + "grad_norm": 0.17839237629758914, + "learning_rate": 0.0004664740367629525, + "loss": 3.0278515815734863, + "step": 7111, + "token_acc": 0.2947525082488638 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.19663337642508003, + "learning_rate": 0.0004664619152161026, + "loss": 3.042304039001465, + "step": 7112, + "token_acc": 0.29429628803576846 + }, + { + "epoch": 4.169451773673409, + "grad_norm": 0.21056685268497685, + "learning_rate": 0.00046644979163588463, + "loss": 3.0030055046081543, + "step": 7113, + "token_acc": 0.2994610327748917 + }, + { + "epoch": 4.1700381119906185, + "grad_norm": 0.26487086465418436, + "learning_rate": 0.00046643766602241256, + "loss": 3.047046661376953, + "step": 7114, + "token_acc": 0.29298914799351833 + }, + { + "epoch": 4.170624450307828, + "grad_norm": 0.32481344596130146, + "learning_rate": 0.00046642553837580023, + "loss": 3.0168404579162598, + "step": 7115, + "token_acc": 0.2981899785073117 + }, + { + "epoch": 4.171210788625037, + "grad_norm": 0.272552402031005, + "learning_rate": 0.00046641340869616154, + "loss": 3.038266181945801, + "step": 7116, + "token_acc": 0.2937408621261285 + }, + { + "epoch": 4.171797126942246, + "grad_norm": 0.17470484100002326, + "learning_rate": 0.0004664012769836106, + "loss": 3.0391368865966797, + "step": 7117, + "token_acc": 0.293547087939927 + }, + { + "epoch": 4.172383465259455, + "grad_norm": 0.21957628140690602, + "learning_rate": 0.00046638914323826125, + "loss": 3.051262378692627, + "step": 7118, + "token_acc": 0.29323716209746914 + }, + { + "epoch": 4.172969803576664, + "grad_norm": 0.18030448739028657, + "learning_rate": 0.0004663770074602275, + "loss": 3.036210775375366, + "step": 7119, + "token_acc": 0.29414135185425594 + }, + { + "epoch": 4.173556141893873, + "grad_norm": 0.19974349358637666, + "learning_rate": 0.00046636486964962335, + "loss": 3.0618629455566406, + "step": 7120, + "token_acc": 0.2913911763328877 + }, + { + "epoch": 4.174142480211081, + "grad_norm": 0.18861560344242462, + "learning_rate": 0.00046635272980656274, + "loss": 3.043172836303711, + "step": 7121, + "token_acc": 0.29310756205503175 + }, + { + "epoch": 4.1747288185282905, + "grad_norm": 0.20889792893229253, + "learning_rate": 0.00046634058793115974, + "loss": 3.0008678436279297, + "step": 7122, + "token_acc": 0.300017460147864 + }, + { + "epoch": 4.1753151568455, + "grad_norm": 0.22265419460742056, + "learning_rate": 0.0004663284440235285, + "loss": 3.0406363010406494, + "step": 7123, + "token_acc": 0.2936238716725119 + }, + { + "epoch": 4.175901495162709, + "grad_norm": 0.1951362379442764, + "learning_rate": 0.00046631629808378307, + "loss": 3.0189366340637207, + "step": 7124, + "token_acc": 0.29693681535455996 + }, + { + "epoch": 4.176487833479918, + "grad_norm": 0.18581102207241978, + "learning_rate": 0.00046630415011203745, + "loss": 3.053821325302124, + "step": 7125, + "token_acc": 0.29320057474543654 + }, + { + "epoch": 4.177074171797127, + "grad_norm": 0.22939984417511636, + "learning_rate": 0.0004662920001084059, + "loss": 3.0178940296173096, + "step": 7126, + "token_acc": 0.2960817797643874 + }, + { + "epoch": 4.177660510114336, + "grad_norm": 0.18712370128417088, + "learning_rate": 0.0004662798480730024, + "loss": 3.011500597000122, + "step": 7127, + "token_acc": 0.29905122108818927 + }, + { + "epoch": 4.178246848431545, + "grad_norm": 0.2380529736075332, + "learning_rate": 0.0004662676940059412, + "loss": 3.0373952388763428, + "step": 7128, + "token_acc": 0.29279643973225794 + }, + { + "epoch": 4.178833186748754, + "grad_norm": 0.18541249519328332, + "learning_rate": 0.00046625553790733635, + "loss": 2.9727001190185547, + "step": 7129, + "token_acc": 0.30440173428434775 + }, + { + "epoch": 4.179419525065963, + "grad_norm": 0.20667337262809457, + "learning_rate": 0.0004662433797773022, + "loss": 3.0099425315856934, + "step": 7130, + "token_acc": 0.29811988236495895 + }, + { + "epoch": 4.1800058633831725, + "grad_norm": 0.2039504039482547, + "learning_rate": 0.0004662312196159528, + "loss": 3.020404815673828, + "step": 7131, + "token_acc": 0.29621003161538323 + }, + { + "epoch": 4.180592201700381, + "grad_norm": 0.2057135749111697, + "learning_rate": 0.0004662190574234026, + "loss": 3.0423197746276855, + "step": 7132, + "token_acc": 0.29315194392683824 + }, + { + "epoch": 4.18117854001759, + "grad_norm": 0.23412507896396903, + "learning_rate": 0.00046620689319976566, + "loss": 3.0398218631744385, + "step": 7133, + "token_acc": 0.29487629861911135 + }, + { + "epoch": 4.181764878334799, + "grad_norm": 0.17660227602515383, + "learning_rate": 0.0004661947269451563, + "loss": 2.978708505630493, + "step": 7134, + "token_acc": 0.30259389807749415 + }, + { + "epoch": 4.182351216652008, + "grad_norm": 0.23714571563875728, + "learning_rate": 0.0004661825586596888, + "loss": 3.0459704399108887, + "step": 7135, + "token_acc": 0.29141898107729874 + }, + { + "epoch": 4.182937554969217, + "grad_norm": 0.23386796260139422, + "learning_rate": 0.00046617038834347746, + "loss": 2.98483943939209, + "step": 7136, + "token_acc": 0.30154882762721946 + }, + { + "epoch": 4.183523893286426, + "grad_norm": 0.16988005683655433, + "learning_rate": 0.00046615821599663664, + "loss": 3.0512070655822754, + "step": 7137, + "token_acc": 0.29314000773369797 + }, + { + "epoch": 4.184110231603635, + "grad_norm": 0.21942275141055242, + "learning_rate": 0.00046614604161928063, + "loss": 3.0374364852905273, + "step": 7138, + "token_acc": 0.2942677130894467 + }, + { + "epoch": 4.1846965699208445, + "grad_norm": 0.20198881199602095, + "learning_rate": 0.0004661338652115239, + "loss": 3.001643657684326, + "step": 7139, + "token_acc": 0.2995079069323014 + }, + { + "epoch": 4.185282908238054, + "grad_norm": 0.20345284515402953, + "learning_rate": 0.00046612168677348065, + "loss": 3.0409388542175293, + "step": 7140, + "token_acc": 0.29268401764738383 + }, + { + "epoch": 4.185869246555263, + "grad_norm": 0.19391012433348753, + "learning_rate": 0.00046610950630526543, + "loss": 3.0369417667388916, + "step": 7141, + "token_acc": 0.2938946158167732 + }, + { + "epoch": 4.186455584872472, + "grad_norm": 0.19226466492808938, + "learning_rate": 0.0004660973238069926, + "loss": 3.012075901031494, + "step": 7142, + "token_acc": 0.2986068955583221 + }, + { + "epoch": 4.18704192318968, + "grad_norm": 0.2528403017866318, + "learning_rate": 0.0004660851392787766, + "loss": 3.0370397567749023, + "step": 7143, + "token_acc": 0.2939726342537161 + }, + { + "epoch": 4.187628261506889, + "grad_norm": 0.23609561243943936, + "learning_rate": 0.0004660729527207319, + "loss": 3.0025415420532227, + "step": 7144, + "token_acc": 0.2994409318643597 + }, + { + "epoch": 4.188214599824098, + "grad_norm": 0.1965381717174718, + "learning_rate": 0.000466060764132973, + "loss": 3.0507707595825195, + "step": 7145, + "token_acc": 0.2920774872856449 + }, + { + "epoch": 4.188800938141307, + "grad_norm": 0.22691902109601858, + "learning_rate": 0.00046604857351561433, + "loss": 3.0100255012512207, + "step": 7146, + "token_acc": 0.298947072266356 + }, + { + "epoch": 4.1893872764585165, + "grad_norm": 0.20072408231028657, + "learning_rate": 0.0004660363808687704, + "loss": 3.0749642848968506, + "step": 7147, + "token_acc": 0.2882328750431551 + }, + { + "epoch": 4.189973614775726, + "grad_norm": 0.21423720638904545, + "learning_rate": 0.0004660241861925559, + "loss": 3.0191421508789062, + "step": 7148, + "token_acc": 0.2981584586832364 + }, + { + "epoch": 4.190559953092935, + "grad_norm": 0.2076223938319146, + "learning_rate": 0.00046601198948708516, + "loss": 3.026064395904541, + "step": 7149, + "token_acc": 0.2944992306148391 + }, + { + "epoch": 4.191146291410144, + "grad_norm": 0.1902893680957885, + "learning_rate": 0.0004659997907524729, + "loss": 3.045395851135254, + "step": 7150, + "token_acc": 0.2932133960553693 + }, + { + "epoch": 4.191732629727353, + "grad_norm": 0.23110492311367747, + "learning_rate": 0.00046598758998883373, + "loss": 3.0631494522094727, + "step": 7151, + "token_acc": 0.29036563768486934 + }, + { + "epoch": 4.192318968044562, + "grad_norm": 0.19852537666000913, + "learning_rate": 0.00046597538719628207, + "loss": 2.9920687675476074, + "step": 7152, + "token_acc": 0.3016063965486845 + }, + { + "epoch": 4.192905306361771, + "grad_norm": 0.18116172925316323, + "learning_rate": 0.00046596318237493277, + "loss": 3.076284170150757, + "step": 7153, + "token_acc": 0.2881908571821139 + }, + { + "epoch": 4.193491644678979, + "grad_norm": 0.21801417753415978, + "learning_rate": 0.0004659509755249004, + "loss": 3.077207088470459, + "step": 7154, + "token_acc": 0.29055617767823816 + }, + { + "epoch": 4.1940779829961885, + "grad_norm": 0.21016189565742255, + "learning_rate": 0.00046593876664629955, + "loss": 3.0335421562194824, + "step": 7155, + "token_acc": 0.29657758771870657 + }, + { + "epoch": 4.194664321313398, + "grad_norm": 0.16630249379307907, + "learning_rate": 0.000465926555739245, + "loss": 3.034965753555298, + "step": 7156, + "token_acc": 0.29499131599647965 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 0.21604587693350585, + "learning_rate": 0.00046591434280385144, + "loss": 3.0059666633605957, + "step": 7157, + "token_acc": 0.29904675608047765 + }, + { + "epoch": 4.195836997947816, + "grad_norm": 0.17651302496647003, + "learning_rate": 0.00046590212784023354, + "loss": 2.9971914291381836, + "step": 7158, + "token_acc": 0.299662286082898 + }, + { + "epoch": 4.196423336265025, + "grad_norm": 0.17693190856646454, + "learning_rate": 0.0004658899108485061, + "loss": 3.041050434112549, + "step": 7159, + "token_acc": 0.2941525293281957 + }, + { + "epoch": 4.197009674582234, + "grad_norm": 0.20259167037250458, + "learning_rate": 0.00046587769182878383, + "loss": 3.0409457683563232, + "step": 7160, + "token_acc": 0.2942173379859894 + }, + { + "epoch": 4.197596012899443, + "grad_norm": 0.16577071297280604, + "learning_rate": 0.0004658654707811816, + "loss": 3.0118017196655273, + "step": 7161, + "token_acc": 0.2980252185975467 + }, + { + "epoch": 4.198182351216652, + "grad_norm": 0.18552614042436216, + "learning_rate": 0.0004658532477058141, + "loss": 3.0386929512023926, + "step": 7162, + "token_acc": 0.2937376260646567 + }, + { + "epoch": 4.198768689533861, + "grad_norm": 0.22671153390412097, + "learning_rate": 0.00046584102260279625, + "loss": 3.0002856254577637, + "step": 7163, + "token_acc": 0.29869861489386507 + }, + { + "epoch": 4.19935502785107, + "grad_norm": 0.2824981426324045, + "learning_rate": 0.00046582879547224276, + "loss": 3.0433058738708496, + "step": 7164, + "token_acc": 0.29393711606049444 + }, + { + "epoch": 4.199941366168279, + "grad_norm": 0.411495209009468, + "learning_rate": 0.00046581656631426873, + "loss": 3.017425060272217, + "step": 7165, + "token_acc": 0.2975307670104175 + }, + { + "epoch": 4.200527704485488, + "grad_norm": 0.3118122567759677, + "learning_rate": 0.00046580433512898874, + "loss": 2.9966278076171875, + "step": 7166, + "token_acc": 0.30098733577314973 + }, + { + "epoch": 4.201114042802697, + "grad_norm": 0.17435651759247894, + "learning_rate": 0.0004657921019165179, + "loss": 2.9982504844665527, + "step": 7167, + "token_acc": 0.2999541551410631 + }, + { + "epoch": 4.201700381119906, + "grad_norm": 0.2626497240931547, + "learning_rate": 0.0004657798666769709, + "loss": 3.087343454360962, + "step": 7168, + "token_acc": 0.287779965234467 + }, + { + "epoch": 4.202286719437115, + "grad_norm": 0.19462075163709555, + "learning_rate": 0.00046576762941046295, + "loss": 2.9727554321289062, + "step": 7169, + "token_acc": 0.30430436423746243 + }, + { + "epoch": 4.202873057754324, + "grad_norm": 0.20068935494331192, + "learning_rate": 0.00046575539011710885, + "loss": 3.0082504749298096, + "step": 7170, + "token_acc": 0.29929280460834234 + }, + { + "epoch": 4.203459396071533, + "grad_norm": 0.1701033985192429, + "learning_rate": 0.0004657431487970236, + "loss": 3.035154104232788, + "step": 7171, + "token_acc": 0.29428575944032365 + }, + { + "epoch": 4.2040457343887425, + "grad_norm": 0.21203657352301916, + "learning_rate": 0.00046573090545032217, + "loss": 3.006300926208496, + "step": 7172, + "token_acc": 0.30026322187807225 + }, + { + "epoch": 4.204632072705952, + "grad_norm": 0.21393399496336932, + "learning_rate": 0.0004657186600771196, + "loss": 3.055222511291504, + "step": 7173, + "token_acc": 0.29272289773045224 + }, + { + "epoch": 4.205218411023161, + "grad_norm": 0.21800821552289154, + "learning_rate": 0.0004657064126775309, + "loss": 3.0469555854797363, + "step": 7174, + "token_acc": 0.292700488206281 + }, + { + "epoch": 4.205804749340369, + "grad_norm": 0.19224082996588798, + "learning_rate": 0.00046569416325167114, + "loss": 3.017199754714966, + "step": 7175, + "token_acc": 0.2973582216346439 + }, + { + "epoch": 4.206391087657578, + "grad_norm": 0.21649697547673905, + "learning_rate": 0.0004656819117996553, + "loss": 3.0250484943389893, + "step": 7176, + "token_acc": 0.2947412513639443 + }, + { + "epoch": 4.206977425974787, + "grad_norm": 0.2293386470284171, + "learning_rate": 0.00046566965832159856, + "loss": 3.0057473182678223, + "step": 7177, + "token_acc": 0.2987094255120537 + }, + { + "epoch": 4.207563764291996, + "grad_norm": 0.1840261651221997, + "learning_rate": 0.00046565740281761603, + "loss": 3.0113186836242676, + "step": 7178, + "token_acc": 0.29763357312332095 + }, + { + "epoch": 4.208150102609205, + "grad_norm": 0.2523318353567239, + "learning_rate": 0.0004656451452878228, + "loss": 3.0586607456207275, + "step": 7179, + "token_acc": 0.2908959859237039 + }, + { + "epoch": 4.2087364409264145, + "grad_norm": 0.2092421207723749, + "learning_rate": 0.000465632885732334, + "loss": 3.036681652069092, + "step": 7180, + "token_acc": 0.2953302538317887 + }, + { + "epoch": 4.209322779243624, + "grad_norm": 0.2213120261401445, + "learning_rate": 0.00046562062415126483, + "loss": 3.007185459136963, + "step": 7181, + "token_acc": 0.2978972121780217 + }, + { + "epoch": 4.209909117560833, + "grad_norm": 0.19155800810470536, + "learning_rate": 0.0004656083605447304, + "loss": 3.0147643089294434, + "step": 7182, + "token_acc": 0.2986389237781517 + }, + { + "epoch": 4.210495455878042, + "grad_norm": 0.21149699446447984, + "learning_rate": 0.000465596094912846, + "loss": 3.034579277038574, + "step": 7183, + "token_acc": 0.2950187123914462 + }, + { + "epoch": 4.211081794195251, + "grad_norm": 0.27381916078877117, + "learning_rate": 0.0004655838272557268, + "loss": 3.0246386528015137, + "step": 7184, + "token_acc": 0.2962732011751261 + }, + { + "epoch": 4.21166813251246, + "grad_norm": 0.19809543772936736, + "learning_rate": 0.000465571557573488, + "loss": 3.0541319847106934, + "step": 7185, + "token_acc": 0.29117324274888945 + }, + { + "epoch": 4.212254470829668, + "grad_norm": 0.22581517657389624, + "learning_rate": 0.00046555928586624495, + "loss": 3.05717396736145, + "step": 7186, + "token_acc": 0.29128791852043256 + }, + { + "epoch": 4.212840809146877, + "grad_norm": 0.217388709016104, + "learning_rate": 0.00046554701213411285, + "loss": 3.045132637023926, + "step": 7187, + "token_acc": 0.29182146728347225 + }, + { + "epoch": 4.2134271474640865, + "grad_norm": 0.18840122464008327, + "learning_rate": 0.00046553473637720713, + "loss": 3.102694511413574, + "step": 7188, + "token_acc": 0.2832544304327789 + }, + { + "epoch": 4.214013485781296, + "grad_norm": 0.24554499698267465, + "learning_rate": 0.0004655224585956429, + "loss": 3.002741813659668, + "step": 7189, + "token_acc": 0.2994943103236521 + }, + { + "epoch": 4.214599824098505, + "grad_norm": 0.19128149306691974, + "learning_rate": 0.0004655101787895356, + "loss": 3.063619613647461, + "step": 7190, + "token_acc": 0.2886796035392853 + }, + { + "epoch": 4.215186162415714, + "grad_norm": 0.26583750266094613, + "learning_rate": 0.00046549789695900056, + "loss": 2.9953603744506836, + "step": 7191, + "token_acc": 0.3001553314472275 + }, + { + "epoch": 4.215772500732923, + "grad_norm": 0.20798661582231734, + "learning_rate": 0.0004654856131041532, + "loss": 3.0, + "step": 7192, + "token_acc": 0.299153776136946 + }, + { + "epoch": 4.216358839050132, + "grad_norm": 0.22306383487478446, + "learning_rate": 0.0004654733272251088, + "loss": 3.0378551483154297, + "step": 7193, + "token_acc": 0.2944513283683211 + }, + { + "epoch": 4.216945177367341, + "grad_norm": 0.1912486582995136, + "learning_rate": 0.0004654610393219829, + "loss": 2.9882826805114746, + "step": 7194, + "token_acc": 0.30115435321962813 + }, + { + "epoch": 4.21753151568455, + "grad_norm": 0.2684242131620467, + "learning_rate": 0.0004654487493948909, + "loss": 3.072085380554199, + "step": 7195, + "token_acc": 0.289867864147827 + }, + { + "epoch": 4.218117854001759, + "grad_norm": 0.21315851180081472, + "learning_rate": 0.00046543645744394823, + "loss": 3.0188755989074707, + "step": 7196, + "token_acc": 0.2970566837111133 + }, + { + "epoch": 4.218704192318968, + "grad_norm": 0.2336857103000578, + "learning_rate": 0.00046542416346927033, + "loss": 3.0823090076446533, + "step": 7197, + "token_acc": 0.2874185242867106 + }, + { + "epoch": 4.219290530636177, + "grad_norm": 0.23008886321897284, + "learning_rate": 0.00046541186747097265, + "loss": 3.012960910797119, + "step": 7198, + "token_acc": 0.2974197192980646 + }, + { + "epoch": 4.219876868953386, + "grad_norm": 0.21535649589772046, + "learning_rate": 0.00046539956944917084, + "loss": 3.0952091217041016, + "step": 7199, + "token_acc": 0.2888128943322199 + }, + { + "epoch": 4.220463207270595, + "grad_norm": 0.18874393717942592, + "learning_rate": 0.00046538726940398024, + "loss": 3.0290207862854004, + "step": 7200, + "token_acc": 0.2953556783277116 + }, + { + "epoch": 4.221049545587804, + "grad_norm": 0.20581388503280557, + "learning_rate": 0.0004653749673355165, + "loss": 3.0229454040527344, + "step": 7201, + "token_acc": 0.29560466356369797 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.2056077547446293, + "learning_rate": 0.0004653626632438952, + "loss": 3.067230224609375, + "step": 7202, + "token_acc": 0.2909809757618472 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.21441276898880496, + "learning_rate": 0.00046535035712923185, + "loss": 3.0767927169799805, + "step": 7203, + "token_acc": 0.2883587184939103 + }, + { + "epoch": 4.222808560539431, + "grad_norm": 0.204499742479287, + "learning_rate": 0.0004653380489916421, + "loss": 3.006409168243408, + "step": 7204, + "token_acc": 0.29920727224116034 + }, + { + "epoch": 4.2233948988566405, + "grad_norm": 0.16815923724321757, + "learning_rate": 0.0004653257388312415, + "loss": 3.044400215148926, + "step": 7205, + "token_acc": 0.29545357762612984 + }, + { + "epoch": 4.22398123717385, + "grad_norm": 0.18524566030904668, + "learning_rate": 0.0004653134266481458, + "loss": 3.0381197929382324, + "step": 7206, + "token_acc": 0.2931384324448647 + }, + { + "epoch": 4.224567575491059, + "grad_norm": 0.17668464946370718, + "learning_rate": 0.0004653011124424706, + "loss": 3.013251543045044, + "step": 7207, + "token_acc": 0.2979389715371891 + }, + { + "epoch": 4.225153913808267, + "grad_norm": 0.18257376746229081, + "learning_rate": 0.0004652887962143315, + "loss": 3.023289203643799, + "step": 7208, + "token_acc": 0.29631769831433763 + }, + { + "epoch": 4.225740252125476, + "grad_norm": 0.1788219354723936, + "learning_rate": 0.0004652764779638444, + "loss": 3.045978307723999, + "step": 7209, + "token_acc": 0.29395063780502845 + }, + { + "epoch": 4.226326590442685, + "grad_norm": 0.1823665469096321, + "learning_rate": 0.0004652641576911247, + "loss": 3.07065486907959, + "step": 7210, + "token_acc": 0.2900328454368022 + }, + { + "epoch": 4.226912928759894, + "grad_norm": 0.20739412835154172, + "learning_rate": 0.00046525183539628846, + "loss": 3.0098862648010254, + "step": 7211, + "token_acc": 0.2977733136797068 + }, + { + "epoch": 4.227499267077103, + "grad_norm": 0.23144851817703418, + "learning_rate": 0.0004652395110794512, + "loss": 3.062488555908203, + "step": 7212, + "token_acc": 0.29075703128245606 + }, + { + "epoch": 4.2280856053943126, + "grad_norm": 0.19903605197128096, + "learning_rate": 0.0004652271847407288, + "loss": 3.0978357791900635, + "step": 7213, + "token_acc": 0.28609121174645175 + }, + { + "epoch": 4.228671943711522, + "grad_norm": 0.17601060419982678, + "learning_rate": 0.0004652148563802371, + "loss": 3.0160069465637207, + "step": 7214, + "token_acc": 0.29667864250820647 + }, + { + "epoch": 4.229258282028731, + "grad_norm": 0.22273856148661494, + "learning_rate": 0.00046520252599809166, + "loss": 3.066481590270996, + "step": 7215, + "token_acc": 0.29091505247428845 + }, + { + "epoch": 4.22984462034594, + "grad_norm": 0.165558317848903, + "learning_rate": 0.0004651901935944086, + "loss": 3.001032590866089, + "step": 7216, + "token_acc": 0.2988250310841701 + }, + { + "epoch": 4.230430958663149, + "grad_norm": 0.17574694925998402, + "learning_rate": 0.0004651778591693036, + "loss": 3.060455799102783, + "step": 7217, + "token_acc": 0.2919625692278312 + }, + { + "epoch": 4.231017296980358, + "grad_norm": 0.20723927960812277, + "learning_rate": 0.0004651655227228926, + "loss": 3.0199244022369385, + "step": 7218, + "token_acc": 0.2973019356583423 + }, + { + "epoch": 4.231603635297566, + "grad_norm": 0.19253394581645475, + "learning_rate": 0.0004651531842552914, + "loss": 3.007469892501831, + "step": 7219, + "token_acc": 0.29877539274020704 + }, + { + "epoch": 4.2321899736147754, + "grad_norm": 0.19161325732394313, + "learning_rate": 0.00046514084376661605, + "loss": 3.0038843154907227, + "step": 7220, + "token_acc": 0.2987262775589544 + }, + { + "epoch": 4.232776311931985, + "grad_norm": 0.22692899923408688, + "learning_rate": 0.00046512850125698225, + "loss": 3.0106732845306396, + "step": 7221, + "token_acc": 0.2992220305498503 + }, + { + "epoch": 4.233362650249194, + "grad_norm": 0.2115732935318443, + "learning_rate": 0.0004651161567265062, + "loss": 3.077622413635254, + "step": 7222, + "token_acc": 0.288922710322971 + }, + { + "epoch": 4.233948988566403, + "grad_norm": 0.2019073345427456, + "learning_rate": 0.0004651038101753036, + "loss": 3.0127577781677246, + "step": 7223, + "token_acc": 0.29833541318555806 + }, + { + "epoch": 4.234535326883612, + "grad_norm": 0.22052979661675895, + "learning_rate": 0.00046509146160349067, + "loss": 3.059021472930908, + "step": 7224, + "token_acc": 0.29122310496211345 + }, + { + "epoch": 4.235121665200821, + "grad_norm": 0.2146427398563426, + "learning_rate": 0.0004650791110111833, + "loss": 3.079977512359619, + "step": 7225, + "token_acc": 0.2885198773409265 + }, + { + "epoch": 4.23570800351803, + "grad_norm": 0.23425564957326006, + "learning_rate": 0.0004650667583984974, + "loss": 3.056211233139038, + "step": 7226, + "token_acc": 0.28999268765697694 + }, + { + "epoch": 4.236294341835239, + "grad_norm": 0.2735447098311677, + "learning_rate": 0.0004650544037655492, + "loss": 3.0535433292388916, + "step": 7227, + "token_acc": 0.2917613560765321 + }, + { + "epoch": 4.236880680152448, + "grad_norm": 0.21842977066231503, + "learning_rate": 0.00046504204711245455, + "loss": 3.0150058269500732, + "step": 7228, + "token_acc": 0.29675714810572495 + }, + { + "epoch": 4.237467018469657, + "grad_norm": 0.19820564950678218, + "learning_rate": 0.0004650296884393298, + "loss": 3.0259552001953125, + "step": 7229, + "token_acc": 0.2947137685913417 + }, + { + "epoch": 4.238053356786866, + "grad_norm": 0.21132207864794167, + "learning_rate": 0.0004650173277462908, + "loss": 3.0346803665161133, + "step": 7230, + "token_acc": 0.29386813297310743 + }, + { + "epoch": 4.238639695104075, + "grad_norm": 0.20254279478224177, + "learning_rate": 0.0004650049650334538, + "loss": 3.0663952827453613, + "step": 7231, + "token_acc": 0.29137044791724714 + }, + { + "epoch": 4.239226033421284, + "grad_norm": 0.2551141040057678, + "learning_rate": 0.00046499260030093484, + "loss": 3.0588698387145996, + "step": 7232, + "token_acc": 0.291092350103377 + }, + { + "epoch": 4.239812371738493, + "grad_norm": 0.25184660175377055, + "learning_rate": 0.00046498023354885, + "loss": 3.086754322052002, + "step": 7233, + "token_acc": 0.28723888281356014 + }, + { + "epoch": 4.240398710055702, + "grad_norm": 0.23249709531231724, + "learning_rate": 0.00046496786477731567, + "loss": 3.0265302658081055, + "step": 7234, + "token_acc": 0.29619402312022197 + }, + { + "epoch": 4.240985048372911, + "grad_norm": 0.21436659470740724, + "learning_rate": 0.0004649554939864479, + "loss": 3.075906276702881, + "step": 7235, + "token_acc": 0.28844102621680795 + }, + { + "epoch": 4.24157138669012, + "grad_norm": 0.1997818984105397, + "learning_rate": 0.00046494312117636294, + "loss": 3.0163071155548096, + "step": 7236, + "token_acc": 0.2964620237315317 + }, + { + "epoch": 4.2421577250073295, + "grad_norm": 0.2194907664669766, + "learning_rate": 0.00046493074634717705, + "loss": 3.060027599334717, + "step": 7237, + "token_acc": 0.29129932222981136 + }, + { + "epoch": 4.242744063324539, + "grad_norm": 0.17815121209963833, + "learning_rate": 0.0004649183694990063, + "loss": 3.0438966751098633, + "step": 7238, + "token_acc": 0.2943165939789135 + }, + { + "epoch": 4.243330401641748, + "grad_norm": 0.22647371250225418, + "learning_rate": 0.00046490599063196713, + "loss": 3.0117921829223633, + "step": 7239, + "token_acc": 0.29971508595711804 + }, + { + "epoch": 4.243916739958956, + "grad_norm": 0.20886039102782136, + "learning_rate": 0.0004648936097461758, + "loss": 3.072829484939575, + "step": 7240, + "token_acc": 0.28895028818054636 + }, + { + "epoch": 4.244503078276165, + "grad_norm": 0.19316258779186285, + "learning_rate": 0.0004648812268417485, + "loss": 3.003109931945801, + "step": 7241, + "token_acc": 0.29721121374464204 + }, + { + "epoch": 4.245089416593374, + "grad_norm": 0.19067020562495265, + "learning_rate": 0.00046486884191880167, + "loss": 3.0332727432250977, + "step": 7242, + "token_acc": 0.29396787837249727 + }, + { + "epoch": 4.245675754910583, + "grad_norm": 0.17649926805330626, + "learning_rate": 0.00046485645497745164, + "loss": 2.9963645935058594, + "step": 7243, + "token_acc": 0.30061032212631106 + }, + { + "epoch": 4.246262093227792, + "grad_norm": 0.18658002375145616, + "learning_rate": 0.0004648440660178147, + "loss": 3.020477771759033, + "step": 7244, + "token_acc": 0.29699250056657805 + }, + { + "epoch": 4.2468484315450015, + "grad_norm": 0.20006045232623287, + "learning_rate": 0.00046483167504000726, + "loss": 3.0562543869018555, + "step": 7245, + "token_acc": 0.2918381580863474 + }, + { + "epoch": 4.247434769862211, + "grad_norm": 0.1834211176446755, + "learning_rate": 0.0004648192820441458, + "loss": 3.000281810760498, + "step": 7246, + "token_acc": 0.29908064703828696 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.1838794358666694, + "learning_rate": 0.0004648068870303466, + "loss": 3.0101988315582275, + "step": 7247, + "token_acc": 0.29780088068996063 + }, + { + "epoch": 4.248607446496629, + "grad_norm": 0.21910430028999436, + "learning_rate": 0.0004647944899987261, + "loss": 3.032127857208252, + "step": 7248, + "token_acc": 0.2949095220201015 + }, + { + "epoch": 4.249193784813838, + "grad_norm": 0.2604695523252541, + "learning_rate": 0.0004647820909494009, + "loss": 3.0543711185455322, + "step": 7249, + "token_acc": 0.29206657339550235 + }, + { + "epoch": 4.249780123131047, + "grad_norm": 0.26027715891762565, + "learning_rate": 0.0004647696898824873, + "loss": 3.0225749015808105, + "step": 7250, + "token_acc": 0.29730030762702875 + }, + { + "epoch": 4.250366461448255, + "grad_norm": 0.31807102803223836, + "learning_rate": 0.0004647572867981019, + "loss": 3.0529260635375977, + "step": 7251, + "token_acc": 0.29128068097010057 + }, + { + "epoch": 4.250952799765464, + "grad_norm": 0.2326010368280377, + "learning_rate": 0.0004647448816963612, + "loss": 3.0328445434570312, + "step": 7252, + "token_acc": 0.2962837510887448 + }, + { + "epoch": 4.2515391380826735, + "grad_norm": 0.18287497552776938, + "learning_rate": 0.00046473247457738166, + "loss": 3.0036303997039795, + "step": 7253, + "token_acc": 0.3008589236653279 + }, + { + "epoch": 4.252125476399883, + "grad_norm": 0.18327713223996892, + "learning_rate": 0.00046472006544128, + "loss": 3.0194225311279297, + "step": 7254, + "token_acc": 0.2944594335936988 + }, + { + "epoch": 4.252711814717092, + "grad_norm": 0.19120016504664566, + "learning_rate": 0.00046470765428817255, + "loss": 3.0305886268615723, + "step": 7255, + "token_acc": 0.29578205477171404 + }, + { + "epoch": 4.253298153034301, + "grad_norm": 0.1741665719739343, + "learning_rate": 0.000464695241118176, + "loss": 3.0565361976623535, + "step": 7256, + "token_acc": 0.29156514157317576 + }, + { + "epoch": 4.25388449135151, + "grad_norm": 0.20598949173902928, + "learning_rate": 0.000464682825931407, + "loss": 3.0029425621032715, + "step": 7257, + "token_acc": 0.2985429229943485 + }, + { + "epoch": 4.254470829668719, + "grad_norm": 0.308150823920553, + "learning_rate": 0.00046467040872798216, + "loss": 3.017829418182373, + "step": 7258, + "token_acc": 0.2955879402026688 + }, + { + "epoch": 4.255057167985928, + "grad_norm": 0.28887097728298866, + "learning_rate": 0.00046465798950801805, + "loss": 2.993651866912842, + "step": 7259, + "token_acc": 0.30148115790382635 + }, + { + "epoch": 4.255643506303137, + "grad_norm": 0.18842538286960578, + "learning_rate": 0.0004646455682716314, + "loss": 3.062232494354248, + "step": 7260, + "token_acc": 0.2922357083537459 + }, + { + "epoch": 4.256229844620346, + "grad_norm": 0.2779440427377432, + "learning_rate": 0.00046463314501893896, + "loss": 3.022719383239746, + "step": 7261, + "token_acc": 0.29545203983599416 + }, + { + "epoch": 4.256816182937555, + "grad_norm": 0.22232304462280986, + "learning_rate": 0.0004646207197500572, + "loss": 3.01255202293396, + "step": 7262, + "token_acc": 0.2969424489293757 + }, + { + "epoch": 4.257402521254764, + "grad_norm": 0.2330451779201088, + "learning_rate": 0.00046460829246510306, + "loss": 2.9859397411346436, + "step": 7263, + "token_acc": 0.30138488098088384 + }, + { + "epoch": 4.257988859571973, + "grad_norm": 0.2645558763049161, + "learning_rate": 0.0004645958631641932, + "loss": 3.0558505058288574, + "step": 7264, + "token_acc": 0.2907449990580086 + }, + { + "epoch": 4.258575197889182, + "grad_norm": 0.19444701069906487, + "learning_rate": 0.0004645834318474443, + "loss": 3.103001117706299, + "step": 7265, + "token_acc": 0.284434247324346 + }, + { + "epoch": 4.259161536206391, + "grad_norm": 0.21392301788358983, + "learning_rate": 0.0004645709985149734, + "loss": 3.0365567207336426, + "step": 7266, + "token_acc": 0.2956237338605355 + }, + { + "epoch": 4.2597478745236, + "grad_norm": 0.16570621180879094, + "learning_rate": 0.0004645585631668969, + "loss": 3.029478073120117, + "step": 7267, + "token_acc": 0.29352355663338525 + }, + { + "epoch": 4.260334212840809, + "grad_norm": 0.19392138266077966, + "learning_rate": 0.000464546125803332, + "loss": 3.0014162063598633, + "step": 7268, + "token_acc": 0.30090012419045276 + }, + { + "epoch": 4.260920551158018, + "grad_norm": 0.22477921693453845, + "learning_rate": 0.00046453368642439524, + "loss": 3.0667591094970703, + "step": 7269, + "token_acc": 0.29108717831237363 + }, + { + "epoch": 4.2615068894752275, + "grad_norm": 0.22039136420494712, + "learning_rate": 0.00046452124503020367, + "loss": 2.995938301086426, + "step": 7270, + "token_acc": 0.3007941206925391 + }, + { + "epoch": 4.262093227792437, + "grad_norm": 0.19878033778551069, + "learning_rate": 0.00046450880162087393, + "loss": 3.042675495147705, + "step": 7271, + "token_acc": 0.29312096440559915 + }, + { + "epoch": 4.262679566109645, + "grad_norm": 0.2805607964746652, + "learning_rate": 0.00046449635619652313, + "loss": 3.011254072189331, + "step": 7272, + "token_acc": 0.2982654186460963 + }, + { + "epoch": 4.263265904426854, + "grad_norm": 0.18583625651976154, + "learning_rate": 0.00046448390875726813, + "loss": 3.0606212615966797, + "step": 7273, + "token_acc": 0.29005361078519915 + }, + { + "epoch": 4.263852242744063, + "grad_norm": 0.22311917847378054, + "learning_rate": 0.0004644714593032258, + "loss": 3.053183078765869, + "step": 7274, + "token_acc": 0.29212003465041636 + }, + { + "epoch": 4.264438581061272, + "grad_norm": 0.19137704478020862, + "learning_rate": 0.0004644590078345131, + "loss": 3.0386698246002197, + "step": 7275, + "token_acc": 0.29410758955932526 + }, + { + "epoch": 4.265024919378481, + "grad_norm": 0.2652698207684537, + "learning_rate": 0.00046444655435124707, + "loss": 3.031611919403076, + "step": 7276, + "token_acc": 0.29370177980977263 + }, + { + "epoch": 4.26561125769569, + "grad_norm": 0.1910222691437405, + "learning_rate": 0.0004644340988535446, + "loss": 3.0476956367492676, + "step": 7277, + "token_acc": 0.2931088137559998 + }, + { + "epoch": 4.2661975960128995, + "grad_norm": 0.2018811217732455, + "learning_rate": 0.0004644216413415227, + "loss": 3.0440738201141357, + "step": 7278, + "token_acc": 0.2950478908692701 + }, + { + "epoch": 4.266783934330109, + "grad_norm": 0.18914238463092745, + "learning_rate": 0.00046440918181529843, + "loss": 3.030407667160034, + "step": 7279, + "token_acc": 0.29616783733446894 + }, + { + "epoch": 4.267370272647318, + "grad_norm": 0.19186678296370016, + "learning_rate": 0.0004643967202749888, + "loss": 2.9866089820861816, + "step": 7280, + "token_acc": 0.30093894693820483 + }, + { + "epoch": 4.267956610964527, + "grad_norm": 0.18825563741892448, + "learning_rate": 0.00046438425672071096, + "loss": 3.0327978134155273, + "step": 7281, + "token_acc": 0.2959708997305935 + }, + { + "epoch": 4.268542949281736, + "grad_norm": 0.19522793126427482, + "learning_rate": 0.00046437179115258175, + "loss": 3.0397255420684814, + "step": 7282, + "token_acc": 0.29447283225506143 + }, + { + "epoch": 4.269129287598945, + "grad_norm": 0.20642936480255422, + "learning_rate": 0.00046435932357071863, + "loss": 3.0638389587402344, + "step": 7283, + "token_acc": 0.2904871857685672 + }, + { + "epoch": 4.269715625916153, + "grad_norm": 0.19911454429565026, + "learning_rate": 0.0004643468539752384, + "loss": 3.03776478767395, + "step": 7284, + "token_acc": 0.293763730177161 + }, + { + "epoch": 4.270301964233362, + "grad_norm": 0.20976080990143434, + "learning_rate": 0.00046433438236625834, + "loss": 3.0086519718170166, + "step": 7285, + "token_acc": 0.29792241026791016 + }, + { + "epoch": 4.2708883025505715, + "grad_norm": 0.17352337568920734, + "learning_rate": 0.0004643219087438956, + "loss": 2.997100591659546, + "step": 7286, + "token_acc": 0.3018106177823775 + }, + { + "epoch": 4.271474640867781, + "grad_norm": 0.17357494714334998, + "learning_rate": 0.0004643094331082673, + "loss": 3.0867109298706055, + "step": 7287, + "token_acc": 0.2877382832658295 + }, + { + "epoch": 4.27206097918499, + "grad_norm": 0.19556261316209386, + "learning_rate": 0.0004642969554594907, + "loss": 3.0226526260375977, + "step": 7288, + "token_acc": 0.2971607747068783 + }, + { + "epoch": 4.272647317502199, + "grad_norm": 0.18168954928689307, + "learning_rate": 0.0004642844757976829, + "loss": 3.010441780090332, + "step": 7289, + "token_acc": 0.2985246062433212 + }, + { + "epoch": 4.273233655819408, + "grad_norm": 0.17607367496516332, + "learning_rate": 0.0004642719941229613, + "loss": 3.0098719596862793, + "step": 7290, + "token_acc": 0.29916682266837286 + }, + { + "epoch": 4.273819994136617, + "grad_norm": 0.1849178085101402, + "learning_rate": 0.00046425951043544295, + "loss": 3.0792784690856934, + "step": 7291, + "token_acc": 0.2887901663082132 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.17569223705055148, + "learning_rate": 0.00046424702473524525, + "loss": 3.052720069885254, + "step": 7292, + "token_acc": 0.29230357587542455 + }, + { + "epoch": 4.274992670771035, + "grad_norm": 0.22752647269424886, + "learning_rate": 0.00046423453702248545, + "loss": 3.0107295513153076, + "step": 7293, + "token_acc": 0.298837892370955 + }, + { + "epoch": 4.2755790090882435, + "grad_norm": 0.2717712867275703, + "learning_rate": 0.0004642220472972809, + "loss": 3.056264877319336, + "step": 7294, + "token_acc": 0.2919780932772723 + }, + { + "epoch": 4.276165347405453, + "grad_norm": 0.26113764586484167, + "learning_rate": 0.00046420955555974885, + "loss": 3.042703628540039, + "step": 7295, + "token_acc": 0.2931754118285966 + }, + { + "epoch": 4.276751685722662, + "grad_norm": 0.1753930968064935, + "learning_rate": 0.0004641970618100067, + "loss": 3.0342659950256348, + "step": 7296, + "token_acc": 0.2958851986183953 + }, + { + "epoch": 4.277338024039871, + "grad_norm": 0.2384081498373174, + "learning_rate": 0.00046418456604817174, + "loss": 3.0522491931915283, + "step": 7297, + "token_acc": 0.29252802371862563 + }, + { + "epoch": 4.27792436235708, + "grad_norm": 0.2041932751150039, + "learning_rate": 0.0004641720682743614, + "loss": 3.047947883605957, + "step": 7298, + "token_acc": 0.29237264756960407 + }, + { + "epoch": 4.278510700674289, + "grad_norm": 0.18322360249718328, + "learning_rate": 0.00046415956848869314, + "loss": 3.0624027252197266, + "step": 7299, + "token_acc": 0.2902337048380333 + }, + { + "epoch": 4.279097038991498, + "grad_norm": 0.253297719892055, + "learning_rate": 0.0004641470666912843, + "loss": 3.070003032684326, + "step": 7300, + "token_acc": 0.2896063627633273 + }, + { + "epoch": 4.279683377308707, + "grad_norm": 0.17833470294833959, + "learning_rate": 0.0004641345628822523, + "loss": 3.0455422401428223, + "step": 7301, + "token_acc": 0.2916853773846705 + }, + { + "epoch": 4.280269715625916, + "grad_norm": 0.2805479688463854, + "learning_rate": 0.00046412205706171475, + "loss": 3.0642621517181396, + "step": 7302, + "token_acc": 0.29094388122367676 + }, + { + "epoch": 4.2808560539431255, + "grad_norm": 0.24582430513601233, + "learning_rate": 0.00046410954922978886, + "loss": 3.051823616027832, + "step": 7303, + "token_acc": 0.2926300925851639 + }, + { + "epoch": 4.281442392260335, + "grad_norm": 0.24180166479787638, + "learning_rate": 0.0004640970393865923, + "loss": 3.0173614025115967, + "step": 7304, + "token_acc": 0.2968326510536482 + }, + { + "epoch": 4.282028730577543, + "grad_norm": 0.23451020379998191, + "learning_rate": 0.00046408452753224263, + "loss": 3.057274580001831, + "step": 7305, + "token_acc": 0.2923452419674082 + }, + { + "epoch": 4.282615068894752, + "grad_norm": 0.1912025874681136, + "learning_rate": 0.00046407201366685725, + "loss": 3.0389208793640137, + "step": 7306, + "token_acc": 0.29443759111808904 + }, + { + "epoch": 4.283201407211961, + "grad_norm": 0.19723245996826194, + "learning_rate": 0.0004640594977905538, + "loss": 2.961979389190674, + "step": 7307, + "token_acc": 0.3041204429362024 + }, + { + "epoch": 4.28378774552917, + "grad_norm": 0.18488005292196993, + "learning_rate": 0.00046404697990344975, + "loss": 3.015009880065918, + "step": 7308, + "token_acc": 0.2982548721543547 + }, + { + "epoch": 4.284374083846379, + "grad_norm": 0.17611423045426525, + "learning_rate": 0.00046403446000566273, + "loss": 3.046461582183838, + "step": 7309, + "token_acc": 0.29312512229182247 + }, + { + "epoch": 4.284960422163588, + "grad_norm": 0.20805243030340356, + "learning_rate": 0.0004640219380973105, + "loss": 3.0603318214416504, + "step": 7310, + "token_acc": 0.2900340411821718 + }, + { + "epoch": 4.2855467604807975, + "grad_norm": 0.17134574390579776, + "learning_rate": 0.0004640094141785105, + "loss": 3.0567173957824707, + "step": 7311, + "token_acc": 0.2927464796094379 + }, + { + "epoch": 4.286133098798007, + "grad_norm": 0.1993838388091723, + "learning_rate": 0.00046399688824938046, + "loss": 3.025550127029419, + "step": 7312, + "token_acc": 0.2964498219609892 + }, + { + "epoch": 4.286719437115216, + "grad_norm": 0.17916713057511413, + "learning_rate": 0.000463984360310038, + "loss": 3.0443339347839355, + "step": 7313, + "token_acc": 0.2928862964476596 + }, + { + "epoch": 4.287305775432425, + "grad_norm": 0.19834512359026277, + "learning_rate": 0.00046397183036060073, + "loss": 2.9818224906921387, + "step": 7314, + "token_acc": 0.302160169987504 + }, + { + "epoch": 4.287892113749633, + "grad_norm": 0.20869768346536827, + "learning_rate": 0.0004639592984011866, + "loss": 3.0265040397644043, + "step": 7315, + "token_acc": 0.2967787499509554 + }, + { + "epoch": 4.288478452066842, + "grad_norm": 0.20297974577995187, + "learning_rate": 0.0004639467644319131, + "loss": 3.04345440864563, + "step": 7316, + "token_acc": 0.2930626048937289 + }, + { + "epoch": 4.289064790384051, + "grad_norm": 0.19962836702406517, + "learning_rate": 0.00046393422845289803, + "loss": 3.066373348236084, + "step": 7317, + "token_acc": 0.29207596767467764 + }, + { + "epoch": 4.28965112870126, + "grad_norm": 0.16984820983701024, + "learning_rate": 0.0004639216904642592, + "loss": 3.055436134338379, + "step": 7318, + "token_acc": 0.2916192038206241 + }, + { + "epoch": 4.2902374670184695, + "grad_norm": 0.1768482049248973, + "learning_rate": 0.0004639091504661143, + "loss": 3.0117740631103516, + "step": 7319, + "token_acc": 0.29911648043707256 + }, + { + "epoch": 4.290823805335679, + "grad_norm": 0.17060784803186016, + "learning_rate": 0.0004638966084585812, + "loss": 3.0297725200653076, + "step": 7320, + "token_acc": 0.2950180001718021 + }, + { + "epoch": 4.291410143652888, + "grad_norm": 0.1726061282150735, + "learning_rate": 0.0004638840644417777, + "loss": 3.042325019836426, + "step": 7321, + "token_acc": 0.2941343895386727 + }, + { + "epoch": 4.291996481970097, + "grad_norm": 0.19375187834065757, + "learning_rate": 0.00046387151841582164, + "loss": 3.041559934616089, + "step": 7322, + "token_acc": 0.29206977095634146 + }, + { + "epoch": 4.292582820287306, + "grad_norm": 0.19113190939270544, + "learning_rate": 0.00046385897038083085, + "loss": 3.053023338317871, + "step": 7323, + "token_acc": 0.2910645939385639 + }, + { + "epoch": 4.293169158604515, + "grad_norm": 0.15489284377181867, + "learning_rate": 0.00046384642033692327, + "loss": 3.041614532470703, + "step": 7324, + "token_acc": 0.2926240639640475 + }, + { + "epoch": 4.293755496921724, + "grad_norm": 0.21746515007652106, + "learning_rate": 0.00046383386828421664, + "loss": 3.053802728652954, + "step": 7325, + "token_acc": 0.2934623613512772 + }, + { + "epoch": 4.294341835238933, + "grad_norm": 0.22551872754465307, + "learning_rate": 0.000463821314222829, + "loss": 3.0038957595825195, + "step": 7326, + "token_acc": 0.29875068727472665 + }, + { + "epoch": 4.2949281735561415, + "grad_norm": 0.1941942717225257, + "learning_rate": 0.0004638087581528783, + "loss": 3.0205600261688232, + "step": 7327, + "token_acc": 0.2966617359406579 + }, + { + "epoch": 4.295514511873351, + "grad_norm": 0.19776246537358266, + "learning_rate": 0.0004637962000744823, + "loss": 3.0114293098449707, + "step": 7328, + "token_acc": 0.29730630850306794 + }, + { + "epoch": 4.29610085019056, + "grad_norm": 0.20152552678980784, + "learning_rate": 0.0004637836399877592, + "loss": 3.028628349304199, + "step": 7329, + "token_acc": 0.2949789348324391 + }, + { + "epoch": 4.296687188507769, + "grad_norm": 0.19760176639140292, + "learning_rate": 0.0004637710778928268, + "loss": 3.0333213806152344, + "step": 7330, + "token_acc": 0.295072634941916 + }, + { + "epoch": 4.297273526824978, + "grad_norm": 0.19297214435437715, + "learning_rate": 0.0004637585137898033, + "loss": 3.0465164184570312, + "step": 7331, + "token_acc": 0.2923753420214185 + }, + { + "epoch": 4.297859865142187, + "grad_norm": 0.25139937937946333, + "learning_rate": 0.0004637459476788065, + "loss": 3.0764904022216797, + "step": 7332, + "token_acc": 0.2906442415196889 + }, + { + "epoch": 4.298446203459396, + "grad_norm": 0.2593749680843242, + "learning_rate": 0.0004637333795599545, + "loss": 3.058875799179077, + "step": 7333, + "token_acc": 0.2911292875715568 + }, + { + "epoch": 4.299032541776605, + "grad_norm": 0.23887168492587066, + "learning_rate": 0.00046372080943336553, + "loss": 3.045907497406006, + "step": 7334, + "token_acc": 0.2941029604721456 + }, + { + "epoch": 4.299618880093814, + "grad_norm": 0.21006450653436493, + "learning_rate": 0.0004637082372991575, + "loss": 3.013392925262451, + "step": 7335, + "token_acc": 0.2981420176046398 + }, + { + "epoch": 4.3002052184110235, + "grad_norm": 0.18718388217301946, + "learning_rate": 0.0004636956631574486, + "loss": 3.0517544746398926, + "step": 7336, + "token_acc": 0.2921343732476933 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.20754944183084517, + "learning_rate": 0.00046368308700835684, + "loss": 3.035308361053467, + "step": 7337, + "token_acc": 0.29397371514050646 + }, + { + "epoch": 4.301377895045441, + "grad_norm": 0.19462959098141244, + "learning_rate": 0.0004636705088520005, + "loss": 3.0191650390625, + "step": 7338, + "token_acc": 0.29612204011168186 + }, + { + "epoch": 4.30196423336265, + "grad_norm": 0.17733369515346875, + "learning_rate": 0.00046365792868849755, + "loss": 3.062168836593628, + "step": 7339, + "token_acc": 0.29056886934528336 + }, + { + "epoch": 4.302550571679859, + "grad_norm": 0.18462095112039786, + "learning_rate": 0.0004636453465179663, + "loss": 3.032287120819092, + "step": 7340, + "token_acc": 0.2946809536593694 + }, + { + "epoch": 4.303136909997068, + "grad_norm": 0.1982254727375965, + "learning_rate": 0.0004636327623405249, + "loss": 3.0676932334899902, + "step": 7341, + "token_acc": 0.2918679242807855 + }, + { + "epoch": 4.303723248314277, + "grad_norm": 0.19974278706109944, + "learning_rate": 0.0004636201761562916, + "loss": 3.004348039627075, + "step": 7342, + "token_acc": 0.29733898958735056 + }, + { + "epoch": 4.304309586631486, + "grad_norm": 0.2191078782981199, + "learning_rate": 0.0004636075879653846, + "loss": 3.075151205062866, + "step": 7343, + "token_acc": 0.2892454777208615 + }, + { + "epoch": 4.3048959249486956, + "grad_norm": 0.19359164903270398, + "learning_rate": 0.00046359499776792216, + "loss": 3.0351898670196533, + "step": 7344, + "token_acc": 0.2935231848825711 + }, + { + "epoch": 4.305482263265905, + "grad_norm": 0.1864356484532815, + "learning_rate": 0.00046358240556402256, + "loss": 3.046383857727051, + "step": 7345, + "token_acc": 0.29286576642219303 + }, + { + "epoch": 4.306068601583114, + "grad_norm": 0.19228299001564558, + "learning_rate": 0.000463569811353804, + "loss": 3.0079708099365234, + "step": 7346, + "token_acc": 0.30021297809144487 + }, + { + "epoch": 4.306654939900323, + "grad_norm": 0.19630039623658913, + "learning_rate": 0.0004635572151373849, + "loss": 3.0555596351623535, + "step": 7347, + "token_acc": 0.2900031040493252 + }, + { + "epoch": 4.307241278217531, + "grad_norm": 0.22452998725573733, + "learning_rate": 0.00046354461691488347, + "loss": 2.9841198921203613, + "step": 7348, + "token_acc": 0.3009481720806281 + }, + { + "epoch": 4.30782761653474, + "grad_norm": 0.29677900505742066, + "learning_rate": 0.0004635320166864182, + "loss": 3.0319552421569824, + "step": 7349, + "token_acc": 0.2944271361015214 + }, + { + "epoch": 4.308413954851949, + "grad_norm": 0.3466598447656047, + "learning_rate": 0.0004635194144521073, + "loss": 3.052605628967285, + "step": 7350, + "token_acc": 0.2914837736162137 + }, + { + "epoch": 4.3090002931691584, + "grad_norm": 0.230542325421348, + "learning_rate": 0.0004635068102120693, + "loss": 3.0765819549560547, + "step": 7351, + "token_acc": 0.2886258562680028 + }, + { + "epoch": 4.309586631486368, + "grad_norm": 0.22935980257355698, + "learning_rate": 0.0004634942039664225, + "loss": 3.0282375812530518, + "step": 7352, + "token_acc": 0.29517580751582795 + }, + { + "epoch": 4.310172969803577, + "grad_norm": 0.2844022599781738, + "learning_rate": 0.00046348159571528533, + "loss": 3.007565975189209, + "step": 7353, + "token_acc": 0.2993737269407518 + }, + { + "epoch": 4.310759308120786, + "grad_norm": 0.1703867614221709, + "learning_rate": 0.00046346898545877624, + "loss": 3.01338267326355, + "step": 7354, + "token_acc": 0.2973343916832808 + }, + { + "epoch": 4.311345646437995, + "grad_norm": 0.2621886210881118, + "learning_rate": 0.00046345637319701366, + "loss": 3.0332469940185547, + "step": 7355, + "token_acc": 0.294285767163453 + }, + { + "epoch": 4.311931984755204, + "grad_norm": 0.22373239822929716, + "learning_rate": 0.00046344375893011614, + "loss": 3.008401393890381, + "step": 7356, + "token_acc": 0.2988404858627301 + }, + { + "epoch": 4.312518323072413, + "grad_norm": 0.2872764483225045, + "learning_rate": 0.0004634311426582021, + "loss": 3.005619764328003, + "step": 7357, + "token_acc": 0.2985731498850093 + }, + { + "epoch": 4.313104661389621, + "grad_norm": 0.19039720073001115, + "learning_rate": 0.00046341852438139004, + "loss": 3.0463571548461914, + "step": 7358, + "token_acc": 0.2945624043034647 + }, + { + "epoch": 4.3136909997068305, + "grad_norm": 0.2408970839291239, + "learning_rate": 0.00046340590409979857, + "loss": 3.0570287704467773, + "step": 7359, + "token_acc": 0.28995665263136117 + }, + { + "epoch": 4.31427733802404, + "grad_norm": 0.24425054183646672, + "learning_rate": 0.00046339328181354617, + "loss": 3.0411548614501953, + "step": 7360, + "token_acc": 0.2918149558437384 + }, + { + "epoch": 4.314863676341249, + "grad_norm": 0.19640617824775172, + "learning_rate": 0.00046338065752275147, + "loss": 3.05983304977417, + "step": 7361, + "token_acc": 0.29158017677089654 + }, + { + "epoch": 4.315450014658458, + "grad_norm": 0.21490540395242239, + "learning_rate": 0.000463368031227533, + "loss": 3.0167531967163086, + "step": 7362, + "token_acc": 0.2981231046635522 + }, + { + "epoch": 4.316036352975667, + "grad_norm": 0.25830799264201687, + "learning_rate": 0.00046335540292800936, + "loss": 3.0580382347106934, + "step": 7363, + "token_acc": 0.2912083976195724 + }, + { + "epoch": 4.316622691292876, + "grad_norm": 0.21051107765455762, + "learning_rate": 0.0004633427726242993, + "loss": 3.011385917663574, + "step": 7364, + "token_acc": 0.29773547050246135 + }, + { + "epoch": 4.317209029610085, + "grad_norm": 0.23722867947038911, + "learning_rate": 0.00046333014031652134, + "loss": 3.0127954483032227, + "step": 7365, + "token_acc": 0.2972987145100692 + }, + { + "epoch": 4.317795367927294, + "grad_norm": 0.17873039234554378, + "learning_rate": 0.00046331750600479415, + "loss": 3.034630537033081, + "step": 7366, + "token_acc": 0.2970629736085707 + }, + { + "epoch": 4.318381706244503, + "grad_norm": 0.24504705505039803, + "learning_rate": 0.0004633048696892365, + "loss": 3.0660440921783447, + "step": 7367, + "token_acc": 0.29079199646035103 + }, + { + "epoch": 4.3189680445617125, + "grad_norm": 0.18589682580352346, + "learning_rate": 0.00046329223136996694, + "loss": 3.082886219024658, + "step": 7368, + "token_acc": 0.28787691125507753 + }, + { + "epoch": 4.319554382878922, + "grad_norm": 0.20957307702994904, + "learning_rate": 0.00046327959104710436, + "loss": 3.0548925399780273, + "step": 7369, + "token_acc": 0.2927629613271152 + }, + { + "epoch": 4.32014072119613, + "grad_norm": 0.18438467602863795, + "learning_rate": 0.00046326694872076736, + "loss": 3.0427393913269043, + "step": 7370, + "token_acc": 0.2932563290421583 + }, + { + "epoch": 4.320727059513339, + "grad_norm": 0.23334771636014218, + "learning_rate": 0.0004632543043910748, + "loss": 3.053088665008545, + "step": 7371, + "token_acc": 0.2922609655229314 + }, + { + "epoch": 4.321313397830548, + "grad_norm": 0.17994176066788198, + "learning_rate": 0.0004632416580581454, + "loss": 3.0342416763305664, + "step": 7372, + "token_acc": 0.2960781853565298 + }, + { + "epoch": 4.321899736147757, + "grad_norm": 0.2136935670673401, + "learning_rate": 0.00046322900972209797, + "loss": 3.0433990955352783, + "step": 7373, + "token_acc": 0.29527102516719 + }, + { + "epoch": 4.322486074464966, + "grad_norm": 0.19449680227748337, + "learning_rate": 0.00046321635938305133, + "loss": 2.990283489227295, + "step": 7374, + "token_acc": 0.30067372075721255 + }, + { + "epoch": 4.323072412782175, + "grad_norm": 0.1959037126312654, + "learning_rate": 0.0004632037070411243, + "loss": 3.0249247550964355, + "step": 7375, + "token_acc": 0.29758399767781796 + }, + { + "epoch": 4.3236587510993845, + "grad_norm": 0.18869596898559512, + "learning_rate": 0.00046319105269643576, + "loss": 3.0392751693725586, + "step": 7376, + "token_acc": 0.2941637705049543 + }, + { + "epoch": 4.324245089416594, + "grad_norm": 0.18300549749946282, + "learning_rate": 0.00046317839634910454, + "loss": 2.992873191833496, + "step": 7377, + "token_acc": 0.30137012556746307 + }, + { + "epoch": 4.324831427733803, + "grad_norm": 0.1891110642875182, + "learning_rate": 0.0004631657379992495, + "loss": 3.002100944519043, + "step": 7378, + "token_acc": 0.2999768011754071 + }, + { + "epoch": 4.325417766051012, + "grad_norm": 0.21096247846318414, + "learning_rate": 0.00046315307764698964, + "loss": 3.0342583656311035, + "step": 7379, + "token_acc": 0.2948329995477744 + }, + { + "epoch": 4.32600410436822, + "grad_norm": 0.175912505571332, + "learning_rate": 0.0004631404152924439, + "loss": 3.012756824493408, + "step": 7380, + "token_acc": 0.2983351757456652 + }, + { + "epoch": 4.326590442685429, + "grad_norm": 0.1978897737387396, + "learning_rate": 0.0004631277509357311, + "loss": 3.0481491088867188, + "step": 7381, + "token_acc": 0.29259827502623686 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.1823141016796918, + "learning_rate": 0.00046311508457697026, + "loss": 3.0035839080810547, + "step": 7382, + "token_acc": 0.29976809754864403 + }, + { + "epoch": 4.327763119319847, + "grad_norm": 0.20311035923879286, + "learning_rate": 0.00046310241621628037, + "loss": 3.037753105163574, + "step": 7383, + "token_acc": 0.2949219308251765 + }, + { + "epoch": 4.3283494576370565, + "grad_norm": 0.1717274841777956, + "learning_rate": 0.0004630897458537805, + "loss": 3.05173659324646, + "step": 7384, + "token_acc": 0.2918921656768769 + }, + { + "epoch": 4.328935795954266, + "grad_norm": 0.20656958757902574, + "learning_rate": 0.00046307707348958963, + "loss": 3.0078141689300537, + "step": 7385, + "token_acc": 0.2998708157801897 + }, + { + "epoch": 4.329522134271475, + "grad_norm": 0.19851052682711015, + "learning_rate": 0.0004630643991238267, + "loss": 3.0546536445617676, + "step": 7386, + "token_acc": 0.29180702400142156 + }, + { + "epoch": 4.330108472588684, + "grad_norm": 0.20104521123418023, + "learning_rate": 0.0004630517227566108, + "loss": 3.027625560760498, + "step": 7387, + "token_acc": 0.2946310267609235 + }, + { + "epoch": 4.330694810905893, + "grad_norm": 0.22656090493990086, + "learning_rate": 0.0004630390443880612, + "loss": 2.994175434112549, + "step": 7388, + "token_acc": 0.30018326663425215 + }, + { + "epoch": 4.331281149223102, + "grad_norm": 0.21753634825306367, + "learning_rate": 0.0004630263640182968, + "loss": 3.0565760135650635, + "step": 7389, + "token_acc": 0.292736252673965 + }, + { + "epoch": 4.331867487540311, + "grad_norm": 0.22480229454816908, + "learning_rate": 0.00046301368164743676, + "loss": 3.0724668502807617, + "step": 7390, + "token_acc": 0.2897222995527881 + }, + { + "epoch": 4.33245382585752, + "grad_norm": 0.20059961649897526, + "learning_rate": 0.0004630009972756002, + "loss": 3.069899320602417, + "step": 7391, + "token_acc": 0.2890455707722978 + }, + { + "epoch": 4.3330401641747285, + "grad_norm": 0.21102671325181863, + "learning_rate": 0.0004629883109029063, + "loss": 3.0591177940368652, + "step": 7392, + "token_acc": 0.2909188363896243 + }, + { + "epoch": 4.333626502491938, + "grad_norm": 0.2619960559431775, + "learning_rate": 0.0004629756225294742, + "loss": 3.007251262664795, + "step": 7393, + "token_acc": 0.30001754488566584 + }, + { + "epoch": 4.334212840809147, + "grad_norm": 0.2717205633323569, + "learning_rate": 0.0004629629321554232, + "loss": 3.0278992652893066, + "step": 7394, + "token_acc": 0.2954074799093344 + }, + { + "epoch": 4.334799179126356, + "grad_norm": 0.22727725778816382, + "learning_rate": 0.0004629502397808723, + "loss": 3.0420618057250977, + "step": 7395, + "token_acc": 0.2942933704860016 + }, + { + "epoch": 4.335385517443565, + "grad_norm": 0.1982573513698482, + "learning_rate": 0.0004629375454059409, + "loss": 3.023836135864258, + "step": 7396, + "token_acc": 0.2976283561815654 + }, + { + "epoch": 4.335971855760774, + "grad_norm": 0.2042734423495468, + "learning_rate": 0.0004629248490307483, + "loss": 3.055373191833496, + "step": 7397, + "token_acc": 0.2931242015987779 + }, + { + "epoch": 4.336558194077983, + "grad_norm": 0.23162282968783332, + "learning_rate": 0.00046291215065541355, + "loss": 3.0937328338623047, + "step": 7398, + "token_acc": 0.287022644204512 + }, + { + "epoch": 4.337144532395192, + "grad_norm": 0.1975061969905795, + "learning_rate": 0.0004628994502800561, + "loss": 3.0817885398864746, + "step": 7399, + "token_acc": 0.28821430236800166 + }, + { + "epoch": 4.337730870712401, + "grad_norm": 0.19797168246825958, + "learning_rate": 0.00046288674790479514, + "loss": 3.0889339447021484, + "step": 7400, + "token_acc": 0.2879575763484893 + }, + { + "epoch": 4.3383172090296105, + "grad_norm": 0.266195047381109, + "learning_rate": 0.0004628740435297501, + "loss": 3.0350232124328613, + "step": 7401, + "token_acc": 0.2952281801399562 + }, + { + "epoch": 4.338903547346819, + "grad_norm": 0.24735116569942564, + "learning_rate": 0.0004628613371550402, + "loss": 3.0221309661865234, + "step": 7402, + "token_acc": 0.2957843290861582 + }, + { + "epoch": 4.339489885664028, + "grad_norm": 0.17618476439583544, + "learning_rate": 0.000462848628780785, + "loss": 3.0492477416992188, + "step": 7403, + "token_acc": 0.2930530102402284 + }, + { + "epoch": 4.340076223981237, + "grad_norm": 0.2135099098245777, + "learning_rate": 0.00046283591840710366, + "loss": 3.036322593688965, + "step": 7404, + "token_acc": 0.2939802869233151 + }, + { + "epoch": 4.340662562298446, + "grad_norm": 0.19537418433865283, + "learning_rate": 0.00046282320603411567, + "loss": 3.020264148712158, + "step": 7405, + "token_acc": 0.29657603222557904 + }, + { + "epoch": 4.341248900615655, + "grad_norm": 0.18830810334279963, + "learning_rate": 0.0004628104916619405, + "loss": 3.103588581085205, + "step": 7406, + "token_acc": 0.2872384531719452 + }, + { + "epoch": 4.341835238932864, + "grad_norm": 0.20806976663294371, + "learning_rate": 0.00046279777529069745, + "loss": 3.0320184230804443, + "step": 7407, + "token_acc": 0.294376162560944 + }, + { + "epoch": 4.342421577250073, + "grad_norm": 0.19028785166870088, + "learning_rate": 0.0004627850569205061, + "loss": 3.0100531578063965, + "step": 7408, + "token_acc": 0.2992730782289737 + }, + { + "epoch": 4.3430079155672825, + "grad_norm": 0.19171124445599927, + "learning_rate": 0.00046277233655148587, + "loss": 3.0491299629211426, + "step": 7409, + "token_acc": 0.29164931945556444 + }, + { + "epoch": 4.343594253884492, + "grad_norm": 0.2702892482098735, + "learning_rate": 0.00046275961418375624, + "loss": 3.0463979244232178, + "step": 7410, + "token_acc": 0.292536013175835 + }, + { + "epoch": 4.344180592201701, + "grad_norm": 0.23284447504153385, + "learning_rate": 0.00046274688981743674, + "loss": 2.990690231323242, + "step": 7411, + "token_acc": 0.30113254328458566 + }, + { + "epoch": 4.34476693051891, + "grad_norm": 0.18645159730963004, + "learning_rate": 0.00046273416345264684, + "loss": 3.044921398162842, + "step": 7412, + "token_acc": 0.2930963528927609 + }, + { + "epoch": 4.345353268836118, + "grad_norm": 0.20587907028872307, + "learning_rate": 0.0004627214350895063, + "loss": 3.034092903137207, + "step": 7413, + "token_acc": 0.29246645308651487 + }, + { + "epoch": 4.345939607153327, + "grad_norm": 0.21073741733558052, + "learning_rate": 0.0004627087047281343, + "loss": 3.0388731956481934, + "step": 7414, + "token_acc": 0.29396236386433583 + }, + { + "epoch": 4.346525945470536, + "grad_norm": 0.18052979202304362, + "learning_rate": 0.0004626959723686508, + "loss": 3.0058445930480957, + "step": 7415, + "token_acc": 0.29808360510240467 + }, + { + "epoch": 4.347112283787745, + "grad_norm": 0.17053592469449644, + "learning_rate": 0.0004626832380111752, + "loss": 3.02262282371521, + "step": 7416, + "token_acc": 0.2960323123057195 + }, + { + "epoch": 4.3476986221049545, + "grad_norm": 0.16604459574535513, + "learning_rate": 0.00046267050165582716, + "loss": 2.993769884109497, + "step": 7417, + "token_acc": 0.3013771186440678 + }, + { + "epoch": 4.348284960422164, + "grad_norm": 0.1826125976318458, + "learning_rate": 0.00046265776330272636, + "loss": 2.9973368644714355, + "step": 7418, + "token_acc": 0.3001957862917323 + }, + { + "epoch": 4.348871298739373, + "grad_norm": 0.18444829239719415, + "learning_rate": 0.0004626450229519924, + "loss": 3.031925916671753, + "step": 7419, + "token_acc": 0.2959684655987117 + }, + { + "epoch": 4.349457637056582, + "grad_norm": 0.16675363910608254, + "learning_rate": 0.00046263228060374503, + "loss": 3.0233850479125977, + "step": 7420, + "token_acc": 0.29627331979968796 + }, + { + "epoch": 4.350043975373791, + "grad_norm": 0.1659424896027177, + "learning_rate": 0.0004626195362581039, + "loss": 3.0460872650146484, + "step": 7421, + "token_acc": 0.2947687975007161 + }, + { + "epoch": 4.350630313691, + "grad_norm": 0.15873325434938698, + "learning_rate": 0.0004626067899151887, + "loss": 3.0167508125305176, + "step": 7422, + "token_acc": 0.2968006566913095 + }, + { + "epoch": 4.351216652008208, + "grad_norm": 0.1681809321458993, + "learning_rate": 0.00046259404157511925, + "loss": 3.0244288444519043, + "step": 7423, + "token_acc": 0.2971358285011422 + }, + { + "epoch": 4.351802990325417, + "grad_norm": 0.18082993369166306, + "learning_rate": 0.00046258129123801525, + "loss": 3.069662570953369, + "step": 7424, + "token_acc": 0.28891627922280066 + }, + { + "epoch": 4.3523893286426265, + "grad_norm": 0.18903947782295186, + "learning_rate": 0.0004625685389039964, + "loss": 3.0051651000976562, + "step": 7425, + "token_acc": 0.29768696913729503 + }, + { + "epoch": 4.352975666959836, + "grad_norm": 0.25331623215662735, + "learning_rate": 0.0004625557845731827, + "loss": 3.0654406547546387, + "step": 7426, + "token_acc": 0.2917292079524388 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.3447159936341741, + "learning_rate": 0.0004625430282456937, + "loss": 3.027750015258789, + "step": 7427, + "token_acc": 0.29675155461130187 + }, + { + "epoch": 4.354148343594254, + "grad_norm": 0.2704918855141654, + "learning_rate": 0.0004625302699216495, + "loss": 3.0281574726104736, + "step": 7428, + "token_acc": 0.297872684929067 + }, + { + "epoch": 4.354734681911463, + "grad_norm": 0.18803840869986135, + "learning_rate": 0.00046251750960116966, + "loss": 3.0601987838745117, + "step": 7429, + "token_acc": 0.29000721807065216 + }, + { + "epoch": 4.355321020228672, + "grad_norm": 0.2304811079669045, + "learning_rate": 0.0004625047472843742, + "loss": 2.998924970626831, + "step": 7430, + "token_acc": 0.3007969128420545 + }, + { + "epoch": 4.355907358545881, + "grad_norm": 0.17825665131839397, + "learning_rate": 0.00046249198297138304, + "loss": 3.0382819175720215, + "step": 7431, + "token_acc": 0.2959405689358233 + }, + { + "epoch": 4.35649369686309, + "grad_norm": 0.19885623527954244, + "learning_rate": 0.0004624792166623161, + "loss": 3.0109152793884277, + "step": 7432, + "token_acc": 0.2982725635401621 + }, + { + "epoch": 4.357080035180299, + "grad_norm": 0.20673945253928655, + "learning_rate": 0.0004624664483572931, + "loss": 3.0245442390441895, + "step": 7433, + "token_acc": 0.2964558277060774 + }, + { + "epoch": 4.3576663734975085, + "grad_norm": 0.188909268992054, + "learning_rate": 0.0004624536780564342, + "loss": 2.9925849437713623, + "step": 7434, + "token_acc": 0.3013480222113429 + }, + { + "epoch": 4.358252711814717, + "grad_norm": 0.2062115653746405, + "learning_rate": 0.0004624409057598593, + "loss": 3.0509238243103027, + "step": 7435, + "token_acc": 0.29182644001066294 + }, + { + "epoch": 4.358839050131926, + "grad_norm": 0.21702497087612563, + "learning_rate": 0.00046242813146768827, + "loss": 3.063058376312256, + "step": 7436, + "token_acc": 0.29087926938250613 + }, + { + "epoch": 4.359425388449135, + "grad_norm": 0.23843733211278406, + "learning_rate": 0.0004624153551800413, + "loss": 3.0219860076904297, + "step": 7437, + "token_acc": 0.2968000831147243 + }, + { + "epoch": 4.360011726766344, + "grad_norm": 0.18915393427127203, + "learning_rate": 0.0004624025768970382, + "loss": 2.9877147674560547, + "step": 7438, + "token_acc": 0.3002915679384508 + }, + { + "epoch": 4.360598065083553, + "grad_norm": 0.2041851067242998, + "learning_rate": 0.0004623897966187992, + "loss": 3.0817365646362305, + "step": 7439, + "token_acc": 0.2885972888732372 + }, + { + "epoch": 4.361184403400762, + "grad_norm": 0.244161759423216, + "learning_rate": 0.0004623770143454442, + "loss": 3.002307653427124, + "step": 7440, + "token_acc": 0.29759288747346074 + }, + { + "epoch": 4.361770741717971, + "grad_norm": 0.20037149822585196, + "learning_rate": 0.00046236423007709333, + "loss": 3.0436453819274902, + "step": 7441, + "token_acc": 0.2933716033432103 + }, + { + "epoch": 4.3623570800351805, + "grad_norm": 0.19635094778165368, + "learning_rate": 0.00046235144381386674, + "loss": 3.0295345783233643, + "step": 7442, + "token_acc": 0.2978717274470158 + }, + { + "epoch": 4.36294341835239, + "grad_norm": 0.19633615469518834, + "learning_rate": 0.0004623386555558844, + "loss": 3.0125646591186523, + "step": 7443, + "token_acc": 0.29774658486365585 + }, + { + "epoch": 4.363529756669599, + "grad_norm": 0.17908632697662138, + "learning_rate": 0.00046232586530326657, + "loss": 3.0864644050598145, + "step": 7444, + "token_acc": 0.2889055066745159 + }, + { + "epoch": 4.364116094986807, + "grad_norm": 0.19821601778191889, + "learning_rate": 0.00046231307305613336, + "loss": 3.0216212272644043, + "step": 7445, + "token_acc": 0.2968090271913528 + }, + { + "epoch": 4.364702433304016, + "grad_norm": 0.19244222399770558, + "learning_rate": 0.0004623002788146049, + "loss": 3.0228304862976074, + "step": 7446, + "token_acc": 0.29487746145838617 + }, + { + "epoch": 4.365288771621225, + "grad_norm": 0.1850902811456137, + "learning_rate": 0.0004622874825788014, + "loss": 3.024705410003662, + "step": 7447, + "token_acc": 0.29581152526202814 + }, + { + "epoch": 4.365875109938434, + "grad_norm": 0.21606500966413003, + "learning_rate": 0.00046227468434884304, + "loss": 3.0333304405212402, + "step": 7448, + "token_acc": 0.2944822680621888 + }, + { + "epoch": 4.366461448255643, + "grad_norm": 0.2377340161700647, + "learning_rate": 0.00046226188412485015, + "loss": 3.014246940612793, + "step": 7449, + "token_acc": 0.2954652280739237 + }, + { + "epoch": 4.3670477865728525, + "grad_norm": 0.2259790114221647, + "learning_rate": 0.0004622490819069428, + "loss": 3.0706403255462646, + "step": 7450, + "token_acc": 0.28907210824767204 + }, + { + "epoch": 4.367634124890062, + "grad_norm": 0.1952300272455978, + "learning_rate": 0.00046223627769524135, + "loss": 3.0077767372131348, + "step": 7451, + "token_acc": 0.2980014872413731 + }, + { + "epoch": 4.368220463207271, + "grad_norm": 0.20735602647882279, + "learning_rate": 0.0004622234714898661, + "loss": 3.0574991703033447, + "step": 7452, + "token_acc": 0.291812599751729 + }, + { + "epoch": 4.36880680152448, + "grad_norm": 0.24411749543127234, + "learning_rate": 0.0004622106632909373, + "loss": 3.023402214050293, + "step": 7453, + "token_acc": 0.29814502087996964 + }, + { + "epoch": 4.369393139841689, + "grad_norm": 0.22436214858677092, + "learning_rate": 0.0004621978530985752, + "loss": 2.9573724269866943, + "step": 7454, + "token_acc": 0.3053596688845811 + }, + { + "epoch": 4.369979478158898, + "grad_norm": 0.17339411378137617, + "learning_rate": 0.00046218504091290023, + "loss": 3.0339279174804688, + "step": 7455, + "token_acc": 0.2939892686429952 + }, + { + "epoch": 4.370565816476106, + "grad_norm": 0.28667152284942843, + "learning_rate": 0.0004621722267340328, + "loss": 3.0635414123535156, + "step": 7456, + "token_acc": 0.2904077179884861 + }, + { + "epoch": 4.371152154793315, + "grad_norm": 0.26881350239032703, + "learning_rate": 0.0004621594105620932, + "loss": 3.018627882003784, + "step": 7457, + "token_acc": 0.2958344190894539 + }, + { + "epoch": 4.3717384931105245, + "grad_norm": 0.17669170233468612, + "learning_rate": 0.0004621465923972018, + "loss": 3.04150390625, + "step": 7458, + "token_acc": 0.29517203672832726 + }, + { + "epoch": 4.372324831427734, + "grad_norm": 0.22400859644971893, + "learning_rate": 0.00046213377223947906, + "loss": 3.0622315406799316, + "step": 7459, + "token_acc": 0.2910539455457555 + }, + { + "epoch": 4.372911169744943, + "grad_norm": 0.2302023986970875, + "learning_rate": 0.0004621209500890453, + "loss": 3.0642857551574707, + "step": 7460, + "token_acc": 0.29130354114348284 + }, + { + "epoch": 4.373497508062152, + "grad_norm": 0.20886895596562155, + "learning_rate": 0.00046210812594602116, + "loss": 3.043332099914551, + "step": 7461, + "token_acc": 0.29435710131915893 + }, + { + "epoch": 4.374083846379361, + "grad_norm": 0.17952397745575238, + "learning_rate": 0.00046209529981052687, + "loss": 3.0028300285339355, + "step": 7462, + "token_acc": 0.29915414463779044 + }, + { + "epoch": 4.37467018469657, + "grad_norm": 0.20580491871342954, + "learning_rate": 0.00046208247168268314, + "loss": 3.048549175262451, + "step": 7463, + "token_acc": 0.29231760632786635 + }, + { + "epoch": 4.375256523013779, + "grad_norm": 0.20089238126024245, + "learning_rate": 0.00046206964156261034, + "loss": 3.042198896408081, + "step": 7464, + "token_acc": 0.29449159353200244 + }, + { + "epoch": 4.375842861330988, + "grad_norm": 0.2441780075199423, + "learning_rate": 0.00046205680945042907, + "loss": 3.0827395915985107, + "step": 7465, + "token_acc": 0.28691553534548925 + }, + { + "epoch": 4.3764291996481965, + "grad_norm": 0.22016799191874425, + "learning_rate": 0.00046204397534625974, + "loss": 2.9968390464782715, + "step": 7466, + "token_acc": 0.2996711288638453 + }, + { + "epoch": 4.377015537965406, + "grad_norm": 0.20222940892897134, + "learning_rate": 0.0004620311392502231, + "loss": 3.054934024810791, + "step": 7467, + "token_acc": 0.29077670028212205 + }, + { + "epoch": 4.377601876282615, + "grad_norm": 0.2717866536463735, + "learning_rate": 0.0004620183011624395, + "loss": 3.0090277194976807, + "step": 7468, + "token_acc": 0.2979135837505828 + }, + { + "epoch": 4.378188214599824, + "grad_norm": 0.247079871140854, + "learning_rate": 0.00046200546108302975, + "loss": 3.018577814102173, + "step": 7469, + "token_acc": 0.29660258376622395 + }, + { + "epoch": 4.378774552917033, + "grad_norm": 0.22053513633342253, + "learning_rate": 0.0004619926190121143, + "loss": 3.0542314052581787, + "step": 7470, + "token_acc": 0.2919831179549325 + }, + { + "epoch": 4.379360891234242, + "grad_norm": 0.25853138927987507, + "learning_rate": 0.0004619797749498139, + "loss": 3.0643656253814697, + "step": 7471, + "token_acc": 0.29062110695853355 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.19452490485520127, + "learning_rate": 0.00046196692889624927, + "loss": 3.0672216415405273, + "step": 7472, + "token_acc": 0.2914582418062375 + }, + { + "epoch": 4.38053356786866, + "grad_norm": 0.2704189465219825, + "learning_rate": 0.0004619540808515408, + "loss": 3.043999195098877, + "step": 7473, + "token_acc": 0.29353675406401675 + }, + { + "epoch": 4.381119906185869, + "grad_norm": 0.23342420395324937, + "learning_rate": 0.0004619412308158094, + "loss": 3.0202512741088867, + "step": 7474, + "token_acc": 0.2971954674220963 + }, + { + "epoch": 4.3817062445030786, + "grad_norm": 0.22168594754530893, + "learning_rate": 0.0004619283787891758, + "loss": 3.03303861618042, + "step": 7475, + "token_acc": 0.29470362644077797 + }, + { + "epoch": 4.382292582820288, + "grad_norm": 0.24261362989054872, + "learning_rate": 0.0004619155247717606, + "loss": 3.0300254821777344, + "step": 7476, + "token_acc": 0.29696056423361633 + }, + { + "epoch": 4.382878921137497, + "grad_norm": 0.19319289097640557, + "learning_rate": 0.0004619026687636846, + "loss": 3.0648913383483887, + "step": 7477, + "token_acc": 0.29075852580892664 + }, + { + "epoch": 4.383465259454705, + "grad_norm": 0.26043536195155187, + "learning_rate": 0.0004618898107650686, + "loss": 3.029869556427002, + "step": 7478, + "token_acc": 0.29468627576905787 + }, + { + "epoch": 4.384051597771914, + "grad_norm": 0.1862942816372656, + "learning_rate": 0.0004618769507760333, + "loss": 3.05059814453125, + "step": 7479, + "token_acc": 0.29489964446397043 + }, + { + "epoch": 4.384637936089123, + "grad_norm": 0.24064883855313304, + "learning_rate": 0.00046186408879669963, + "loss": 3.0370357036590576, + "step": 7480, + "token_acc": 0.2941743837327585 + }, + { + "epoch": 4.385224274406332, + "grad_norm": 0.1964551663450361, + "learning_rate": 0.00046185122482718827, + "loss": 3.0265300273895264, + "step": 7481, + "token_acc": 0.29669174971188395 + }, + { + "epoch": 4.3858106127235414, + "grad_norm": 0.1938626907250114, + "learning_rate": 0.00046183835886762014, + "loss": 3.0124495029449463, + "step": 7482, + "token_acc": 0.29618153533085756 + }, + { + "epoch": 4.386396951040751, + "grad_norm": 0.18284241318335548, + "learning_rate": 0.00046182549091811607, + "loss": 3.065272092819214, + "step": 7483, + "token_acc": 0.29051542173364137 + }, + { + "epoch": 4.38698328935796, + "grad_norm": 0.20076509633656564, + "learning_rate": 0.000461812620978797, + "loss": 3.042503833770752, + "step": 7484, + "token_acc": 0.29422638550650887 + }, + { + "epoch": 4.387569627675169, + "grad_norm": 0.1906941828565981, + "learning_rate": 0.0004617997490497838, + "loss": 3.0366454124450684, + "step": 7485, + "token_acc": 0.2940044810654047 + }, + { + "epoch": 4.388155965992378, + "grad_norm": 0.18685972866946116, + "learning_rate": 0.0004617868751311973, + "loss": 3.038287401199341, + "step": 7486, + "token_acc": 0.29333488056467216 + }, + { + "epoch": 4.388742304309587, + "grad_norm": 0.2183809187724572, + "learning_rate": 0.0004617739992231585, + "loss": 3.0009188652038574, + "step": 7487, + "token_acc": 0.29914816182288095 + }, + { + "epoch": 4.389328642626795, + "grad_norm": 0.17034879116128313, + "learning_rate": 0.0004617611213257883, + "loss": 3.029564619064331, + "step": 7488, + "token_acc": 0.2942860367524086 + }, + { + "epoch": 4.389914980944004, + "grad_norm": 0.1900525296614871, + "learning_rate": 0.0004617482414392078, + "loss": 3.0664420127868652, + "step": 7489, + "token_acc": 0.2895762292700927 + }, + { + "epoch": 4.3905013192612135, + "grad_norm": 0.1926831749226504, + "learning_rate": 0.0004617353595635379, + "loss": 3.0137276649475098, + "step": 7490, + "token_acc": 0.29782754920233934 + }, + { + "epoch": 4.391087657578423, + "grad_norm": 0.1909624701739409, + "learning_rate": 0.00046172247569889956, + "loss": 3.0185084342956543, + "step": 7491, + "token_acc": 0.2971992022815574 + }, + { + "epoch": 4.391673995895632, + "grad_norm": 0.15874226550319134, + "learning_rate": 0.00046170958984541387, + "loss": 3.038285732269287, + "step": 7492, + "token_acc": 0.2934004112618496 + }, + { + "epoch": 4.392260334212841, + "grad_norm": 0.2085482013322463, + "learning_rate": 0.000461696702003202, + "loss": 3.0913572311401367, + "step": 7493, + "token_acc": 0.2875825117420556 + }, + { + "epoch": 4.39284667253005, + "grad_norm": 0.28067362998581946, + "learning_rate": 0.00046168381217238474, + "loss": 3.044562816619873, + "step": 7494, + "token_acc": 0.2949803147068606 + }, + { + "epoch": 4.393433010847259, + "grad_norm": 0.3662581477477971, + "learning_rate": 0.0004616709203530833, + "loss": 3.055656909942627, + "step": 7495, + "token_acc": 0.2920879898332176 + }, + { + "epoch": 4.394019349164468, + "grad_norm": 0.24937378478764022, + "learning_rate": 0.0004616580265454189, + "loss": 3.0151169300079346, + "step": 7496, + "token_acc": 0.2976364085218582 + }, + { + "epoch": 4.394605687481677, + "grad_norm": 0.20363295757192482, + "learning_rate": 0.0004616451307495124, + "loss": 3.104248046875, + "step": 7497, + "token_acc": 0.2856949570827064 + }, + { + "epoch": 4.395192025798886, + "grad_norm": 0.21018321343781718, + "learning_rate": 0.00046163223296548524, + "loss": 3.020953893661499, + "step": 7498, + "token_acc": 0.2963507737954072 + }, + { + "epoch": 4.395778364116095, + "grad_norm": 0.17920563235808587, + "learning_rate": 0.00046161933319345836, + "loss": 3.02058744430542, + "step": 7499, + "token_acc": 0.2973521252610471 + }, + { + "epoch": 4.396364702433304, + "grad_norm": 0.22142518460442173, + "learning_rate": 0.000461606431433553, + "loss": 3.038877487182617, + "step": 7500, + "token_acc": 0.2945777851551761 + }, + { + "epoch": 4.396951040750513, + "grad_norm": 0.2019659188045671, + "learning_rate": 0.00046159352768589037, + "loss": 3.028102159500122, + "step": 7501, + "token_acc": 0.29525295177644495 + }, + { + "epoch": 4.397537379067722, + "grad_norm": 0.1917446154424647, + "learning_rate": 0.0004615806219505917, + "loss": 3.0374135971069336, + "step": 7502, + "token_acc": 0.2940361388905192 + }, + { + "epoch": 4.398123717384931, + "grad_norm": 0.19446824834025309, + "learning_rate": 0.0004615677142277782, + "loss": 3.0670928955078125, + "step": 7503, + "token_acc": 0.2897940074906367 + }, + { + "epoch": 4.39871005570214, + "grad_norm": 0.23448785003383812, + "learning_rate": 0.00046155480451757103, + "loss": 3.0418567657470703, + "step": 7504, + "token_acc": 0.29472074244014984 + }, + { + "epoch": 4.399296394019349, + "grad_norm": 0.1604750626715864, + "learning_rate": 0.0004615418928200916, + "loss": 3.0699028968811035, + "step": 7505, + "token_acc": 0.2888524976746014 + }, + { + "epoch": 4.399882732336558, + "grad_norm": 0.19260752813030613, + "learning_rate": 0.00046152897913546114, + "loss": 3.001244306564331, + "step": 7506, + "token_acc": 0.30016563190682416 + }, + { + "epoch": 4.4004690706537675, + "grad_norm": 0.1809449615076826, + "learning_rate": 0.000461516063463801, + "loss": 3.053567409515381, + "step": 7507, + "token_acc": 0.2920464276774304 + }, + { + "epoch": 4.401055408970977, + "grad_norm": 0.19050727131520465, + "learning_rate": 0.00046150314580523246, + "loss": 3.034363269805908, + "step": 7508, + "token_acc": 0.29392466780111276 + }, + { + "epoch": 4.401641747288186, + "grad_norm": 0.1681378730789792, + "learning_rate": 0.0004614902261598768, + "loss": 3.0069069862365723, + "step": 7509, + "token_acc": 0.29926307894743864 + }, + { + "epoch": 4.402228085605394, + "grad_norm": 0.17451969246407623, + "learning_rate": 0.00046147730452785553, + "loss": 3.067537307739258, + "step": 7510, + "token_acc": 0.28939893922592885 + }, + { + "epoch": 4.402814423922603, + "grad_norm": 0.17058777414392742, + "learning_rate": 0.00046146438090928987, + "loss": 3.073115348815918, + "step": 7511, + "token_acc": 0.29044533049411153 + }, + { + "epoch": 4.403400762239812, + "grad_norm": 0.19937195508768987, + "learning_rate": 0.0004614514553043014, + "loss": 3.08248233795166, + "step": 7512, + "token_acc": 0.2880395563276761 + }, + { + "epoch": 4.403987100557021, + "grad_norm": 0.19833060788006351, + "learning_rate": 0.00046143852771301136, + "loss": 3.064807653427124, + "step": 7513, + "token_acc": 0.2905400454312303 + }, + { + "epoch": 4.40457343887423, + "grad_norm": 0.2180654112977944, + "learning_rate": 0.0004614255981355413, + "loss": 3.053335666656494, + "step": 7514, + "token_acc": 0.29108790210148505 + }, + { + "epoch": 4.4051597771914395, + "grad_norm": 0.19577594974991983, + "learning_rate": 0.00046141266657201267, + "loss": 3.0100479125976562, + "step": 7515, + "token_acc": 0.2977492844995645 + }, + { + "epoch": 4.405746115508649, + "grad_norm": 0.17864755186611248, + "learning_rate": 0.0004613997330225469, + "loss": 2.9719505310058594, + "step": 7516, + "token_acc": 0.304961007482348 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.2387633630615382, + "learning_rate": 0.00046138679748726545, + "loss": 3.031125545501709, + "step": 7517, + "token_acc": 0.29523237441073924 + }, + { + "epoch": 4.406918792143067, + "grad_norm": 0.22450527761595424, + "learning_rate": 0.00046137385996628997, + "loss": 3.058399200439453, + "step": 7518, + "token_acc": 0.2922976709762941 + }, + { + "epoch": 4.407505130460276, + "grad_norm": 0.2112571747983414, + "learning_rate": 0.00046136092045974186, + "loss": 3.049314260482788, + "step": 7519, + "token_acc": 0.2924011073802083 + }, + { + "epoch": 4.408091468777485, + "grad_norm": 0.2267099030405212, + "learning_rate": 0.0004613479789677427, + "loss": 3.032458782196045, + "step": 7520, + "token_acc": 0.29258290861922803 + }, + { + "epoch": 4.408677807094693, + "grad_norm": 0.27874556392461547, + "learning_rate": 0.0004613350354904141, + "loss": 3.0328762531280518, + "step": 7521, + "token_acc": 0.29732554985926024 + }, + { + "epoch": 4.409264145411902, + "grad_norm": 0.2134244192194194, + "learning_rate": 0.00046132209002787763, + "loss": 3.0926437377929688, + "step": 7522, + "token_acc": 0.28702195514087664 + }, + { + "epoch": 4.4098504837291115, + "grad_norm": 0.19880226126512426, + "learning_rate": 0.00046130914258025486, + "loss": 3.0537564754486084, + "step": 7523, + "token_acc": 0.2922183694028015 + }, + { + "epoch": 4.410436822046321, + "grad_norm": 0.2622580314408094, + "learning_rate": 0.00046129619314766744, + "loss": 3.05915904045105, + "step": 7524, + "token_acc": 0.2914480197384957 + }, + { + "epoch": 4.41102316036353, + "grad_norm": 0.2122384969223726, + "learning_rate": 0.00046128324173023706, + "loss": 3.0373120307922363, + "step": 7525, + "token_acc": 0.29413602391233556 + }, + { + "epoch": 4.411609498680739, + "grad_norm": 0.2127315936843975, + "learning_rate": 0.00046127028832808526, + "loss": 3.050812005996704, + "step": 7526, + "token_acc": 0.2913184541720087 + }, + { + "epoch": 4.412195836997948, + "grad_norm": 0.23332393311396762, + "learning_rate": 0.0004612573329413338, + "loss": 3.041382312774658, + "step": 7527, + "token_acc": 0.29259515607638475 + }, + { + "epoch": 4.412782175315157, + "grad_norm": 0.21529283590460704, + "learning_rate": 0.0004612443755701044, + "loss": 2.9850082397460938, + "step": 7528, + "token_acc": 0.30377380936644943 + }, + { + "epoch": 4.413368513632366, + "grad_norm": 0.2041303729429446, + "learning_rate": 0.00046123141621451873, + "loss": 3.0363340377807617, + "step": 7529, + "token_acc": 0.2955727775760036 + }, + { + "epoch": 4.413954851949575, + "grad_norm": 0.22530750428748536, + "learning_rate": 0.0004612184548746986, + "loss": 3.0665838718414307, + "step": 7530, + "token_acc": 0.2913468192706947 + }, + { + "epoch": 4.4145411902667835, + "grad_norm": 0.22001839042793542, + "learning_rate": 0.00046120549155076565, + "loss": 3.032942771911621, + "step": 7531, + "token_acc": 0.29565338036323663 + }, + { + "epoch": 4.415127528583993, + "grad_norm": 0.2583822756365117, + "learning_rate": 0.0004611925262428417, + "loss": 3.0183916091918945, + "step": 7532, + "token_acc": 0.2971070735112197 + }, + { + "epoch": 4.415713866901202, + "grad_norm": 0.21185490751462283, + "learning_rate": 0.00046117955895104857, + "loss": 3.003661870956421, + "step": 7533, + "token_acc": 0.2977303174954151 + }, + { + "epoch": 4.416300205218411, + "grad_norm": 0.2663105007930926, + "learning_rate": 0.00046116658967550807, + "loss": 3.045348882675171, + "step": 7534, + "token_acc": 0.29202843221372965 + }, + { + "epoch": 4.41688654353562, + "grad_norm": 0.26219437019213276, + "learning_rate": 0.00046115361841634196, + "loss": 3.0178146362304688, + "step": 7535, + "token_acc": 0.29475757219704823 + }, + { + "epoch": 4.417472881852829, + "grad_norm": 0.17467774884495788, + "learning_rate": 0.00046114064517367216, + "loss": 3.0526294708251953, + "step": 7536, + "token_acc": 0.29370285382363176 + }, + { + "epoch": 4.418059220170038, + "grad_norm": 0.3011361147810642, + "learning_rate": 0.0004611276699476206, + "loss": 3.0759549140930176, + "step": 7537, + "token_acc": 0.28947607608907355 + }, + { + "epoch": 4.418645558487247, + "grad_norm": 0.2462495028347038, + "learning_rate": 0.000461114692738309, + "loss": 3.059208869934082, + "step": 7538, + "token_acc": 0.29187650193292236 + }, + { + "epoch": 4.419231896804456, + "grad_norm": 0.20878714913971583, + "learning_rate": 0.0004611017135458593, + "loss": 3.0137548446655273, + "step": 7539, + "token_acc": 0.29972064094454987 + }, + { + "epoch": 4.4198182351216655, + "grad_norm": 0.2123034709715419, + "learning_rate": 0.0004610887323703935, + "loss": 3.0560672283172607, + "step": 7540, + "token_acc": 0.2914231008369039 + }, + { + "epoch": 4.420404573438875, + "grad_norm": 0.259187211035135, + "learning_rate": 0.0004610757492120335, + "loss": 3.0791683197021484, + "step": 7541, + "token_acc": 0.2891130396177751 + }, + { + "epoch": 4.420990911756084, + "grad_norm": 0.20754974764083525, + "learning_rate": 0.0004610627640709013, + "loss": 3.024458885192871, + "step": 7542, + "token_acc": 0.2980797769258972 + }, + { + "epoch": 4.421577250073292, + "grad_norm": 0.19050257960433375, + "learning_rate": 0.0004610497769471188, + "loss": 3.071842670440674, + "step": 7543, + "token_acc": 0.2898069893899546 + }, + { + "epoch": 4.422163588390501, + "grad_norm": 0.21999382050881797, + "learning_rate": 0.00046103678784080806, + "loss": 3.0235085487365723, + "step": 7544, + "token_acc": 0.29515990636592465 + }, + { + "epoch": 4.42274992670771, + "grad_norm": 0.19931819454019808, + "learning_rate": 0.00046102379675209103, + "loss": 3.0318026542663574, + "step": 7545, + "token_acc": 0.29505515501999846 + }, + { + "epoch": 4.423336265024919, + "grad_norm": 0.17324184639266896, + "learning_rate": 0.00046101080368108986, + "loss": 3.0414838790893555, + "step": 7546, + "token_acc": 0.2943136634426207 + }, + { + "epoch": 4.423922603342128, + "grad_norm": 0.19821724235811047, + "learning_rate": 0.0004609978086279265, + "loss": 3.085927963256836, + "step": 7547, + "token_acc": 0.287089475233894 + }, + { + "epoch": 4.4245089416593375, + "grad_norm": 0.18841893008344263, + "learning_rate": 0.0004609848115927231, + "loss": 3.0053563117980957, + "step": 7548, + "token_acc": 0.3006212083437214 + }, + { + "epoch": 4.425095279976547, + "grad_norm": 0.17069164278988555, + "learning_rate": 0.0004609718125756016, + "loss": 3.1011414527893066, + "step": 7549, + "token_acc": 0.28639622631360967 + }, + { + "epoch": 4.425681618293756, + "grad_norm": 0.21289496093769322, + "learning_rate": 0.0004609588115766843, + "loss": 3.02915096282959, + "step": 7550, + "token_acc": 0.2960657481418605 + }, + { + "epoch": 4.426267956610965, + "grad_norm": 0.17667380727781076, + "learning_rate": 0.0004609458085960931, + "loss": 3.04573917388916, + "step": 7551, + "token_acc": 0.2922632386194547 + }, + { + "epoch": 4.426854294928174, + "grad_norm": 0.16491452943010507, + "learning_rate": 0.00046093280363395045, + "loss": 3.087282180786133, + "step": 7552, + "token_acc": 0.2873702919574479 + }, + { + "epoch": 4.427440633245382, + "grad_norm": 0.1911721354188527, + "learning_rate": 0.00046091979669037834, + "loss": 3.044433355331421, + "step": 7553, + "token_acc": 0.29121318781494515 + }, + { + "epoch": 4.428026971562591, + "grad_norm": 0.1857855981643311, + "learning_rate": 0.00046090678776549885, + "loss": 3.0064706802368164, + "step": 7554, + "token_acc": 0.29731909704987375 + }, + { + "epoch": 4.4286133098798, + "grad_norm": 0.1893392276757729, + "learning_rate": 0.00046089377685943435, + "loss": 3.0068907737731934, + "step": 7555, + "token_acc": 0.29849096323408847 + }, + { + "epoch": 4.4291996481970095, + "grad_norm": 0.16947234966737473, + "learning_rate": 0.00046088076397230696, + "loss": 3.016566753387451, + "step": 7556, + "token_acc": 0.29645059805631696 + }, + { + "epoch": 4.429785986514219, + "grad_norm": 0.22668331160395627, + "learning_rate": 0.00046086774910423893, + "loss": 3.0473833084106445, + "step": 7557, + "token_acc": 0.2916787489125979 + }, + { + "epoch": 4.430372324831428, + "grad_norm": 0.24443664671461063, + "learning_rate": 0.0004608547322553526, + "loss": 3.0571484565734863, + "step": 7558, + "token_acc": 0.2917369952362904 + }, + { + "epoch": 4.430958663148637, + "grad_norm": 0.2040382538332167, + "learning_rate": 0.0004608417134257702, + "loss": 3.013051986694336, + "step": 7559, + "token_acc": 0.29631763241099374 + }, + { + "epoch": 4.431545001465846, + "grad_norm": 0.19142343128363662, + "learning_rate": 0.000460828692615614, + "loss": 3.048431873321533, + "step": 7560, + "token_acc": 0.2923738644678838 + }, + { + "epoch": 4.432131339783055, + "grad_norm": 0.19831279855285922, + "learning_rate": 0.00046081566982500633, + "loss": 3.054229974746704, + "step": 7561, + "token_acc": 0.2908986343048418 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 0.17629069328705005, + "learning_rate": 0.0004608026450540695, + "loss": 3.0721988677978516, + "step": 7562, + "token_acc": 0.2896223219467107 + }, + { + "epoch": 4.433304016417473, + "grad_norm": 0.19204106224098036, + "learning_rate": 0.0004607896183029259, + "loss": 3.0516953468322754, + "step": 7563, + "token_acc": 0.2927323001736223 + }, + { + "epoch": 4.4338903547346815, + "grad_norm": 0.23524025998156667, + "learning_rate": 0.00046077658957169787, + "loss": 3.0560128688812256, + "step": 7564, + "token_acc": 0.29137966856815234 + }, + { + "epoch": 4.434476693051891, + "grad_norm": 0.17342587407626683, + "learning_rate": 0.0004607635588605078, + "loss": 3.047670364379883, + "step": 7565, + "token_acc": 0.29239575552412284 + }, + { + "epoch": 4.4350630313691, + "grad_norm": 0.17894744182875094, + "learning_rate": 0.00046075052616947816, + "loss": 3.0308995246887207, + "step": 7566, + "token_acc": 0.29421973313012323 + }, + { + "epoch": 4.435649369686309, + "grad_norm": 0.2087917073797313, + "learning_rate": 0.0004607374914987312, + "loss": 3.0656702518463135, + "step": 7567, + "token_acc": 0.28901734104046245 + }, + { + "epoch": 4.436235708003518, + "grad_norm": 0.20080034291228516, + "learning_rate": 0.0004607244548483896, + "loss": 2.9949564933776855, + "step": 7568, + "token_acc": 0.29910187576987646 + }, + { + "epoch": 4.436822046320727, + "grad_norm": 0.1813953204199183, + "learning_rate": 0.00046071141621857565, + "loss": 3.0483360290527344, + "step": 7569, + "token_acc": 0.2937331399368298 + }, + { + "epoch": 4.437408384637936, + "grad_norm": 0.18517493390155132, + "learning_rate": 0.00046069837560941185, + "loss": 3.0493578910827637, + "step": 7570, + "token_acc": 0.2917145141172764 + }, + { + "epoch": 4.437994722955145, + "grad_norm": 0.20185606585101415, + "learning_rate": 0.00046068533302102076, + "loss": 3.038222312927246, + "step": 7571, + "token_acc": 0.2930980591985165 + }, + { + "epoch": 4.438581061272354, + "grad_norm": 0.17781904835872908, + "learning_rate": 0.0004606722884535249, + "loss": 3.0672547817230225, + "step": 7572, + "token_acc": 0.29026904466936254 + }, + { + "epoch": 4.4391673995895635, + "grad_norm": 0.23736385628053638, + "learning_rate": 0.0004606592419070468, + "loss": 3.006150722503662, + "step": 7573, + "token_acc": 0.2981968739658768 + }, + { + "epoch": 4.439753737906772, + "grad_norm": 0.2719648388681537, + "learning_rate": 0.00046064619338170886, + "loss": 3.091278553009033, + "step": 7574, + "token_acc": 0.28724266951349076 + }, + { + "epoch": 4.440340076223981, + "grad_norm": 0.19574987469290808, + "learning_rate": 0.00046063314287763394, + "loss": 3.01570463180542, + "step": 7575, + "token_acc": 0.2976582567723025 + }, + { + "epoch": 4.44092641454119, + "grad_norm": 0.2531118030477968, + "learning_rate": 0.00046062009039494437, + "loss": 3.0444788932800293, + "step": 7576, + "token_acc": 0.29264501153916794 + }, + { + "epoch": 4.441512752858399, + "grad_norm": 0.37624361535698847, + "learning_rate": 0.00046060703593376283, + "loss": 3.0619711875915527, + "step": 7577, + "token_acc": 0.2892516944143812 + }, + { + "epoch": 4.442099091175608, + "grad_norm": 0.2257985745639285, + "learning_rate": 0.0004605939794942121, + "loss": 3.058312177658081, + "step": 7578, + "token_acc": 0.2932942178278081 + }, + { + "epoch": 4.442685429492817, + "grad_norm": 0.2547080460667835, + "learning_rate": 0.0004605809210764146, + "loss": 3.04949688911438, + "step": 7579, + "token_acc": 0.29187478851116916 + }, + { + "epoch": 4.443271767810026, + "grad_norm": 0.2788817409800309, + "learning_rate": 0.0004605678606804932, + "loss": 3.0276925563812256, + "step": 7580, + "token_acc": 0.29549438790610844 + }, + { + "epoch": 4.4438581061272355, + "grad_norm": 0.2087876851287658, + "learning_rate": 0.00046055479830657043, + "loss": 3.020184278488159, + "step": 7581, + "token_acc": 0.2978189852995701 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.2214421005231179, + "learning_rate": 0.00046054173395476905, + "loss": 3.056488275527954, + "step": 7582, + "token_acc": 0.2918954649788942 + }, + { + "epoch": 4.445030782761654, + "grad_norm": 0.22878164383851737, + "learning_rate": 0.0004605286676252118, + "loss": 3.03840970993042, + "step": 7583, + "token_acc": 0.29531565497435125 + }, + { + "epoch": 4.445617121078863, + "grad_norm": 0.19363909319283915, + "learning_rate": 0.00046051559931802147, + "loss": 3.032454490661621, + "step": 7584, + "token_acc": 0.2954528743842106 + }, + { + "epoch": 4.446203459396072, + "grad_norm": 0.21859671217780147, + "learning_rate": 0.00046050252903332073, + "loss": 3.0649924278259277, + "step": 7585, + "token_acc": 0.2914438645078824 + }, + { + "epoch": 4.44678979771328, + "grad_norm": 0.19093612809457278, + "learning_rate": 0.00046048945677123234, + "loss": 3.0845353603363037, + "step": 7586, + "token_acc": 0.2882938099412945 + }, + { + "epoch": 4.447376136030489, + "grad_norm": 0.24384448242587697, + "learning_rate": 0.0004604763825318792, + "loss": 3.0703938007354736, + "step": 7587, + "token_acc": 0.28956492472727663 + }, + { + "epoch": 4.447962474347698, + "grad_norm": 0.17252300731934725, + "learning_rate": 0.000460463306315384, + "loss": 3.0647661685943604, + "step": 7588, + "token_acc": 0.29012570355483114 + }, + { + "epoch": 4.4485488126649075, + "grad_norm": 0.244178454899283, + "learning_rate": 0.0004604502281218697, + "loss": 3.079650640487671, + "step": 7589, + "token_acc": 0.28702297505294944 + }, + { + "epoch": 4.449135150982117, + "grad_norm": 0.1721861728363252, + "learning_rate": 0.0004604371479514591, + "loss": 3.0347256660461426, + "step": 7590, + "token_acc": 0.2953310000744657 + }, + { + "epoch": 4.449721489299326, + "grad_norm": 0.2321810275607078, + "learning_rate": 0.000460424065804275, + "loss": 3.104008913040161, + "step": 7591, + "token_acc": 0.2873571216865762 + }, + { + "epoch": 4.450307827616535, + "grad_norm": 0.17731771741092256, + "learning_rate": 0.00046041098168044035, + "loss": 3.0531649589538574, + "step": 7592, + "token_acc": 0.2932982663621811 + }, + { + "epoch": 4.450894165933744, + "grad_norm": 0.19180301755694798, + "learning_rate": 0.00046039789558007817, + "loss": 3.036776065826416, + "step": 7593, + "token_acc": 0.2963379539836992 + }, + { + "epoch": 4.451480504250953, + "grad_norm": 0.21070659281966705, + "learning_rate": 0.0004603848075033111, + "loss": 3.022726535797119, + "step": 7594, + "token_acc": 0.2981204160210995 + }, + { + "epoch": 4.452066842568162, + "grad_norm": 0.19618898554572026, + "learning_rate": 0.0004603717174502624, + "loss": 3.0223283767700195, + "step": 7595, + "token_acc": 0.29647060019363547 + }, + { + "epoch": 4.45265318088537, + "grad_norm": 0.19443355004759513, + "learning_rate": 0.0004603586254210549, + "loss": 3.019186496734619, + "step": 7596, + "token_acc": 0.29905173689663195 + }, + { + "epoch": 4.4532395192025795, + "grad_norm": 0.19261926878416546, + "learning_rate": 0.0004603455314158115, + "loss": 3.0328915119171143, + "step": 7597, + "token_acc": 0.295583789948875 + }, + { + "epoch": 4.453825857519789, + "grad_norm": 0.20681565845253028, + "learning_rate": 0.00046033243543465533, + "loss": 3.036186695098877, + "step": 7598, + "token_acc": 0.2954959399948561 + }, + { + "epoch": 4.454412195836998, + "grad_norm": 0.18015500147747537, + "learning_rate": 0.00046031933747770935, + "loss": 3.045621395111084, + "step": 7599, + "token_acc": 0.29264101024364136 + }, + { + "epoch": 4.454998534154207, + "grad_norm": 0.18553987528731267, + "learning_rate": 0.00046030623754509656, + "loss": 3.0554604530334473, + "step": 7600, + "token_acc": 0.2933688291657423 + }, + { + "epoch": 4.455584872471416, + "grad_norm": 0.1746189110956432, + "learning_rate": 0.00046029313563694007, + "loss": 3.0395989418029785, + "step": 7601, + "token_acc": 0.2941514447822652 + }, + { + "epoch": 4.456171210788625, + "grad_norm": 0.19828341803691518, + "learning_rate": 0.000460280031753363, + "loss": 3.022763729095459, + "step": 7602, + "token_acc": 0.29717530186078195 + }, + { + "epoch": 4.456757549105834, + "grad_norm": 0.19313079957981577, + "learning_rate": 0.00046026692589448833, + "loss": 3.0611565113067627, + "step": 7603, + "token_acc": 0.2916771869798886 + }, + { + "epoch": 4.457343887423043, + "grad_norm": 0.1950737882058072, + "learning_rate": 0.0004602538180604392, + "loss": 3.0042316913604736, + "step": 7604, + "token_acc": 0.29964273366370864 + }, + { + "epoch": 4.457930225740252, + "grad_norm": 0.1744623100650474, + "learning_rate": 0.00046024070825133887, + "loss": 3.0035977363586426, + "step": 7605, + "token_acc": 0.3001417397781824 + }, + { + "epoch": 4.4585165640574616, + "grad_norm": 0.17689431113635637, + "learning_rate": 0.0004602275964673103, + "loss": 3.072335958480835, + "step": 7606, + "token_acc": 0.2890095545301292 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.17619544005229673, + "learning_rate": 0.0004602144827084768, + "loss": 3.0310516357421875, + "step": 7607, + "token_acc": 0.29582374494807856 + }, + { + "epoch": 4.459689240691879, + "grad_norm": 0.17805173348495315, + "learning_rate": 0.0004602013669749615, + "loss": 3.0765323638916016, + "step": 7608, + "token_acc": 0.28899109712472093 + }, + { + "epoch": 4.460275579009088, + "grad_norm": 0.20360165757750823, + "learning_rate": 0.00046018824926688756, + "loss": 3.0297372341156006, + "step": 7609, + "token_acc": 0.2954833952226699 + }, + { + "epoch": 4.460861917326297, + "grad_norm": 0.24057609116017406, + "learning_rate": 0.0004601751295843783, + "loss": 3.0555989742279053, + "step": 7610, + "token_acc": 0.29239412142885846 + }, + { + "epoch": 4.461448255643506, + "grad_norm": 0.20937971061391722, + "learning_rate": 0.0004601620079275569, + "loss": 3.031153678894043, + "step": 7611, + "token_acc": 0.2959155453916928 + }, + { + "epoch": 4.462034593960715, + "grad_norm": 0.17822492522673333, + "learning_rate": 0.0004601488842965466, + "loss": 3.020857810974121, + "step": 7612, + "token_acc": 0.2961217814223674 + }, + { + "epoch": 4.4626209322779244, + "grad_norm": 0.1709022609023486, + "learning_rate": 0.00046013575869147073, + "loss": 3.0303144454956055, + "step": 7613, + "token_acc": 0.29555015733099116 + }, + { + "epoch": 4.463207270595134, + "grad_norm": 0.17338764594468506, + "learning_rate": 0.00046012263111245254, + "loss": 2.973038911819458, + "step": 7614, + "token_acc": 0.30378191650688063 + }, + { + "epoch": 4.463793608912343, + "grad_norm": 0.18911517414390897, + "learning_rate": 0.00046010950155961545, + "loss": 3.0565481185913086, + "step": 7615, + "token_acc": 0.2931893580325444 + }, + { + "epoch": 4.464379947229552, + "grad_norm": 0.18840251798153082, + "learning_rate": 0.0004600963700330827, + "loss": 3.0413818359375, + "step": 7616, + "token_acc": 0.2935745746161677 + }, + { + "epoch": 4.464966285546761, + "grad_norm": 0.20457301093121866, + "learning_rate": 0.00046008323653297763, + "loss": 3.0494651794433594, + "step": 7617, + "token_acc": 0.29291603297846797 + }, + { + "epoch": 4.465552623863969, + "grad_norm": 0.18857032788092065, + "learning_rate": 0.00046007010105942367, + "loss": 3.018894672393799, + "step": 7618, + "token_acc": 0.29704119474276125 + }, + { + "epoch": 4.466138962181178, + "grad_norm": 0.24119875757845458, + "learning_rate": 0.0004600569636125441, + "loss": 3.0258312225341797, + "step": 7619, + "token_acc": 0.29576290953692613 + }, + { + "epoch": 4.466725300498387, + "grad_norm": 0.29686425326240484, + "learning_rate": 0.0004600438241924625, + "loss": 3.0597023963928223, + "step": 7620, + "token_acc": 0.2905547415686594 + }, + { + "epoch": 4.4673116388155965, + "grad_norm": 0.19145685364262147, + "learning_rate": 0.00046003068279930223, + "loss": 3.053844928741455, + "step": 7621, + "token_acc": 0.2924921915364823 + }, + { + "epoch": 4.467897977132806, + "grad_norm": 0.22489455372119785, + "learning_rate": 0.0004600175394331867, + "loss": 3.0231003761291504, + "step": 7622, + "token_acc": 0.29773761767764767 + }, + { + "epoch": 4.468484315450015, + "grad_norm": 0.2109328049697858, + "learning_rate": 0.0004600043940942393, + "loss": 3.0442333221435547, + "step": 7623, + "token_acc": 0.2921478089553154 + }, + { + "epoch": 4.469070653767224, + "grad_norm": 0.18363313063692918, + "learning_rate": 0.0004599912467825837, + "loss": 3.0066678524017334, + "step": 7624, + "token_acc": 0.29904314304504587 + }, + { + "epoch": 4.469656992084433, + "grad_norm": 0.21398662392085024, + "learning_rate": 0.0004599780974983432, + "loss": 3.013634204864502, + "step": 7625, + "token_acc": 0.2985257825183919 + }, + { + "epoch": 4.470243330401642, + "grad_norm": 0.20295223182419272, + "learning_rate": 0.0004599649462416415, + "loss": 3.0667214393615723, + "step": 7626, + "token_acc": 0.2906267159024964 + }, + { + "epoch": 4.470829668718851, + "grad_norm": 0.17860655984519236, + "learning_rate": 0.0004599517930126021, + "loss": 3.0143256187438965, + "step": 7627, + "token_acc": 0.2973822979971151 + }, + { + "epoch": 4.47141600703606, + "grad_norm": 0.2351790644713847, + "learning_rate": 0.00045993863781134845, + "loss": 3.0475316047668457, + "step": 7628, + "token_acc": 0.29309907073579444 + }, + { + "epoch": 4.4720023453532685, + "grad_norm": 0.21542947540069318, + "learning_rate": 0.0004599254806380042, + "loss": 3.081923484802246, + "step": 7629, + "token_acc": 0.2872054807792824 + }, + { + "epoch": 4.472588683670478, + "grad_norm": 0.17678907403227306, + "learning_rate": 0.000459912321492693, + "loss": 3.032973051071167, + "step": 7630, + "token_acc": 0.29336243618718255 + }, + { + "epoch": 4.473175021987687, + "grad_norm": 0.21351148519034216, + "learning_rate": 0.00045989916037553837, + "loss": 3.0537538528442383, + "step": 7631, + "token_acc": 0.29228213860393204 + }, + { + "epoch": 4.473761360304896, + "grad_norm": 0.19759561204369963, + "learning_rate": 0.00045988599728666394, + "loss": 2.9968748092651367, + "step": 7632, + "token_acc": 0.3006177058348124 + }, + { + "epoch": 4.474347698622105, + "grad_norm": 0.21578599620471187, + "learning_rate": 0.00045987283222619335, + "loss": 3.0576815605163574, + "step": 7633, + "token_acc": 0.28963389295212766 + }, + { + "epoch": 4.474934036939314, + "grad_norm": 0.20917375035422553, + "learning_rate": 0.0004598596651942504, + "loss": 3.026186943054199, + "step": 7634, + "token_acc": 0.29704727670506686 + }, + { + "epoch": 4.475520375256523, + "grad_norm": 0.21339744656140158, + "learning_rate": 0.00045984649619095863, + "loss": 3.0379202365875244, + "step": 7635, + "token_acc": 0.29579791890030094 + }, + { + "epoch": 4.476106713573732, + "grad_norm": 0.19259239672021897, + "learning_rate": 0.0004598333252164418, + "loss": 2.9974935054779053, + "step": 7636, + "token_acc": 0.29921285667431946 + }, + { + "epoch": 4.476693051890941, + "grad_norm": 0.20314865285111752, + "learning_rate": 0.00045982015227082366, + "loss": 3.027759075164795, + "step": 7637, + "token_acc": 0.29749978911420294 + }, + { + "epoch": 4.4772793902081505, + "grad_norm": 0.22442506785654343, + "learning_rate": 0.0004598069773542279, + "loss": 3.005478858947754, + "step": 7638, + "token_acc": 0.2991695758513285 + }, + { + "epoch": 4.477865728525359, + "grad_norm": 0.1886233409721311, + "learning_rate": 0.0004597938004667783, + "loss": 3.0778331756591797, + "step": 7639, + "token_acc": 0.2886811188699488 + }, + { + "epoch": 4.478452066842568, + "grad_norm": 0.1988770101538601, + "learning_rate": 0.00045978062160859876, + "loss": 3.0884857177734375, + "step": 7640, + "token_acc": 0.2876125080696363 + }, + { + "epoch": 4.479038405159777, + "grad_norm": 0.20425523007572852, + "learning_rate": 0.0004597674407798128, + "loss": 3.0628671646118164, + "step": 7641, + "token_acc": 0.2903683498210738 + }, + { + "epoch": 4.479624743476986, + "grad_norm": 0.22862713093909018, + "learning_rate": 0.00045975425798054447, + "loss": 3.035349130630493, + "step": 7642, + "token_acc": 0.2934626446250491 + }, + { + "epoch": 4.480211081794195, + "grad_norm": 0.22363805270717516, + "learning_rate": 0.0004597410732109176, + "loss": 3.0828819274902344, + "step": 7643, + "token_acc": 0.28832578394525366 + }, + { + "epoch": 4.480797420111404, + "grad_norm": 0.24389988757603698, + "learning_rate": 0.0004597278864710559, + "loss": 3.0328969955444336, + "step": 7644, + "token_acc": 0.29425555276781834 + }, + { + "epoch": 4.481383758428613, + "grad_norm": 0.27192738494728863, + "learning_rate": 0.0004597146977610833, + "loss": 3.0757884979248047, + "step": 7645, + "token_acc": 0.2891024920968596 + }, + { + "epoch": 4.4819700967458225, + "grad_norm": 0.20843095547578383, + "learning_rate": 0.00045970150708112375, + "loss": 3.072634696960449, + "step": 7646, + "token_acc": 0.28930052998720013 + }, + { + "epoch": 4.482556435063032, + "grad_norm": 0.1985178151505475, + "learning_rate": 0.0004596883144313011, + "loss": 3.085329532623291, + "step": 7647, + "token_acc": 0.28829426567089306 + }, + { + "epoch": 4.483142773380241, + "grad_norm": 0.2955113249349143, + "learning_rate": 0.00045967511981173924, + "loss": 2.996570587158203, + "step": 7648, + "token_acc": 0.29940599140419527 + }, + { + "epoch": 4.48372911169745, + "grad_norm": 0.2081176403428122, + "learning_rate": 0.0004596619232225623, + "loss": 3.046630382537842, + "step": 7649, + "token_acc": 0.29375353230885165 + }, + { + "epoch": 4.484315450014659, + "grad_norm": 0.23506372420972854, + "learning_rate": 0.00045964872466389405, + "loss": 3.0310287475585938, + "step": 7650, + "token_acc": 0.2953277813254337 + }, + { + "epoch": 4.484901788331867, + "grad_norm": 0.23271746353133058, + "learning_rate": 0.00045963552413585853, + "loss": 3.0389418601989746, + "step": 7651, + "token_acc": 0.2937115522939102 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.1761589432160985, + "learning_rate": 0.00045962232163857973, + "loss": 3.0496087074279785, + "step": 7652, + "token_acc": 0.2926761828176567 + }, + { + "epoch": 4.486074464966285, + "grad_norm": 0.18759810523165646, + "learning_rate": 0.00045960911717218166, + "loss": 3.0675244331359863, + "step": 7653, + "token_acc": 0.29119780629387215 + }, + { + "epoch": 4.4866608032834945, + "grad_norm": 0.21101509564276377, + "learning_rate": 0.0004595959107367885, + "loss": 3.019044876098633, + "step": 7654, + "token_acc": 0.29737900832333036 + }, + { + "epoch": 4.487247141600704, + "grad_norm": 0.26529732338338785, + "learning_rate": 0.0004595827023325241, + "loss": 3.0790176391601562, + "step": 7655, + "token_acc": 0.2889874257518115 + }, + { + "epoch": 4.487833479917913, + "grad_norm": 0.19246519183388464, + "learning_rate": 0.00045956949195951263, + "loss": 3.0009732246398926, + "step": 7656, + "token_acc": 0.3004856939593151 + }, + { + "epoch": 4.488419818235122, + "grad_norm": 0.2051223163386124, + "learning_rate": 0.0004595562796178783, + "loss": 3.0152268409729004, + "step": 7657, + "token_acc": 0.2972520234822931 + }, + { + "epoch": 4.489006156552331, + "grad_norm": 0.17723538594703814, + "learning_rate": 0.0004595430653077449, + "loss": 3.059682607650757, + "step": 7658, + "token_acc": 0.29077984374753724 + }, + { + "epoch": 4.48959249486954, + "grad_norm": 0.20399896300376053, + "learning_rate": 0.0004595298490292369, + "loss": 3.054368257522583, + "step": 7659, + "token_acc": 0.29176005273566247 + }, + { + "epoch": 4.490178833186749, + "grad_norm": 0.19659008619947899, + "learning_rate": 0.0004595166307824783, + "loss": 3.0809693336486816, + "step": 7660, + "token_acc": 0.28822145170191277 + }, + { + "epoch": 4.490765171503957, + "grad_norm": 0.18918408917871415, + "learning_rate": 0.00045950341056759326, + "loss": 3.082911252975464, + "step": 7661, + "token_acc": 0.2868065248640653 + }, + { + "epoch": 4.4913515098211665, + "grad_norm": 0.2054273124442771, + "learning_rate": 0.000459490188384706, + "loss": 3.073209047317505, + "step": 7662, + "token_acc": 0.28910329683153035 + }, + { + "epoch": 4.491937848138376, + "grad_norm": 0.17997147275755476, + "learning_rate": 0.0004594769642339407, + "loss": 3.051784038543701, + "step": 7663, + "token_acc": 0.29516069842519266 + }, + { + "epoch": 4.492524186455585, + "grad_norm": 0.16282336307018755, + "learning_rate": 0.00045946373811542166, + "loss": 3.0419459342956543, + "step": 7664, + "token_acc": 0.2941067324743347 + }, + { + "epoch": 4.493110524772794, + "grad_norm": 0.18096161143455447, + "learning_rate": 0.00045945051002927297, + "loss": 2.9990177154541016, + "step": 7665, + "token_acc": 0.30059398647107166 + }, + { + "epoch": 4.493696863090003, + "grad_norm": 0.17682385862585487, + "learning_rate": 0.00045943727997561895, + "loss": 3.0519909858703613, + "step": 7666, + "token_acc": 0.29240413075339133 + }, + { + "epoch": 4.494283201407212, + "grad_norm": 0.2125860237547938, + "learning_rate": 0.000459424047954584, + "loss": 3.041882038116455, + "step": 7667, + "token_acc": 0.2941605513854162 + }, + { + "epoch": 4.494869539724421, + "grad_norm": 0.1681214479200011, + "learning_rate": 0.00045941081396629226, + "loss": 3.003459930419922, + "step": 7668, + "token_acc": 0.299738332806165 + }, + { + "epoch": 4.49545587804163, + "grad_norm": 0.19237578631808083, + "learning_rate": 0.0004593975780108681, + "loss": 2.994502067565918, + "step": 7669, + "token_acc": 0.3002833843887756 + }, + { + "epoch": 4.496042216358839, + "grad_norm": 0.18309851029699306, + "learning_rate": 0.0004593843400884359, + "loss": 3.04885196685791, + "step": 7670, + "token_acc": 0.29291126664686695 + }, + { + "epoch": 4.4966285546760485, + "grad_norm": 0.1779808166741814, + "learning_rate": 0.00045937110019912, + "loss": 3.028949022293091, + "step": 7671, + "token_acc": 0.29651900170618845 + }, + { + "epoch": 4.497214892993257, + "grad_norm": 0.1745979920444208, + "learning_rate": 0.00045935785834304467, + "loss": 2.9968109130859375, + "step": 7672, + "token_acc": 0.301610053926734 + }, + { + "epoch": 4.497801231310466, + "grad_norm": 0.16727829491908888, + "learning_rate": 0.0004593446145203344, + "loss": 3.0861566066741943, + "step": 7673, + "token_acc": 0.2858729332825957 + }, + { + "epoch": 4.498387569627675, + "grad_norm": 0.1789467100488619, + "learning_rate": 0.00045933136873111356, + "loss": 3.048555850982666, + "step": 7674, + "token_acc": 0.29419937022184056 + }, + { + "epoch": 4.498973907944884, + "grad_norm": 0.22849473559913733, + "learning_rate": 0.00045931812097550664, + "loss": 3.0504724979400635, + "step": 7675, + "token_acc": 0.2921356245931341 + }, + { + "epoch": 4.499560246262093, + "grad_norm": 0.2241713022227615, + "learning_rate": 0.00045930487125363803, + "loss": 3.0393242835998535, + "step": 7676, + "token_acc": 0.29277535797188387 + }, + { + "epoch": 4.500146584579302, + "grad_norm": 0.18230175907619314, + "learning_rate": 0.00045929161956563216, + "loss": 2.9953837394714355, + "step": 7677, + "token_acc": 0.2995537676034234 + }, + { + "epoch": 4.500732922896511, + "grad_norm": 0.1820095338597325, + "learning_rate": 0.00045927836591161354, + "loss": 3.0582523345947266, + "step": 7678, + "token_acc": 0.29255348438405443 + }, + { + "epoch": 4.5013192612137205, + "grad_norm": 0.2512032333928572, + "learning_rate": 0.0004592651102917067, + "loss": 3.0348997116088867, + "step": 7679, + "token_acc": 0.29461697830886585 + }, + { + "epoch": 4.50190559953093, + "grad_norm": 0.2868468636997157, + "learning_rate": 0.00045925185270603614, + "loss": 3.0552401542663574, + "step": 7680, + "token_acc": 0.29216013366786 + }, + { + "epoch": 4.502491937848139, + "grad_norm": 0.17669534218764027, + "learning_rate": 0.0004592385931547264, + "loss": 3.0304698944091797, + "step": 7681, + "token_acc": 0.2956780923994039 + }, + { + "epoch": 4.503078276165347, + "grad_norm": 0.23152504955669662, + "learning_rate": 0.00045922533163790204, + "loss": 3.063931703567505, + "step": 7682, + "token_acc": 0.29083877535492303 + }, + { + "epoch": 4.503664614482556, + "grad_norm": 0.22386396929177713, + "learning_rate": 0.0004592120681556876, + "loss": 2.9869446754455566, + "step": 7683, + "token_acc": 0.30246536620488274 + }, + { + "epoch": 4.504250952799765, + "grad_norm": 0.20879659004422466, + "learning_rate": 0.0004591988027082077, + "loss": 2.991276264190674, + "step": 7684, + "token_acc": 0.3004150650944819 + }, + { + "epoch": 4.504837291116974, + "grad_norm": 0.27811433477695924, + "learning_rate": 0.0004591855352955869, + "loss": 3.015376567840576, + "step": 7685, + "token_acc": 0.2976625774027173 + }, + { + "epoch": 4.505423629434183, + "grad_norm": 0.1707204533158236, + "learning_rate": 0.0004591722659179499, + "loss": 3.071946144104004, + "step": 7686, + "token_acc": 0.2880634835956255 + }, + { + "epoch": 4.5060099677513925, + "grad_norm": 0.23221264426288762, + "learning_rate": 0.0004591589945754214, + "loss": 3.0003676414489746, + "step": 7687, + "token_acc": 0.3000526173878216 + }, + { + "epoch": 4.506596306068602, + "grad_norm": 0.17165889883176683, + "learning_rate": 0.00045914572126812595, + "loss": 3.0096435546875, + "step": 7688, + "token_acc": 0.29698042399158936 + }, + { + "epoch": 4.507182644385811, + "grad_norm": 0.2290507121877382, + "learning_rate": 0.00045913244599618823, + "loss": 3.0237956047058105, + "step": 7689, + "token_acc": 0.2956233299548512 + }, + { + "epoch": 4.50776898270302, + "grad_norm": 0.21658924712637703, + "learning_rate": 0.00045911916875973304, + "loss": 3.04410982131958, + "step": 7690, + "token_acc": 0.2937747922048444 + }, + { + "epoch": 4.508355321020229, + "grad_norm": 0.18584523983268905, + "learning_rate": 0.00045910588955888507, + "loss": 3.0015764236450195, + "step": 7691, + "token_acc": 0.2991690641122487 + }, + { + "epoch": 4.508941659337438, + "grad_norm": 0.25221599568930114, + "learning_rate": 0.000459092608393769, + "loss": 3.036126136779785, + "step": 7692, + "token_acc": 0.2951906540344164 + }, + { + "epoch": 4.509527997654647, + "grad_norm": 0.1751188515357791, + "learning_rate": 0.00045907932526450965, + "loss": 3.034043312072754, + "step": 7693, + "token_acc": 0.2961817361373463 + }, + { + "epoch": 4.510114335971855, + "grad_norm": 0.21835964645741596, + "learning_rate": 0.0004590660401712318, + "loss": 3.080922842025757, + "step": 7694, + "token_acc": 0.28965449242872415 + }, + { + "epoch": 4.5107006742890645, + "grad_norm": 0.19167026498992387, + "learning_rate": 0.00045905275311406015, + "loss": 3.056273937225342, + "step": 7695, + "token_acc": 0.2919000748874675 + }, + { + "epoch": 4.511287012606274, + "grad_norm": 0.19030844965852323, + "learning_rate": 0.0004590394640931196, + "loss": 3.0271034240722656, + "step": 7696, + "token_acc": 0.2959358540995205 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 0.1797431704819906, + "learning_rate": 0.000459026173108535, + "loss": 2.9959287643432617, + "step": 7697, + "token_acc": 0.29931240358645234 + }, + { + "epoch": 4.512459689240692, + "grad_norm": 0.22678782099337802, + "learning_rate": 0.00045901288016043115, + "loss": 3.0537538528442383, + "step": 7698, + "token_acc": 0.29174404380043584 + }, + { + "epoch": 4.513046027557901, + "grad_norm": 0.19356081852668974, + "learning_rate": 0.00045899958524893295, + "loss": 3.025479316711426, + "step": 7699, + "token_acc": 0.2968606140137365 + }, + { + "epoch": 4.51363236587511, + "grad_norm": 0.19817106163271916, + "learning_rate": 0.0004589862883741653, + "loss": 3.041426658630371, + "step": 7700, + "token_acc": 0.2939535364765808 + }, + { + "epoch": 4.514218704192319, + "grad_norm": 0.18803742458370246, + "learning_rate": 0.00045897298953625305, + "loss": 3.0845577716827393, + "step": 7701, + "token_acc": 0.2865216267129613 + }, + { + "epoch": 4.514805042509528, + "grad_norm": 0.2127429259119938, + "learning_rate": 0.0004589596887353211, + "loss": 3.0653514862060547, + "step": 7702, + "token_acc": 0.2905331854075573 + }, + { + "epoch": 4.515391380826737, + "grad_norm": 0.2159512173558895, + "learning_rate": 0.00045894638597149457, + "loss": 3.0213801860809326, + "step": 7703, + "token_acc": 0.2979369255996644 + }, + { + "epoch": 4.515977719143946, + "grad_norm": 0.18218567402970115, + "learning_rate": 0.0004589330812448983, + "loss": 3.0130577087402344, + "step": 7704, + "token_acc": 0.29880119243093306 + }, + { + "epoch": 4.516564057461155, + "grad_norm": 0.20899489687218348, + "learning_rate": 0.0004589197745556572, + "loss": 3.055253744125366, + "step": 7705, + "token_acc": 0.2919411980290752 + }, + { + "epoch": 4.517150395778364, + "grad_norm": 0.23275977744429535, + "learning_rate": 0.0004589064659038964, + "loss": 3.0597341060638428, + "step": 7706, + "token_acc": 0.2922139739581437 + }, + { + "epoch": 4.517736734095573, + "grad_norm": 0.20660279838715576, + "learning_rate": 0.00045889315528974074, + "loss": 3.053131580352783, + "step": 7707, + "token_acc": 0.2899625260019685 + }, + { + "epoch": 4.518323072412782, + "grad_norm": 0.17891155482051155, + "learning_rate": 0.0004588798427133155, + "loss": 3.025653839111328, + "step": 7708, + "token_acc": 0.2966734433190004 + }, + { + "epoch": 4.518909410729991, + "grad_norm": 0.2163608318598857, + "learning_rate": 0.0004588665281747456, + "loss": 3.0432658195495605, + "step": 7709, + "token_acc": 0.29366915647167285 + }, + { + "epoch": 4.5194957490472, + "grad_norm": 0.198861684944232, + "learning_rate": 0.0004588532116741561, + "loss": 3.0471251010894775, + "step": 7710, + "token_acc": 0.29204545163135043 + }, + { + "epoch": 4.520082087364409, + "grad_norm": 0.16259892347042804, + "learning_rate": 0.00045883989321167205, + "loss": 3.0011978149414062, + "step": 7711, + "token_acc": 0.29965453095934097 + }, + { + "epoch": 4.5206684256816185, + "grad_norm": 0.18153699015824948, + "learning_rate": 0.00045882657278741864, + "loss": 3.054030418395996, + "step": 7712, + "token_acc": 0.29132056410465734 + }, + { + "epoch": 4.521254763998828, + "grad_norm": 0.19386590429606068, + "learning_rate": 0.000458813250401521, + "loss": 3.0295534133911133, + "step": 7713, + "token_acc": 0.29564199760856846 + }, + { + "epoch": 4.521841102316037, + "grad_norm": 0.1806088290152924, + "learning_rate": 0.00045879992605410425, + "loss": 3.012301445007324, + "step": 7714, + "token_acc": 0.29913974961042267 + }, + { + "epoch": 4.522427440633246, + "grad_norm": 0.21495100188026894, + "learning_rate": 0.00045878659974529356, + "loss": 3.0546364784240723, + "step": 7715, + "token_acc": 0.2927611468598133 + }, + { + "epoch": 4.523013778950454, + "grad_norm": 0.4024803673868194, + "learning_rate": 0.0004587732714752141, + "loss": 3.058104991912842, + "step": 7716, + "token_acc": 0.292005202436931 + }, + { + "epoch": 4.523600117267663, + "grad_norm": 0.4272667956721717, + "learning_rate": 0.0004587599412439911, + "loss": 3.0296711921691895, + "step": 7717, + "token_acc": 0.2962941331074964 + }, + { + "epoch": 4.524186455584872, + "grad_norm": 0.1810501761750138, + "learning_rate": 0.00045874660905174974, + "loss": 3.0296177864074707, + "step": 7718, + "token_acc": 0.2949934106989777 + }, + { + "epoch": 4.524772793902081, + "grad_norm": 0.22838991700802103, + "learning_rate": 0.0004587332748986153, + "loss": 3.0276174545288086, + "step": 7719, + "token_acc": 0.29666679621209174 + }, + { + "epoch": 4.5253591322192905, + "grad_norm": 0.19426745031098175, + "learning_rate": 0.00045871993878471296, + "loss": 3.0476722717285156, + "step": 7720, + "token_acc": 0.29296903460837886 + }, + { + "epoch": 4.5259454705365, + "grad_norm": 0.19812493437044693, + "learning_rate": 0.0004587066007101681, + "loss": 3.054412841796875, + "step": 7721, + "token_acc": 0.29300589286190437 + }, + { + "epoch": 4.526531808853709, + "grad_norm": 0.18855234007528757, + "learning_rate": 0.000458693260675106, + "loss": 3.047390937805176, + "step": 7722, + "token_acc": 0.29380012663571126 + }, + { + "epoch": 4.527118147170918, + "grad_norm": 0.2101971386185216, + "learning_rate": 0.00045867991867965186, + "loss": 3.0842652320861816, + "step": 7723, + "token_acc": 0.2884389018337628 + }, + { + "epoch": 4.527704485488127, + "grad_norm": 0.20930430793115856, + "learning_rate": 0.00045866657472393103, + "loss": 3.0205554962158203, + "step": 7724, + "token_acc": 0.29626444837401883 + }, + { + "epoch": 4.528290823805335, + "grad_norm": 0.23076508329338388, + "learning_rate": 0.000458653228808069, + "loss": 3.0232300758361816, + "step": 7725, + "token_acc": 0.2967079710010076 + }, + { + "epoch": 4.528877162122544, + "grad_norm": 0.20141783369370414, + "learning_rate": 0.00045863988093219107, + "loss": 3.0242323875427246, + "step": 7726, + "token_acc": 0.2937037914068461 + }, + { + "epoch": 4.529463500439753, + "grad_norm": 0.19090497395877684, + "learning_rate": 0.00045862653109642255, + "loss": 3.0027780532836914, + "step": 7727, + "token_acc": 0.2989830052092503 + }, + { + "epoch": 4.5300498387569625, + "grad_norm": 0.19197377586514258, + "learning_rate": 0.000458613179300889, + "loss": 3.063613176345825, + "step": 7728, + "token_acc": 0.29165483082930055 + }, + { + "epoch": 4.530636177074172, + "grad_norm": 0.19180830330935006, + "learning_rate": 0.0004585998255457156, + "loss": 3.080824851989746, + "step": 7729, + "token_acc": 0.2884022200524711 + }, + { + "epoch": 4.531222515391381, + "grad_norm": 0.17019802054699226, + "learning_rate": 0.00045858646983102795, + "loss": 3.023195743560791, + "step": 7730, + "token_acc": 0.29698061047212043 + }, + { + "epoch": 4.53180885370859, + "grad_norm": 0.17747700102354017, + "learning_rate": 0.0004585731121569515, + "loss": 3.0080957412719727, + "step": 7731, + "token_acc": 0.2987777416205662 + }, + { + "epoch": 4.532395192025799, + "grad_norm": 0.1908854400285193, + "learning_rate": 0.0004585597525236118, + "loss": 3.0752322673797607, + "step": 7732, + "token_acc": 0.2887484552595429 + }, + { + "epoch": 4.532981530343008, + "grad_norm": 0.19326827839285396, + "learning_rate": 0.0004585463909311342, + "loss": 3.0320725440979004, + "step": 7733, + "token_acc": 0.29640988444668703 + }, + { + "epoch": 4.533567868660217, + "grad_norm": 0.19616406765670003, + "learning_rate": 0.0004585330273796443, + "loss": 3.0425262451171875, + "step": 7734, + "token_acc": 0.2942502671566711 + }, + { + "epoch": 4.534154206977426, + "grad_norm": 0.18366474670819022, + "learning_rate": 0.0004585196618692676, + "loss": 3.027400016784668, + "step": 7735, + "token_acc": 0.2951814160594489 + }, + { + "epoch": 4.534740545294635, + "grad_norm": 0.18902503538459364, + "learning_rate": 0.0004585062944001296, + "loss": 3.0546491146087646, + "step": 7736, + "token_acc": 0.2924429304833554 + }, + { + "epoch": 4.535326883611844, + "grad_norm": 0.18439302650081577, + "learning_rate": 0.00045849292497235605, + "loss": 3.064779043197632, + "step": 7737, + "token_acc": 0.2896026755921256 + }, + { + "epoch": 4.535913221929053, + "grad_norm": 0.20476792036272337, + "learning_rate": 0.00045847955358607235, + "loss": 3.0330610275268555, + "step": 7738, + "token_acc": 0.29280855148250523 + }, + { + "epoch": 4.536499560246262, + "grad_norm": 0.21228118470811175, + "learning_rate": 0.0004584661802414042, + "loss": 3.046804428100586, + "step": 7739, + "token_acc": 0.2936720094951936 + }, + { + "epoch": 4.537085898563471, + "grad_norm": 0.18642310165554377, + "learning_rate": 0.0004584528049384771, + "loss": 2.9907026290893555, + "step": 7740, + "token_acc": 0.30123607945294695 + }, + { + "epoch": 4.53767223688068, + "grad_norm": 0.2131402338298228, + "learning_rate": 0.00045843942767741686, + "loss": 3.011488914489746, + "step": 7741, + "token_acc": 0.3002470103901196 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.21270579458651356, + "learning_rate": 0.0004584260484583491, + "loss": 3.0763394832611084, + "step": 7742, + "token_acc": 0.2896072496639135 + }, + { + "epoch": 4.538844913515098, + "grad_norm": 0.21310108387558624, + "learning_rate": 0.0004584126672813995, + "loss": 2.997494697570801, + "step": 7743, + "token_acc": 0.29953174036063057 + }, + { + "epoch": 4.5394312518323074, + "grad_norm": 0.2654213540846141, + "learning_rate": 0.00045839928414669366, + "loss": 3.0808634757995605, + "step": 7744, + "token_acc": 0.28987784479699935 + }, + { + "epoch": 4.540017590149517, + "grad_norm": 0.24063047560261408, + "learning_rate": 0.0004583858990543574, + "loss": 3.0638937950134277, + "step": 7745, + "token_acc": 0.2895288752209705 + }, + { + "epoch": 4.540603928466726, + "grad_norm": 0.24776016780815946, + "learning_rate": 0.00045837251200451633, + "loss": 3.0562894344329834, + "step": 7746, + "token_acc": 0.29238145218925454 + }, + { + "epoch": 4.541190266783934, + "grad_norm": 0.192615369918968, + "learning_rate": 0.0004583591229972964, + "loss": 3.066903591156006, + "step": 7747, + "token_acc": 0.2900211724331319 + }, + { + "epoch": 4.541776605101143, + "grad_norm": 0.2140920060614169, + "learning_rate": 0.00045834573203282333, + "loss": 3.0957999229431152, + "step": 7748, + "token_acc": 0.28673020567719065 + }, + { + "epoch": 4.542362943418352, + "grad_norm": 0.24122298067303372, + "learning_rate": 0.00045833233911122276, + "loss": 3.0607690811157227, + "step": 7749, + "token_acc": 0.2913910250837134 + }, + { + "epoch": 4.542949281735561, + "grad_norm": 0.19856602871552073, + "learning_rate": 0.0004583189442326206, + "loss": 3.031538486480713, + "step": 7750, + "token_acc": 0.2948558429959619 + }, + { + "epoch": 4.54353562005277, + "grad_norm": 0.20121596088634475, + "learning_rate": 0.0004583055473971427, + "loss": 3.020907402038574, + "step": 7751, + "token_acc": 0.2978971514495922 + }, + { + "epoch": 4.5441219583699795, + "grad_norm": 0.21288798828894406, + "learning_rate": 0.00045829214860491484, + "loss": 3.051473617553711, + "step": 7752, + "token_acc": 0.29312433464263427 + }, + { + "epoch": 4.544708296687189, + "grad_norm": 0.1805941019037169, + "learning_rate": 0.00045827874785606294, + "loss": 3.0340399742126465, + "step": 7753, + "token_acc": 0.2950359969259319 + }, + { + "epoch": 4.545294635004398, + "grad_norm": 0.2105900560084025, + "learning_rate": 0.0004582653451507129, + "loss": 3.025294303894043, + "step": 7754, + "token_acc": 0.29581852117439444 + }, + { + "epoch": 4.545880973321607, + "grad_norm": 0.18487510097102186, + "learning_rate": 0.0004582519404889906, + "loss": 3.031548023223877, + "step": 7755, + "token_acc": 0.29606893177837046 + }, + { + "epoch": 4.546467311638816, + "grad_norm": 0.18060953686528056, + "learning_rate": 0.0004582385338710218, + "loss": 3.0275983810424805, + "step": 7756, + "token_acc": 0.2953803574661804 + }, + { + "epoch": 4.547053649956025, + "grad_norm": 0.1749221349190226, + "learning_rate": 0.0004582251252969327, + "loss": 3.005312442779541, + "step": 7757, + "token_acc": 0.2974636776017356 + }, + { + "epoch": 4.547639988273234, + "grad_norm": 0.18756230523521286, + "learning_rate": 0.0004582117147668491, + "loss": 3.037630319595337, + "step": 7758, + "token_acc": 0.2939565479814833 + }, + { + "epoch": 4.548226326590442, + "grad_norm": 0.1644114789029569, + "learning_rate": 0.000458198302280897, + "loss": 3.0071287155151367, + "step": 7759, + "token_acc": 0.2985444621332727 + }, + { + "epoch": 4.5488126649076515, + "grad_norm": 0.17893638760164182, + "learning_rate": 0.00045818488783920243, + "loss": 3.0003890991210938, + "step": 7760, + "token_acc": 0.30030078661141185 + }, + { + "epoch": 4.549399003224861, + "grad_norm": 0.18242037025939323, + "learning_rate": 0.0004581714714418914, + "loss": 3.0751843452453613, + "step": 7761, + "token_acc": 0.2889476111533524 + }, + { + "epoch": 4.54998534154207, + "grad_norm": 0.18564942110865656, + "learning_rate": 0.0004581580530890899, + "loss": 3.0438437461853027, + "step": 7762, + "token_acc": 0.2935649247511362 + }, + { + "epoch": 4.550571679859279, + "grad_norm": 0.18099165097538658, + "learning_rate": 0.00045814463278092386, + "loss": 3.028074264526367, + "step": 7763, + "token_acc": 0.29566046349257497 + }, + { + "epoch": 4.551158018176488, + "grad_norm": 0.18097236819219453, + "learning_rate": 0.0004581312105175196, + "loss": 3.0315089225769043, + "step": 7764, + "token_acc": 0.2956141311460523 + }, + { + "epoch": 4.551744356493697, + "grad_norm": 0.19254680790331796, + "learning_rate": 0.0004581177862990031, + "loss": 3.0414376258850098, + "step": 7765, + "token_acc": 0.29523516897847374 + }, + { + "epoch": 4.552330694810906, + "grad_norm": 0.1963984932501173, + "learning_rate": 0.00045810436012550036, + "loss": 3.065140724182129, + "step": 7766, + "token_acc": 0.2916476494751255 + }, + { + "epoch": 4.552917033128115, + "grad_norm": 0.20240325099651418, + "learning_rate": 0.0004580909319971376, + "loss": 3.0250349044799805, + "step": 7767, + "token_acc": 0.29663222359900554 + }, + { + "epoch": 4.5535033714453235, + "grad_norm": 0.19617551799624186, + "learning_rate": 0.00045807750191404097, + "loss": 3.0141658782958984, + "step": 7768, + "token_acc": 0.29564309365013564 + }, + { + "epoch": 4.554089709762533, + "grad_norm": 0.21570344843240088, + "learning_rate": 0.00045806406987633654, + "loss": 3.0346503257751465, + "step": 7769, + "token_acc": 0.2962813225839695 + }, + { + "epoch": 4.554676048079742, + "grad_norm": 0.20665914542151365, + "learning_rate": 0.0004580506358841506, + "loss": 3.0941786766052246, + "step": 7770, + "token_acc": 0.286630640015626 + }, + { + "epoch": 4.555262386396951, + "grad_norm": 0.21125058683192813, + "learning_rate": 0.0004580371999376093, + "loss": 3.0495142936706543, + "step": 7771, + "token_acc": 0.29260867249094397 + }, + { + "epoch": 4.55584872471416, + "grad_norm": 0.248619213700559, + "learning_rate": 0.00045802376203683874, + "loss": 3.054462432861328, + "step": 7772, + "token_acc": 0.2914053224976269 + }, + { + "epoch": 4.556435063031369, + "grad_norm": 0.21562458466427578, + "learning_rate": 0.00045801032218196537, + "loss": 3.0564768314361572, + "step": 7773, + "token_acc": 0.29120048222330325 + }, + { + "epoch": 4.557021401348578, + "grad_norm": 0.18914746405070024, + "learning_rate": 0.00045799688037311524, + "loss": 3.01826548576355, + "step": 7774, + "token_acc": 0.2969143135800292 + }, + { + "epoch": 4.557607739665787, + "grad_norm": 0.2452973916671826, + "learning_rate": 0.00045798343661041473, + "loss": 2.999192476272583, + "step": 7775, + "token_acc": 0.30062212095954516 + }, + { + "epoch": 4.558194077982996, + "grad_norm": 0.24432388075903044, + "learning_rate": 0.00045796999089399004, + "loss": 3.0432722568511963, + "step": 7776, + "token_acc": 0.29412719574861024 + }, + { + "epoch": 4.5587804163002055, + "grad_norm": 0.18643706648495512, + "learning_rate": 0.00045795654322396763, + "loss": 3.035109519958496, + "step": 7777, + "token_acc": 0.2945248053603599 + }, + { + "epoch": 4.559366754617415, + "grad_norm": 0.194972220787616, + "learning_rate": 0.0004579430936004737, + "loss": 3.0296120643615723, + "step": 7778, + "token_acc": 0.2971370332622996 + }, + { + "epoch": 4.559953092934624, + "grad_norm": 0.17548699458488332, + "learning_rate": 0.0004579296420236346, + "loss": 3.049912452697754, + "step": 7779, + "token_acc": 0.2915303972043642 + }, + { + "epoch": 4.560539431251832, + "grad_norm": 0.1820991507949024, + "learning_rate": 0.00045791618849357665, + "loss": 3.0085248947143555, + "step": 7780, + "token_acc": 0.2982165457916047 + }, + { + "epoch": 4.561125769569041, + "grad_norm": 0.21062205360248948, + "learning_rate": 0.0004579027330104263, + "loss": 3.0634238719940186, + "step": 7781, + "token_acc": 0.2902926695842451 + }, + { + "epoch": 4.56171210788625, + "grad_norm": 0.18000310628358196, + "learning_rate": 0.0004578892755743099, + "loss": 3.064964771270752, + "step": 7782, + "token_acc": 0.29101600990075593 + }, + { + "epoch": 4.562298446203459, + "grad_norm": 0.2417925866651743, + "learning_rate": 0.000457875816185354, + "loss": 3.0440642833709717, + "step": 7783, + "token_acc": 0.29445420494299124 + }, + { + "epoch": 4.562884784520668, + "grad_norm": 0.25673945686718974, + "learning_rate": 0.0004578623548436849, + "loss": 3.041970729827881, + "step": 7784, + "token_acc": 0.29396096406022343 + }, + { + "epoch": 4.5634711228378775, + "grad_norm": 0.16908148835360404, + "learning_rate": 0.00045784889154942897, + "loss": 3.030210494995117, + "step": 7785, + "token_acc": 0.2949385145234974 + }, + { + "epoch": 4.564057461155087, + "grad_norm": 0.2641465438307317, + "learning_rate": 0.00045783542630271277, + "loss": 3.0457029342651367, + "step": 7786, + "token_acc": 0.2932319521749162 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 0.21116599667111, + "learning_rate": 0.000457821959103663, + "loss": 3.0689854621887207, + "step": 7787, + "token_acc": 0.28922215395771855 + }, + { + "epoch": 4.565230137789505, + "grad_norm": 0.18972275715627357, + "learning_rate": 0.0004578084899524058, + "loss": 3.0240399837493896, + "step": 7788, + "token_acc": 0.2965556411808468 + }, + { + "epoch": 4.565816476106714, + "grad_norm": 0.21030515877549327, + "learning_rate": 0.0004577950188490679, + "loss": 3.081120014190674, + "step": 7789, + "token_acc": 0.2887685960002914 + }, + { + "epoch": 4.566402814423922, + "grad_norm": 0.1599056253271638, + "learning_rate": 0.0004577815457937758, + "loss": 3.064271926879883, + "step": 7790, + "token_acc": 0.29154171175349103 + }, + { + "epoch": 4.566989152741131, + "grad_norm": 0.19393529607962012, + "learning_rate": 0.00045776807078665605, + "loss": 3.035736083984375, + "step": 7791, + "token_acc": 0.29338998507081887 + }, + { + "epoch": 4.56757549105834, + "grad_norm": 0.18845045061097335, + "learning_rate": 0.00045775459382783537, + "loss": 3.103361129760742, + "step": 7792, + "token_acc": 0.28608454461403215 + }, + { + "epoch": 4.5681618293755495, + "grad_norm": 0.18427864509328393, + "learning_rate": 0.0004577411149174401, + "loss": 3.039100170135498, + "step": 7793, + "token_acc": 0.2945561223305029 + }, + { + "epoch": 4.568748167692759, + "grad_norm": 0.1598271897619845, + "learning_rate": 0.00045772763405559704, + "loss": 3.0363528728485107, + "step": 7794, + "token_acc": 0.293373154278315 + }, + { + "epoch": 4.569334506009968, + "grad_norm": 0.2135086611404317, + "learning_rate": 0.0004577141512424327, + "loss": 3.0375876426696777, + "step": 7795, + "token_acc": 0.29509345371526674 + }, + { + "epoch": 4.569920844327177, + "grad_norm": 0.19849947670627496, + "learning_rate": 0.0004577006664780739, + "loss": 3.039111614227295, + "step": 7796, + "token_acc": 0.29426071999677494 + }, + { + "epoch": 4.570507182644386, + "grad_norm": 0.186675314575643, + "learning_rate": 0.0004576871797626472, + "loss": 3.03743052482605, + "step": 7797, + "token_acc": 0.29343637719476645 + }, + { + "epoch": 4.571093520961595, + "grad_norm": 0.1800593933108539, + "learning_rate": 0.0004576736910962793, + "loss": 3.0046610832214355, + "step": 7798, + "token_acc": 0.29926222509163614 + }, + { + "epoch": 4.571679859278804, + "grad_norm": 0.26045604531164673, + "learning_rate": 0.0004576602004790969, + "loss": 3.0314443111419678, + "step": 7799, + "token_acc": 0.29492697603476736 + }, + { + "epoch": 4.572266197596013, + "grad_norm": 0.31667420401475516, + "learning_rate": 0.00045764670791122674, + "loss": 3.063892126083374, + "step": 7800, + "token_acc": 0.29145895559472934 + }, + { + "epoch": 4.572852535913222, + "grad_norm": 0.22121253578207276, + "learning_rate": 0.00045763321339279555, + "loss": 3.0507192611694336, + "step": 7801, + "token_acc": 0.2930239820807443 + }, + { + "epoch": 4.573438874230431, + "grad_norm": 0.21146686732468206, + "learning_rate": 0.00045761971692393014, + "loss": 3.060736656188965, + "step": 7802, + "token_acc": 0.292124773579342 + }, + { + "epoch": 4.57402521254764, + "grad_norm": 0.332990626583307, + "learning_rate": 0.00045760621850475725, + "loss": 3.0674777030944824, + "step": 7803, + "token_acc": 0.2904646465655201 + }, + { + "epoch": 4.574611550864849, + "grad_norm": 0.2678546140168523, + "learning_rate": 0.00045759271813540373, + "loss": 3.0217394828796387, + "step": 7804, + "token_acc": 0.29794296532104775 + }, + { + "epoch": 4.575197889182058, + "grad_norm": 0.2344257995751777, + "learning_rate": 0.0004575792158159963, + "loss": 3.0525519847869873, + "step": 7805, + "token_acc": 0.29195585361220194 + }, + { + "epoch": 4.575784227499267, + "grad_norm": 0.2292192891417131, + "learning_rate": 0.0004575657115466619, + "loss": 3.0600693225860596, + "step": 7806, + "token_acc": 0.29203474443002037 + }, + { + "epoch": 4.576370565816476, + "grad_norm": 0.245183151581815, + "learning_rate": 0.0004575522053275273, + "loss": 3.0749831199645996, + "step": 7807, + "token_acc": 0.2887324297062794 + }, + { + "epoch": 4.576956904133685, + "grad_norm": 0.20423631980703807, + "learning_rate": 0.00045753869715871944, + "loss": 3.0036659240722656, + "step": 7808, + "token_acc": 0.30051814888235695 + }, + { + "epoch": 4.577543242450894, + "grad_norm": 0.24868295142134825, + "learning_rate": 0.00045752518704036515, + "loss": 3.0619301795959473, + "step": 7809, + "token_acc": 0.2911696776030626 + }, + { + "epoch": 4.5781295807681035, + "grad_norm": 0.18682809427540573, + "learning_rate": 0.0004575116749725914, + "loss": 3.019085168838501, + "step": 7810, + "token_acc": 0.2965460753757619 + }, + { + "epoch": 4.578715919085313, + "grad_norm": 0.25127746898004016, + "learning_rate": 0.0004574981609555251, + "loss": 3.0414505004882812, + "step": 7811, + "token_acc": 0.2945519576920842 + }, + { + "epoch": 4.579302257402521, + "grad_norm": 0.20049475699156788, + "learning_rate": 0.00045748464498929323, + "loss": 3.0160837173461914, + "step": 7812, + "token_acc": 0.29719824642338927 + }, + { + "epoch": 4.57988859571973, + "grad_norm": 0.2524545762848182, + "learning_rate": 0.0004574711270740226, + "loss": 3.013145923614502, + "step": 7813, + "token_acc": 0.29616153991319116 + }, + { + "epoch": 4.580474934036939, + "grad_norm": 0.17402863796781934, + "learning_rate": 0.0004574576072098404, + "loss": 3.0083446502685547, + "step": 7814, + "token_acc": 0.2983774796620568 + }, + { + "epoch": 4.581061272354148, + "grad_norm": 0.20853842734291966, + "learning_rate": 0.00045744408539687343, + "loss": 3.0648179054260254, + "step": 7815, + "token_acc": 0.2925405776714844 + }, + { + "epoch": 4.581647610671357, + "grad_norm": 0.15766727232862143, + "learning_rate": 0.0004574305616352489, + "loss": 3.049630641937256, + "step": 7816, + "token_acc": 0.29218349557671486 + }, + { + "epoch": 4.582233948988566, + "grad_norm": 0.22414556018201928, + "learning_rate": 0.00045741703592509363, + "loss": 3.0668680667877197, + "step": 7817, + "token_acc": 0.29150028813446954 + }, + { + "epoch": 4.5828202873057755, + "grad_norm": 0.19722662033128643, + "learning_rate": 0.000457403508266535, + "loss": 3.011504650115967, + "step": 7818, + "token_acc": 0.29678234639411827 + }, + { + "epoch": 4.583406625622985, + "grad_norm": 0.24578665216183787, + "learning_rate": 0.00045738997865969977, + "loss": 3.045166254043579, + "step": 7819, + "token_acc": 0.2946795547837487 + }, + { + "epoch": 4.583992963940194, + "grad_norm": 0.1995592644066035, + "learning_rate": 0.00045737644710471513, + "loss": 3.0374789237976074, + "step": 7820, + "token_acc": 0.29391765711098944 + }, + { + "epoch": 4.584579302257403, + "grad_norm": 0.26004609162815795, + "learning_rate": 0.0004573629136017083, + "loss": 3.0289974212646484, + "step": 7821, + "token_acc": 0.2964375229245629 + }, + { + "epoch": 4.585165640574612, + "grad_norm": 0.19574788013819414, + "learning_rate": 0.00045734937815080626, + "loss": 3.0600624084472656, + "step": 7822, + "token_acc": 0.2910896685372845 + }, + { + "epoch": 4.585751978891821, + "grad_norm": 0.19044797189742138, + "learning_rate": 0.00045733584075213627, + "loss": 3.0403828620910645, + "step": 7823, + "token_acc": 0.2958146991600987 + }, + { + "epoch": 4.586338317209029, + "grad_norm": 0.18438665072054153, + "learning_rate": 0.00045732230140582534, + "loss": 3.0201401710510254, + "step": 7824, + "token_acc": 0.2961809414091572 + }, + { + "epoch": 4.586924655526238, + "grad_norm": 0.20182846463563778, + "learning_rate": 0.00045730876011200087, + "loss": 3.0704402923583984, + "step": 7825, + "token_acc": 0.2894408525908706 + }, + { + "epoch": 4.5875109938434475, + "grad_norm": 0.16578806136324223, + "learning_rate": 0.0004572952168707899, + "loss": 3.0314865112304688, + "step": 7826, + "token_acc": 0.29559421886920406 + }, + { + "epoch": 4.588097332160657, + "grad_norm": 0.18929741670734776, + "learning_rate": 0.0004572816716823197, + "loss": 3.0050058364868164, + "step": 7827, + "token_acc": 0.2985936064453815 + }, + { + "epoch": 4.588683670477866, + "grad_norm": 0.17837248590461222, + "learning_rate": 0.0004572681245467175, + "loss": 3.052299737930298, + "step": 7828, + "token_acc": 0.2923463210084248 + }, + { + "epoch": 4.589270008795075, + "grad_norm": 0.1713927243254849, + "learning_rate": 0.00045725457546411065, + "loss": 3.0259742736816406, + "step": 7829, + "token_acc": 0.29639184159278636 + }, + { + "epoch": 4.589856347112284, + "grad_norm": 0.18679792610351442, + "learning_rate": 0.00045724102443462625, + "loss": 3.0785880088806152, + "step": 7830, + "token_acc": 0.288134484563056 + }, + { + "epoch": 4.590442685429493, + "grad_norm": 0.1927721388980211, + "learning_rate": 0.00045722747145839174, + "loss": 2.9990954399108887, + "step": 7831, + "token_acc": 0.2989585960146511 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.21173834980583617, + "learning_rate": 0.00045721391653553436, + "loss": 3.060718059539795, + "step": 7832, + "token_acc": 0.2898463624736104 + }, + { + "epoch": 4.59161536206391, + "grad_norm": 0.22620019919834827, + "learning_rate": 0.00045720035966618144, + "loss": 2.9942922592163086, + "step": 7833, + "token_acc": 0.30149051852723086 + }, + { + "epoch": 4.5922017003811195, + "grad_norm": 0.17384321562835217, + "learning_rate": 0.0004571868008504603, + "loss": 3.030264377593994, + "step": 7834, + "token_acc": 0.2956553886948319 + }, + { + "epoch": 4.592788038698329, + "grad_norm": 0.21505182300057427, + "learning_rate": 0.00045717324008849846, + "loss": 3.07796049118042, + "step": 7835, + "token_acc": 0.28868085989160597 + }, + { + "epoch": 4.593374377015538, + "grad_norm": 0.18935980133505287, + "learning_rate": 0.0004571596773804232, + "loss": 3.0458874702453613, + "step": 7836, + "token_acc": 0.29333017950258067 + }, + { + "epoch": 4.593960715332747, + "grad_norm": 0.20403231968247543, + "learning_rate": 0.0004571461127263618, + "loss": 3.047337055206299, + "step": 7837, + "token_acc": 0.2933505487411233 + }, + { + "epoch": 4.594547053649956, + "grad_norm": 0.18278261896458015, + "learning_rate": 0.0004571325461264419, + "loss": 3.0264110565185547, + "step": 7838, + "token_acc": 0.29516309305104127 + }, + { + "epoch": 4.595133391967165, + "grad_norm": 0.24379592111093562, + "learning_rate": 0.0004571189775807908, + "loss": 3.0627965927124023, + "step": 7839, + "token_acc": 0.290358055270973 + }, + { + "epoch": 4.595719730284374, + "grad_norm": 0.19741792380508344, + "learning_rate": 0.0004571054070895361, + "loss": 3.013413190841675, + "step": 7840, + "token_acc": 0.29737402955203435 + }, + { + "epoch": 4.596306068601583, + "grad_norm": 0.21073360258642995, + "learning_rate": 0.000457091834652805, + "loss": 3.0569615364074707, + "step": 7841, + "token_acc": 0.29184358048382464 + }, + { + "epoch": 4.596892406918792, + "grad_norm": 0.2312127443162448, + "learning_rate": 0.0004570782602707253, + "loss": 3.041816234588623, + "step": 7842, + "token_acc": 0.29471457709062526 + }, + { + "epoch": 4.5974787452360015, + "grad_norm": 0.17094772193117477, + "learning_rate": 0.0004570646839434244, + "loss": 3.0402138233184814, + "step": 7843, + "token_acc": 0.29391356545670516 + }, + { + "epoch": 4.598065083553211, + "grad_norm": 0.2050003936788715, + "learning_rate": 0.00045705110567102975, + "loss": 3.0385661125183105, + "step": 7844, + "token_acc": 0.2946338829366352 + }, + { + "epoch": 4.598651421870419, + "grad_norm": 0.24621689334207333, + "learning_rate": 0.000457037525453669, + "loss": 3.0482773780822754, + "step": 7845, + "token_acc": 0.29228956973189785 + }, + { + "epoch": 4.599237760187628, + "grad_norm": 0.17848316796310376, + "learning_rate": 0.00045702394329146965, + "loss": 3.0513901710510254, + "step": 7846, + "token_acc": 0.29211475664339714 + }, + { + "epoch": 4.599824098504837, + "grad_norm": 0.22381012855851568, + "learning_rate": 0.00045701035918455936, + "loss": 3.023817777633667, + "step": 7847, + "token_acc": 0.29527217182971865 + }, + { + "epoch": 4.600410436822046, + "grad_norm": 0.20455734716908267, + "learning_rate": 0.00045699677313306575, + "loss": 3.097198486328125, + "step": 7848, + "token_acc": 0.28609896514687994 + }, + { + "epoch": 4.600996775139255, + "grad_norm": 0.18587005632627668, + "learning_rate": 0.0004569831851371163, + "loss": 3.006114959716797, + "step": 7849, + "token_acc": 0.29873031756468255 + }, + { + "epoch": 4.601583113456464, + "grad_norm": 0.22999507208152906, + "learning_rate": 0.0004569695951968388, + "loss": 3.0409340858459473, + "step": 7850, + "token_acc": 0.29468957190366246 + }, + { + "epoch": 4.6021694517736735, + "grad_norm": 0.18745321915896673, + "learning_rate": 0.00045695600331236076, + "loss": 3.044898271560669, + "step": 7851, + "token_acc": 0.2934812183314424 + }, + { + "epoch": 4.602755790090883, + "grad_norm": 0.1958885413413641, + "learning_rate": 0.00045694240948381, + "loss": 3.060129165649414, + "step": 7852, + "token_acc": 0.2913964994057395 + }, + { + "epoch": 4.603342128408092, + "grad_norm": 0.2996371068790844, + "learning_rate": 0.00045692881371131415, + "loss": 3.036195993423462, + "step": 7853, + "token_acc": 0.29451174174360245 + }, + { + "epoch": 4.603928466725301, + "grad_norm": 0.2158338103541941, + "learning_rate": 0.000456915215995001, + "loss": 3.0701870918273926, + "step": 7854, + "token_acc": 0.28948040616583 + }, + { + "epoch": 4.604514805042509, + "grad_norm": 0.22154915762470215, + "learning_rate": 0.0004569016163349982, + "loss": 3.071502208709717, + "step": 7855, + "token_acc": 0.28925082280704034 + }, + { + "epoch": 4.605101143359718, + "grad_norm": 0.2550295610898617, + "learning_rate": 0.0004568880147314334, + "loss": 3.0128469467163086, + "step": 7856, + "token_acc": 0.29724644175911147 + }, + { + "epoch": 4.605687481676927, + "grad_norm": 0.1742770690795189, + "learning_rate": 0.00045687441118443455, + "loss": 3.0335254669189453, + "step": 7857, + "token_acc": 0.2955858178335176 + }, + { + "epoch": 4.606273819994136, + "grad_norm": 0.2895920085185948, + "learning_rate": 0.0004568608056941295, + "loss": 3.077699661254883, + "step": 7858, + "token_acc": 0.2880566480708403 + }, + { + "epoch": 4.6068601583113455, + "grad_norm": 0.17093613758917756, + "learning_rate": 0.00045684719826064567, + "loss": 3.036289691925049, + "step": 7859, + "token_acc": 0.29242059297066375 + }, + { + "epoch": 4.607446496628555, + "grad_norm": 0.2611131720770259, + "learning_rate": 0.00045683358888411136, + "loss": 3.0289061069488525, + "step": 7860, + "token_acc": 0.29745033687258937 + }, + { + "epoch": 4.608032834945764, + "grad_norm": 0.18416645205876153, + "learning_rate": 0.0004568199775646541, + "loss": 3.063633918762207, + "step": 7861, + "token_acc": 0.289769503813153 + }, + { + "epoch": 4.608619173262973, + "grad_norm": 0.23524793088860999, + "learning_rate": 0.00045680636430240186, + "loss": 3.0569167137145996, + "step": 7862, + "token_acc": 0.2922040643500443 + }, + { + "epoch": 4.609205511580182, + "grad_norm": 0.17062133038262667, + "learning_rate": 0.0004567927490974826, + "loss": 3.033095359802246, + "step": 7863, + "token_acc": 0.2949894443157707 + }, + { + "epoch": 4.609791849897391, + "grad_norm": 0.24907997336352825, + "learning_rate": 0.00045677913195002397, + "loss": 3.0427675247192383, + "step": 7864, + "token_acc": 0.2944004503752712 + }, + { + "epoch": 4.6103781882146, + "grad_norm": 0.17532333886551307, + "learning_rate": 0.0004567655128601541, + "loss": 3.025683879852295, + "step": 7865, + "token_acc": 0.29527851212837997 + }, + { + "epoch": 4.610964526531809, + "grad_norm": 0.28415268709081853, + "learning_rate": 0.00045675189182800086, + "loss": 3.0268681049346924, + "step": 7866, + "token_acc": 0.2958615477553579 + }, + { + "epoch": 4.6115508648490176, + "grad_norm": 0.18329562167446117, + "learning_rate": 0.0004567382688536922, + "loss": 3.079500675201416, + "step": 7867, + "token_acc": 0.29093990192372476 + }, + { + "epoch": 4.612137203166227, + "grad_norm": 0.2929201779464907, + "learning_rate": 0.000456724643937356, + "loss": 3.06679630279541, + "step": 7868, + "token_acc": 0.2911921952480173 + }, + { + "epoch": 4.612723541483436, + "grad_norm": 0.19688207280450812, + "learning_rate": 0.00045671101707912045, + "loss": 3.0999252796173096, + "step": 7869, + "token_acc": 0.28521464085588744 + }, + { + "epoch": 4.613309879800645, + "grad_norm": 0.2433088538473592, + "learning_rate": 0.00045669738827911345, + "loss": 3.0678224563598633, + "step": 7870, + "token_acc": 0.2912764277384706 + }, + { + "epoch": 4.613896218117854, + "grad_norm": 0.17938131011122063, + "learning_rate": 0.0004566837575374629, + "loss": 3.015923261642456, + "step": 7871, + "token_acc": 0.29763721190486675 + }, + { + "epoch": 4.614482556435063, + "grad_norm": 0.355606150140134, + "learning_rate": 0.00045667012485429704, + "loss": 3.0801806449890137, + "step": 7872, + "token_acc": 0.2873492467554146 + }, + { + "epoch": 4.615068894752272, + "grad_norm": 0.1667961255781519, + "learning_rate": 0.0004566564902297439, + "loss": 3.0267834663391113, + "step": 7873, + "token_acc": 0.29539316460076687 + }, + { + "epoch": 4.615655233069481, + "grad_norm": 0.22374083841946085, + "learning_rate": 0.0004566428536639314, + "loss": 3.0202596187591553, + "step": 7874, + "token_acc": 0.2984316741672012 + }, + { + "epoch": 4.6162415713866904, + "grad_norm": 0.19967579583702366, + "learning_rate": 0.00045662921515698783, + "loss": 3.031914234161377, + "step": 7875, + "token_acc": 0.29517436801700175 + }, + { + "epoch": 4.616827909703899, + "grad_norm": 0.20627432170159724, + "learning_rate": 0.00045661557470904116, + "loss": 3.0710954666137695, + "step": 7876, + "token_acc": 0.29067888963730854 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.21152795050297074, + "learning_rate": 0.0004566019323202196, + "loss": 2.979069232940674, + "step": 7877, + "token_acc": 0.30351272646195926 + }, + { + "epoch": 4.618000586338317, + "grad_norm": 0.17841260157409825, + "learning_rate": 0.00045658828799065125, + "loss": 3.0461244583129883, + "step": 7878, + "token_acc": 0.2916355054330527 + }, + { + "epoch": 4.618586924655526, + "grad_norm": 0.24400830015675043, + "learning_rate": 0.0004565746417204644, + "loss": 3.0102062225341797, + "step": 7879, + "token_acc": 0.2982942980082966 + }, + { + "epoch": 4.619173262972735, + "grad_norm": 0.16365542200693173, + "learning_rate": 0.0004565609935097871, + "loss": 3.024242401123047, + "step": 7880, + "token_acc": 0.29586128449685317 + }, + { + "epoch": 4.619759601289944, + "grad_norm": 0.21296580449234057, + "learning_rate": 0.0004565473433587476, + "loss": 3.0820741653442383, + "step": 7881, + "token_acc": 0.28791384786789004 + }, + { + "epoch": 4.620345939607153, + "grad_norm": 0.1872963026281644, + "learning_rate": 0.0004565336912674741, + "loss": 3.0502171516418457, + "step": 7882, + "token_acc": 0.29294386904069203 + }, + { + "epoch": 4.6209322779243625, + "grad_norm": 0.22810212910547079, + "learning_rate": 0.000456520037236095, + "loss": 3.0690391063690186, + "step": 7883, + "token_acc": 0.28983165088312374 + }, + { + "epoch": 4.621518616241572, + "grad_norm": 0.21666338495556886, + "learning_rate": 0.00045650638126473834, + "loss": 3.068673849105835, + "step": 7884, + "token_acc": 0.28956878234390915 + }, + { + "epoch": 4.622104954558781, + "grad_norm": 0.21799264536862453, + "learning_rate": 0.00045649272335353253, + "loss": 3.0840187072753906, + "step": 7885, + "token_acc": 0.28795465357232125 + }, + { + "epoch": 4.62269129287599, + "grad_norm": 0.22357206252272302, + "learning_rate": 0.0004564790635026058, + "loss": 3.0425257682800293, + "step": 7886, + "token_acc": 0.29420384456552146 + }, + { + "epoch": 4.623277631193199, + "grad_norm": 0.19480698400380914, + "learning_rate": 0.00045646540171208664, + "loss": 3.0728821754455566, + "step": 7887, + "token_acc": 0.28945330325827606 + }, + { + "epoch": 4.623863969510407, + "grad_norm": 0.22484644591645092, + "learning_rate": 0.000456451737982103, + "loss": 3.053065299987793, + "step": 7888, + "token_acc": 0.29279723024378324 + }, + { + "epoch": 4.624450307827616, + "grad_norm": 0.20016241449974334, + "learning_rate": 0.0004564380723127837, + "loss": 3.064450740814209, + "step": 7889, + "token_acc": 0.29046608921100703 + }, + { + "epoch": 4.625036646144825, + "grad_norm": 0.23135679916179752, + "learning_rate": 0.00045642440470425685, + "loss": 3.0510663986206055, + "step": 7890, + "token_acc": 0.29181653386194395 + }, + { + "epoch": 4.6256229844620345, + "grad_norm": 0.16506234400865674, + "learning_rate": 0.00045641073515665075, + "loss": 3.039574146270752, + "step": 7891, + "token_acc": 0.29419240189939333 + }, + { + "epoch": 4.626209322779244, + "grad_norm": 0.22646823073817215, + "learning_rate": 0.0004563970636700941, + "loss": 3.00673246383667, + "step": 7892, + "token_acc": 0.30064467052301497 + }, + { + "epoch": 4.626795661096453, + "grad_norm": 0.18024243954908933, + "learning_rate": 0.0004563833902447151, + "loss": 3.0074453353881836, + "step": 7893, + "token_acc": 0.2990538246841353 + }, + { + "epoch": 4.627381999413662, + "grad_norm": 0.20224201740298728, + "learning_rate": 0.00045636971488064224, + "loss": 3.032456159591675, + "step": 7894, + "token_acc": 0.29520516213580555 + }, + { + "epoch": 4.627968337730871, + "grad_norm": 0.16375196398798933, + "learning_rate": 0.0004563560375780039, + "loss": 3.0559322834014893, + "step": 7895, + "token_acc": 0.2915450771665194 + }, + { + "epoch": 4.62855467604808, + "grad_norm": 0.18210666923371072, + "learning_rate": 0.00045634235833692886, + "loss": 3.018622398376465, + "step": 7896, + "token_acc": 0.2971205811538342 + }, + { + "epoch": 4.629141014365289, + "grad_norm": 0.17284458243342243, + "learning_rate": 0.0004563286771575453, + "loss": 3.0370490550994873, + "step": 7897, + "token_acc": 0.2960802314286019 + }, + { + "epoch": 4.629727352682497, + "grad_norm": 0.16363261089068157, + "learning_rate": 0.00045631499403998177, + "loss": 3.050380229949951, + "step": 7898, + "token_acc": 0.2914851664436687 + }, + { + "epoch": 4.6303136909997065, + "grad_norm": 0.19163305783562323, + "learning_rate": 0.000456301308984367, + "loss": 3.060014486312866, + "step": 7899, + "token_acc": 0.2909861438870166 + }, + { + "epoch": 4.630900029316916, + "grad_norm": 0.1691149577828513, + "learning_rate": 0.0004562876219908294, + "loss": 3.047626495361328, + "step": 7900, + "token_acc": 0.29457769186199084 + }, + { + "epoch": 4.631486367634125, + "grad_norm": 0.17555885152623965, + "learning_rate": 0.00045627393305949754, + "loss": 3.066770553588867, + "step": 7901, + "token_acc": 0.29082377867067577 + }, + { + "epoch": 4.632072705951334, + "grad_norm": 0.191307856764747, + "learning_rate": 0.00045626024219050013, + "loss": 3.025543212890625, + "step": 7902, + "token_acc": 0.29692497704025084 + }, + { + "epoch": 4.632659044268543, + "grad_norm": 0.18382038351374802, + "learning_rate": 0.0004562465493839656, + "loss": 3.025158166885376, + "step": 7903, + "token_acc": 0.2972709424254862 + }, + { + "epoch": 4.633245382585752, + "grad_norm": 0.18080656463741146, + "learning_rate": 0.00045623285464002264, + "loss": 3.042851448059082, + "step": 7904, + "token_acc": 0.292723327997958 + }, + { + "epoch": 4.633831720902961, + "grad_norm": 0.1863702473763146, + "learning_rate": 0.0004562191579587999, + "loss": 3.0448098182678223, + "step": 7905, + "token_acc": 0.29331946394647723 + }, + { + "epoch": 4.63441805922017, + "grad_norm": 0.20571475016046592, + "learning_rate": 0.0004562054593404261, + "loss": 3.029006004333496, + "step": 7906, + "token_acc": 0.29477180002984016 + }, + { + "epoch": 4.635004397537379, + "grad_norm": 0.19023827110172287, + "learning_rate": 0.0004561917587850299, + "loss": 3.0246174335479736, + "step": 7907, + "token_acc": 0.2962522989105136 + }, + { + "epoch": 4.6355907358545885, + "grad_norm": 0.16705615844782765, + "learning_rate": 0.00045617805629273996, + "loss": 2.9670491218566895, + "step": 7908, + "token_acc": 0.3021874776605631 + }, + { + "epoch": 4.636177074171798, + "grad_norm": 0.18374161421009763, + "learning_rate": 0.000456164351863685, + "loss": 3.076702117919922, + "step": 7909, + "token_acc": 0.2900571668496867 + }, + { + "epoch": 4.636763412489006, + "grad_norm": 0.27914898465552157, + "learning_rate": 0.0004561506454979937, + "loss": 3.0288381576538086, + "step": 7910, + "token_acc": 0.2968009842304295 + }, + { + "epoch": 4.637349750806215, + "grad_norm": 0.38585079303278547, + "learning_rate": 0.000456136937195795, + "loss": 3.0211286544799805, + "step": 7911, + "token_acc": 0.2978466708190921 + }, + { + "epoch": 4.637936089123424, + "grad_norm": 0.26664361093245875, + "learning_rate": 0.00045612322695721746, + "loss": 3.002315044403076, + "step": 7912, + "token_acc": 0.2998039296837008 + }, + { + "epoch": 4.638522427440633, + "grad_norm": 0.1955027416202244, + "learning_rate": 0.00045610951478239, + "loss": 3.075575828552246, + "step": 7913, + "token_acc": 0.28842737728744305 + }, + { + "epoch": 4.639108765757842, + "grad_norm": 0.21969880187082347, + "learning_rate": 0.00045609580067144137, + "loss": 3.0196433067321777, + "step": 7914, + "token_acc": 0.29609575522014425 + }, + { + "epoch": 4.639695104075051, + "grad_norm": 0.16791252299413129, + "learning_rate": 0.0004560820846245004, + "loss": 3.035079002380371, + "step": 7915, + "token_acc": 0.29473528239610053 + }, + { + "epoch": 4.6402814423922605, + "grad_norm": 0.2018297573707955, + "learning_rate": 0.0004560683666416959, + "loss": 3.017077922821045, + "step": 7916, + "token_acc": 0.29734695092130903 + }, + { + "epoch": 4.64086778070947, + "grad_norm": 0.16422557662644138, + "learning_rate": 0.00045605464672315686, + "loss": 3.0430288314819336, + "step": 7917, + "token_acc": 0.29398592450415867 + }, + { + "epoch": 4.641454119026679, + "grad_norm": 0.2170352520711253, + "learning_rate": 0.00045604092486901205, + "loss": 3.0379128456115723, + "step": 7918, + "token_acc": 0.29385875342329465 + }, + { + "epoch": 4.642040457343887, + "grad_norm": 0.18728233539796438, + "learning_rate": 0.0004560272010793904, + "loss": 3.047757148742676, + "step": 7919, + "token_acc": 0.2899079976875165 + }, + { + "epoch": 4.642626795661096, + "grad_norm": 0.2208402318133815, + "learning_rate": 0.00045601347535442077, + "loss": 3.096238136291504, + "step": 7920, + "token_acc": 0.2852529191770701 + }, + { + "epoch": 4.643213133978305, + "grad_norm": 0.1747388048875358, + "learning_rate": 0.00045599974769423217, + "loss": 3.0435028076171875, + "step": 7921, + "token_acc": 0.2934999616571515 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 0.1737587310837407, + "learning_rate": 0.00045598601809895356, + "loss": 3.0723254680633545, + "step": 7922, + "token_acc": 0.29071351738974543 + }, + { + "epoch": 4.644385810612723, + "grad_norm": 0.20380677393158111, + "learning_rate": 0.00045597228656871387, + "loss": 3.0363826751708984, + "step": 7923, + "token_acc": 0.29297960165949055 + }, + { + "epoch": 4.6449721489299325, + "grad_norm": 0.1689565559263999, + "learning_rate": 0.0004559585531036421, + "loss": 3.0367989540100098, + "step": 7924, + "token_acc": 0.29468068167877365 + }, + { + "epoch": 4.645558487247142, + "grad_norm": 0.19029537834983035, + "learning_rate": 0.00045594481770386725, + "loss": 3.0834813117980957, + "step": 7925, + "token_acc": 0.2869083526535483 + }, + { + "epoch": 4.646144825564351, + "grad_norm": 0.18508468190135086, + "learning_rate": 0.00045593108036951836, + "loss": 3.064253330230713, + "step": 7926, + "token_acc": 0.29141578644527555 + }, + { + "epoch": 4.64673116388156, + "grad_norm": 0.1897579673008172, + "learning_rate": 0.00045591734110072445, + "loss": 3.0779192447662354, + "step": 7927, + "token_acc": 0.28720325925152124 + }, + { + "epoch": 4.647317502198769, + "grad_norm": 0.17887400008863225, + "learning_rate": 0.0004559035998976146, + "loss": 3.045581579208374, + "step": 7928, + "token_acc": 0.2917418955653832 + }, + { + "epoch": 4.647903840515978, + "grad_norm": 0.17211340359954436, + "learning_rate": 0.0004558898567603179, + "loss": 3.0282535552978516, + "step": 7929, + "token_acc": 0.2951539143721084 + }, + { + "epoch": 4.648490178833187, + "grad_norm": 0.1729761969683447, + "learning_rate": 0.0004558761116889634, + "loss": 3.1143007278442383, + "step": 7930, + "token_acc": 0.28410785599183275 + }, + { + "epoch": 4.649076517150396, + "grad_norm": 0.18520001643031803, + "learning_rate": 0.00045586236468368025, + "loss": 3.1094765663146973, + "step": 7931, + "token_acc": 0.2844304967390566 + }, + { + "epoch": 4.6496628554676045, + "grad_norm": 0.231702725486281, + "learning_rate": 0.0004558486157445977, + "loss": 3.0570220947265625, + "step": 7932, + "token_acc": 0.29154021295298205 + }, + { + "epoch": 4.650249193784814, + "grad_norm": 0.2630793672819458, + "learning_rate": 0.0004558348648718447, + "loss": 3.0372509956359863, + "step": 7933, + "token_acc": 0.29441440614667697 + }, + { + "epoch": 4.650835532102023, + "grad_norm": 0.20091752943460603, + "learning_rate": 0.00045582111206555044, + "loss": 3.027027130126953, + "step": 7934, + "token_acc": 0.2954581870047044 + }, + { + "epoch": 4.651421870419232, + "grad_norm": 0.21700402526138166, + "learning_rate": 0.0004558073573258443, + "loss": 3.010606527328491, + "step": 7935, + "token_acc": 0.2982048437639171 + }, + { + "epoch": 4.652008208736441, + "grad_norm": 0.2566924474291895, + "learning_rate": 0.0004557936006528553, + "loss": 3.042030096054077, + "step": 7936, + "token_acc": 0.29335111633015637 + }, + { + "epoch": 4.65259454705365, + "grad_norm": 0.17364982250510116, + "learning_rate": 0.00045577984204671275, + "loss": 3.056523323059082, + "step": 7937, + "token_acc": 0.2916415561585002 + }, + { + "epoch": 4.653180885370859, + "grad_norm": 0.21585042843398988, + "learning_rate": 0.0004557660815075459, + "loss": 3.0384583473205566, + "step": 7938, + "token_acc": 0.29403689403689404 + }, + { + "epoch": 4.653767223688068, + "grad_norm": 0.18253060876833255, + "learning_rate": 0.000455752319035484, + "loss": 3.0295188426971436, + "step": 7939, + "token_acc": 0.29612860313975015 + }, + { + "epoch": 4.654353562005277, + "grad_norm": 0.19532156889457608, + "learning_rate": 0.0004557385546306562, + "loss": 3.0767831802368164, + "step": 7940, + "token_acc": 0.28948370242986726 + }, + { + "epoch": 4.654939900322486, + "grad_norm": 0.18711516343176132, + "learning_rate": 0.000455724788293192, + "loss": 3.0394368171691895, + "step": 7941, + "token_acc": 0.2931789336717825 + }, + { + "epoch": 4.655526238639695, + "grad_norm": 0.17877288996359258, + "learning_rate": 0.00045571102002322063, + "loss": 3.0784835815429688, + "step": 7942, + "token_acc": 0.286616542177456 + }, + { + "epoch": 4.656112576956904, + "grad_norm": 0.19452783072630556, + "learning_rate": 0.0004556972498208715, + "loss": 3.046689987182617, + "step": 7943, + "token_acc": 0.29318849163841665 + }, + { + "epoch": 4.656698915274113, + "grad_norm": 0.18372136955374005, + "learning_rate": 0.00045568347768627375, + "loss": 3.056165933609009, + "step": 7944, + "token_acc": 0.2933575753759615 + }, + { + "epoch": 4.657285253591322, + "grad_norm": 0.17967535725800876, + "learning_rate": 0.00045566970361955695, + "loss": 3.0811591148376465, + "step": 7945, + "token_acc": 0.2884608936313996 + }, + { + "epoch": 4.657871591908531, + "grad_norm": 0.18651299542920274, + "learning_rate": 0.0004556559276208504, + "loss": 3.041147232055664, + "step": 7946, + "token_acc": 0.29506214543855697 + }, + { + "epoch": 4.65845793022574, + "grad_norm": 0.19665698338050452, + "learning_rate": 0.00045564214969028363, + "loss": 3.0251402854919434, + "step": 7947, + "token_acc": 0.29764220683215314 + }, + { + "epoch": 4.659044268542949, + "grad_norm": 0.16507049214442232, + "learning_rate": 0.00045562836982798597, + "loss": 3.0489554405212402, + "step": 7948, + "token_acc": 0.2923183175742111 + }, + { + "epoch": 4.6596306068601585, + "grad_norm": 0.25184623437633835, + "learning_rate": 0.0004556145880340867, + "loss": 3.0518088340759277, + "step": 7949, + "token_acc": 0.2906663467381637 + }, + { + "epoch": 4.660216945177368, + "grad_norm": 0.24003682477099095, + "learning_rate": 0.00045560080430871557, + "loss": 3.0464553833007812, + "step": 7950, + "token_acc": 0.2935356564253511 + }, + { + "epoch": 4.660803283494577, + "grad_norm": 0.19114426666923445, + "learning_rate": 0.0004555870186520019, + "loss": 3.099944591522217, + "step": 7951, + "token_acc": 0.2836475124511327 + }, + { + "epoch": 4.661389621811786, + "grad_norm": 0.1833926153591248, + "learning_rate": 0.00045557323106407523, + "loss": 3.057783603668213, + "step": 7952, + "token_acc": 0.2921743555575375 + }, + { + "epoch": 4.661975960128994, + "grad_norm": 0.1766573805747921, + "learning_rate": 0.0004555594415450651, + "loss": 3.047929525375366, + "step": 7953, + "token_acc": 0.2924752077554809 + }, + { + "epoch": 4.662562298446203, + "grad_norm": 0.16241575140511705, + "learning_rate": 0.000455545650095101, + "loss": 3.0324549674987793, + "step": 7954, + "token_acc": 0.2939304717516735 + }, + { + "epoch": 4.663148636763412, + "grad_norm": 0.19158533723766624, + "learning_rate": 0.0004555318567143124, + "loss": 3.039604902267456, + "step": 7955, + "token_acc": 0.29435390003248857 + }, + { + "epoch": 4.663734975080621, + "grad_norm": 0.17666676190524033, + "learning_rate": 0.0004555180614028291, + "loss": 3.053238868713379, + "step": 7956, + "token_acc": 0.2924349273165928 + }, + { + "epoch": 4.6643213133978305, + "grad_norm": 0.18959895030074603, + "learning_rate": 0.0004555042641607805, + "loss": 3.017399549484253, + "step": 7957, + "token_acc": 0.29614065934065936 + }, + { + "epoch": 4.66490765171504, + "grad_norm": 0.19466719233473379, + "learning_rate": 0.0004554904649882962, + "loss": 3.0840866565704346, + "step": 7958, + "token_acc": 0.288114284529284 + }, + { + "epoch": 4.665493990032249, + "grad_norm": 0.17665803180044912, + "learning_rate": 0.0004554766638855059, + "loss": 3.0078773498535156, + "step": 7959, + "token_acc": 0.29760857307077304 + }, + { + "epoch": 4.666080328349458, + "grad_norm": 0.19179519989684782, + "learning_rate": 0.0004554628608525393, + "loss": 3.0187392234802246, + "step": 7960, + "token_acc": 0.29850843212599737 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.20598931669329482, + "learning_rate": 0.00045544905588952594, + "loss": 3.0235226154327393, + "step": 7961, + "token_acc": 0.29678976762818426 + }, + { + "epoch": 4.667253004983876, + "grad_norm": 0.19616889802378318, + "learning_rate": 0.00045543524899659555, + "loss": 3.0523688793182373, + "step": 7962, + "token_acc": 0.29227009653112773 + }, + { + "epoch": 4.667839343301084, + "grad_norm": 0.1762435468858557, + "learning_rate": 0.00045542144017387783, + "loss": 3.0494980812072754, + "step": 7963, + "token_acc": 0.29309676021783265 + }, + { + "epoch": 4.668425681618293, + "grad_norm": 0.19397939343755546, + "learning_rate": 0.0004554076294215025, + "loss": 3.036929130554199, + "step": 7964, + "token_acc": 0.29460000895040683 + }, + { + "epoch": 4.6690120199355025, + "grad_norm": 0.19928160891966196, + "learning_rate": 0.00045539381673959925, + "loss": 3.026571750640869, + "step": 7965, + "token_acc": 0.2967803710353082 + }, + { + "epoch": 4.669598358252712, + "grad_norm": 0.20083328752098722, + "learning_rate": 0.0004553800021282979, + "loss": 3.075126886367798, + "step": 7966, + "token_acc": 0.2919400505460439 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.19828025977232586, + "learning_rate": 0.00045536618558772815, + "loss": 3.0255038738250732, + "step": 7967, + "token_acc": 0.29656638515925354 + }, + { + "epoch": 4.67077103488713, + "grad_norm": 0.23931765458144427, + "learning_rate": 0.0004553523671180198, + "loss": 3.0303902626037598, + "step": 7968, + "token_acc": 0.2946499158335171 + }, + { + "epoch": 4.671357373204339, + "grad_norm": 0.23160710522101882, + "learning_rate": 0.00045533854671930276, + "loss": 3.004765510559082, + "step": 7969, + "token_acc": 0.2994257522698485 + }, + { + "epoch": 4.671943711521548, + "grad_norm": 0.17454445338296665, + "learning_rate": 0.0004553247243917067, + "loss": 2.994231939315796, + "step": 7970, + "token_acc": 0.30172474775924274 + }, + { + "epoch": 4.672530049838757, + "grad_norm": 0.19136040893728268, + "learning_rate": 0.00045531090013536154, + "loss": 3.095249652862549, + "step": 7971, + "token_acc": 0.2870891465540238 + }, + { + "epoch": 4.673116388155966, + "grad_norm": 0.21271905979598432, + "learning_rate": 0.0004552970739503971, + "loss": 3.039263963699341, + "step": 7972, + "token_acc": 0.2957374732423052 + }, + { + "epoch": 4.673702726473175, + "grad_norm": 0.18717369407524545, + "learning_rate": 0.00045528324583694337, + "loss": 3.0271553993225098, + "step": 7973, + "token_acc": 0.2952407367717697 + }, + { + "epoch": 4.6742890647903845, + "grad_norm": 0.23895039032356719, + "learning_rate": 0.00045526941579513016, + "loss": 3.087270498275757, + "step": 7974, + "token_acc": 0.28823357395443544 + }, + { + "epoch": 4.674875403107593, + "grad_norm": 0.22780370556280186, + "learning_rate": 0.00045525558382508735, + "loss": 3.078446388244629, + "step": 7975, + "token_acc": 0.29006199712624453 + }, + { + "epoch": 4.675461741424802, + "grad_norm": 0.17099853006182997, + "learning_rate": 0.00045524174992694487, + "loss": 3.058983325958252, + "step": 7976, + "token_acc": 0.290619735008424 + }, + { + "epoch": 4.676048079742011, + "grad_norm": 0.193058552086614, + "learning_rate": 0.00045522791410083284, + "loss": 3.0329530239105225, + "step": 7977, + "token_acc": 0.293404875463699 + }, + { + "epoch": 4.67663441805922, + "grad_norm": 0.18315649280201907, + "learning_rate": 0.0004552140763468811, + "loss": 3.0383377075195312, + "step": 7978, + "token_acc": 0.2963654922500046 + }, + { + "epoch": 4.677220756376429, + "grad_norm": 0.1951195549944361, + "learning_rate": 0.0004552002366652195, + "loss": 3.0981011390686035, + "step": 7979, + "token_acc": 0.28623494696546775 + }, + { + "epoch": 4.677807094693638, + "grad_norm": 0.23991718787673869, + "learning_rate": 0.0004551863950559783, + "loss": 3.0166354179382324, + "step": 7980, + "token_acc": 0.2987220048635354 + }, + { + "epoch": 4.678393433010847, + "grad_norm": 0.2864690018755094, + "learning_rate": 0.0004551725515192874, + "loss": 2.9783473014831543, + "step": 7981, + "token_acc": 0.3018044315424703 + }, + { + "epoch": 4.6789797713280565, + "grad_norm": 0.24496585215119268, + "learning_rate": 0.00045515870605527674, + "loss": 3.055659055709839, + "step": 7982, + "token_acc": 0.292387446515628 + }, + { + "epoch": 4.679566109645266, + "grad_norm": 0.1906869296410676, + "learning_rate": 0.00045514485866407663, + "loss": 3.052459239959717, + "step": 7983, + "token_acc": 0.2929727940133166 + }, + { + "epoch": 4.680152447962474, + "grad_norm": 0.2711644118935266, + "learning_rate": 0.000455131009345817, + "loss": 3.1056301593780518, + "step": 7984, + "token_acc": 0.2854237357952351 + }, + { + "epoch": 4.680738786279683, + "grad_norm": 0.18580643623762055, + "learning_rate": 0.0004551171581006279, + "loss": 3.039222240447998, + "step": 7985, + "token_acc": 0.29235740599593496 + }, + { + "epoch": 4.681325124596892, + "grad_norm": 0.20545877839320048, + "learning_rate": 0.0004551033049286395, + "loss": 3.036125421524048, + "step": 7986, + "token_acc": 0.2946557985131618 + }, + { + "epoch": 4.681911462914101, + "grad_norm": 0.19072946114544526, + "learning_rate": 0.00045508944982998196, + "loss": 3.0507709980010986, + "step": 7987, + "token_acc": 0.2917572596324713 + }, + { + "epoch": 4.68249780123131, + "grad_norm": 0.1800013035477047, + "learning_rate": 0.0004550755928047854, + "loss": 3.065943717956543, + "step": 7988, + "token_acc": 0.2915095113151853 + }, + { + "epoch": 4.683084139548519, + "grad_norm": 0.24032943331994272, + "learning_rate": 0.0004550617338531799, + "loss": 3.059300422668457, + "step": 7989, + "token_acc": 0.2914615165510366 + }, + { + "epoch": 4.6836704778657285, + "grad_norm": 0.17912214434369897, + "learning_rate": 0.00045504787297529593, + "loss": 3.000694513320923, + "step": 7990, + "token_acc": 0.29870037188870296 + }, + { + "epoch": 4.684256816182938, + "grad_norm": 0.23300606223592524, + "learning_rate": 0.00045503401017126335, + "loss": 3.005760669708252, + "step": 7991, + "token_acc": 0.29930079069463505 + }, + { + "epoch": 4.684843154500147, + "grad_norm": 0.18061912017548534, + "learning_rate": 0.00045502014544121256, + "loss": 2.9905614852905273, + "step": 7992, + "token_acc": 0.30144510889275383 + }, + { + "epoch": 4.685429492817356, + "grad_norm": 0.2323223308557717, + "learning_rate": 0.00045500627878527377, + "loss": 3.0289628505706787, + "step": 7993, + "token_acc": 0.2964127637119442 + }, + { + "epoch": 4.686015831134565, + "grad_norm": 0.19494810061687487, + "learning_rate": 0.00045499241020357725, + "loss": 3.0282387733459473, + "step": 7994, + "token_acc": 0.29615759826289456 + }, + { + "epoch": 4.686602169451774, + "grad_norm": 0.18641150233663467, + "learning_rate": 0.00045497853969625327, + "loss": 3.0627260208129883, + "step": 7995, + "token_acc": 0.29029246711283013 + }, + { + "epoch": 4.687188507768982, + "grad_norm": 0.2641989914205502, + "learning_rate": 0.0004549646672634321, + "loss": 3.0462896823883057, + "step": 7996, + "token_acc": 0.29244388969789425 + }, + { + "epoch": 4.687774846086191, + "grad_norm": 0.19881687849755472, + "learning_rate": 0.0004549507929052441, + "loss": 3.046302556991577, + "step": 7997, + "token_acc": 0.2915923981277607 + }, + { + "epoch": 4.6883611844034006, + "grad_norm": 0.25521549583996905, + "learning_rate": 0.0004549369166218196, + "loss": 3.0197601318359375, + "step": 7998, + "token_acc": 0.2967985360036189 + }, + { + "epoch": 4.68894752272061, + "grad_norm": 0.2189742797385495, + "learning_rate": 0.00045492303841328886, + "loss": 3.028839588165283, + "step": 7999, + "token_acc": 0.29546595540359977 + }, + { + "epoch": 4.689533861037819, + "grad_norm": 0.2298528077790898, + "learning_rate": 0.0004549091582797823, + "loss": 3.069355010986328, + "step": 8000, + "token_acc": 0.28953341740226984 + }, + { + "epoch": 4.690120199355028, + "grad_norm": 0.25191456665912076, + "learning_rate": 0.00045489527622143036, + "loss": 3.1318163871765137, + "step": 8001, + "token_acc": 0.2814052123444389 + }, + { + "epoch": 4.690706537672237, + "grad_norm": 0.17006869245891404, + "learning_rate": 0.0004548813922383634, + "loss": 3.0846824645996094, + "step": 8002, + "token_acc": 0.28713435596649717 + }, + { + "epoch": 4.691292875989446, + "grad_norm": 0.2035730204122709, + "learning_rate": 0.0004548675063307118, + "loss": 3.017592430114746, + "step": 8003, + "token_acc": 0.2971963110747034 + }, + { + "epoch": 4.691879214306655, + "grad_norm": 0.17235350820108492, + "learning_rate": 0.0004548536184986061, + "loss": 3.01029109954834, + "step": 8004, + "token_acc": 0.2991354596961694 + }, + { + "epoch": 4.692465552623864, + "grad_norm": 0.19768502274816915, + "learning_rate": 0.0004548397287421767, + "loss": 3.0274598598480225, + "step": 8005, + "token_acc": 0.2956225782804482 + }, + { + "epoch": 4.693051890941073, + "grad_norm": 0.21678663790588082, + "learning_rate": 0.000454825837061554, + "loss": 3.079550266265869, + "step": 8006, + "token_acc": 0.28935440047719024 + }, + { + "epoch": 4.693638229258282, + "grad_norm": 0.20483762918553944, + "learning_rate": 0.0004548119434568686, + "loss": 3.072958469390869, + "step": 8007, + "token_acc": 0.2895830115418826 + }, + { + "epoch": 4.694224567575491, + "grad_norm": 0.16952649683925472, + "learning_rate": 0.000454798047928251, + "loss": 3.0399582386016846, + "step": 8008, + "token_acc": 0.294707421022865 + }, + { + "epoch": 4.6948109058927, + "grad_norm": 0.17145382482469476, + "learning_rate": 0.0004547841504758318, + "loss": 3.0316524505615234, + "step": 8009, + "token_acc": 0.2953471109895691 + }, + { + "epoch": 4.695397244209909, + "grad_norm": 0.17465604321263767, + "learning_rate": 0.00045477025109974127, + "loss": 3.032283067703247, + "step": 8010, + "token_acc": 0.294787049756504 + }, + { + "epoch": 4.695983582527118, + "grad_norm": 0.1611856137648926, + "learning_rate": 0.00045475634980011027, + "loss": 3.094754695892334, + "step": 8011, + "token_acc": 0.28731652713195266 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.18956354868649164, + "learning_rate": 0.0004547424465770693, + "loss": 3.093773365020752, + "step": 8012, + "token_acc": 0.2847676109902721 + }, + { + "epoch": 4.697156259161536, + "grad_norm": 0.21644635686943645, + "learning_rate": 0.0004547285414307488, + "loss": 3.0396718978881836, + "step": 8013, + "token_acc": 0.2941452958343018 + }, + { + "epoch": 4.6977425974787455, + "grad_norm": 0.1689955887525404, + "learning_rate": 0.0004547146343612797, + "loss": 3.0749409198760986, + "step": 8014, + "token_acc": 0.28851665599311294 + }, + { + "epoch": 4.698328935795955, + "grad_norm": 0.1843501858755421, + "learning_rate": 0.00045470072536879237, + "loss": 3.05033540725708, + "step": 8015, + "token_acc": 0.2917679487480102 + }, + { + "epoch": 4.698915274113164, + "grad_norm": 0.1833778505990386, + "learning_rate": 0.00045468681445341757, + "loss": 3.0323336124420166, + "step": 8016, + "token_acc": 0.2962058353889674 + }, + { + "epoch": 4.699501612430373, + "grad_norm": 0.1911092911292201, + "learning_rate": 0.00045467290161528594, + "loss": 3.057612657546997, + "step": 8017, + "token_acc": 0.29140716117737 + }, + { + "epoch": 4.700087950747581, + "grad_norm": 0.23715853464414993, + "learning_rate": 0.00045465898685452825, + "loss": 3.026954412460327, + "step": 8018, + "token_acc": 0.29868579232738246 + }, + { + "epoch": 4.70067428906479, + "grad_norm": 0.25408290664332184, + "learning_rate": 0.0004546450701712752, + "loss": 2.987659454345703, + "step": 8019, + "token_acc": 0.3015023240371846 + }, + { + "epoch": 4.701260627381999, + "grad_norm": 0.1938798581551351, + "learning_rate": 0.0004546311515656574, + "loss": 3.0442681312561035, + "step": 8020, + "token_acc": 0.29505844520578944 + }, + { + "epoch": 4.701846965699208, + "grad_norm": 0.2266523611047431, + "learning_rate": 0.00045461723103780574, + "loss": 3.0231189727783203, + "step": 8021, + "token_acc": 0.29540756857564693 + }, + { + "epoch": 4.7024333040164175, + "grad_norm": 0.27228014043134574, + "learning_rate": 0.0004546033085878509, + "loss": 3.065335273742676, + "step": 8022, + "token_acc": 0.29120917008794545 + }, + { + "epoch": 4.703019642333627, + "grad_norm": 0.18182965437059484, + "learning_rate": 0.00045458938421592365, + "loss": 3.0405826568603516, + "step": 8023, + "token_acc": 0.29229593009590377 + }, + { + "epoch": 4.703605980650836, + "grad_norm": 0.24000589736865774, + "learning_rate": 0.0004545754579221548, + "loss": 2.9833414554595947, + "step": 8024, + "token_acc": 0.3014126755818865 + }, + { + "epoch": 4.704192318968045, + "grad_norm": 0.2779801861967681, + "learning_rate": 0.00045456152970667525, + "loss": 3.055398464202881, + "step": 8025, + "token_acc": 0.2918309182151375 + }, + { + "epoch": 4.704778657285254, + "grad_norm": 0.18690144458034458, + "learning_rate": 0.00045454759956961577, + "loss": 3.090012788772583, + "step": 8026, + "token_acc": 0.28691562143671606 + }, + { + "epoch": 4.705364995602462, + "grad_norm": 0.24947655291435297, + "learning_rate": 0.00045453366751110717, + "loss": 3.0479936599731445, + "step": 8027, + "token_acc": 0.29308369985516297 + }, + { + "epoch": 4.705951333919671, + "grad_norm": 0.170181804790625, + "learning_rate": 0.0004545197335312804, + "loss": 3.0843465328216553, + "step": 8028, + "token_acc": 0.2863211620904557 + }, + { + "epoch": 4.70653767223688, + "grad_norm": 0.268625241087238, + "learning_rate": 0.0004545057976302664, + "loss": 3.07427978515625, + "step": 8029, + "token_acc": 0.2882187609179699 + }, + { + "epoch": 4.7071240105540895, + "grad_norm": 0.1875515664450732, + "learning_rate": 0.00045449185980819595, + "loss": 3.0231497287750244, + "step": 8030, + "token_acc": 0.2960921105176124 + }, + { + "epoch": 4.707710348871299, + "grad_norm": 0.25436079627153424, + "learning_rate": 0.00045447792006520003, + "loss": 3.0118207931518555, + "step": 8031, + "token_acc": 0.2972007257377717 + }, + { + "epoch": 4.708296687188508, + "grad_norm": 0.1882886205226228, + "learning_rate": 0.0004544639784014096, + "loss": 3.020613193511963, + "step": 8032, + "token_acc": 0.2973006173452586 + }, + { + "epoch": 4.708883025505717, + "grad_norm": 0.23872501615242225, + "learning_rate": 0.0004544500348169556, + "loss": 3.0313796997070312, + "step": 8033, + "token_acc": 0.2948937386014991 + }, + { + "epoch": 4.709469363822926, + "grad_norm": 0.18599198001046352, + "learning_rate": 0.000454436089311969, + "loss": 3.0512845516204834, + "step": 8034, + "token_acc": 0.29296380228096663 + }, + { + "epoch": 4.710055702140135, + "grad_norm": 0.22547387328639848, + "learning_rate": 0.0004544221418865809, + "loss": 3.032609462738037, + "step": 8035, + "token_acc": 0.2962689113866454 + }, + { + "epoch": 4.710642040457344, + "grad_norm": 0.17047763979688688, + "learning_rate": 0.0004544081925409221, + "loss": 3.025763511657715, + "step": 8036, + "token_acc": 0.2959154232956887 + }, + { + "epoch": 4.711228378774553, + "grad_norm": 0.21569515342633444, + "learning_rate": 0.0004543942412751239, + "loss": 3.12408447265625, + "step": 8037, + "token_acc": 0.28217110275118396 + }, + { + "epoch": 4.711814717091762, + "grad_norm": 0.2045754005974824, + "learning_rate": 0.0004543802880893172, + "loss": 3.060786247253418, + "step": 8038, + "token_acc": 0.2890259212381276 + }, + { + "epoch": 4.7124010554089715, + "grad_norm": 0.2336719882980407, + "learning_rate": 0.00045436633298363306, + "loss": 3.0467286109924316, + "step": 8039, + "token_acc": 0.2926053566490097 + }, + { + "epoch": 4.71298739372618, + "grad_norm": 0.20423093338516535, + "learning_rate": 0.00045435237595820266, + "loss": 3.0679893493652344, + "step": 8040, + "token_acc": 0.28922366008060757 + }, + { + "epoch": 4.713573732043389, + "grad_norm": 0.24072165882048077, + "learning_rate": 0.000454338417013157, + "loss": 3.126478672027588, + "step": 8041, + "token_acc": 0.2820371577237791 + }, + { + "epoch": 4.714160070360598, + "grad_norm": 0.16930982297968283, + "learning_rate": 0.00045432445614862737, + "loss": 2.9863779544830322, + "step": 8042, + "token_acc": 0.29998009335170334 + }, + { + "epoch": 4.714746408677807, + "grad_norm": 0.2538909878412488, + "learning_rate": 0.0004543104933647447, + "loss": 3.013861656188965, + "step": 8043, + "token_acc": 0.29724163913227203 + }, + { + "epoch": 4.715332746995016, + "grad_norm": 0.16732184573237052, + "learning_rate": 0.00045429652866164026, + "loss": 3.018298387527466, + "step": 8044, + "token_acc": 0.2970602479270293 + }, + { + "epoch": 4.715919085312225, + "grad_norm": 0.22296941846645618, + "learning_rate": 0.0004542825620394453, + "loss": 3.0446643829345703, + "step": 8045, + "token_acc": 0.2944485846732938 + }, + { + "epoch": 4.716505423629434, + "grad_norm": 0.1654179798220177, + "learning_rate": 0.00045426859349829085, + "loss": 3.099398612976074, + "step": 8046, + "token_acc": 0.2870337914077152 + }, + { + "epoch": 4.7170917619466435, + "grad_norm": 0.24734720906805324, + "learning_rate": 0.00045425462303830835, + "loss": 3.021082878112793, + "step": 8047, + "token_acc": 0.2953848363850146 + }, + { + "epoch": 4.717678100263853, + "grad_norm": 0.18730242600767572, + "learning_rate": 0.0004542406506596288, + "loss": 3.0749564170837402, + "step": 8048, + "token_acc": 0.2890789883176898 + }, + { + "epoch": 4.718264438581061, + "grad_norm": 0.2050954047392026, + "learning_rate": 0.0004542266763623836, + "loss": 3.036804437637329, + "step": 8049, + "token_acc": 0.2942367114910477 + }, + { + "epoch": 4.71885077689827, + "grad_norm": 0.16926769691804688, + "learning_rate": 0.0004542127001467039, + "loss": 3.0559167861938477, + "step": 8050, + "token_acc": 0.29131803059690947 + }, + { + "epoch": 4.719437115215479, + "grad_norm": 0.20232710448139002, + "learning_rate": 0.0004541987220127211, + "loss": 3.0608155727386475, + "step": 8051, + "token_acc": 0.29205413052070983 + }, + { + "epoch": 4.720023453532688, + "grad_norm": 0.1997354098802789, + "learning_rate": 0.00045418474196056645, + "loss": 3.0422816276550293, + "step": 8052, + "token_acc": 0.29329789753440516 + }, + { + "epoch": 4.720609791849897, + "grad_norm": 0.18972683442226776, + "learning_rate": 0.00045417075999037136, + "loss": 3.064742088317871, + "step": 8053, + "token_acc": 0.290101933739221 + }, + { + "epoch": 4.721196130167106, + "grad_norm": 0.17770215793419686, + "learning_rate": 0.0004541567761022671, + "loss": 3.043896198272705, + "step": 8054, + "token_acc": 0.2926433882451854 + }, + { + "epoch": 4.7217824684843155, + "grad_norm": 0.17708788766762143, + "learning_rate": 0.00045414279029638496, + "loss": 3.012117862701416, + "step": 8055, + "token_acc": 0.2992648970619519 + }, + { + "epoch": 4.722368806801525, + "grad_norm": 0.20225351446492323, + "learning_rate": 0.0004541288025728564, + "loss": 3.025546073913574, + "step": 8056, + "token_acc": 0.2972456649172472 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.17179949720160054, + "learning_rate": 0.0004541148129318129, + "loss": 3.007215976715088, + "step": 8057, + "token_acc": 0.29916697228480116 + }, + { + "epoch": 4.723541483435943, + "grad_norm": 0.17614345092399, + "learning_rate": 0.00045410082137338574, + "loss": 3.034902811050415, + "step": 8058, + "token_acc": 0.2942907337571108 + }, + { + "epoch": 4.724127821753152, + "grad_norm": 0.19580686581221662, + "learning_rate": 0.0004540868278977063, + "loss": 3.0407934188842773, + "step": 8059, + "token_acc": 0.29171732354826585 + }, + { + "epoch": 4.724714160070361, + "grad_norm": 0.2228798767607216, + "learning_rate": 0.00045407283250490624, + "loss": 3.0703625679016113, + "step": 8060, + "token_acc": 0.29044849586932325 + }, + { + "epoch": 4.725300498387569, + "grad_norm": 0.1840369219953886, + "learning_rate": 0.0004540588351951168, + "loss": 3.016350507736206, + "step": 8061, + "token_acc": 0.29696571424028917 + }, + { + "epoch": 4.725886836704778, + "grad_norm": 0.182385789924304, + "learning_rate": 0.0004540448359684697, + "loss": 3.0543291568756104, + "step": 8062, + "token_acc": 0.2927285872905208 + }, + { + "epoch": 4.7264731750219875, + "grad_norm": 0.2201999128320826, + "learning_rate": 0.0004540308348250962, + "loss": 3.0787839889526367, + "step": 8063, + "token_acc": 0.2894356155591958 + }, + { + "epoch": 4.727059513339197, + "grad_norm": 0.17325986175214206, + "learning_rate": 0.00045401683176512803, + "loss": 3.042766571044922, + "step": 8064, + "token_acc": 0.2946817488481316 + }, + { + "epoch": 4.727645851656406, + "grad_norm": 0.2061233502749803, + "learning_rate": 0.00045400282678869665, + "loss": 3.054119348526001, + "step": 8065, + "token_acc": 0.29192980957617753 + }, + { + "epoch": 4.728232189973615, + "grad_norm": 0.2962575454414807, + "learning_rate": 0.00045398881989593354, + "loss": 3.067460536956787, + "step": 8066, + "token_acc": 0.2888792002903099 + }, + { + "epoch": 4.728818528290824, + "grad_norm": 0.265365653712062, + "learning_rate": 0.0004539748110869704, + "loss": 3.083008289337158, + "step": 8067, + "token_acc": 0.2890790054397359 + }, + { + "epoch": 4.729404866608033, + "grad_norm": 0.16617020537490407, + "learning_rate": 0.0004539608003619387, + "loss": 3.030125141143799, + "step": 8068, + "token_acc": 0.29540970053995874 + }, + { + "epoch": 4.729991204925242, + "grad_norm": 0.20107062891692165, + "learning_rate": 0.0004539467877209702, + "loss": 3.0496387481689453, + "step": 8069, + "token_acc": 0.29399727873801473 + }, + { + "epoch": 4.730577543242451, + "grad_norm": 0.18657548935829188, + "learning_rate": 0.0004539327731641964, + "loss": 3.071558952331543, + "step": 8070, + "token_acc": 0.29039538180119256 + }, + { + "epoch": 4.7311638815596595, + "grad_norm": 0.18021357722852507, + "learning_rate": 0.000453918756691749, + "loss": 3.0159921646118164, + "step": 8071, + "token_acc": 0.2958962893047701 + }, + { + "epoch": 4.731750219876869, + "grad_norm": 0.20441379033500795, + "learning_rate": 0.0004539047383037597, + "loss": 3.0333118438720703, + "step": 8072, + "token_acc": 0.29511471322996363 + }, + { + "epoch": 4.732336558194078, + "grad_norm": 0.20617762251715366, + "learning_rate": 0.00045389071800036016, + "loss": 3.039820671081543, + "step": 8073, + "token_acc": 0.29275729951116397 + }, + { + "epoch": 4.732922896511287, + "grad_norm": 0.17431825120320568, + "learning_rate": 0.00045387669578168203, + "loss": 3.0579569339752197, + "step": 8074, + "token_acc": 0.2910923535518236 + }, + { + "epoch": 4.733509234828496, + "grad_norm": 0.21101568332120535, + "learning_rate": 0.0004538626716478571, + "loss": 3.083423614501953, + "step": 8075, + "token_acc": 0.2892961347003274 + }, + { + "epoch": 4.734095573145705, + "grad_norm": 0.21652294443018716, + "learning_rate": 0.0004538486455990171, + "loss": 3.033292293548584, + "step": 8076, + "token_acc": 0.2943113025026524 + }, + { + "epoch": 4.734681911462914, + "grad_norm": 0.20568841019636736, + "learning_rate": 0.0004538346176352937, + "loss": 3.0142390727996826, + "step": 8077, + "token_acc": 0.29823160477635935 + }, + { + "epoch": 4.735268249780123, + "grad_norm": 0.1738444287100953, + "learning_rate": 0.0004538205877568187, + "loss": 3.0962371826171875, + "step": 8078, + "token_acc": 0.2856865604313754 + }, + { + "epoch": 4.735854588097332, + "grad_norm": 0.17370104473910622, + "learning_rate": 0.000453806555963724, + "loss": 3.018662929534912, + "step": 8079, + "token_acc": 0.2981563274884874 + }, + { + "epoch": 4.7364409264145415, + "grad_norm": 0.20344592926770422, + "learning_rate": 0.00045379252225614134, + "loss": 3.0663442611694336, + "step": 8080, + "token_acc": 0.2899164132122424 + }, + { + "epoch": 4.737027264731751, + "grad_norm": 0.19208033314034037, + "learning_rate": 0.0004537784866342025, + "loss": 3.0330026149749756, + "step": 8081, + "token_acc": 0.29588172381251965 + }, + { + "epoch": 4.73761360304896, + "grad_norm": 0.1630891015780073, + "learning_rate": 0.00045376444909803947, + "loss": 3.041818857192993, + "step": 8082, + "token_acc": 0.29393436611266544 + }, + { + "epoch": 4.738199941366168, + "grad_norm": 0.1639706936578414, + "learning_rate": 0.0004537504096477839, + "loss": 3.0361692905426025, + "step": 8083, + "token_acc": 0.2946450738610502 + }, + { + "epoch": 4.738786279683377, + "grad_norm": 0.16488294429107092, + "learning_rate": 0.0004537363682835679, + "loss": 3.023923873901367, + "step": 8084, + "token_acc": 0.29831159351296455 + }, + { + "epoch": 4.739372618000586, + "grad_norm": 0.18713907803508287, + "learning_rate": 0.0004537223250055232, + "loss": 3.045753002166748, + "step": 8085, + "token_acc": 0.29341514048093775 + }, + { + "epoch": 4.739958956317795, + "grad_norm": 0.17128204876607012, + "learning_rate": 0.0004537082798137818, + "loss": 3.054975986480713, + "step": 8086, + "token_acc": 0.29001805285534216 + }, + { + "epoch": 4.740545294635004, + "grad_norm": 0.1914184695653119, + "learning_rate": 0.0004536942327084755, + "loss": 3.0377347469329834, + "step": 8087, + "token_acc": 0.2940427542285795 + }, + { + "epoch": 4.7411316329522135, + "grad_norm": 0.2163891735271536, + "learning_rate": 0.0004536801836897365, + "loss": 3.063438653945923, + "step": 8088, + "token_acc": 0.289966543298252 + }, + { + "epoch": 4.741717971269423, + "grad_norm": 0.1882115107797921, + "learning_rate": 0.00045366613275769663, + "loss": 3.0624465942382812, + "step": 8089, + "token_acc": 0.2902184467915058 + }, + { + "epoch": 4.742304309586632, + "grad_norm": 0.18990157234270733, + "learning_rate": 0.0004536520799124878, + "loss": 3.0651159286499023, + "step": 8090, + "token_acc": 0.29020775612231076 + }, + { + "epoch": 4.742890647903841, + "grad_norm": 0.21256366216363703, + "learning_rate": 0.0004536380251542422, + "loss": 3.044762134552002, + "step": 8091, + "token_acc": 0.2933580380765901 + }, + { + "epoch": 4.743476986221049, + "grad_norm": 0.15986233911456202, + "learning_rate": 0.00045362396848309174, + "loss": 3.041992664337158, + "step": 8092, + "token_acc": 0.29472166701431335 + }, + { + "epoch": 4.744063324538258, + "grad_norm": 0.20744147783222688, + "learning_rate": 0.0004536099098991684, + "loss": 3.0738940238952637, + "step": 8093, + "token_acc": 0.28922035530821916 + }, + { + "epoch": 4.744649662855467, + "grad_norm": 0.3042606379603551, + "learning_rate": 0.0004535958494026044, + "loss": 3.033778190612793, + "step": 8094, + "token_acc": 0.2963875947335409 + }, + { + "epoch": 4.745236001172676, + "grad_norm": 0.18330114323403998, + "learning_rate": 0.0004535817869935317, + "loss": 2.9897680282592773, + "step": 8095, + "token_acc": 0.3026168808135995 + }, + { + "epoch": 4.7458223394898855, + "grad_norm": 0.22140905450212867, + "learning_rate": 0.0004535677226720825, + "loss": 3.077850818634033, + "step": 8096, + "token_acc": 0.2886442440166147 + }, + { + "epoch": 4.746408677807095, + "grad_norm": 0.2612011598179863, + "learning_rate": 0.0004535536564383888, + "loss": 3.0762956142425537, + "step": 8097, + "token_acc": 0.2892958083085683 + }, + { + "epoch": 4.746995016124304, + "grad_norm": 0.1725996598608016, + "learning_rate": 0.0004535395882925828, + "loss": 3.040656566619873, + "step": 8098, + "token_acc": 0.29342643567856963 + }, + { + "epoch": 4.747581354441513, + "grad_norm": 0.25189283725576816, + "learning_rate": 0.0004535255182347967, + "loss": 3.068525791168213, + "step": 8099, + "token_acc": 0.29079293165228237 + }, + { + "epoch": 4.748167692758722, + "grad_norm": 0.20699869487012704, + "learning_rate": 0.00045351144626516255, + "loss": 3.0454187393188477, + "step": 8100, + "token_acc": 0.29314732370069163 + }, + { + "epoch": 4.748754031075931, + "grad_norm": 0.1940087213995694, + "learning_rate": 0.0004534973723838126, + "loss": 3.0376486778259277, + "step": 8101, + "token_acc": 0.29510017925856163 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 0.261808181763256, + "learning_rate": 0.00045348329659087905, + "loss": 3.0620203018188477, + "step": 8102, + "token_acc": 0.28954506744609043 + }, + { + "epoch": 4.749926707710349, + "grad_norm": 0.16918116494961782, + "learning_rate": 0.0004534692188864942, + "loss": 3.0556859970092773, + "step": 8103, + "token_acc": 0.2895942246106545 + }, + { + "epoch": 4.7505130460275575, + "grad_norm": 0.23963193894105225, + "learning_rate": 0.00045345513927079006, + "loss": 3.0365610122680664, + "step": 8104, + "token_acc": 0.2955468913585679 + }, + { + "epoch": 4.751099384344767, + "grad_norm": 0.18925023481667588, + "learning_rate": 0.0004534410577438992, + "loss": 3.0576319694519043, + "step": 8105, + "token_acc": 0.29056269195953355 + }, + { + "epoch": 4.751685722661976, + "grad_norm": 0.22301856758163083, + "learning_rate": 0.0004534269743059537, + "loss": 3.036783218383789, + "step": 8106, + "token_acc": 0.2944473406421259 + }, + { + "epoch": 4.752272060979185, + "grad_norm": 0.182574319055091, + "learning_rate": 0.0004534128889570859, + "loss": 3.048043727874756, + "step": 8107, + "token_acc": 0.2928452107930576 + }, + { + "epoch": 4.752858399296394, + "grad_norm": 0.22520703911979323, + "learning_rate": 0.0004533988016974281, + "loss": 3.028371810913086, + "step": 8108, + "token_acc": 0.29737944415007894 + }, + { + "epoch": 4.753444737613603, + "grad_norm": 0.2028047805188071, + "learning_rate": 0.00045338471252711254, + "loss": 3.048466682434082, + "step": 8109, + "token_acc": 0.2934945034275205 + }, + { + "epoch": 4.754031075930812, + "grad_norm": 0.21838439149603234, + "learning_rate": 0.0004533706214462718, + "loss": 3.054708480834961, + "step": 8110, + "token_acc": 0.29312563649168316 + }, + { + "epoch": 4.754617414248021, + "grad_norm": 0.21685423156794087, + "learning_rate": 0.00045335652845503806, + "loss": 3.051158905029297, + "step": 8111, + "token_acc": 0.29283435621705556 + }, + { + "epoch": 4.75520375256523, + "grad_norm": 0.20278054603934012, + "learning_rate": 0.00045334243355354384, + "loss": 3.0235133171081543, + "step": 8112, + "token_acc": 0.29648049470429577 + }, + { + "epoch": 4.7557900908824395, + "grad_norm": 0.1623626028787463, + "learning_rate": 0.00045332833674192137, + "loss": 3.0675442218780518, + "step": 8113, + "token_acc": 0.29116748702390904 + }, + { + "epoch": 4.756376429199648, + "grad_norm": 0.19889567768394695, + "learning_rate": 0.00045331423802030325, + "loss": 3.0920228958129883, + "step": 8114, + "token_acc": 0.2871832322306208 + }, + { + "epoch": 4.756962767516857, + "grad_norm": 0.19585973363158735, + "learning_rate": 0.0004533001373888217, + "loss": 3.024867534637451, + "step": 8115, + "token_acc": 0.29599368597147674 + }, + { + "epoch": 4.757549105834066, + "grad_norm": 0.24806897942540174, + "learning_rate": 0.00045328603484760945, + "loss": 3.022796869277954, + "step": 8116, + "token_acc": 0.29643810308585766 + }, + { + "epoch": 4.758135444151275, + "grad_norm": 0.22466475504730887, + "learning_rate": 0.00045327193039679877, + "loss": 3.0358481407165527, + "step": 8117, + "token_acc": 0.29646413815022776 + }, + { + "epoch": 4.758721782468484, + "grad_norm": 0.1693903426671957, + "learning_rate": 0.0004532578240365222, + "loss": 3.0672290325164795, + "step": 8118, + "token_acc": 0.2913440565657401 + }, + { + "epoch": 4.759308120785693, + "grad_norm": 0.2255760642157718, + "learning_rate": 0.0004532437157669123, + "loss": 3.0258865356445312, + "step": 8119, + "token_acc": 0.2964518186676649 + }, + { + "epoch": 4.759894459102902, + "grad_norm": 0.18568045481514533, + "learning_rate": 0.00045322960558810156, + "loss": 3.054309129714966, + "step": 8120, + "token_acc": 0.2926073344555795 + }, + { + "epoch": 4.7604807974201115, + "grad_norm": 0.1533671248873215, + "learning_rate": 0.0004532154935002225, + "loss": 3.072385311126709, + "step": 8121, + "token_acc": 0.2886014066282661 + }, + { + "epoch": 4.761067135737321, + "grad_norm": 0.18205365805874582, + "learning_rate": 0.0004532013795034078, + "loss": 3.0103516578674316, + "step": 8122, + "token_acc": 0.29778995522626006 + }, + { + "epoch": 4.76165347405453, + "grad_norm": 0.20593692336336708, + "learning_rate": 0.00045318726359778983, + "loss": 3.027998924255371, + "step": 8123, + "token_acc": 0.2978503300570159 + }, + { + "epoch": 4.762239812371739, + "grad_norm": 0.16898501082119435, + "learning_rate": 0.00045317314578350145, + "loss": 3.071122169494629, + "step": 8124, + "token_acc": 0.2890321379535035 + }, + { + "epoch": 4.762826150688948, + "grad_norm": 0.26975581748527555, + "learning_rate": 0.00045315902606067506, + "loss": 3.0170068740844727, + "step": 8125, + "token_acc": 0.29790003777594787 + }, + { + "epoch": 4.763412489006156, + "grad_norm": 0.18523025922802763, + "learning_rate": 0.0004531449044294434, + "loss": 3.067138195037842, + "step": 8126, + "token_acc": 0.2904073621079136 + }, + { + "epoch": 4.763998827323365, + "grad_norm": 0.26183581725698735, + "learning_rate": 0.0004531307808899391, + "loss": 3.0312914848327637, + "step": 8127, + "token_acc": 0.2961015665274432 + }, + { + "epoch": 4.764585165640574, + "grad_norm": 0.24413504086240792, + "learning_rate": 0.00045311665544229483, + "loss": 3.0271964073181152, + "step": 8128, + "token_acc": 0.2956888521952665 + }, + { + "epoch": 4.7651715039577835, + "grad_norm": 0.20036577418287554, + "learning_rate": 0.0004531025280866433, + "loss": 3.060777187347412, + "step": 8129, + "token_acc": 0.29321908655789536 + }, + { + "epoch": 4.765757842274993, + "grad_norm": 0.29147273222544895, + "learning_rate": 0.0004530883988231172, + "loss": 3.001120090484619, + "step": 8130, + "token_acc": 0.3002125899600417 + }, + { + "epoch": 4.766344180592202, + "grad_norm": 0.1809316619435025, + "learning_rate": 0.00045307426765184923, + "loss": 2.9764904975891113, + "step": 8131, + "token_acc": 0.3048796502497825 + }, + { + "epoch": 4.766930518909411, + "grad_norm": 0.24624316595138132, + "learning_rate": 0.0004530601345729722, + "loss": 3.014728307723999, + "step": 8132, + "token_acc": 0.2974370256297437 + }, + { + "epoch": 4.76751685722662, + "grad_norm": 0.2101937227298329, + "learning_rate": 0.00045304599958661884, + "loss": 3.0165584087371826, + "step": 8133, + "token_acc": 0.2975139579018655 + }, + { + "epoch": 4.768103195543829, + "grad_norm": 0.22798483227277452, + "learning_rate": 0.0004530318626929219, + "loss": 3.0670084953308105, + "step": 8134, + "token_acc": 0.2887177210893235 + }, + { + "epoch": 4.768689533861037, + "grad_norm": 0.2102260759242806, + "learning_rate": 0.00045301772389201426, + "loss": 3.062410593032837, + "step": 8135, + "token_acc": 0.29182172266765033 + }, + { + "epoch": 4.7692758721782464, + "grad_norm": 0.19254990672662414, + "learning_rate": 0.0004530035831840286, + "loss": 3.0415048599243164, + "step": 8136, + "token_acc": 0.2938247142478973 + }, + { + "epoch": 4.769862210495456, + "grad_norm": 0.21991333772621247, + "learning_rate": 0.00045298944056909775, + "loss": 3.1133790016174316, + "step": 8137, + "token_acc": 0.28484917526013415 + }, + { + "epoch": 4.770448548812665, + "grad_norm": 0.1867215356631029, + "learning_rate": 0.00045297529604735475, + "loss": 3.0927886962890625, + "step": 8138, + "token_acc": 0.2852247736920166 + }, + { + "epoch": 4.771034887129874, + "grad_norm": 0.21214681675154018, + "learning_rate": 0.0004529611496189323, + "loss": 3.014636993408203, + "step": 8139, + "token_acc": 0.299808054506345 + }, + { + "epoch": 4.771621225447083, + "grad_norm": 0.1975757320931099, + "learning_rate": 0.0004529470012839634, + "loss": 3.045872211456299, + "step": 8140, + "token_acc": 0.2944522074096454 + }, + { + "epoch": 4.772207563764292, + "grad_norm": 0.19785471692496498, + "learning_rate": 0.00045293285104258094, + "loss": 3.0008597373962402, + "step": 8141, + "token_acc": 0.29874825357226786 + }, + { + "epoch": 4.772793902081501, + "grad_norm": 0.18860934253743214, + "learning_rate": 0.00045291869889491765, + "loss": 3.0473556518554688, + "step": 8142, + "token_acc": 0.29156361566720135 + }, + { + "epoch": 4.77338024039871, + "grad_norm": 0.19997628761448896, + "learning_rate": 0.00045290454484110676, + "loss": 3.0097289085388184, + "step": 8143, + "token_acc": 0.2989769644897903 + }, + { + "epoch": 4.773966578715919, + "grad_norm": 0.17370910403847412, + "learning_rate": 0.00045289038888128103, + "loss": 3.0415279865264893, + "step": 8144, + "token_acc": 0.29445170513091407 + }, + { + "epoch": 4.7745529170331285, + "grad_norm": 0.2010236338999224, + "learning_rate": 0.00045287623101557354, + "loss": 3.0398483276367188, + "step": 8145, + "token_acc": 0.29441102971960237 + }, + { + "epoch": 4.775139255350338, + "grad_norm": 0.17060059987578335, + "learning_rate": 0.00045286207124411716, + "loss": 3.04191255569458, + "step": 8146, + "token_acc": 0.292796216751181 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 0.20614815249002116, + "learning_rate": 0.00045284790956704504, + "loss": 3.0324440002441406, + "step": 8147, + "token_acc": 0.297024289242299 + }, + { + "epoch": 4.776311931984755, + "grad_norm": 0.2024526324918473, + "learning_rate": 0.00045283374598449014, + "loss": 3.0417675971984863, + "step": 8148, + "token_acc": 0.2946282303313265 + }, + { + "epoch": 4.776898270301964, + "grad_norm": 0.19714847562096544, + "learning_rate": 0.00045281958049658545, + "loss": 3.061729907989502, + "step": 8149, + "token_acc": 0.28997016224280475 + }, + { + "epoch": 4.777484608619173, + "grad_norm": 0.19341047965954822, + "learning_rate": 0.00045280541310346417, + "loss": 3.0443923473358154, + "step": 8150, + "token_acc": 0.29369112851159634 + }, + { + "epoch": 4.778070946936382, + "grad_norm": 0.16219366507663802, + "learning_rate": 0.0004527912438052593, + "loss": 3.0006027221679688, + "step": 8151, + "token_acc": 0.29967843039405645 + }, + { + "epoch": 4.778657285253591, + "grad_norm": 0.18374607898979095, + "learning_rate": 0.000452777072602104, + "loss": 3.0871877670288086, + "step": 8152, + "token_acc": 0.2887636686242276 + }, + { + "epoch": 4.7792436235708005, + "grad_norm": 0.1794223544060227, + "learning_rate": 0.0004527628994941313, + "loss": 2.9948267936706543, + "step": 8153, + "token_acc": 0.2999787293826006 + }, + { + "epoch": 4.77982996188801, + "grad_norm": 0.17348813035371666, + "learning_rate": 0.00045274872448147443, + "loss": 3.055471897125244, + "step": 8154, + "token_acc": 0.2917260285516402 + }, + { + "epoch": 4.780416300205219, + "grad_norm": 0.22187357998242838, + "learning_rate": 0.0004527345475642665, + "loss": 3.077023983001709, + "step": 8155, + "token_acc": 0.28943039249276187 + }, + { + "epoch": 4.781002638522428, + "grad_norm": 0.1798485718682848, + "learning_rate": 0.00045272036874264063, + "loss": 3.043170213699341, + "step": 8156, + "token_acc": 0.2944234564714655 + }, + { + "epoch": 4.781588976839636, + "grad_norm": 0.18884024698368765, + "learning_rate": 0.00045270618801673005, + "loss": 3.071423053741455, + "step": 8157, + "token_acc": 0.28846863371388215 + }, + { + "epoch": 4.782175315156845, + "grad_norm": 0.17262680775682385, + "learning_rate": 0.00045269200538666804, + "loss": 3.0400078296661377, + "step": 8158, + "token_acc": 0.2937985813849268 + }, + { + "epoch": 4.782761653474054, + "grad_norm": 0.19535359249509412, + "learning_rate": 0.00045267782085258774, + "loss": 3.061572551727295, + "step": 8159, + "token_acc": 0.291996396044333 + }, + { + "epoch": 4.783347991791263, + "grad_norm": 0.20809406727162685, + "learning_rate": 0.00045266363441462247, + "loss": 3.092970371246338, + "step": 8160, + "token_acc": 0.2865207239058092 + }, + { + "epoch": 4.7839343301084725, + "grad_norm": 0.22206959342920374, + "learning_rate": 0.00045264944607290535, + "loss": 3.0564486980438232, + "step": 8161, + "token_acc": 0.2924214652415752 + }, + { + "epoch": 4.784520668425682, + "grad_norm": 0.22461891425871752, + "learning_rate": 0.00045263525582756985, + "loss": 3.0112791061401367, + "step": 8162, + "token_acc": 0.2982998579178507 + }, + { + "epoch": 4.785107006742891, + "grad_norm": 0.18417368350067845, + "learning_rate": 0.0004526210636787492, + "loss": 3.0508201122283936, + "step": 8163, + "token_acc": 0.29390767822064173 + }, + { + "epoch": 4.7856933450601, + "grad_norm": 0.18823157095266424, + "learning_rate": 0.0004526068696265766, + "loss": 3.050769329071045, + "step": 8164, + "token_acc": 0.28980519530776355 + }, + { + "epoch": 4.786279683377309, + "grad_norm": 0.2190995218699739, + "learning_rate": 0.0004525926736711855, + "loss": 3.0141258239746094, + "step": 8165, + "token_acc": 0.2951894153097384 + }, + { + "epoch": 4.786866021694518, + "grad_norm": 0.15959001488573019, + "learning_rate": 0.0004525784758127093, + "loss": 3.0545763969421387, + "step": 8166, + "token_acc": 0.29337849360994445 + }, + { + "epoch": 4.787452360011727, + "grad_norm": 0.18967654705886594, + "learning_rate": 0.00045256427605128125, + "loss": 3.002711296081543, + "step": 8167, + "token_acc": 0.30008689075900685 + }, + { + "epoch": 4.788038698328936, + "grad_norm": 0.18138179995080322, + "learning_rate": 0.00045255007438703475, + "loss": 3.016913414001465, + "step": 8168, + "token_acc": 0.2981850876629632 + }, + { + "epoch": 4.7886250366461445, + "grad_norm": 0.16774213377523908, + "learning_rate": 0.00045253587082010326, + "loss": 3.0596976280212402, + "step": 8169, + "token_acc": 0.29020961351875807 + }, + { + "epoch": 4.789211374963354, + "grad_norm": 0.2369984921873028, + "learning_rate": 0.00045252166535062025, + "loss": 2.9944701194763184, + "step": 8170, + "token_acc": 0.30022385158773773 + }, + { + "epoch": 4.789797713280563, + "grad_norm": 0.2788138952444477, + "learning_rate": 0.00045250745797871896, + "loss": 3.0271573066711426, + "step": 8171, + "token_acc": 0.29560932937105217 + }, + { + "epoch": 4.790384051597772, + "grad_norm": 0.18487157099135848, + "learning_rate": 0.00045249324870453314, + "loss": 2.9938201904296875, + "step": 8172, + "token_acc": 0.29896432976973686 + }, + { + "epoch": 4.790970389914981, + "grad_norm": 0.2744353114017672, + "learning_rate": 0.000452479037528196, + "loss": 3.0438570976257324, + "step": 8173, + "token_acc": 0.2941330925517578 + }, + { + "epoch": 4.79155672823219, + "grad_norm": 0.18442693313922134, + "learning_rate": 0.00045246482444984116, + "loss": 3.039293050765991, + "step": 8174, + "token_acc": 0.2922290914434461 + }, + { + "epoch": 4.792143066549399, + "grad_norm": 0.27110192332513094, + "learning_rate": 0.00045245060946960214, + "loss": 3.028707504272461, + "step": 8175, + "token_acc": 0.29495353375092176 + }, + { + "epoch": 4.792729404866608, + "grad_norm": 0.26498931255971836, + "learning_rate": 0.0004524363925876125, + "loss": 3.061530590057373, + "step": 8176, + "token_acc": 0.29229383563955014 + }, + { + "epoch": 4.793315743183817, + "grad_norm": 0.19185896721747667, + "learning_rate": 0.00045242217380400565, + "loss": 3.0260987281799316, + "step": 8177, + "token_acc": 0.29495470831948367 + }, + { + "epoch": 4.7939020815010265, + "grad_norm": 0.21726649549631039, + "learning_rate": 0.00045240795311891533, + "loss": 3.035027265548706, + "step": 8178, + "token_acc": 0.2940701044514113 + }, + { + "epoch": 4.794488419818235, + "grad_norm": 0.17469648816618108, + "learning_rate": 0.00045239373053247494, + "loss": 3.041036367416382, + "step": 8179, + "token_acc": 0.2952447544866687 + }, + { + "epoch": 4.795074758135444, + "grad_norm": 0.2127138688452311, + "learning_rate": 0.00045237950604481823, + "loss": 3.0151727199554443, + "step": 8180, + "token_acc": 0.2955632617675557 + }, + { + "epoch": 4.795661096452653, + "grad_norm": 0.17420725086065458, + "learning_rate": 0.00045236527965607877, + "loss": 3.1032662391662598, + "step": 8181, + "token_acc": 0.2856022240051512 + }, + { + "epoch": 4.796247434769862, + "grad_norm": 0.20271027198464547, + "learning_rate": 0.00045235105136639023, + "loss": 3.096384286880493, + "step": 8182, + "token_acc": 0.28719787774976274 + }, + { + "epoch": 4.796833773087071, + "grad_norm": 0.19099116246628006, + "learning_rate": 0.0004523368211758862, + "loss": 3.09938383102417, + "step": 8183, + "token_acc": 0.28452029854242306 + }, + { + "epoch": 4.79742011140428, + "grad_norm": 0.21006969439211812, + "learning_rate": 0.00045232258908470036, + "loss": 3.039801597595215, + "step": 8184, + "token_acc": 0.2937149077575341 + }, + { + "epoch": 4.798006449721489, + "grad_norm": 0.20888067473454164, + "learning_rate": 0.00045230835509296654, + "loss": 3.06699275970459, + "step": 8185, + "token_acc": 0.29119142094635925 + }, + { + "epoch": 4.7985927880386985, + "grad_norm": 0.21996725334814787, + "learning_rate": 0.0004522941192008182, + "loss": 3.061077356338501, + "step": 8186, + "token_acc": 0.29046982128196347 + }, + { + "epoch": 4.799179126355908, + "grad_norm": 0.17691418287945837, + "learning_rate": 0.0004522798814083893, + "loss": 3.0206832885742188, + "step": 8187, + "token_acc": 0.29874474556937963 + }, + { + "epoch": 4.799765464673117, + "grad_norm": 0.21226660149099696, + "learning_rate": 0.0004522656417158134, + "loss": 3.036076068878174, + "step": 8188, + "token_acc": 0.29257932699185596 + }, + { + "epoch": 4.800351802990326, + "grad_norm": 0.21309178625271893, + "learning_rate": 0.0004522514001232244, + "loss": 3.027238368988037, + "step": 8189, + "token_acc": 0.296584487673151 + }, + { + "epoch": 4.800938141307535, + "grad_norm": 0.18131793264879323, + "learning_rate": 0.0004522371566307561, + "loss": 2.99778151512146, + "step": 8190, + "token_acc": 0.2997679873867176 + }, + { + "epoch": 4.801524479624743, + "grad_norm": 0.1901875517328491, + "learning_rate": 0.00045222291123854215, + "loss": 2.9859070777893066, + "step": 8191, + "token_acc": 0.30013573720413744 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.1819594070891484, + "learning_rate": 0.00045220866394671646, + "loss": 3.0240635871887207, + "step": 8192, + "token_acc": 0.296548807394612 + }, + { + "epoch": 4.802697156259161, + "grad_norm": 0.2343654360576861, + "learning_rate": 0.0004521944147554129, + "loss": 3.055288314819336, + "step": 8193, + "token_acc": 0.2927331309443154 + }, + { + "epoch": 4.8032834945763705, + "grad_norm": 0.20147362533730825, + "learning_rate": 0.0004521801636647652, + "loss": 3.0612642765045166, + "step": 8194, + "token_acc": 0.2904154286584595 + }, + { + "epoch": 4.80386983289358, + "grad_norm": 0.19282735560067807, + "learning_rate": 0.0004521659106749073, + "loss": 3.074296712875366, + "step": 8195, + "token_acc": 0.28973154136786927 + }, + { + "epoch": 4.804456171210789, + "grad_norm": 0.17765576316857848, + "learning_rate": 0.0004521516557859732, + "loss": 3.0530242919921875, + "step": 8196, + "token_acc": 0.29366405731145684 + }, + { + "epoch": 4.805042509527998, + "grad_norm": 0.18331584291285877, + "learning_rate": 0.0004521373989980966, + "loss": 3.002938747406006, + "step": 8197, + "token_acc": 0.29919208429089356 + }, + { + "epoch": 4.805628847845207, + "grad_norm": 0.2152110024918458, + "learning_rate": 0.00045212314031141155, + "loss": 3.0662336349487305, + "step": 8198, + "token_acc": 0.28877844414166015 + }, + { + "epoch": 4.806215186162416, + "grad_norm": 0.1759259362179971, + "learning_rate": 0.000452108879726052, + "loss": 3.037829637527466, + "step": 8199, + "token_acc": 0.2949736690473957 + }, + { + "epoch": 4.806801524479624, + "grad_norm": 0.18484937935006487, + "learning_rate": 0.0004520946172421518, + "loss": 3.087880849838257, + "step": 8200, + "token_acc": 0.2887245850842565 + }, + { + "epoch": 4.807387862796833, + "grad_norm": 0.20921574199576717, + "learning_rate": 0.000452080352859845, + "loss": 3.0339581966400146, + "step": 8201, + "token_acc": 0.2952201231450813 + }, + { + "epoch": 4.8079742011140425, + "grad_norm": 0.172132749850909, + "learning_rate": 0.0004520660865792656, + "loss": 3.0557827949523926, + "step": 8202, + "token_acc": 0.2917493454503807 + }, + { + "epoch": 4.808560539431252, + "grad_norm": 0.2081584948715618, + "learning_rate": 0.00045205181840054766, + "loss": 3.0303049087524414, + "step": 8203, + "token_acc": 0.29475954662797393 + }, + { + "epoch": 4.809146877748461, + "grad_norm": 0.20778819285555394, + "learning_rate": 0.0004520375483238251, + "loss": 3.0470879077911377, + "step": 8204, + "token_acc": 0.2916244336789122 + }, + { + "epoch": 4.80973321606567, + "grad_norm": 0.1669762076257186, + "learning_rate": 0.00045202327634923204, + "loss": 3.0396885871887207, + "step": 8205, + "token_acc": 0.2959067814184985 + }, + { + "epoch": 4.810319554382879, + "grad_norm": 0.18624737149698814, + "learning_rate": 0.0004520090024769025, + "loss": 3.0731754302978516, + "step": 8206, + "token_acc": 0.2907396876997167 + }, + { + "epoch": 4.810905892700088, + "grad_norm": 0.18949841536672962, + "learning_rate": 0.00045199472670697064, + "loss": 3.0350866317749023, + "step": 8207, + "token_acc": 0.29409631936533875 + }, + { + "epoch": 4.811492231017297, + "grad_norm": 0.1847181395678015, + "learning_rate": 0.0004519804490395705, + "loss": 3.0365662574768066, + "step": 8208, + "token_acc": 0.29595431090420327 + }, + { + "epoch": 4.812078569334506, + "grad_norm": 0.20350569033264362, + "learning_rate": 0.00045196616947483617, + "loss": 3.030475378036499, + "step": 8209, + "token_acc": 0.29753115177148304 + }, + { + "epoch": 4.812664907651715, + "grad_norm": 0.17775173772891112, + "learning_rate": 0.0004519518880129019, + "loss": 3.04473876953125, + "step": 8210, + "token_acc": 0.2923811557549764 + }, + { + "epoch": 4.8132512459689245, + "grad_norm": 0.22266242487145857, + "learning_rate": 0.00045193760465390173, + "loss": 3.0540578365325928, + "step": 8211, + "token_acc": 0.2926404643951535 + }, + { + "epoch": 4.813837584286133, + "grad_norm": 0.1816731747013646, + "learning_rate": 0.0004519233193979699, + "loss": 3.0376217365264893, + "step": 8212, + "token_acc": 0.2953960178380755 + }, + { + "epoch": 4.814423922603342, + "grad_norm": 0.18542101540997544, + "learning_rate": 0.0004519090322452405, + "loss": 3.030982255935669, + "step": 8213, + "token_acc": 0.29635619332916263 + }, + { + "epoch": 4.815010260920551, + "grad_norm": 0.24811290343673947, + "learning_rate": 0.00045189474319584796, + "loss": 3.081815719604492, + "step": 8214, + "token_acc": 0.28862605648319933 + }, + { + "epoch": 4.81559659923776, + "grad_norm": 0.2081511746404867, + "learning_rate": 0.00045188045224992625, + "loss": 3.0511112213134766, + "step": 8215, + "token_acc": 0.29156705513014475 + }, + { + "epoch": 4.816182937554969, + "grad_norm": 0.1963137924701037, + "learning_rate": 0.00045186615940760977, + "loss": 3.0530755519866943, + "step": 8216, + "token_acc": 0.2918859626267574 + }, + { + "epoch": 4.816769275872178, + "grad_norm": 0.19799720595699524, + "learning_rate": 0.00045185186466903274, + "loss": 3.0408077239990234, + "step": 8217, + "token_acc": 0.29368820130730666 + }, + { + "epoch": 4.817355614189387, + "grad_norm": 0.23603834475105998, + "learning_rate": 0.0004518375680343294, + "loss": 3.0679879188537598, + "step": 8218, + "token_acc": 0.2906354215566295 + }, + { + "epoch": 4.8179419525065965, + "grad_norm": 0.28441550401993465, + "learning_rate": 0.00045182326950363416, + "loss": 3.0513529777526855, + "step": 8219, + "token_acc": 0.2924095771777891 + }, + { + "epoch": 4.818528290823806, + "grad_norm": 0.1913758323227508, + "learning_rate": 0.00045180896907708127, + "loss": 3.0455126762390137, + "step": 8220, + "token_acc": 0.2927460269462084 + }, + { + "epoch": 4.819114629141015, + "grad_norm": 0.21072786446966812, + "learning_rate": 0.00045179466675480495, + "loss": 3.0480964183807373, + "step": 8221, + "token_acc": 0.29236188975812266 + }, + { + "epoch": 4.819700967458223, + "grad_norm": 0.24379155056108015, + "learning_rate": 0.00045178036253693975, + "loss": 3.042239189147949, + "step": 8222, + "token_acc": 0.2947335000611567 + }, + { + "epoch": 4.820287305775432, + "grad_norm": 0.19739827143635144, + "learning_rate": 0.0004517660564236199, + "loss": 3.0239672660827637, + "step": 8223, + "token_acc": 0.2958702041857692 + }, + { + "epoch": 4.820873644092641, + "grad_norm": 0.2555757429696557, + "learning_rate": 0.0004517517484149799, + "loss": 3.031019687652588, + "step": 8224, + "token_acc": 0.2956024582303159 + }, + { + "epoch": 4.82145998240985, + "grad_norm": 0.22245925970434688, + "learning_rate": 0.000451737438511154, + "loss": 3.0054047107696533, + "step": 8225, + "token_acc": 0.29983288814032133 + }, + { + "epoch": 4.822046320727059, + "grad_norm": 0.2032970500041507, + "learning_rate": 0.00045172312671227675, + "loss": 3.075723886489868, + "step": 8226, + "token_acc": 0.28908207181551504 + }, + { + "epoch": 4.8226326590442685, + "grad_norm": 0.21796064979669263, + "learning_rate": 0.0004517088130184826, + "loss": 3.0939009189605713, + "step": 8227, + "token_acc": 0.28656539883891496 + }, + { + "epoch": 4.823218997361478, + "grad_norm": 0.16445290626758913, + "learning_rate": 0.00045169449742990585, + "loss": 3.0636043548583984, + "step": 8228, + "token_acc": 0.2899118587863573 + }, + { + "epoch": 4.823805335678687, + "grad_norm": 0.19598423088415046, + "learning_rate": 0.0004516801799466812, + "loss": 3.0252432823181152, + "step": 8229, + "token_acc": 0.2951720127818749 + }, + { + "epoch": 4.824391673995896, + "grad_norm": 0.1687893885374334, + "learning_rate": 0.000451665860568943, + "loss": 3.0518088340759277, + "step": 8230, + "token_acc": 0.2929648631840796 + }, + { + "epoch": 4.824978012313105, + "grad_norm": 0.20259312588275857, + "learning_rate": 0.00045165153929682575, + "loss": 3.0011117458343506, + "step": 8231, + "token_acc": 0.29926516548221216 + }, + { + "epoch": 4.825564350630314, + "grad_norm": 0.1672952011168371, + "learning_rate": 0.000451637216130464, + "loss": 3.0376741886138916, + "step": 8232, + "token_acc": 0.29391056535769666 + }, + { + "epoch": 4.826150688947523, + "grad_norm": 0.18548770455887906, + "learning_rate": 0.0004516228910699923, + "loss": 3.0299410820007324, + "step": 8233, + "token_acc": 0.2949940735617498 + }, + { + "epoch": 4.826737027264731, + "grad_norm": 0.18770707113638524, + "learning_rate": 0.0004516085641155453, + "loss": 3.0248372554779053, + "step": 8234, + "token_acc": 0.29551460814720215 + }, + { + "epoch": 4.8273233655819405, + "grad_norm": 0.2653974354565007, + "learning_rate": 0.0004515942352672575, + "loss": 3.05552339553833, + "step": 8235, + "token_acc": 0.2907236212942302 + }, + { + "epoch": 4.82790970389915, + "grad_norm": 0.198611981178459, + "learning_rate": 0.0004515799045252634, + "loss": 3.047046184539795, + "step": 8236, + "token_acc": 0.2924780460395236 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.18607962794455024, + "learning_rate": 0.0004515655718896979, + "loss": 3.046444892883301, + "step": 8237, + "token_acc": 0.2921292344643189 + }, + { + "epoch": 4.829082380533568, + "grad_norm": 0.20294213635802263, + "learning_rate": 0.00045155123736069535, + "loss": 3.0549964904785156, + "step": 8238, + "token_acc": 0.29127074720799656 + }, + { + "epoch": 4.829668718850777, + "grad_norm": 0.1861743916212333, + "learning_rate": 0.0004515369009383906, + "loss": 3.0308034420013428, + "step": 8239, + "token_acc": 0.29709506341276964 + }, + { + "epoch": 4.830255057167986, + "grad_norm": 0.21265727641343649, + "learning_rate": 0.00045152256262291813, + "loss": 3.055305004119873, + "step": 8240, + "token_acc": 0.29209571352257446 + }, + { + "epoch": 4.830841395485195, + "grad_norm": 0.18001984024110576, + "learning_rate": 0.00045150822241441283, + "loss": 3.064025402069092, + "step": 8241, + "token_acc": 0.29079793030437157 + }, + { + "epoch": 4.831427733802404, + "grad_norm": 0.20289555218635544, + "learning_rate": 0.0004514938803130093, + "loss": 3.0849294662475586, + "step": 8242, + "token_acc": 0.285881336648266 + }, + { + "epoch": 4.8320140721196125, + "grad_norm": 0.1642261230291842, + "learning_rate": 0.0004514795363188423, + "loss": 3.041956663131714, + "step": 8243, + "token_acc": 0.29481787835838813 + }, + { + "epoch": 4.832600410436822, + "grad_norm": 0.1960141560385876, + "learning_rate": 0.0004514651904320465, + "loss": 3.0624804496765137, + "step": 8244, + "token_acc": 0.2904554927695547 + }, + { + "epoch": 4.833186748754031, + "grad_norm": 0.19378282849160142, + "learning_rate": 0.0004514508426527567, + "loss": 3.0687248706817627, + "step": 8245, + "token_acc": 0.28916678115851663 + }, + { + "epoch": 4.83377308707124, + "grad_norm": 0.17614857788405613, + "learning_rate": 0.0004514364929811078, + "loss": 3.0405640602111816, + "step": 8246, + "token_acc": 0.2956237891775696 + }, + { + "epoch": 4.834359425388449, + "grad_norm": 0.21554557132842408, + "learning_rate": 0.00045142214141723436, + "loss": 3.0461223125457764, + "step": 8247, + "token_acc": 0.2941286401456058 + }, + { + "epoch": 4.834945763705658, + "grad_norm": 0.1848103303675837, + "learning_rate": 0.00045140778796127134, + "loss": 3.0603389739990234, + "step": 8248, + "token_acc": 0.29088755197324107 + }, + { + "epoch": 4.835532102022867, + "grad_norm": 0.16538575200036382, + "learning_rate": 0.00045139343261335363, + "loss": 3.0047850608825684, + "step": 8249, + "token_acc": 0.29904514327908316 + }, + { + "epoch": 4.836118440340076, + "grad_norm": 0.18695626308268848, + "learning_rate": 0.00045137907537361595, + "loss": 3.053900957107544, + "step": 8250, + "token_acc": 0.2920105026256564 + }, + { + "epoch": 4.836704778657285, + "grad_norm": 0.17796180889392935, + "learning_rate": 0.0004513647162421932, + "loss": 3.0795178413391113, + "step": 8251, + "token_acc": 0.28806730407916165 + }, + { + "epoch": 4.8372911169744945, + "grad_norm": 0.21399315031831678, + "learning_rate": 0.0004513503552192203, + "loss": 3.0447440147399902, + "step": 8252, + "token_acc": 0.2921939170213454 + }, + { + "epoch": 4.837877455291704, + "grad_norm": 0.23217207060786355, + "learning_rate": 0.00045133599230483224, + "loss": 3.0184664726257324, + "step": 8253, + "token_acc": 0.2983459732372046 + }, + { + "epoch": 4.838463793608913, + "grad_norm": 0.1860159529148665, + "learning_rate": 0.0004513216274991637, + "loss": 3.050820827484131, + "step": 8254, + "token_acc": 0.2931625750591988 + }, + { + "epoch": 4.839050131926121, + "grad_norm": 0.2258048963594445, + "learning_rate": 0.0004513072608023498, + "loss": 3.0180134773254395, + "step": 8255, + "token_acc": 0.29864361089767966 + }, + { + "epoch": 4.83963647024333, + "grad_norm": 0.24163014506769456, + "learning_rate": 0.00045129289221452546, + "loss": 3.06150484085083, + "step": 8256, + "token_acc": 0.29170875787767997 + }, + { + "epoch": 4.840222808560539, + "grad_norm": 0.17968296213181534, + "learning_rate": 0.0004512785217358256, + "loss": 2.997995376586914, + "step": 8257, + "token_acc": 0.2969716894589765 + }, + { + "epoch": 4.840809146877748, + "grad_norm": 0.20175018047072732, + "learning_rate": 0.0004512641493663853, + "loss": 3.022812843322754, + "step": 8258, + "token_acc": 0.2954193298469882 + }, + { + "epoch": 4.841395485194957, + "grad_norm": 0.20540612165688657, + "learning_rate": 0.0004512497751063395, + "loss": 3.0291857719421387, + "step": 8259, + "token_acc": 0.29613671258352614 + }, + { + "epoch": 4.8419818235121665, + "grad_norm": 0.18518194037587807, + "learning_rate": 0.00045123539895582326, + "loss": 3.036161422729492, + "step": 8260, + "token_acc": 0.29468225239890333 + }, + { + "epoch": 4.842568161829376, + "grad_norm": 0.19798072263496472, + "learning_rate": 0.00045122102091497165, + "loss": 3.037208080291748, + "step": 8261, + "token_acc": 0.29427519162946864 + }, + { + "epoch": 4.843154500146585, + "grad_norm": 0.23858512454360034, + "learning_rate": 0.0004512066409839196, + "loss": 3.0512466430664062, + "step": 8262, + "token_acc": 0.29195662904161895 + }, + { + "epoch": 4.843740838463794, + "grad_norm": 0.1775303186718838, + "learning_rate": 0.0004511922591628024, + "loss": 3.0311052799224854, + "step": 8263, + "token_acc": 0.29518913333837354 + }, + { + "epoch": 4.844327176781003, + "grad_norm": 0.1686940828604769, + "learning_rate": 0.0004511778754517549, + "loss": 3.0203659534454346, + "step": 8264, + "token_acc": 0.29614443662785206 + }, + { + "epoch": 4.844913515098211, + "grad_norm": 0.21986746919202957, + "learning_rate": 0.00045116348985091247, + "loss": 3.0562288761138916, + "step": 8265, + "token_acc": 0.2913219780338924 + }, + { + "epoch": 4.84549985341542, + "grad_norm": 0.1658507770588668, + "learning_rate": 0.0004511491023604101, + "loss": 3.034461498260498, + "step": 8266, + "token_acc": 0.294979401094509 + }, + { + "epoch": 4.8460861917326294, + "grad_norm": 0.2854573165424869, + "learning_rate": 0.000451134712980383, + "loss": 3.0482699871063232, + "step": 8267, + "token_acc": 0.29244876028520106 + }, + { + "epoch": 4.846672530049839, + "grad_norm": 0.40747935345055003, + "learning_rate": 0.00045112032171096625, + "loss": 3.0340206623077393, + "step": 8268, + "token_acc": 0.2935023397209433 + }, + { + "epoch": 4.847258868367048, + "grad_norm": 0.1714339875190739, + "learning_rate": 0.00045110592855229504, + "loss": 3.0818023681640625, + "step": 8269, + "token_acc": 0.2880880466488184 + }, + { + "epoch": 4.847845206684257, + "grad_norm": 0.28611751791994827, + "learning_rate": 0.0004510915335045047, + "loss": 3.025582790374756, + "step": 8270, + "token_acc": 0.29641714266276975 + }, + { + "epoch": 4.848431545001466, + "grad_norm": 0.20687179007412346, + "learning_rate": 0.0004510771365677304, + "loss": 3.0459704399108887, + "step": 8271, + "token_acc": 0.29262902906861776 + }, + { + "epoch": 4.849017883318675, + "grad_norm": 0.2540211393347662, + "learning_rate": 0.00045106273774210727, + "loss": 3.122156858444214, + "step": 8272, + "token_acc": 0.283641586611018 + }, + { + "epoch": 4.849604221635884, + "grad_norm": 0.16729101340677663, + "learning_rate": 0.0004510483370277707, + "loss": 3.0336146354675293, + "step": 8273, + "token_acc": 0.29466767231136415 + }, + { + "epoch": 4.850190559953093, + "grad_norm": 0.22966833244692123, + "learning_rate": 0.00045103393442485595, + "loss": 3.016200304031372, + "step": 8274, + "token_acc": 0.29795679616238346 + }, + { + "epoch": 4.850776898270302, + "grad_norm": 0.19149761144065783, + "learning_rate": 0.0004510195299334983, + "loss": 3.0628151893615723, + "step": 8275, + "token_acc": 0.29037685588834056 + }, + { + "epoch": 4.8513632365875115, + "grad_norm": 0.19339022669337116, + "learning_rate": 0.00045100512355383295, + "loss": 3.0466232299804688, + "step": 8276, + "token_acc": 0.2924061979095684 + }, + { + "epoch": 4.85194957490472, + "grad_norm": 0.254629534085646, + "learning_rate": 0.00045099071528599533, + "loss": 3.0473294258117676, + "step": 8277, + "token_acc": 0.29009173832361934 + }, + { + "epoch": 4.852535913221929, + "grad_norm": 0.1771371554988384, + "learning_rate": 0.0004509763051301209, + "loss": 3.0144152641296387, + "step": 8278, + "token_acc": 0.2977145651515818 + }, + { + "epoch": 4.853122251539138, + "grad_norm": 0.1982112277988596, + "learning_rate": 0.0004509618930863448, + "loss": 3.0239410400390625, + "step": 8279, + "token_acc": 0.29586634309841986 + }, + { + "epoch": 4.853708589856347, + "grad_norm": 0.1711864941313512, + "learning_rate": 0.0004509474791548025, + "loss": 3.039302110671997, + "step": 8280, + "token_acc": 0.2947339220740047 + }, + { + "epoch": 4.854294928173556, + "grad_norm": 0.20694308895413283, + "learning_rate": 0.0004509330633356294, + "loss": 3.0800561904907227, + "step": 8281, + "token_acc": 0.2875939316602864 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.16532086307565366, + "learning_rate": 0.000450918645628961, + "loss": 3.0677409172058105, + "step": 8282, + "token_acc": 0.29062026791683954 + }, + { + "epoch": 4.855467604807974, + "grad_norm": 0.1937538227513767, + "learning_rate": 0.0004509042260349327, + "loss": 3.0369300842285156, + "step": 8283, + "token_acc": 0.2940584247452165 + }, + { + "epoch": 4.8560539431251835, + "grad_norm": 0.1627952674649333, + "learning_rate": 0.00045088980455367986, + "loss": 3.0220439434051514, + "step": 8284, + "token_acc": 0.29744786746600166 + }, + { + "epoch": 4.856640281442393, + "grad_norm": 0.21819419703387286, + "learning_rate": 0.000450875381185338, + "loss": 3.028167247772217, + "step": 8285, + "token_acc": 0.2945527040079206 + }, + { + "epoch": 4.857226619759601, + "grad_norm": 0.16456155947307877, + "learning_rate": 0.0004508609559300426, + "loss": 3.105139970779419, + "step": 8286, + "token_acc": 0.28367108064300733 + }, + { + "epoch": 4.85781295807681, + "grad_norm": 0.2268339870359941, + "learning_rate": 0.0004508465287879292, + "loss": 3.05922269821167, + "step": 8287, + "token_acc": 0.29059741904667347 + }, + { + "epoch": 4.858399296394019, + "grad_norm": 0.1781345172138699, + "learning_rate": 0.0004508320997591334, + "loss": 3.049931049346924, + "step": 8288, + "token_acc": 0.292901339728101 + }, + { + "epoch": 4.858985634711228, + "grad_norm": 0.18781316937135573, + "learning_rate": 0.0004508176688437905, + "loss": 3.0567965507507324, + "step": 8289, + "token_acc": 0.2920211536775897 + }, + { + "epoch": 4.859571973028437, + "grad_norm": 0.18221711413082994, + "learning_rate": 0.00045080323604203634, + "loss": 3.0802602767944336, + "step": 8290, + "token_acc": 0.2882619069753029 + }, + { + "epoch": 4.860158311345646, + "grad_norm": 0.18969375410013278, + "learning_rate": 0.00045078880135400625, + "loss": 3.0400662422180176, + "step": 8291, + "token_acc": 0.2929359782490714 + }, + { + "epoch": 4.8607446496628555, + "grad_norm": 0.21968750432244005, + "learning_rate": 0.000450774364779836, + "loss": 3.0454132556915283, + "step": 8292, + "token_acc": 0.2932408477438989 + }, + { + "epoch": 4.861330987980065, + "grad_norm": 0.16812389983138806, + "learning_rate": 0.0004507599263196611, + "loss": 3.0248780250549316, + "step": 8293, + "token_acc": 0.2972627222774831 + }, + { + "epoch": 4.861917326297274, + "grad_norm": 0.19806776583358124, + "learning_rate": 0.0004507454859736172, + "loss": 3.021618366241455, + "step": 8294, + "token_acc": 0.29810110859236716 + }, + { + "epoch": 4.862503664614483, + "grad_norm": 0.1587022058730712, + "learning_rate": 0.00045073104374184, + "loss": 2.9967684745788574, + "step": 8295, + "token_acc": 0.30186555629826706 + }, + { + "epoch": 4.863090002931692, + "grad_norm": 0.18583149112933373, + "learning_rate": 0.00045071659962446514, + "loss": 3.0586111545562744, + "step": 8296, + "token_acc": 0.29162186839844617 + }, + { + "epoch": 4.863676341248901, + "grad_norm": 0.1426706138827495, + "learning_rate": 0.0004507021536216283, + "loss": 3.0647144317626953, + "step": 8297, + "token_acc": 0.2904473311963871 + }, + { + "epoch": 4.86426267956611, + "grad_norm": 0.19014343454350513, + "learning_rate": 0.0004506877057334652, + "loss": 3.049382209777832, + "step": 8298, + "token_acc": 0.2905761107219427 + }, + { + "epoch": 4.864849017883318, + "grad_norm": 0.17736289521929513, + "learning_rate": 0.0004506732559601115, + "loss": 3.039126396179199, + "step": 8299, + "token_acc": 0.2933060522586501 + }, + { + "epoch": 4.8654353562005275, + "grad_norm": 0.15575164779609152, + "learning_rate": 0.000450658804301703, + "loss": 3.022524356842041, + "step": 8300, + "token_acc": 0.2986661016106712 + }, + { + "epoch": 4.866021694517737, + "grad_norm": 0.17094831537689123, + "learning_rate": 0.0004506443507583754, + "loss": 3.058741569519043, + "step": 8301, + "token_acc": 0.291617371682872 + }, + { + "epoch": 4.866608032834946, + "grad_norm": 0.20711076560870667, + "learning_rate": 0.00045062989533026443, + "loss": 3.028104305267334, + "step": 8302, + "token_acc": 0.29600004277457265 + }, + { + "epoch": 4.867194371152155, + "grad_norm": 0.16370778000011313, + "learning_rate": 0.000450615438017506, + "loss": 3.0587964057922363, + "step": 8303, + "token_acc": 0.2905246305546748 + }, + { + "epoch": 4.867780709469364, + "grad_norm": 0.18332750971283526, + "learning_rate": 0.0004506009788202359, + "loss": 3.0489614009857178, + "step": 8304, + "token_acc": 0.29359181851909605 + }, + { + "epoch": 4.868367047786573, + "grad_norm": 0.2004335696244491, + "learning_rate": 0.00045058651773858995, + "loss": 3.041551113128662, + "step": 8305, + "token_acc": 0.2934721051436565 + }, + { + "epoch": 4.868953386103782, + "grad_norm": 0.16675080067893025, + "learning_rate": 0.0004505720547727039, + "loss": 3.0671846866607666, + "step": 8306, + "token_acc": 0.28879350490033023 + }, + { + "epoch": 4.869539724420991, + "grad_norm": 0.1827681620566002, + "learning_rate": 0.0004505575899227137, + "loss": 3.073871612548828, + "step": 8307, + "token_acc": 0.2885374478980741 + }, + { + "epoch": 4.8701260627381995, + "grad_norm": 0.2854804878055513, + "learning_rate": 0.0004505431231887551, + "loss": 3.0441880226135254, + "step": 8308, + "token_acc": 0.2926706482689183 + }, + { + "epoch": 4.870712401055409, + "grad_norm": 0.26288764627398586, + "learning_rate": 0.00045052865457096417, + "loss": 3.0122766494750977, + "step": 8309, + "token_acc": 0.298260049007634 + }, + { + "epoch": 4.871298739372618, + "grad_norm": 0.1667566720385298, + "learning_rate": 0.00045051418406947673, + "loss": 3.0150389671325684, + "step": 8310, + "token_acc": 0.2979929896400228 + }, + { + "epoch": 4.871885077689827, + "grad_norm": 0.1901123541683121, + "learning_rate": 0.0004504997116844287, + "loss": 3.0392978191375732, + "step": 8311, + "token_acc": 0.29520058136473265 + }, + { + "epoch": 4.872471416007036, + "grad_norm": 0.17376535300026733, + "learning_rate": 0.0004504852374159561, + "loss": 3.0811309814453125, + "step": 8312, + "token_acc": 0.28645390165508594 + }, + { + "epoch": 4.873057754324245, + "grad_norm": 0.18689811189213562, + "learning_rate": 0.0004504707612641948, + "loss": 3.0977230072021484, + "step": 8313, + "token_acc": 0.28651914148230284 + }, + { + "epoch": 4.873644092641454, + "grad_norm": 0.2114725174105723, + "learning_rate": 0.00045045628322928097, + "loss": 3.05631685256958, + "step": 8314, + "token_acc": 0.2919886294330096 + }, + { + "epoch": 4.874230430958663, + "grad_norm": 0.16258329420094583, + "learning_rate": 0.00045044180331135043, + "loss": 3.0620105266571045, + "step": 8315, + "token_acc": 0.2900593272663821 + }, + { + "epoch": 4.874816769275872, + "grad_norm": 0.187609521637445, + "learning_rate": 0.0004504273215105391, + "loss": 2.9953415393829346, + "step": 8316, + "token_acc": 0.3011473400867997 + }, + { + "epoch": 4.8754031075930815, + "grad_norm": 0.2081663268046037, + "learning_rate": 0.0004504128378269833, + "loss": 3.0131211280822754, + "step": 8317, + "token_acc": 0.2985325034434905 + }, + { + "epoch": 4.875989445910291, + "grad_norm": 0.24492182453415076, + "learning_rate": 0.00045039835226081895, + "loss": 3.065298080444336, + "step": 8318, + "token_acc": 0.2905076163640601 + }, + { + "epoch": 4.8765757842275, + "grad_norm": 0.25909647115920226, + "learning_rate": 0.0004503838648121821, + "loss": 3.0742697715759277, + "step": 8319, + "token_acc": 0.28982599762172484 + }, + { + "epoch": 4.877162122544708, + "grad_norm": 0.19140658749870088, + "learning_rate": 0.00045036937548120884, + "loss": 3.087031841278076, + "step": 8320, + "token_acc": 0.2890294795713615 + }, + { + "epoch": 4.877748460861917, + "grad_norm": 0.19781086809894965, + "learning_rate": 0.0004503548842680353, + "loss": 3.0446617603302, + "step": 8321, + "token_acc": 0.2913714036571465 + }, + { + "epoch": 4.878334799179126, + "grad_norm": 0.2470620086619544, + "learning_rate": 0.0004503403911727976, + "loss": 2.9955687522888184, + "step": 8322, + "token_acc": 0.3003205216558866 + }, + { + "epoch": 4.878921137496335, + "grad_norm": 0.1884856833319998, + "learning_rate": 0.00045032589619563193, + "loss": 3.050199508666992, + "step": 8323, + "token_acc": 0.2936966449884616 + }, + { + "epoch": 4.879507475813544, + "grad_norm": 0.18945612134356593, + "learning_rate": 0.0004503113993366743, + "loss": 3.0228309631347656, + "step": 8324, + "token_acc": 0.29616642594484444 + }, + { + "epoch": 4.8800938141307535, + "grad_norm": 0.22093690226448756, + "learning_rate": 0.0004502969005960611, + "loss": 3.0488386154174805, + "step": 8325, + "token_acc": 0.29358465454939375 + }, + { + "epoch": 4.880680152447963, + "grad_norm": 0.19358001192148847, + "learning_rate": 0.0004502823999739284, + "loss": 3.105501890182495, + "step": 8326, + "token_acc": 0.28564545272033726 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 0.19056204384865041, + "learning_rate": 0.0004502678974704124, + "loss": 3.0377073287963867, + "step": 8327, + "token_acc": 0.2954767761237633 + }, + { + "epoch": 4.881852829082381, + "grad_norm": 0.18688258208672415, + "learning_rate": 0.0004502533930856494, + "loss": 3.05466890335083, + "step": 8328, + "token_acc": 0.29204103689126193 + }, + { + "epoch": 4.88243916739959, + "grad_norm": 0.1938227048346595, + "learning_rate": 0.0004502388868197756, + "loss": 3.0520758628845215, + "step": 8329, + "token_acc": 0.29202354020255644 + }, + { + "epoch": 4.883025505716798, + "grad_norm": 0.20166338096370262, + "learning_rate": 0.0004502243786729273, + "loss": 3.0213375091552734, + "step": 8330, + "token_acc": 0.29698114413891213 + }, + { + "epoch": 4.883611844034007, + "grad_norm": 0.19640365465814036, + "learning_rate": 0.0004502098686452408, + "loss": 3.0327935218811035, + "step": 8331, + "token_acc": 0.2969744925960857 + }, + { + "epoch": 4.884198182351216, + "grad_norm": 0.17461322311797992, + "learning_rate": 0.0004501953567368523, + "loss": 3.060469627380371, + "step": 8332, + "token_acc": 0.29185001257229065 + }, + { + "epoch": 4.8847845206684255, + "grad_norm": 0.17606568801465003, + "learning_rate": 0.00045018084294789817, + "loss": 2.9896316528320312, + "step": 8333, + "token_acc": 0.30154963004327795 + }, + { + "epoch": 4.885370858985635, + "grad_norm": 0.19634284557164672, + "learning_rate": 0.0004501663272785149, + "loss": 3.009902238845825, + "step": 8334, + "token_acc": 0.2979915670897055 + }, + { + "epoch": 4.885957197302844, + "grad_norm": 0.20945982070195124, + "learning_rate": 0.00045015180972883865, + "loss": 3.0668418407440186, + "step": 8335, + "token_acc": 0.2903726263454505 + }, + { + "epoch": 4.886543535620053, + "grad_norm": 0.22550718491910798, + "learning_rate": 0.0004501372902990058, + "loss": 3.0472984313964844, + "step": 8336, + "token_acc": 0.29208705804450485 + }, + { + "epoch": 4.887129873937262, + "grad_norm": 0.16972176005510667, + "learning_rate": 0.00045012276898915286, + "loss": 3.057155132293701, + "step": 8337, + "token_acc": 0.29188087765878906 + }, + { + "epoch": 4.887716212254471, + "grad_norm": 0.20458552388360932, + "learning_rate": 0.0004501082457994161, + "loss": 3.046921730041504, + "step": 8338, + "token_acc": 0.29151475310358776 + }, + { + "epoch": 4.88830255057168, + "grad_norm": 0.25106052448466787, + "learning_rate": 0.0004500937207299321, + "loss": 3.018035411834717, + "step": 8339, + "token_acc": 0.2961233446745001 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.22364060941116343, + "learning_rate": 0.0004500791937808372, + "loss": 3.0083279609680176, + "step": 8340, + "token_acc": 0.2989196800991874 + }, + { + "epoch": 4.889475227206098, + "grad_norm": 0.17354286492597026, + "learning_rate": 0.00045006466495226786, + "loss": 2.984035015106201, + "step": 8341, + "token_acc": 0.30236009855767304 + }, + { + "epoch": 4.890061565523307, + "grad_norm": 0.23114899301911834, + "learning_rate": 0.0004500501342443606, + "loss": 3.058978796005249, + "step": 8342, + "token_acc": 0.29212389774067904 + }, + { + "epoch": 4.890647903840516, + "grad_norm": 0.2026893390288088, + "learning_rate": 0.0004500356016572519, + "loss": 3.0550312995910645, + "step": 8343, + "token_acc": 0.29288573987251754 + }, + { + "epoch": 4.891234242157725, + "grad_norm": 0.17149250991408108, + "learning_rate": 0.00045002106719107827, + "loss": 3.052339553833008, + "step": 8344, + "token_acc": 0.29255295666603476 + }, + { + "epoch": 4.891820580474934, + "grad_norm": 0.2197789867818995, + "learning_rate": 0.00045000653084597625, + "loss": 3.0772218704223633, + "step": 8345, + "token_acc": 0.29062587930494427 + }, + { + "epoch": 4.892406918792143, + "grad_norm": 0.18326869940463486, + "learning_rate": 0.0004499919926220824, + "loss": 2.9842095375061035, + "step": 8346, + "token_acc": 0.3022970780309227 + }, + { + "epoch": 4.892993257109352, + "grad_norm": 0.20745186260908338, + "learning_rate": 0.00044997745251953324, + "loss": 3.100586414337158, + "step": 8347, + "token_acc": 0.28470819275378667 + }, + { + "epoch": 4.893579595426561, + "grad_norm": 0.1883665072814455, + "learning_rate": 0.0004499629105384654, + "loss": 3.0712223052978516, + "step": 8348, + "token_acc": 0.2903380147645806 + }, + { + "epoch": 4.89416593374377, + "grad_norm": 0.18348760315422052, + "learning_rate": 0.00044994836667901553, + "loss": 3.09987735748291, + "step": 8349, + "token_acc": 0.2860487974098057 + }, + { + "epoch": 4.8947522720609795, + "grad_norm": 0.2027696144139854, + "learning_rate": 0.0004499338209413201, + "loss": 3.0382657051086426, + "step": 8350, + "token_acc": 0.2935398679532758 + }, + { + "epoch": 4.895338610378188, + "grad_norm": 0.19965863286683658, + "learning_rate": 0.0004499192733255159, + "loss": 3.0535008907318115, + "step": 8351, + "token_acc": 0.29194557053518116 + }, + { + "epoch": 4.895924948695397, + "grad_norm": 0.18190603494018373, + "learning_rate": 0.00044990472383173944, + "loss": 3.0735087394714355, + "step": 8352, + "token_acc": 0.2896419852061718 + }, + { + "epoch": 4.896511287012606, + "grad_norm": 0.17736298587099053, + "learning_rate": 0.00044989017246012755, + "loss": 3.0627965927124023, + "step": 8353, + "token_acc": 0.2904934873222063 + }, + { + "epoch": 4.897097625329815, + "grad_norm": 0.17534962681540694, + "learning_rate": 0.0004498756192108168, + "loss": 3.0565590858459473, + "step": 8354, + "token_acc": 0.29150343653689925 + }, + { + "epoch": 4.897683963647024, + "grad_norm": 0.17658781455656447, + "learning_rate": 0.00044986106408394396, + "loss": 3.021911382675171, + "step": 8355, + "token_acc": 0.2955750776141438 + }, + { + "epoch": 4.898270301964233, + "grad_norm": 0.18073190375119705, + "learning_rate": 0.00044984650707964574, + "loss": 3.048750400543213, + "step": 8356, + "token_acc": 0.29211815124743906 + }, + { + "epoch": 4.898856640281442, + "grad_norm": 0.21931290289951014, + "learning_rate": 0.0004498319481980589, + "loss": 3.0696911811828613, + "step": 8357, + "token_acc": 0.29000563867668755 + }, + { + "epoch": 4.8994429785986515, + "grad_norm": 0.20681385333499622, + "learning_rate": 0.00044981738743932013, + "loss": 3.043977737426758, + "step": 8358, + "token_acc": 0.2936123307162976 + }, + { + "epoch": 4.900029316915861, + "grad_norm": 0.2195677624572636, + "learning_rate": 0.0004498028248035664, + "loss": 2.9817733764648438, + "step": 8359, + "token_acc": 0.3024633979321101 + }, + { + "epoch": 4.90061565523307, + "grad_norm": 0.19002182913900476, + "learning_rate": 0.0004497882602909342, + "loss": 3.0093250274658203, + "step": 8360, + "token_acc": 0.29686161575288117 + }, + { + "epoch": 4.901201993550279, + "grad_norm": 0.18070873848864266, + "learning_rate": 0.00044977369390156055, + "loss": 3.0158705711364746, + "step": 8361, + "token_acc": 0.2980548084044751 + }, + { + "epoch": 4.901788331867488, + "grad_norm": 0.2158714788937025, + "learning_rate": 0.0004497591256355823, + "loss": 3.0400524139404297, + "step": 8362, + "token_acc": 0.2940378935503338 + }, + { + "epoch": 4.902374670184696, + "grad_norm": 0.1853061038186991, + "learning_rate": 0.0004497445554931362, + "loss": 3.075366973876953, + "step": 8363, + "token_acc": 0.2903067615468671 + }, + { + "epoch": 4.902961008501905, + "grad_norm": 0.1776333558850975, + "learning_rate": 0.00044972998347435925, + "loss": 3.1197237968444824, + "step": 8364, + "token_acc": 0.28254274144194585 + }, + { + "epoch": 4.903547346819114, + "grad_norm": 0.24706654707522493, + "learning_rate": 0.0004497154095793881, + "loss": 3.059683322906494, + "step": 8365, + "token_acc": 0.29093332403665395 + }, + { + "epoch": 4.9041336851363235, + "grad_norm": 0.21923055812986283, + "learning_rate": 0.0004497008338083599, + "loss": 3.052083969116211, + "step": 8366, + "token_acc": 0.2920365006695942 + }, + { + "epoch": 4.904720023453533, + "grad_norm": 0.1762778878808655, + "learning_rate": 0.00044968625616141145, + "loss": 3.047804355621338, + "step": 8367, + "token_acc": 0.2921642116272989 + }, + { + "epoch": 4.905306361770742, + "grad_norm": 0.23502957404280328, + "learning_rate": 0.00044967167663867967, + "loss": 3.0428833961486816, + "step": 8368, + "token_acc": 0.2928335959117207 + }, + { + "epoch": 4.905892700087951, + "grad_norm": 0.18944999126709933, + "learning_rate": 0.00044965709524030155, + "loss": 3.050354480743408, + "step": 8369, + "token_acc": 0.2933137078958278 + }, + { + "epoch": 4.90647903840516, + "grad_norm": 0.190684901994884, + "learning_rate": 0.0004496425119664141, + "loss": 3.0448107719421387, + "step": 8370, + "token_acc": 0.294128662991586 + }, + { + "epoch": 4.907065376722369, + "grad_norm": 0.20525728708595328, + "learning_rate": 0.00044962792681715424, + "loss": 3.0587708950042725, + "step": 8371, + "token_acc": 0.29055325261325404 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.17082799780944988, + "learning_rate": 0.00044961333979265904, + "loss": 3.040635347366333, + "step": 8372, + "token_acc": 0.2939021096695786 + }, + { + "epoch": 4.908238053356786, + "grad_norm": 0.2253900685153574, + "learning_rate": 0.00044959875089306545, + "loss": 3.011045455932617, + "step": 8373, + "token_acc": 0.29766916818406675 + }, + { + "epoch": 4.9088243916739955, + "grad_norm": 0.18480579918933207, + "learning_rate": 0.0004495841601185106, + "loss": 3.058290958404541, + "step": 8374, + "token_acc": 0.2927882877296129 + }, + { + "epoch": 4.909410729991205, + "grad_norm": 0.1855846789344752, + "learning_rate": 0.00044956956746913145, + "loss": 3.056124210357666, + "step": 8375, + "token_acc": 0.2904010920685734 + }, + { + "epoch": 4.909997068308414, + "grad_norm": 0.18212893314110834, + "learning_rate": 0.00044955497294506524, + "loss": 3.076988935470581, + "step": 8376, + "token_acc": 0.28899212854012823 + }, + { + "epoch": 4.910583406625623, + "grad_norm": 0.16780978357273071, + "learning_rate": 0.00044954037654644887, + "loss": 3.0520377159118652, + "step": 8377, + "token_acc": 0.2937051469267592 + }, + { + "epoch": 4.911169744942832, + "grad_norm": 0.18441248686934283, + "learning_rate": 0.0004495257782734196, + "loss": 3.0441391468048096, + "step": 8378, + "token_acc": 0.2921616187820222 + }, + { + "epoch": 4.911756083260041, + "grad_norm": 0.18279474749233285, + "learning_rate": 0.00044951117812611454, + "loss": 3.0296130180358887, + "step": 8379, + "token_acc": 0.29467209180789455 + }, + { + "epoch": 4.91234242157725, + "grad_norm": 0.20790273114563643, + "learning_rate": 0.0004494965761046707, + "loss": 3.0382368564605713, + "step": 8380, + "token_acc": 0.29467800489408685 + }, + { + "epoch": 4.912928759894459, + "grad_norm": 0.23227007734565808, + "learning_rate": 0.00044948197220922545, + "loss": 3.037198066711426, + "step": 8381, + "token_acc": 0.29350065177542617 + }, + { + "epoch": 4.913515098211668, + "grad_norm": 0.20031712333540394, + "learning_rate": 0.0004494673664399158, + "loss": 3.0394139289855957, + "step": 8382, + "token_acc": 0.29347569713032123 + }, + { + "epoch": 4.9141014365288775, + "grad_norm": 0.24428980594366806, + "learning_rate": 0.0004494527587968791, + "loss": 3.0533525943756104, + "step": 8383, + "token_acc": 0.2946481249547153 + }, + { + "epoch": 4.914687774846087, + "grad_norm": 0.2562895142625386, + "learning_rate": 0.0004494381492802524, + "loss": 3.0395450592041016, + "step": 8384, + "token_acc": 0.2949973395546824 + }, + { + "epoch": 4.915274113163295, + "grad_norm": 0.18126657386303482, + "learning_rate": 0.0004494235378901731, + "loss": 2.9943675994873047, + "step": 8385, + "token_acc": 0.30025745604186144 + }, + { + "epoch": 4.915860451480504, + "grad_norm": 0.22415670999480147, + "learning_rate": 0.0004494089246267784, + "loss": 3.038001537322998, + "step": 8386, + "token_acc": 0.29297859603427995 + }, + { + "epoch": 4.916446789797713, + "grad_norm": 0.20437596850740655, + "learning_rate": 0.00044939430949020553, + "loss": 3.0979714393615723, + "step": 8387, + "token_acc": 0.2866406809408699 + }, + { + "epoch": 4.917033128114922, + "grad_norm": 0.16243876416894382, + "learning_rate": 0.0004493796924805918, + "loss": 3.051877975463867, + "step": 8388, + "token_acc": 0.2939679664889969 + }, + { + "epoch": 4.917619466432131, + "grad_norm": 0.1787584852061425, + "learning_rate": 0.00044936507359807454, + "loss": 3.0205438137054443, + "step": 8389, + "token_acc": 0.29548352830124186 + }, + { + "epoch": 4.91820580474934, + "grad_norm": 0.18996676743273985, + "learning_rate": 0.0004493504528427911, + "loss": 3.0381569862365723, + "step": 8390, + "token_acc": 0.2957881999232377 + }, + { + "epoch": 4.9187921430665495, + "grad_norm": 0.18363858065242186, + "learning_rate": 0.00044933583021487876, + "loss": 3.0685129165649414, + "step": 8391, + "token_acc": 0.2898426901129856 + }, + { + "epoch": 4.919378481383759, + "grad_norm": 0.17598128747187944, + "learning_rate": 0.0004493212057144749, + "loss": 3.0886130332946777, + "step": 8392, + "token_acc": 0.2866092542711534 + }, + { + "epoch": 4.919964819700968, + "grad_norm": 0.21282002634211616, + "learning_rate": 0.0004493065793417169, + "loss": 3.0419044494628906, + "step": 8393, + "token_acc": 0.2938258699316599 + }, + { + "epoch": 4.920551158018176, + "grad_norm": 0.18309837619247443, + "learning_rate": 0.00044929195109674215, + "loss": 3.040395736694336, + "step": 8394, + "token_acc": 0.2934880668164144 + }, + { + "epoch": 4.921137496335385, + "grad_norm": 0.21511746839746823, + "learning_rate": 0.0004492773209796881, + "loss": 2.9878604412078857, + "step": 8395, + "token_acc": 0.3020455307762728 + }, + { + "epoch": 4.921723834652594, + "grad_norm": 0.19210670608248406, + "learning_rate": 0.0004492626889906921, + "loss": 3.0452492237091064, + "step": 8396, + "token_acc": 0.2952712776891632 + }, + { + "epoch": 4.922310172969803, + "grad_norm": 0.17184419021105332, + "learning_rate": 0.00044924805512989167, + "loss": 3.051713228225708, + "step": 8397, + "token_acc": 0.29180905871542584 + }, + { + "epoch": 4.9228965112870124, + "grad_norm": 0.17294304880120165, + "learning_rate": 0.00044923341939742423, + "loss": 3.0094029903411865, + "step": 8398, + "token_acc": 0.300253817120162 + }, + { + "epoch": 4.923482849604222, + "grad_norm": 0.20626087898336617, + "learning_rate": 0.0004492187817934273, + "loss": 3.0717616081237793, + "step": 8399, + "token_acc": 0.2888186792016645 + }, + { + "epoch": 4.924069187921431, + "grad_norm": 0.2585863341558831, + "learning_rate": 0.00044920414231803835, + "loss": 2.993663787841797, + "step": 8400, + "token_acc": 0.299494367352537 + }, + { + "epoch": 4.92465552623864, + "grad_norm": 0.23513481016981097, + "learning_rate": 0.0004491895009713949, + "loss": 3.086247444152832, + "step": 8401, + "token_acc": 0.28693371163960973 + }, + { + "epoch": 4.925241864555849, + "grad_norm": 0.18735763527652263, + "learning_rate": 0.0004491748577536346, + "loss": 3.0475821495056152, + "step": 8402, + "token_acc": 0.2915089632978876 + }, + { + "epoch": 4.925828202873058, + "grad_norm": 0.1943066655460094, + "learning_rate": 0.0004491602126648948, + "loss": 3.042724609375, + "step": 8403, + "token_acc": 0.2942265611849857 + }, + { + "epoch": 4.926414541190267, + "grad_norm": 0.24815254749000878, + "learning_rate": 0.00044914556570531324, + "loss": 3.0480923652648926, + "step": 8404, + "token_acc": 0.2925051872042762 + }, + { + "epoch": 4.927000879507476, + "grad_norm": 0.20154569368434377, + "learning_rate": 0.0004491309168750274, + "loss": 3.031181573867798, + "step": 8405, + "token_acc": 0.2967442472201976 + }, + { + "epoch": 4.927587217824685, + "grad_norm": 0.16863884816826494, + "learning_rate": 0.00044911626617417493, + "loss": 3.032581329345703, + "step": 8406, + "token_acc": 0.2944937481614154 + }, + { + "epoch": 4.928173556141894, + "grad_norm": 0.19524799470391807, + "learning_rate": 0.00044910161360289347, + "loss": 3.026651382446289, + "step": 8407, + "token_acc": 0.29686870220216816 + }, + { + "epoch": 4.928759894459103, + "grad_norm": 0.17926796040222812, + "learning_rate": 0.0004490869591613207, + "loss": 3.0227315425872803, + "step": 8408, + "token_acc": 0.296401677615864 + }, + { + "epoch": 4.929346232776312, + "grad_norm": 0.1980439797916102, + "learning_rate": 0.0004490723028495941, + "loss": 3.0682120323181152, + "step": 8409, + "token_acc": 0.29048740104516996 + }, + { + "epoch": 4.929932571093521, + "grad_norm": 0.2049980627739991, + "learning_rate": 0.0004490576446678515, + "loss": 3.073478937149048, + "step": 8410, + "token_acc": 0.28903153158865136 + }, + { + "epoch": 4.93051890941073, + "grad_norm": 0.1725936879650742, + "learning_rate": 0.0004490429846162306, + "loss": 3.0452475547790527, + "step": 8411, + "token_acc": 0.29442753053056475 + }, + { + "epoch": 4.931105247727939, + "grad_norm": 0.20511734453037678, + "learning_rate": 0.00044902832269486906, + "loss": 3.0509352684020996, + "step": 8412, + "token_acc": 0.2913506236053026 + }, + { + "epoch": 4.931691586045148, + "grad_norm": 0.19243723442620878, + "learning_rate": 0.00044901365890390464, + "loss": 3.032176971435547, + "step": 8413, + "token_acc": 0.2955386243829845 + }, + { + "epoch": 4.932277924362357, + "grad_norm": 0.17401087709542093, + "learning_rate": 0.00044899899324347503, + "loss": 3.02199649810791, + "step": 8414, + "token_acc": 0.2973910120494389 + }, + { + "epoch": 4.9328642626795665, + "grad_norm": 0.17387788225458967, + "learning_rate": 0.0004489843257137181, + "loss": 3.0637059211730957, + "step": 8415, + "token_acc": 0.2921510978688968 + }, + { + "epoch": 4.933450600996775, + "grad_norm": 0.15565234178560305, + "learning_rate": 0.0004489696563147715, + "loss": 3.0710649490356445, + "step": 8416, + "token_acc": 0.28911078527812323 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.21407191867385925, + "learning_rate": 0.0004489549850467731, + "loss": 3.0545597076416016, + "step": 8417, + "token_acc": 0.29166995782057153 + }, + { + "epoch": 4.934623277631193, + "grad_norm": 0.18146528789328698, + "learning_rate": 0.0004489403119098607, + "loss": 3.027956962585449, + "step": 8418, + "token_acc": 0.29537296803898144 + }, + { + "epoch": 4.935209615948402, + "grad_norm": 0.1909178526414011, + "learning_rate": 0.0004489256369041722, + "loss": 3.0069451332092285, + "step": 8419, + "token_acc": 0.2975427972516595 + }, + { + "epoch": 4.935795954265611, + "grad_norm": 0.2680027144177886, + "learning_rate": 0.00044891096002984534, + "loss": 3.0462937355041504, + "step": 8420, + "token_acc": 0.2944131534789339 + }, + { + "epoch": 4.93638229258282, + "grad_norm": 0.18486298008565794, + "learning_rate": 0.0004488962812870181, + "loss": 3.0483810901641846, + "step": 8421, + "token_acc": 0.29285166267288854 + }, + { + "epoch": 4.936968630900029, + "grad_norm": 0.21398703856565707, + "learning_rate": 0.0004488816006758283, + "loss": 3.0646841526031494, + "step": 8422, + "token_acc": 0.29133070211546686 + }, + { + "epoch": 4.9375549692172385, + "grad_norm": 0.25834315137136266, + "learning_rate": 0.0004488669181964138, + "loss": 3.059004783630371, + "step": 8423, + "token_acc": 0.29103705860500606 + }, + { + "epoch": 4.938141307534448, + "grad_norm": 0.1938012713376633, + "learning_rate": 0.0004488522338489126, + "loss": 3.0573654174804688, + "step": 8424, + "token_acc": 0.2920598909903723 + }, + { + "epoch": 4.938727645851657, + "grad_norm": 0.27086501898896626, + "learning_rate": 0.00044883754763346267, + "loss": 3.051037311553955, + "step": 8425, + "token_acc": 0.2919254658385093 + }, + { + "epoch": 4.939313984168866, + "grad_norm": 0.1750272176683943, + "learning_rate": 0.00044882285955020194, + "loss": 3.026841163635254, + "step": 8426, + "token_acc": 0.29719359025237846 + }, + { + "epoch": 4.939900322486075, + "grad_norm": 0.21844577399094703, + "learning_rate": 0.0004488081695992683, + "loss": 3.041978597640991, + "step": 8427, + "token_acc": 0.2940970953961473 + }, + { + "epoch": 4.940486660803283, + "grad_norm": 0.21203858606336096, + "learning_rate": 0.0004487934777807998, + "loss": 3.0423355102539062, + "step": 8428, + "token_acc": 0.29265449095991763 + }, + { + "epoch": 4.941072999120492, + "grad_norm": 0.22009538051523297, + "learning_rate": 0.0004487787840949344, + "loss": 3.0473384857177734, + "step": 8429, + "token_acc": 0.29266988527228366 + }, + { + "epoch": 4.941659337437701, + "grad_norm": 0.16780131165031184, + "learning_rate": 0.00044876408854181026, + "loss": 3.0276124477386475, + "step": 8430, + "token_acc": 0.2964890381335818 + }, + { + "epoch": 4.9422456757549105, + "grad_norm": 0.22262253304883894, + "learning_rate": 0.0004487493911215654, + "loss": 3.0023674964904785, + "step": 8431, + "token_acc": 0.2978106923567745 + }, + { + "epoch": 4.94283201407212, + "grad_norm": 0.2524767640831319, + "learning_rate": 0.00044873469183433776, + "loss": 3.076324224472046, + "step": 8432, + "token_acc": 0.2874964148861678 + }, + { + "epoch": 4.943418352389329, + "grad_norm": 0.19902012390108612, + "learning_rate": 0.00044871999068026545, + "loss": 3.091171979904175, + "step": 8433, + "token_acc": 0.28730561742648714 + }, + { + "epoch": 4.944004690706538, + "grad_norm": 0.22287508172812212, + "learning_rate": 0.0004487052876594867, + "loss": 3.052544355392456, + "step": 8434, + "token_acc": 0.29219458412603183 + }, + { + "epoch": 4.944591029023747, + "grad_norm": 0.19229753311911305, + "learning_rate": 0.0004486905827721395, + "loss": 3.0671579837799072, + "step": 8435, + "token_acc": 0.29092119648288334 + }, + { + "epoch": 4.945177367340956, + "grad_norm": 0.19403177690163473, + "learning_rate": 0.00044867587601836196, + "loss": 3.0288949012756348, + "step": 8436, + "token_acc": 0.296413758206598 + }, + { + "epoch": 4.945763705658165, + "grad_norm": 0.16584018438217044, + "learning_rate": 0.0004486611673982923, + "loss": 2.999239444732666, + "step": 8437, + "token_acc": 0.2976825803127993 + }, + { + "epoch": 4.946350043975373, + "grad_norm": 0.17676784923185568, + "learning_rate": 0.00044864645691206875, + "loss": 3.0527517795562744, + "step": 8438, + "token_acc": 0.2905328926517861 + }, + { + "epoch": 4.9469363822925825, + "grad_norm": 0.17727265723119953, + "learning_rate": 0.0004486317445598293, + "loss": 3.0688486099243164, + "step": 8439, + "token_acc": 0.2888766490233483 + }, + { + "epoch": 4.947522720609792, + "grad_norm": 0.19900594804450059, + "learning_rate": 0.00044861703034171233, + "loss": 3.031952142715454, + "step": 8440, + "token_acc": 0.2955611812617907 + }, + { + "epoch": 4.948109058927001, + "grad_norm": 0.23370636817737342, + "learning_rate": 0.00044860231425785605, + "loss": 3.0551018714904785, + "step": 8441, + "token_acc": 0.29185408371892824 + }, + { + "epoch": 4.94869539724421, + "grad_norm": 0.17542344362997406, + "learning_rate": 0.0004485875963083985, + "loss": 3.0284972190856934, + "step": 8442, + "token_acc": 0.2941579600054223 + }, + { + "epoch": 4.949281735561419, + "grad_norm": 0.2165878522868259, + "learning_rate": 0.0004485728764934782, + "loss": 3.058781147003174, + "step": 8443, + "token_acc": 0.29124957260461326 + }, + { + "epoch": 4.949868073878628, + "grad_norm": 0.2795911496589915, + "learning_rate": 0.0004485581548132333, + "loss": 3.048804521560669, + "step": 8444, + "token_acc": 0.29366312420669494 + }, + { + "epoch": 4.950454412195837, + "grad_norm": 0.2341740046727812, + "learning_rate": 0.00044854343126780205, + "loss": 3.023752212524414, + "step": 8445, + "token_acc": 0.2948938825535808 + }, + { + "epoch": 4.951040750513046, + "grad_norm": 0.17603102858021652, + "learning_rate": 0.00044852870585732285, + "loss": 3.08232045173645, + "step": 8446, + "token_acc": 0.2891856876520713 + }, + { + "epoch": 4.951627088830255, + "grad_norm": 0.22981735979831966, + "learning_rate": 0.000448513978581934, + "loss": 3.035311222076416, + "step": 8447, + "token_acc": 0.2942813590740689 + }, + { + "epoch": 4.9522134271474645, + "grad_norm": 0.1935709290079015, + "learning_rate": 0.00044849924944177376, + "loss": 3.0725202560424805, + "step": 8448, + "token_acc": 0.2883824070325297 + }, + { + "epoch": 4.952799765464674, + "grad_norm": 0.19138312580724745, + "learning_rate": 0.00044848451843698054, + "loss": 3.099198579788208, + "step": 8449, + "token_acc": 0.28619535197200535 + }, + { + "epoch": 4.953386103781882, + "grad_norm": 0.1954819700702093, + "learning_rate": 0.0004484697855676928, + "loss": 3.038224458694458, + "step": 8450, + "token_acc": 0.2943162460930906 + }, + { + "epoch": 4.953972442099091, + "grad_norm": 0.18478407726279455, + "learning_rate": 0.00044845505083404883, + "loss": 3.0504660606384277, + "step": 8451, + "token_acc": 0.2914338553036333 + }, + { + "epoch": 4.9545587804163, + "grad_norm": 0.21982256431040356, + "learning_rate": 0.0004484403142361871, + "loss": 3.073812484741211, + "step": 8452, + "token_acc": 0.28880761549036393 + }, + { + "epoch": 4.955145118733509, + "grad_norm": 0.15150478860054567, + "learning_rate": 0.000448425575774246, + "loss": 3.0459184646606445, + "step": 8453, + "token_acc": 0.29251594992805025 + }, + { + "epoch": 4.955731457050718, + "grad_norm": 0.25279749122182954, + "learning_rate": 0.000448410835448364, + "loss": 3.0496554374694824, + "step": 8454, + "token_acc": 0.29421970768505423 + }, + { + "epoch": 4.956317795367927, + "grad_norm": 0.185920640670047, + "learning_rate": 0.0004483960932586796, + "loss": 3.043316125869751, + "step": 8455, + "token_acc": 0.2925231327408219 + }, + { + "epoch": 4.9569041336851365, + "grad_norm": 0.21280155707736984, + "learning_rate": 0.00044838134920533113, + "loss": 3.0720698833465576, + "step": 8456, + "token_acc": 0.2875225577555225 + }, + { + "epoch": 4.957490472002346, + "grad_norm": 0.1635273802547219, + "learning_rate": 0.00044836660328845734, + "loss": 3.001539468765259, + "step": 8457, + "token_acc": 0.3003304689305863 + }, + { + "epoch": 4.958076810319555, + "grad_norm": 0.23599160678515302, + "learning_rate": 0.00044835185550819656, + "loss": 3.0260205268859863, + "step": 8458, + "token_acc": 0.2951222933377665 + }, + { + "epoch": 4.958663148636763, + "grad_norm": 0.17374224714152364, + "learning_rate": 0.00044833710586468734, + "loss": 3.0334410667419434, + "step": 8459, + "token_acc": 0.29562775250132745 + }, + { + "epoch": 4.959249486953972, + "grad_norm": 0.18994841096833417, + "learning_rate": 0.00044832235435806836, + "loss": 3.0350823402404785, + "step": 8460, + "token_acc": 0.2939990953544578 + }, + { + "epoch": 4.959835825271181, + "grad_norm": 0.20373880902726077, + "learning_rate": 0.000448307600988478, + "loss": 3.0292789936065674, + "step": 8461, + "token_acc": 0.29658730675699146 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.19160635095151796, + "learning_rate": 0.000448292845756055, + "loss": 3.0338363647460938, + "step": 8462, + "token_acc": 0.29387773853033056 + }, + { + "epoch": 4.961008501905599, + "grad_norm": 0.18814175247275405, + "learning_rate": 0.00044827808866093795, + "loss": 3.0172781944274902, + "step": 8463, + "token_acc": 0.2958845948978906 + }, + { + "epoch": 4.9615948402228085, + "grad_norm": 0.15790413504583187, + "learning_rate": 0.00044826332970326546, + "loss": 3.082944631576538, + "step": 8464, + "token_acc": 0.29010316783380147 + }, + { + "epoch": 4.962181178540018, + "grad_norm": 0.19753814863712896, + "learning_rate": 0.0004482485688831761, + "loss": 3.0133426189422607, + "step": 8465, + "token_acc": 0.2969017040230418 + }, + { + "epoch": 4.962767516857227, + "grad_norm": 0.1783821299043102, + "learning_rate": 0.00044823380620080856, + "loss": 3.0817551612854004, + "step": 8466, + "token_acc": 0.2869654310574562 + }, + { + "epoch": 4.963353855174436, + "grad_norm": 0.154776354743497, + "learning_rate": 0.0004482190416563016, + "loss": 3.047630786895752, + "step": 8467, + "token_acc": 0.292778070365666 + }, + { + "epoch": 4.963940193491645, + "grad_norm": 0.18721079148633007, + "learning_rate": 0.0004482042752497937, + "loss": 3.0521769523620605, + "step": 8468, + "token_acc": 0.2925576606157613 + }, + { + "epoch": 4.964526531808854, + "grad_norm": 0.16812589410650702, + "learning_rate": 0.00044818950698142384, + "loss": 3.0386338233947754, + "step": 8469, + "token_acc": 0.29484617263153695 + }, + { + "epoch": 4.965112870126063, + "grad_norm": 0.16126588716704487, + "learning_rate": 0.00044817473685133057, + "loss": 3.016763925552368, + "step": 8470, + "token_acc": 0.29731926951977494 + }, + { + "epoch": 4.965699208443271, + "grad_norm": 0.1861073486753079, + "learning_rate": 0.0004481599648596528, + "loss": 3.029355764389038, + "step": 8471, + "token_acc": 0.2959844679997769 + }, + { + "epoch": 4.9662855467604805, + "grad_norm": 0.15954852221429763, + "learning_rate": 0.00044814519100652906, + "loss": 2.9987335205078125, + "step": 8472, + "token_acc": 0.2990952068524661 + }, + { + "epoch": 4.96687188507769, + "grad_norm": 0.1508549938328051, + "learning_rate": 0.0004481304152920983, + "loss": 3.091033935546875, + "step": 8473, + "token_acc": 0.2885059395234034 + }, + { + "epoch": 4.967458223394899, + "grad_norm": 0.1940092176792674, + "learning_rate": 0.0004481156377164993, + "loss": 3.0040957927703857, + "step": 8474, + "token_acc": 0.2998250481654207 + }, + { + "epoch": 4.968044561712108, + "grad_norm": 0.15989476810568515, + "learning_rate": 0.00044810085827987084, + "loss": 3.0267391204833984, + "step": 8475, + "token_acc": 0.2978285471043834 + }, + { + "epoch": 4.968630900029317, + "grad_norm": 0.1607073915134726, + "learning_rate": 0.00044808607698235175, + "loss": 3.027008295059204, + "step": 8476, + "token_acc": 0.29731806510860803 + }, + { + "epoch": 4.969217238346526, + "grad_norm": 0.18141320557326815, + "learning_rate": 0.0004480712938240809, + "loss": 3.039196491241455, + "step": 8477, + "token_acc": 0.29435533621221466 + }, + { + "epoch": 4.969803576663735, + "grad_norm": 0.16751714163675135, + "learning_rate": 0.0004480565088051971, + "loss": 3.0556774139404297, + "step": 8478, + "token_acc": 0.2915108168875951 + }, + { + "epoch": 4.970389914980944, + "grad_norm": 0.19943968867100728, + "learning_rate": 0.00044804172192583936, + "loss": 3.064584732055664, + "step": 8479, + "token_acc": 0.2908790668259434 + }, + { + "epoch": 4.970976253298153, + "grad_norm": 0.20134586599079454, + "learning_rate": 0.00044802693318614644, + "loss": 3.0471487045288086, + "step": 8480, + "token_acc": 0.2918769141776977 + }, + { + "epoch": 4.971562591615362, + "grad_norm": 0.22889246757065373, + "learning_rate": 0.0004480121425862574, + "loss": 3.017061710357666, + "step": 8481, + "token_acc": 0.29851934322518886 + }, + { + "epoch": 4.972148929932571, + "grad_norm": 0.251719543799185, + "learning_rate": 0.00044799735012631103, + "loss": 3.0218594074249268, + "step": 8482, + "token_acc": 0.29698731737039635 + }, + { + "epoch": 4.97273526824978, + "grad_norm": 0.31610246255507385, + "learning_rate": 0.0004479825558064464, + "loss": 3.0540881156921387, + "step": 8483, + "token_acc": 0.2905552298281024 + }, + { + "epoch": 4.973321606566989, + "grad_norm": 0.31995209516591117, + "learning_rate": 0.00044796775962680245, + "loss": 3.0387189388275146, + "step": 8484, + "token_acc": 0.29417174238121785 + }, + { + "epoch": 4.973907944884198, + "grad_norm": 0.19001130653655088, + "learning_rate": 0.00044795296158751816, + "loss": 3.0949974060058594, + "step": 8485, + "token_acc": 0.28550962762369864 + }, + { + "epoch": 4.974494283201407, + "grad_norm": 0.338893150316457, + "learning_rate": 0.0004479381616887325, + "loss": 3.0726876258850098, + "step": 8486, + "token_acc": 0.2897717698601412 + }, + { + "epoch": 4.975080621518616, + "grad_norm": 0.2488511223537283, + "learning_rate": 0.0004479233599305846, + "loss": 3.054105758666992, + "step": 8487, + "token_acc": 0.2928103917294209 + }, + { + "epoch": 4.975666959835825, + "grad_norm": 0.2031860193183852, + "learning_rate": 0.0004479085563132134, + "loss": 3.0231683254241943, + "step": 8488, + "token_acc": 0.2981127383255043 + }, + { + "epoch": 4.9762532981530345, + "grad_norm": 0.19638629244786235, + "learning_rate": 0.000447893750836758, + "loss": 3.010496139526367, + "step": 8489, + "token_acc": 0.29813276712670955 + }, + { + "epoch": 4.976839636470244, + "grad_norm": 0.20089546929108917, + "learning_rate": 0.00044787894350135747, + "loss": 3.0292534828186035, + "step": 8490, + "token_acc": 0.29662090238766775 + }, + { + "epoch": 4.977425974787453, + "grad_norm": 0.1738929127512271, + "learning_rate": 0.00044786413430715087, + "loss": 3.0659685134887695, + "step": 8491, + "token_acc": 0.29235555749119435 + }, + { + "epoch": 4.978012313104662, + "grad_norm": 0.23466258092596798, + "learning_rate": 0.0004478493232542774, + "loss": 2.99967098236084, + "step": 8492, + "token_acc": 0.2996582449056971 + }, + { + "epoch": 4.97859865142187, + "grad_norm": 0.1971653290746943, + "learning_rate": 0.0004478345103428761, + "loss": 3.011901617050171, + "step": 8493, + "token_acc": 0.2988665145870601 + }, + { + "epoch": 4.979184989739079, + "grad_norm": 0.19890948915571374, + "learning_rate": 0.00044781969557308634, + "loss": 3.0285441875457764, + "step": 8494, + "token_acc": 0.29512861079922426 + }, + { + "epoch": 4.979771328056288, + "grad_norm": 0.18631944461211597, + "learning_rate": 0.00044780487894504695, + "loss": 2.9871339797973633, + "step": 8495, + "token_acc": 0.3008657274693057 + }, + { + "epoch": 4.980357666373497, + "grad_norm": 0.19414259449493296, + "learning_rate": 0.00044779006045889727, + "loss": 3.06040358543396, + "step": 8496, + "token_acc": 0.29125743828827144 + }, + { + "epoch": 4.9809440046907065, + "grad_norm": 0.18315458347144067, + "learning_rate": 0.0004477752401147765, + "loss": 3.048372983932495, + "step": 8497, + "token_acc": 0.29278189782679964 + }, + { + "epoch": 4.981530343007916, + "grad_norm": 0.19955835358100418, + "learning_rate": 0.0004477604179128238, + "loss": 3.068040370941162, + "step": 8498, + "token_acc": 0.29015434546668895 + }, + { + "epoch": 4.982116681325125, + "grad_norm": 0.20553149716816185, + "learning_rate": 0.0004477455938531786, + "loss": 3.0309255123138428, + "step": 8499, + "token_acc": 0.29595542198847985 + }, + { + "epoch": 4.982703019642334, + "grad_norm": 0.2061509054318814, + "learning_rate": 0.00044773076793597997, + "loss": 3.1177492141723633, + "step": 8500, + "token_acc": 0.28390821851157877 + }, + { + "epoch": 4.983289357959543, + "grad_norm": 0.1966155042669467, + "learning_rate": 0.00044771594016136717, + "loss": 3.032310724258423, + "step": 8501, + "token_acc": 0.2932850585318934 + }, + { + "epoch": 4.983875696276751, + "grad_norm": 0.19039385888055835, + "learning_rate": 0.0004477011105294796, + "loss": 3.033277988433838, + "step": 8502, + "token_acc": 0.2946482450294638 + }, + { + "epoch": 4.98446203459396, + "grad_norm": 0.22163328495939572, + "learning_rate": 0.00044768627904045647, + "loss": 3.1107263565063477, + "step": 8503, + "token_acc": 0.2842375717017208 + }, + { + "epoch": 4.985048372911169, + "grad_norm": 0.16008311321290122, + "learning_rate": 0.00044767144569443705, + "loss": 3.102465867996216, + "step": 8504, + "token_acc": 0.28646643658371573 + }, + { + "epoch": 4.9856347112283785, + "grad_norm": 0.1812685556242146, + "learning_rate": 0.0004476566104915609, + "loss": 3.0310678482055664, + "step": 8505, + "token_acc": 0.29558499180972775 + }, + { + "epoch": 4.986221049545588, + "grad_norm": 0.1606980837395255, + "learning_rate": 0.00044764177343196716, + "loss": 3.022752523422241, + "step": 8506, + "token_acc": 0.2974833614882055 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.1745925139035418, + "learning_rate": 0.0004476269345157953, + "loss": 3.089613437652588, + "step": 8507, + "token_acc": 0.2873136485215372 + }, + { + "epoch": 4.987393726180006, + "grad_norm": 0.17011879919816955, + "learning_rate": 0.00044761209374318467, + "loss": 3.005892515182495, + "step": 8508, + "token_acc": 0.298244565739805 + }, + { + "epoch": 4.987980064497215, + "grad_norm": 0.17569017120956915, + "learning_rate": 0.00044759725111427476, + "loss": 3.094682455062866, + "step": 8509, + "token_acc": 0.28639099286793285 + }, + { + "epoch": 4.988566402814424, + "grad_norm": 0.21673600884941047, + "learning_rate": 0.0004475824066292049, + "loss": 3.0469343662261963, + "step": 8510, + "token_acc": 0.2944642465525396 + }, + { + "epoch": 4.989152741131633, + "grad_norm": 0.21086889051491386, + "learning_rate": 0.00044756756028811463, + "loss": 3.049027919769287, + "step": 8511, + "token_acc": 0.29565028250635705 + }, + { + "epoch": 4.989739079448842, + "grad_norm": 0.18365389741113794, + "learning_rate": 0.00044755271209114336, + "loss": 3.021958827972412, + "step": 8512, + "token_acc": 0.2965382455006425 + }, + { + "epoch": 4.990325417766051, + "grad_norm": 0.23840216935179523, + "learning_rate": 0.0004475378620384305, + "loss": 3.031259059906006, + "step": 8513, + "token_acc": 0.29362008254136535 + }, + { + "epoch": 4.99091175608326, + "grad_norm": 0.21317469902881198, + "learning_rate": 0.00044752301013011557, + "loss": 3.0104594230651855, + "step": 8514, + "token_acc": 0.2964538236458325 + }, + { + "epoch": 4.991498094400469, + "grad_norm": 0.16443007184496214, + "learning_rate": 0.0004475081563663382, + "loss": 3.0685951709747314, + "step": 8515, + "token_acc": 0.2900460483228155 + }, + { + "epoch": 4.992084432717678, + "grad_norm": 0.24284295559354518, + "learning_rate": 0.00044749330074723786, + "loss": 3.0139474868774414, + "step": 8516, + "token_acc": 0.2965865043971158 + }, + { + "epoch": 4.992670771034887, + "grad_norm": 0.22586884255268, + "learning_rate": 0.00044747844327295406, + "loss": 3.054715871810913, + "step": 8517, + "token_acc": 0.29274067784155394 + }, + { + "epoch": 4.993257109352096, + "grad_norm": 0.17839396651015887, + "learning_rate": 0.0004474635839436264, + "loss": 3.0711936950683594, + "step": 8518, + "token_acc": 0.29162238867032214 + }, + { + "epoch": 4.993843447669305, + "grad_norm": 0.24230121325702328, + "learning_rate": 0.0004474487227593944, + "loss": 3.076457977294922, + "step": 8519, + "token_acc": 0.2886245466571637 + }, + { + "epoch": 4.994429785986514, + "grad_norm": 0.26819486543111354, + "learning_rate": 0.00044743385972039786, + "loss": 3.0565733909606934, + "step": 8520, + "token_acc": 0.29193786090030366 + }, + { + "epoch": 4.995016124303723, + "grad_norm": 0.23128099413474382, + "learning_rate": 0.0004474189948267761, + "loss": 3.0763425827026367, + "step": 8521, + "token_acc": 0.2890938324007975 + }, + { + "epoch": 4.9956024626209325, + "grad_norm": 0.18968856147067403, + "learning_rate": 0.00044740412807866897, + "loss": 3.008232831954956, + "step": 8522, + "token_acc": 0.2992816946993663 + }, + { + "epoch": 4.996188800938142, + "grad_norm": 0.21347237809077374, + "learning_rate": 0.00044738925947621603, + "loss": 3.0376553535461426, + "step": 8523, + "token_acc": 0.29335778002516966 + }, + { + "epoch": 4.99677513925535, + "grad_norm": 0.22796372417114907, + "learning_rate": 0.0004473743890195571, + "loss": 3.0593647956848145, + "step": 8524, + "token_acc": 0.2915933351974604 + }, + { + "epoch": 4.997361477572559, + "grad_norm": 0.18002496742290655, + "learning_rate": 0.0004473595167088316, + "loss": 3.0697476863861084, + "step": 8525, + "token_acc": 0.2898295616768251 + }, + { + "epoch": 4.997947815889768, + "grad_norm": 0.18725066759491407, + "learning_rate": 0.00044734464254417945, + "loss": 3.061734676361084, + "step": 8526, + "token_acc": 0.2913460570047336 + }, + { + "epoch": 4.998534154206977, + "grad_norm": 0.20795528649445905, + "learning_rate": 0.0004473297665257403, + "loss": 3.073535442352295, + "step": 8527, + "token_acc": 0.28885737470338124 + }, + { + "epoch": 4.999120492524186, + "grad_norm": 0.1965688890500879, + "learning_rate": 0.0004473148886536539, + "loss": 3.06752872467041, + "step": 8528, + "token_acc": 0.29109355184860014 + }, + { + "epoch": 4.999706830841395, + "grad_norm": 0.21026564864050137, + "learning_rate": 0.00044730000892806, + "loss": 3.078951597213745, + "step": 8529, + "token_acc": 0.2876585085842509 + }, + { + "epoch": 5.0, + "grad_norm": 0.2651592235412727, + "learning_rate": 0.00044728512734909845, + "loss": 3.0628275871276855, + "step": 8530, + "token_acc": 0.2918714146552657 + }, + { + "epoch": 5.0, + "eval_loss": 3.0703072547912598, + "eval_runtime": 6.5296, + "eval_samples_per_second": 39.206, + "eval_steps_per_second": 4.901, + "eval_token_acc": 0.29058256050797504, + "step": 8530 + }, + { + "epoch": 5.000586338317209, + "grad_norm": 0.21651653435947588, + "learning_rate": 0.00044727024391690885, + "loss": 2.9531311988830566, + "step": 8531, + "token_acc": 0.3049516339869281 + }, + { + "epoch": 5.001172676634418, + "grad_norm": 0.22572002856123086, + "learning_rate": 0.00044725535863163125, + "loss": 2.9991421699523926, + "step": 8532, + "token_acc": 0.2985009804482495 + }, + { + "epoch": 5.001759014951627, + "grad_norm": 0.24281064474440098, + "learning_rate": 0.0004472404714934053, + "loss": 3.0120253562927246, + "step": 8533, + "token_acc": 0.29733722155780185 + }, + { + "epoch": 5.0023453532688364, + "grad_norm": 0.21846299797166016, + "learning_rate": 0.00044722558250237087, + "loss": 3.0014803409576416, + "step": 8534, + "token_acc": 0.2964582164818813 + }, + { + "epoch": 5.002931691586046, + "grad_norm": 0.26891261717815335, + "learning_rate": 0.0004472106916586679, + "loss": 3.0168392658233643, + "step": 8535, + "token_acc": 0.29485815513858293 + }, + { + "epoch": 5.003518029903254, + "grad_norm": 0.29370236936690214, + "learning_rate": 0.00044719579896243625, + "loss": 2.9636363983154297, + "step": 8536, + "token_acc": 0.30344725248297355 + }, + { + "epoch": 5.004104368220463, + "grad_norm": 0.23663267126185183, + "learning_rate": 0.00044718090441381574, + "loss": 2.9611024856567383, + "step": 8537, + "token_acc": 0.3056797550680636 + }, + { + "epoch": 5.004690706537672, + "grad_norm": 0.23502856030757432, + "learning_rate": 0.00044716600801294635, + "loss": 2.9888916015625, + "step": 8538, + "token_acc": 0.2995657480357863 + }, + { + "epoch": 5.005277044854881, + "grad_norm": 0.2785406970987534, + "learning_rate": 0.0004471511097599681, + "loss": 2.949319839477539, + "step": 8539, + "token_acc": 0.30699204811367703 + }, + { + "epoch": 5.00586338317209, + "grad_norm": 0.2104951929700274, + "learning_rate": 0.0004471362096550207, + "loss": 2.9499707221984863, + "step": 8540, + "token_acc": 0.30632604347393666 + }, + { + "epoch": 5.006449721489299, + "grad_norm": 0.23910648249618885, + "learning_rate": 0.0004471213076982443, + "loss": 3.025766611099243, + "step": 8541, + "token_acc": 0.29633677725319163 + }, + { + "epoch": 5.0070360598065085, + "grad_norm": 0.18387754474718215, + "learning_rate": 0.0004471064038897789, + "loss": 2.986368417739868, + "step": 8542, + "token_acc": 0.2996799440574487 + }, + { + "epoch": 5.007622398123718, + "grad_norm": 0.2511605384709285, + "learning_rate": 0.00044709149822976435, + "loss": 2.977336883544922, + "step": 8543, + "token_acc": 0.30226935850791464 + }, + { + "epoch": 5.008208736440927, + "grad_norm": 0.17679463789528796, + "learning_rate": 0.00044707659071834086, + "loss": 2.9798312187194824, + "step": 8544, + "token_acc": 0.3011056879708022 + }, + { + "epoch": 5.008795074758136, + "grad_norm": 0.23623273149663992, + "learning_rate": 0.0004470616813556483, + "loss": 2.9449620246887207, + "step": 8545, + "token_acc": 0.30611298525260144 + }, + { + "epoch": 5.009381413075345, + "grad_norm": 0.1718111370840815, + "learning_rate": 0.00044704677014182676, + "loss": 2.9174575805664062, + "step": 8546, + "token_acc": 0.3086948525982212 + }, + { + "epoch": 5.009967751392553, + "grad_norm": 0.20379763075645269, + "learning_rate": 0.00044703185707701637, + "loss": 2.99887752532959, + "step": 8547, + "token_acc": 0.2984795981054149 + }, + { + "epoch": 5.010554089709762, + "grad_norm": 0.1741586693323542, + "learning_rate": 0.0004470169421613572, + "loss": 2.9578797817230225, + "step": 8548, + "token_acc": 0.3043419157340528 + }, + { + "epoch": 5.011140428026971, + "grad_norm": 0.19970138004436996, + "learning_rate": 0.00044700202539498933, + "loss": 2.9282913208007812, + "step": 8549, + "token_acc": 0.30906297369479363 + }, + { + "epoch": 5.0117267663441805, + "grad_norm": 0.1728795553689899, + "learning_rate": 0.00044698710677805285, + "loss": 2.9607439041137695, + "step": 8550, + "token_acc": 0.30377471854248167 + }, + { + "epoch": 5.01231310466139, + "grad_norm": 0.21480135240567783, + "learning_rate": 0.00044697218631068803, + "loss": 3.0034103393554688, + "step": 8551, + "token_acc": 0.2972327044025157 + }, + { + "epoch": 5.012899442978599, + "grad_norm": 0.1783247378388419, + "learning_rate": 0.0004469572639930349, + "loss": 3.0158700942993164, + "step": 8552, + "token_acc": 0.2955437111047521 + }, + { + "epoch": 5.013485781295808, + "grad_norm": 0.20283435136031588, + "learning_rate": 0.0004469423398252337, + "loss": 2.9621500968933105, + "step": 8553, + "token_acc": 0.3042521271294611 + }, + { + "epoch": 5.014072119613017, + "grad_norm": 0.19167515732003332, + "learning_rate": 0.00044692741380742454, + "loss": 2.9783835411071777, + "step": 8554, + "token_acc": 0.3015620935613269 + }, + { + "epoch": 5.014658457930226, + "grad_norm": 0.26863259409179513, + "learning_rate": 0.00044691248593974774, + "loss": 2.9493565559387207, + "step": 8555, + "token_acc": 0.30656807751147375 + }, + { + "epoch": 5.015244796247435, + "grad_norm": 0.2375637162298047, + "learning_rate": 0.00044689755622234344, + "loss": 2.97379732131958, + "step": 8556, + "token_acc": 0.301923826855583 + }, + { + "epoch": 5.015831134564644, + "grad_norm": 0.1840828937392791, + "learning_rate": 0.000446882624655352, + "loss": 2.9432685375213623, + "step": 8557, + "token_acc": 0.3075826990368313 + }, + { + "epoch": 5.0164174728818525, + "grad_norm": 0.2436454050862875, + "learning_rate": 0.00044686769123891354, + "loss": 3.0069971084594727, + "step": 8558, + "token_acc": 0.29715413054960077 + }, + { + "epoch": 5.017003811199062, + "grad_norm": 0.1661001629265361, + "learning_rate": 0.0004468527559731684, + "loss": 2.9651999473571777, + "step": 8559, + "token_acc": 0.3025444503214257 + }, + { + "epoch": 5.017590149516271, + "grad_norm": 0.1949425722001224, + "learning_rate": 0.0004468378188582569, + "loss": 2.979250907897949, + "step": 8560, + "token_acc": 0.30091147694872367 + }, + { + "epoch": 5.01817648783348, + "grad_norm": 0.1913962085446768, + "learning_rate": 0.00044682287989431934, + "loss": 2.978869915008545, + "step": 8561, + "token_acc": 0.3041833384290215 + }, + { + "epoch": 5.018762826150689, + "grad_norm": 0.19531430551856582, + "learning_rate": 0.00044680793908149596, + "loss": 3.003464698791504, + "step": 8562, + "token_acc": 0.2973874303408747 + }, + { + "epoch": 5.019349164467898, + "grad_norm": 0.20629626729860764, + "learning_rate": 0.0004467929964199273, + "loss": 2.987518310546875, + "step": 8563, + "token_acc": 0.2981450631628971 + }, + { + "epoch": 5.019935502785107, + "grad_norm": 0.20383607435168705, + "learning_rate": 0.0004467780519097536, + "loss": 2.96308970451355, + "step": 8564, + "token_acc": 0.3037341846218325 + }, + { + "epoch": 5.020521841102316, + "grad_norm": 0.17526559712721163, + "learning_rate": 0.00044676310555111524, + "loss": 2.9877824783325195, + "step": 8565, + "token_acc": 0.29959882726096576 + }, + { + "epoch": 5.021108179419525, + "grad_norm": 0.19633770639790632, + "learning_rate": 0.0004467481573441527, + "loss": 2.978888750076294, + "step": 8566, + "token_acc": 0.3016042345069544 + }, + { + "epoch": 5.0216945177367345, + "grad_norm": 0.1774046697756081, + "learning_rate": 0.0004467332072890062, + "loss": 2.9549670219421387, + "step": 8567, + "token_acc": 0.30421705034151275 + }, + { + "epoch": 5.022280856053943, + "grad_norm": 0.16027953312480472, + "learning_rate": 0.0004467182553858165, + "loss": 3.004926919937134, + "step": 8568, + "token_acc": 0.29784870559935295 + }, + { + "epoch": 5.022867194371152, + "grad_norm": 0.18253541399362658, + "learning_rate": 0.0004467033016347238, + "loss": 2.9257240295410156, + "step": 8569, + "token_acc": 0.3101169147820566 + }, + { + "epoch": 5.023453532688361, + "grad_norm": 0.16859814205712167, + "learning_rate": 0.0004466883460358686, + "loss": 2.9686279296875, + "step": 8570, + "token_acc": 0.30419109774129405 + }, + { + "epoch": 5.02403987100557, + "grad_norm": 0.19645762569258512, + "learning_rate": 0.0004466733885893915, + "loss": 2.975919723510742, + "step": 8571, + "token_acc": 0.3015831839971624 + }, + { + "epoch": 5.024626209322779, + "grad_norm": 0.2081157755620637, + "learning_rate": 0.00044665842929543287, + "loss": 2.998379707336426, + "step": 8572, + "token_acc": 0.29943394340486196 + }, + { + "epoch": 5.025212547639988, + "grad_norm": 0.20566448724773456, + "learning_rate": 0.0004466434681541334, + "loss": 2.9780659675598145, + "step": 8573, + "token_acc": 0.3004716055249646 + }, + { + "epoch": 5.025798885957197, + "grad_norm": 0.2012513031776449, + "learning_rate": 0.0004466285051656334, + "loss": 2.9635045528411865, + "step": 8574, + "token_acc": 0.3028517599995959 + }, + { + "epoch": 5.0263852242744065, + "grad_norm": 0.1864288736354299, + "learning_rate": 0.0004466135403300736, + "loss": 2.9872422218322754, + "step": 8575, + "token_acc": 0.30066036079759606 + }, + { + "epoch": 5.026971562591616, + "grad_norm": 0.19095984973709618, + "learning_rate": 0.0004465985736475946, + "loss": 2.9934439659118652, + "step": 8576, + "token_acc": 0.29861516313556624 + }, + { + "epoch": 5.027557900908825, + "grad_norm": 0.25724294608878046, + "learning_rate": 0.0004465836051183368, + "loss": 2.990786552429199, + "step": 8577, + "token_acc": 0.3000929178590243 + }, + { + "epoch": 5.028144239226034, + "grad_norm": 0.24779323184876134, + "learning_rate": 0.000446568634742441, + "loss": 2.9668638706207275, + "step": 8578, + "token_acc": 0.3024170853757125 + }, + { + "epoch": 5.028730577543242, + "grad_norm": 0.21021481836487746, + "learning_rate": 0.00044655366252004775, + "loss": 2.95998477935791, + "step": 8579, + "token_acc": 0.30392259771068086 + }, + { + "epoch": 5.029316915860451, + "grad_norm": 0.18225132934618143, + "learning_rate": 0.00044653868845129767, + "loss": 2.9551820755004883, + "step": 8580, + "token_acc": 0.303647634650001 + }, + { + "epoch": 5.02990325417766, + "grad_norm": 0.23513417208296597, + "learning_rate": 0.0004465237125363315, + "loss": 2.8932971954345703, + "step": 8581, + "token_acc": 0.315813628445965 + }, + { + "epoch": 5.030489592494869, + "grad_norm": 0.18365637324019077, + "learning_rate": 0.00044650873477528985, + "loss": 3.0181431770324707, + "step": 8582, + "token_acc": 0.29653990699712335 + }, + { + "epoch": 5.0310759308120785, + "grad_norm": 0.23236298813488257, + "learning_rate": 0.0004464937551683134, + "loss": 2.992175579071045, + "step": 8583, + "token_acc": 0.30067313073048946 + }, + { + "epoch": 5.031662269129288, + "grad_norm": 0.21459267018536887, + "learning_rate": 0.0004464787737155429, + "loss": 2.9586470127105713, + "step": 8584, + "token_acc": 0.30397193004605555 + }, + { + "epoch": 5.032248607446497, + "grad_norm": 0.19500355535004943, + "learning_rate": 0.00044646379041711915, + "loss": 2.9836082458496094, + "step": 8585, + "token_acc": 0.29986052261752555 + }, + { + "epoch": 5.032834945763706, + "grad_norm": 0.2225562938573435, + "learning_rate": 0.0004464488052731828, + "loss": 2.9796080589294434, + "step": 8586, + "token_acc": 0.30089126745311995 + }, + { + "epoch": 5.033421284080915, + "grad_norm": 0.16930396836052575, + "learning_rate": 0.0004464338182838746, + "loss": 2.990751266479492, + "step": 8587, + "token_acc": 0.2982945003545349 + }, + { + "epoch": 5.034007622398124, + "grad_norm": 0.20601011127730254, + "learning_rate": 0.0004464188294493354, + "loss": 2.994412899017334, + "step": 8588, + "token_acc": 0.29891394845418046 + }, + { + "epoch": 5.034593960715333, + "grad_norm": 0.18553739779654535, + "learning_rate": 0.000446403838769706, + "loss": 2.961716651916504, + "step": 8589, + "token_acc": 0.30247082671341696 + }, + { + "epoch": 5.035180299032541, + "grad_norm": 0.16914231102705346, + "learning_rate": 0.00044638884624512723, + "loss": 2.9398536682128906, + "step": 8590, + "token_acc": 0.30648004064911555 + }, + { + "epoch": 5.0357666373497505, + "grad_norm": 0.1904742495773429, + "learning_rate": 0.0004463738518757398, + "loss": 2.9772722721099854, + "step": 8591, + "token_acc": 0.3009807586760951 + }, + { + "epoch": 5.03635297566696, + "grad_norm": 0.16086845018883977, + "learning_rate": 0.0004463588556616847, + "loss": 2.981649398803711, + "step": 8592, + "token_acc": 0.2995121369542 + }, + { + "epoch": 5.036939313984169, + "grad_norm": 0.19569017217664667, + "learning_rate": 0.0004463438576031027, + "loss": 2.9715065956115723, + "step": 8593, + "token_acc": 0.3019302854957473 + }, + { + "epoch": 5.037525652301378, + "grad_norm": 0.18410594082517043, + "learning_rate": 0.0004463288577001347, + "loss": 3.00132417678833, + "step": 8594, + "token_acc": 0.29803745560725103 + }, + { + "epoch": 5.038111990618587, + "grad_norm": 0.16008511298314984, + "learning_rate": 0.0004463138559529217, + "loss": 2.993732452392578, + "step": 8595, + "token_acc": 0.2983107098316894 + }, + { + "epoch": 5.038698328935796, + "grad_norm": 0.19574003776501467, + "learning_rate": 0.0004462988523616046, + "loss": 2.980804920196533, + "step": 8596, + "token_acc": 0.2987962090813361 + }, + { + "epoch": 5.039284667253005, + "grad_norm": 0.18464562336370913, + "learning_rate": 0.0004462838469263242, + "loss": 2.982194423675537, + "step": 8597, + "token_acc": 0.2996981998824849 + }, + { + "epoch": 5.039871005570214, + "grad_norm": 0.1571993673061543, + "learning_rate": 0.00044626883964722164, + "loss": 2.926562786102295, + "step": 8598, + "token_acc": 0.30940352355891787 + }, + { + "epoch": 5.040457343887423, + "grad_norm": 0.1944283114064994, + "learning_rate": 0.0004462538305244378, + "loss": 2.9606873989105225, + "step": 8599, + "token_acc": 0.3037853127984594 + }, + { + "epoch": 5.0410436822046325, + "grad_norm": 0.2092103508225096, + "learning_rate": 0.00044623881955811365, + "loss": 2.959688186645508, + "step": 8600, + "token_acc": 0.3035425635982235 + }, + { + "epoch": 5.041630020521841, + "grad_norm": 0.17326912832992516, + "learning_rate": 0.00044622380674839025, + "loss": 2.9784727096557617, + "step": 8601, + "token_acc": 0.3003358461886079 + }, + { + "epoch": 5.04221635883905, + "grad_norm": 0.19295960945849924, + "learning_rate": 0.00044620879209540855, + "loss": 2.9652857780456543, + "step": 8602, + "token_acc": 0.3036766155582984 + }, + { + "epoch": 5.042802697156259, + "grad_norm": 0.20865610294849113, + "learning_rate": 0.0004461937755993096, + "loss": 2.995967388153076, + "step": 8603, + "token_acc": 0.29896122206856574 + }, + { + "epoch": 5.043389035473468, + "grad_norm": 0.20856190984697506, + "learning_rate": 0.0004461787572602346, + "loss": 2.997234344482422, + "step": 8604, + "token_acc": 0.298446230426761 + }, + { + "epoch": 5.043975373790677, + "grad_norm": 0.1700081823953103, + "learning_rate": 0.00044616373707832455, + "loss": 2.971435546875, + "step": 8605, + "token_acc": 0.3016519742203126 + }, + { + "epoch": 5.044561712107886, + "grad_norm": 0.16944168943075683, + "learning_rate": 0.00044614871505372044, + "loss": 2.975215196609497, + "step": 8606, + "token_acc": 0.30201850944806785 + }, + { + "epoch": 5.045148050425095, + "grad_norm": 0.20181168820853396, + "learning_rate": 0.00044613369118656353, + "loss": 2.976470947265625, + "step": 8607, + "token_acc": 0.3008271029725142 + }, + { + "epoch": 5.0457343887423045, + "grad_norm": 0.2714293895545918, + "learning_rate": 0.00044611866547699487, + "loss": 3.0086488723754883, + "step": 8608, + "token_acc": 0.2963525163185124 + }, + { + "epoch": 5.046320727059514, + "grad_norm": 0.22079845886327976, + "learning_rate": 0.0004461036379251556, + "loss": 2.9424355030059814, + "step": 8609, + "token_acc": 0.30691079574252406 + }, + { + "epoch": 5.046907065376723, + "grad_norm": 0.1842420869462325, + "learning_rate": 0.00044608860853118695, + "loss": 2.938812017440796, + "step": 8610, + "token_acc": 0.3072638157999543 + }, + { + "epoch": 5.047493403693931, + "grad_norm": 0.18522440045263966, + "learning_rate": 0.00044607357729522997, + "loss": 2.9694409370422363, + "step": 8611, + "token_acc": 0.3023615559217686 + }, + { + "epoch": 5.04807974201114, + "grad_norm": 0.23819309698712512, + "learning_rate": 0.000446058544217426, + "loss": 2.9925127029418945, + "step": 8612, + "token_acc": 0.29953019161161826 + }, + { + "epoch": 5.048666080328349, + "grad_norm": 0.22080586186445192, + "learning_rate": 0.0004460435092979162, + "loss": 2.9974308013916016, + "step": 8613, + "token_acc": 0.2993419714827196 + }, + { + "epoch": 5.049252418645558, + "grad_norm": 0.19473636781924344, + "learning_rate": 0.00044602847253684175, + "loss": 2.9720683097839355, + "step": 8614, + "token_acc": 0.3031263901676995 + }, + { + "epoch": 5.049838756962767, + "grad_norm": 0.2770266735643184, + "learning_rate": 0.000446013433934344, + "loss": 2.958238124847412, + "step": 8615, + "token_acc": 0.3034786927370683 + }, + { + "epoch": 5.0504250952799765, + "grad_norm": 0.16672914554842258, + "learning_rate": 0.0004459983934905642, + "loss": 2.960379123687744, + "step": 8616, + "token_acc": 0.3038039766604258 + }, + { + "epoch": 5.051011433597186, + "grad_norm": 0.219613956351533, + "learning_rate": 0.0004459833512056436, + "loss": 2.944220781326294, + "step": 8617, + "token_acc": 0.3077087648911027 + }, + { + "epoch": 5.051597771914395, + "grad_norm": 0.20154021212726264, + "learning_rate": 0.00044596830707972345, + "loss": 2.962651491165161, + "step": 8618, + "token_acc": 0.3034662593922553 + }, + { + "epoch": 5.052184110231604, + "grad_norm": 0.22355544683121295, + "learning_rate": 0.00044595326111294514, + "loss": 2.9767870903015137, + "step": 8619, + "token_acc": 0.30070841073947596 + }, + { + "epoch": 5.052770448548813, + "grad_norm": 0.23896038261285166, + "learning_rate": 0.00044593821330545, + "loss": 2.9746227264404297, + "step": 8620, + "token_acc": 0.3017857050106737 + }, + { + "epoch": 5.053356786866022, + "grad_norm": 0.17937357659973965, + "learning_rate": 0.0004459231636573794, + "loss": 2.9359049797058105, + "step": 8621, + "token_acc": 0.30662470039498685 + }, + { + "epoch": 5.05394312518323, + "grad_norm": 0.21005904788769492, + "learning_rate": 0.0004459081121688747, + "loss": 2.97896146774292, + "step": 8622, + "token_acc": 0.3007855594263292 + }, + { + "epoch": 5.054529463500439, + "grad_norm": 0.1868839888522517, + "learning_rate": 0.00044589305884007723, + "loss": 2.9834413528442383, + "step": 8623, + "token_acc": 0.29919687476739143 + }, + { + "epoch": 5.0551158018176485, + "grad_norm": 0.2396666052058184, + "learning_rate": 0.0004458780036711285, + "loss": 3.0094852447509766, + "step": 8624, + "token_acc": 0.2980510122628723 + }, + { + "epoch": 5.055702140134858, + "grad_norm": 0.17911013713198776, + "learning_rate": 0.00044586294666216976, + "loss": 3.035132884979248, + "step": 8625, + "token_acc": 0.2944158102275903 + }, + { + "epoch": 5.056288478452067, + "grad_norm": 0.2439051766555579, + "learning_rate": 0.0004458478878133426, + "loss": 2.957828998565674, + "step": 8626, + "token_acc": 0.30341192585532517 + }, + { + "epoch": 5.056874816769276, + "grad_norm": 0.17113305704283974, + "learning_rate": 0.00044583282712478854, + "loss": 2.998318672180176, + "step": 8627, + "token_acc": 0.2997714996863221 + }, + { + "epoch": 5.057461155086485, + "grad_norm": 0.24652215906221567, + "learning_rate": 0.00044581776459664884, + "loss": 2.976487636566162, + "step": 8628, + "token_acc": 0.30149553300775755 + }, + { + "epoch": 5.058047493403694, + "grad_norm": 0.17612599844843735, + "learning_rate": 0.00044580270022906524, + "loss": 2.991102933883667, + "step": 8629, + "token_acc": 0.2987881999270417 + }, + { + "epoch": 5.058633831720903, + "grad_norm": 0.21805217067953936, + "learning_rate": 0.00044578763402217906, + "loss": 2.986628532409668, + "step": 8630, + "token_acc": 0.2996015498337501 + }, + { + "epoch": 5.059220170038112, + "grad_norm": 0.1829586784045076, + "learning_rate": 0.0004457725659761318, + "loss": 2.9965732097625732, + "step": 8631, + "token_acc": 0.29890647935364 + }, + { + "epoch": 5.059806508355321, + "grad_norm": 0.18973636372920044, + "learning_rate": 0.0004457574960910652, + "loss": 2.953490734100342, + "step": 8632, + "token_acc": 0.30494309777131856 + }, + { + "epoch": 5.06039284667253, + "grad_norm": 0.2078350464978439, + "learning_rate": 0.00044574242436712066, + "loss": 2.955385446548462, + "step": 8633, + "token_acc": 0.30463890353189244 + }, + { + "epoch": 5.060979184989739, + "grad_norm": 0.18312587414734494, + "learning_rate": 0.00044572735080443984, + "loss": 2.985891103744507, + "step": 8634, + "token_acc": 0.2997699762434481 + }, + { + "epoch": 5.061565523306948, + "grad_norm": 0.18014611302782282, + "learning_rate": 0.00044571227540316427, + "loss": 2.977421760559082, + "step": 8635, + "token_acc": 0.3014093044400667 + }, + { + "epoch": 5.062151861624157, + "grad_norm": 0.15619999693082243, + "learning_rate": 0.0004456971981634356, + "loss": 2.956023693084717, + "step": 8636, + "token_acc": 0.3049728543525121 + }, + { + "epoch": 5.062738199941366, + "grad_norm": 0.2371325572934786, + "learning_rate": 0.00044568211908539544, + "loss": 2.952258825302124, + "step": 8637, + "token_acc": 0.30422057877647385 + }, + { + "epoch": 5.063324538258575, + "grad_norm": 0.1941031941816454, + "learning_rate": 0.00044566703816918555, + "loss": 2.9754738807678223, + "step": 8638, + "token_acc": 0.30187768446343644 + }, + { + "epoch": 5.063910876575784, + "grad_norm": 0.1588576861776656, + "learning_rate": 0.0004456519554149474, + "loss": 2.9642601013183594, + "step": 8639, + "token_acc": 0.3041995697475097 + }, + { + "epoch": 5.064497214892993, + "grad_norm": 0.20147915381564294, + "learning_rate": 0.0004456368708228228, + "loss": 3.0021042823791504, + "step": 8640, + "token_acc": 0.29708337053747286 + }, + { + "epoch": 5.0650835532102025, + "grad_norm": 0.17684925628371811, + "learning_rate": 0.0004456217843929534, + "loss": 3.0264391899108887, + "step": 8641, + "token_acc": 0.2945059279993948 + }, + { + "epoch": 5.065669891527412, + "grad_norm": 0.2139818903243028, + "learning_rate": 0.000445606696125481, + "loss": 3.0014138221740723, + "step": 8642, + "token_acc": 0.2979231801643147 + }, + { + "epoch": 5.066256229844621, + "grad_norm": 0.17499910993485562, + "learning_rate": 0.0004455916060205473, + "loss": 2.952077627182007, + "step": 8643, + "token_acc": 0.30543145118250714 + }, + { + "epoch": 5.066842568161829, + "grad_norm": 0.17815844920170426, + "learning_rate": 0.0004455765140782939, + "loss": 2.9972472190856934, + "step": 8644, + "token_acc": 0.2997598832615728 + }, + { + "epoch": 5.067428906479038, + "grad_norm": 0.18917003560753806, + "learning_rate": 0.0004455614202988628, + "loss": 2.95031476020813, + "step": 8645, + "token_acc": 0.30563063950510266 + }, + { + "epoch": 5.068015244796247, + "grad_norm": 0.1620532305910785, + "learning_rate": 0.00044554632468239567, + "loss": 2.9436497688293457, + "step": 8646, + "token_acc": 0.3063058509293967 + }, + { + "epoch": 5.068601583113456, + "grad_norm": 0.19191543765156602, + "learning_rate": 0.0004455312272290343, + "loss": 3.000683307647705, + "step": 8647, + "token_acc": 0.29723079065184327 + }, + { + "epoch": 5.069187921430665, + "grad_norm": 0.18859357856291722, + "learning_rate": 0.0004455161279389205, + "loss": 2.9740800857543945, + "step": 8648, + "token_acc": 0.3018820250328866 + }, + { + "epoch": 5.0697742597478745, + "grad_norm": 0.18570935491209128, + "learning_rate": 0.00044550102681219613, + "loss": 2.968301296234131, + "step": 8649, + "token_acc": 0.3021979305957651 + }, + { + "epoch": 5.070360598065084, + "grad_norm": 0.18548660058762137, + "learning_rate": 0.0004454859238490031, + "loss": 2.9676902294158936, + "step": 8650, + "token_acc": 0.30386588143474014 + }, + { + "epoch": 5.070946936382293, + "grad_norm": 0.24655278539567257, + "learning_rate": 0.00044547081904948324, + "loss": 2.969587802886963, + "step": 8651, + "token_acc": 0.302777904044793 + }, + { + "epoch": 5.071533274699502, + "grad_norm": 0.20740986710794485, + "learning_rate": 0.00044545571241377834, + "loss": 2.9880752563476562, + "step": 8652, + "token_acc": 0.2991293450473766 + }, + { + "epoch": 5.072119613016711, + "grad_norm": 0.17033005067289558, + "learning_rate": 0.0004454406039420305, + "loss": 2.950756788253784, + "step": 8653, + "token_acc": 0.30603406466361344 + }, + { + "epoch": 5.07270595133392, + "grad_norm": 0.2327896766457918, + "learning_rate": 0.0004454254936343816, + "loss": 2.9522461891174316, + "step": 8654, + "token_acc": 0.3041601602017047 + }, + { + "epoch": 5.073292289651128, + "grad_norm": 0.20808401365117496, + "learning_rate": 0.0004454103814909734, + "loss": 3.0387561321258545, + "step": 8655, + "token_acc": 0.2925784306797631 + }, + { + "epoch": 5.073878627968337, + "grad_norm": 0.17229705792926722, + "learning_rate": 0.00044539526751194805, + "loss": 2.992068290710449, + "step": 8656, + "token_acc": 0.299857431696822 + }, + { + "epoch": 5.0744649662855466, + "grad_norm": 0.20329672513826672, + "learning_rate": 0.00044538015169744746, + "loss": 2.9810991287231445, + "step": 8657, + "token_acc": 0.30157830519148265 + }, + { + "epoch": 5.075051304602756, + "grad_norm": 0.16240154419798736, + "learning_rate": 0.0004453650340476136, + "loss": 2.9683871269226074, + "step": 8658, + "token_acc": 0.30217312252559553 + }, + { + "epoch": 5.075637642919965, + "grad_norm": 0.20096924901561874, + "learning_rate": 0.0004453499145625885, + "loss": 3.014518976211548, + "step": 8659, + "token_acc": 0.29682944558938856 + }, + { + "epoch": 5.076223981237174, + "grad_norm": 0.17393855382729279, + "learning_rate": 0.0004453347932425142, + "loss": 3.005685329437256, + "step": 8660, + "token_acc": 0.2980601592202827 + }, + { + "epoch": 5.076810319554383, + "grad_norm": 0.2158725285054191, + "learning_rate": 0.0004453196700875327, + "loss": 2.9822983741760254, + "step": 8661, + "token_acc": 0.3007618468352555 + }, + { + "epoch": 5.077396657871592, + "grad_norm": 0.21044087792938376, + "learning_rate": 0.0004453045450977862, + "loss": 2.981567621231079, + "step": 8662, + "token_acc": 0.3014756010726135 + }, + { + "epoch": 5.077982996188801, + "grad_norm": 0.17054576406253677, + "learning_rate": 0.0004452894182734166, + "loss": 2.9881153106689453, + "step": 8663, + "token_acc": 0.2996795216484202 + }, + { + "epoch": 5.07856933450601, + "grad_norm": 0.19032653784141515, + "learning_rate": 0.00044527428961456606, + "loss": 2.939809560775757, + "step": 8664, + "token_acc": 0.30707649111507507 + }, + { + "epoch": 5.0791556728232194, + "grad_norm": 0.1923725972280376, + "learning_rate": 0.0004452591591213767, + "loss": 2.9823546409606934, + "step": 8665, + "token_acc": 0.30085225796393494 + }, + { + "epoch": 5.079742011140428, + "grad_norm": 0.1631994160241602, + "learning_rate": 0.00044524402679399066, + "loss": 2.9885072708129883, + "step": 8666, + "token_acc": 0.30099487084223453 + }, + { + "epoch": 5.080328349457637, + "grad_norm": 0.19004242145009445, + "learning_rate": 0.00044522889263255016, + "loss": 2.9761857986450195, + "step": 8667, + "token_acc": 0.29988298440640027 + }, + { + "epoch": 5.080914687774846, + "grad_norm": 0.1672695325002176, + "learning_rate": 0.0004452137566371972, + "loss": 3.0061936378479004, + "step": 8668, + "token_acc": 0.29717524317956284 + }, + { + "epoch": 5.081501026092055, + "grad_norm": 0.16657383551583727, + "learning_rate": 0.0004451986188080741, + "loss": 2.9993398189544678, + "step": 8669, + "token_acc": 0.2985622849571517 + }, + { + "epoch": 5.082087364409264, + "grad_norm": 0.18469211377288927, + "learning_rate": 0.00044518347914532296, + "loss": 2.9781854152679443, + "step": 8670, + "token_acc": 0.3016966476967222 + }, + { + "epoch": 5.082673702726473, + "grad_norm": 0.1831007185012431, + "learning_rate": 0.0004451683376490861, + "loss": 2.979910135269165, + "step": 8671, + "token_acc": 0.3009580999932528 + }, + { + "epoch": 5.083260041043682, + "grad_norm": 0.16628204293285825, + "learning_rate": 0.0004451531943195057, + "loss": 2.9878482818603516, + "step": 8672, + "token_acc": 0.29845324222247377 + }, + { + "epoch": 5.0838463793608915, + "grad_norm": 0.19004530046751614, + "learning_rate": 0.00044513804915672397, + "loss": 2.947545289993286, + "step": 8673, + "token_acc": 0.3059995131233139 + }, + { + "epoch": 5.084432717678101, + "grad_norm": 0.2495446346866189, + "learning_rate": 0.00044512290216088327, + "loss": 3.015436887741089, + "step": 8674, + "token_acc": 0.2961701676214725 + }, + { + "epoch": 5.08501905599531, + "grad_norm": 0.2917824388042082, + "learning_rate": 0.0004451077533321257, + "loss": 2.9743292331695557, + "step": 8675, + "token_acc": 0.3021603858956939 + }, + { + "epoch": 5.085605394312518, + "grad_norm": 0.190728223341427, + "learning_rate": 0.0004450926026705939, + "loss": 3.047908067703247, + "step": 8676, + "token_acc": 0.29266282379769903 + }, + { + "epoch": 5.086191732629727, + "grad_norm": 0.25614514439328046, + "learning_rate": 0.0004450774501764299, + "loss": 3.012049674987793, + "step": 8677, + "token_acc": 0.2975385315731287 + }, + { + "epoch": 5.086778070946936, + "grad_norm": 0.2555025238293874, + "learning_rate": 0.0004450622958497761, + "loss": 2.9370622634887695, + "step": 8678, + "token_acc": 0.30546119141648453 + }, + { + "epoch": 5.087364409264145, + "grad_norm": 0.1895178296392627, + "learning_rate": 0.00044504713969077485, + "loss": 2.9910037517547607, + "step": 8679, + "token_acc": 0.29987682113579406 + }, + { + "epoch": 5.087950747581354, + "grad_norm": 0.30336301251298026, + "learning_rate": 0.0004450319816995686, + "loss": 3.0123772621154785, + "step": 8680, + "token_acc": 0.2978076880056672 + }, + { + "epoch": 5.0885370858985635, + "grad_norm": 0.17058034493622912, + "learning_rate": 0.0004450168218762997, + "loss": 2.972095251083374, + "step": 8681, + "token_acc": 0.3031841932505351 + }, + { + "epoch": 5.089123424215773, + "grad_norm": 0.2590703595789448, + "learning_rate": 0.0004450016602211106, + "loss": 2.9533019065856934, + "step": 8682, + "token_acc": 0.30543957173147 + }, + { + "epoch": 5.089709762532982, + "grad_norm": 0.16398296780915148, + "learning_rate": 0.0004449864967341436, + "loss": 2.998281955718994, + "step": 8683, + "token_acc": 0.29859552042160736 + }, + { + "epoch": 5.090296100850191, + "grad_norm": 0.2832100719416877, + "learning_rate": 0.0004449713314155412, + "loss": 2.966888189315796, + "step": 8684, + "token_acc": 0.30374322361337475 + }, + { + "epoch": 5.0908824391674, + "grad_norm": 0.17066370113299584, + "learning_rate": 0.00044495616426544585, + "loss": 2.9454736709594727, + "step": 8685, + "token_acc": 0.3054903367751262 + }, + { + "epoch": 5.091468777484609, + "grad_norm": 0.21428016867978947, + "learning_rate": 0.00044494099528400013, + "loss": 3.000596761703491, + "step": 8686, + "token_acc": 0.2964813930342949 + }, + { + "epoch": 5.092055115801817, + "grad_norm": 0.1786485249308624, + "learning_rate": 0.0004449258244713463, + "loss": 2.9337856769561768, + "step": 8687, + "token_acc": 0.30664656474336427 + }, + { + "epoch": 5.092641454119026, + "grad_norm": 0.18949454231162494, + "learning_rate": 0.0004449106518276272, + "loss": 2.9337942600250244, + "step": 8688, + "token_acc": 0.30810018435137426 + }, + { + "epoch": 5.0932277924362355, + "grad_norm": 0.17691387774158146, + "learning_rate": 0.0004448954773529851, + "loss": 3.0088260173797607, + "step": 8689, + "token_acc": 0.2970852939363429 + }, + { + "epoch": 5.093814130753445, + "grad_norm": 0.1813394671176994, + "learning_rate": 0.0004448803010475625, + "loss": 2.963430404663086, + "step": 8690, + "token_acc": 0.30339260577018107 + }, + { + "epoch": 5.094400469070654, + "grad_norm": 0.18306315964868977, + "learning_rate": 0.00044486512291150223, + "loss": 2.9731225967407227, + "step": 8691, + "token_acc": 0.30617767057895307 + }, + { + "epoch": 5.094986807387863, + "grad_norm": 0.166233084742953, + "learning_rate": 0.0004448499429449466, + "loss": 3.001830816268921, + "step": 8692, + "token_acc": 0.2983365934282552 + }, + { + "epoch": 5.095573145705072, + "grad_norm": 0.19167196129571717, + "learning_rate": 0.00044483476114803844, + "loss": 2.974398374557495, + "step": 8693, + "token_acc": 0.30307335754339954 + }, + { + "epoch": 5.096159484022281, + "grad_norm": 0.16214186091038377, + "learning_rate": 0.0004448195775209202, + "loss": 2.9722065925598145, + "step": 8694, + "token_acc": 0.3036610642586429 + }, + { + "epoch": 5.09674582233949, + "grad_norm": 0.18606379590160094, + "learning_rate": 0.0004448043920637345, + "loss": 2.9871020317077637, + "step": 8695, + "token_acc": 0.29935455616612616 + }, + { + "epoch": 5.097332160656699, + "grad_norm": 0.182354034894133, + "learning_rate": 0.0004447892047766241, + "loss": 2.9713146686553955, + "step": 8696, + "token_acc": 0.301366809097906 + }, + { + "epoch": 5.097918498973908, + "grad_norm": 0.1799178693649738, + "learning_rate": 0.00044477401565973154, + "loss": 3.0149006843566895, + "step": 8697, + "token_acc": 0.2964379991745855 + }, + { + "epoch": 5.098504837291117, + "grad_norm": 0.1782470615321636, + "learning_rate": 0.0004447588247131996, + "loss": 2.9822301864624023, + "step": 8698, + "token_acc": 0.3016054597618781 + }, + { + "epoch": 5.099091175608326, + "grad_norm": 0.17662041488022265, + "learning_rate": 0.00044474363193717093, + "loss": 2.9736571311950684, + "step": 8699, + "token_acc": 0.3024353593450438 + }, + { + "epoch": 5.099677513925535, + "grad_norm": 0.1751931352238748, + "learning_rate": 0.00044472843733178827, + "loss": 3.0043129920959473, + "step": 8700, + "token_acc": 0.29886601743586405 + }, + { + "epoch": 5.100263852242744, + "grad_norm": 0.16441625263120016, + "learning_rate": 0.00044471324089719435, + "loss": 3.0036861896514893, + "step": 8701, + "token_acc": 0.2967620787926918 + }, + { + "epoch": 5.100850190559953, + "grad_norm": 0.17447517813921465, + "learning_rate": 0.00044469804263353184, + "loss": 2.9923291206359863, + "step": 8702, + "token_acc": 0.301533872009827 + }, + { + "epoch": 5.101436528877162, + "grad_norm": 0.17112936552337116, + "learning_rate": 0.0004446828425409437, + "loss": 2.9982962608337402, + "step": 8703, + "token_acc": 0.2995148130394313 + }, + { + "epoch": 5.102022867194371, + "grad_norm": 0.22457709184497177, + "learning_rate": 0.00044466764061957257, + "loss": 2.9788668155670166, + "step": 8704, + "token_acc": 0.3018351275563241 + }, + { + "epoch": 5.10260920551158, + "grad_norm": 0.21002594504636185, + "learning_rate": 0.0004446524368695611, + "loss": 3.007920265197754, + "step": 8705, + "token_acc": 0.2978220953171503 + }, + { + "epoch": 5.1031955438287895, + "grad_norm": 0.1656793860678139, + "learning_rate": 0.0004446372312910525, + "loss": 2.9555277824401855, + "step": 8706, + "token_acc": 0.3039694082823599 + }, + { + "epoch": 5.103781882145999, + "grad_norm": 0.2062391618323717, + "learning_rate": 0.00044462202388418926, + "loss": 3.0100295543670654, + "step": 8707, + "token_acc": 0.29648936225946554 + }, + { + "epoch": 5.104368220463208, + "grad_norm": 0.2098514263289546, + "learning_rate": 0.00044460681464911445, + "loss": 3.019043207168579, + "step": 8708, + "token_acc": 0.29410315766054484 + }, + { + "epoch": 5.104954558780416, + "grad_norm": 0.16575664721545047, + "learning_rate": 0.00044459160358597085, + "loss": 2.9941768646240234, + "step": 8709, + "token_acc": 0.30066963050415985 + }, + { + "epoch": 5.105540897097625, + "grad_norm": 0.2748136893708205, + "learning_rate": 0.0004445763906949013, + "loss": 2.967991590499878, + "step": 8710, + "token_acc": 0.30203086791791567 + }, + { + "epoch": 5.106127235414834, + "grad_norm": 0.3746450659734796, + "learning_rate": 0.00044456117597604873, + "loss": 3.0214946269989014, + "step": 8711, + "token_acc": 0.29649813831394567 + }, + { + "epoch": 5.106713573732043, + "grad_norm": 0.19248521139432512, + "learning_rate": 0.0004445459594295561, + "loss": 2.9820380210876465, + "step": 8712, + "token_acc": 0.3006762811389776 + }, + { + "epoch": 5.107299912049252, + "grad_norm": 0.21525298970307108, + "learning_rate": 0.0004445307410555664, + "loss": 2.984111785888672, + "step": 8713, + "token_acc": 0.3010132222084552 + }, + { + "epoch": 5.1078862503664615, + "grad_norm": 0.20275811644700503, + "learning_rate": 0.0004445155208542224, + "loss": 2.9863650798797607, + "step": 8714, + "token_acc": 0.30061141304347827 + }, + { + "epoch": 5.108472588683671, + "grad_norm": 0.20061241646584638, + "learning_rate": 0.00044450029882566735, + "loss": 2.9884612560272217, + "step": 8715, + "token_acc": 0.3009622214359632 + }, + { + "epoch": 5.10905892700088, + "grad_norm": 0.2219182911114369, + "learning_rate": 0.000444485074970044, + "loss": 2.932551383972168, + "step": 8716, + "token_acc": 0.30795195855247665 + }, + { + "epoch": 5.109645265318089, + "grad_norm": 0.1696554940962451, + "learning_rate": 0.00044446984928749544, + "loss": 2.975181818008423, + "step": 8717, + "token_acc": 0.3025253013028074 + }, + { + "epoch": 5.110231603635298, + "grad_norm": 0.1896026199259381, + "learning_rate": 0.00044445462177816476, + "loss": 2.984372615814209, + "step": 8718, + "token_acc": 0.30086946615089044 + }, + { + "epoch": 5.110817941952506, + "grad_norm": 0.19518374712631986, + "learning_rate": 0.0004444393924421948, + "loss": 2.985424041748047, + "step": 8719, + "token_acc": 0.30239633673179317 + }, + { + "epoch": 5.111404280269715, + "grad_norm": 0.19871318273017644, + "learning_rate": 0.0004444241612797289, + "loss": 2.965365171432495, + "step": 8720, + "token_acc": 0.30210786165637665 + }, + { + "epoch": 5.111990618586924, + "grad_norm": 0.17348778319980182, + "learning_rate": 0.00044440892829090994, + "loss": 2.9832210540771484, + "step": 8721, + "token_acc": 0.3014860509317962 + }, + { + "epoch": 5.1125769569041335, + "grad_norm": 0.20023844893961287, + "learning_rate": 0.00044439369347588106, + "loss": 2.9795336723327637, + "step": 8722, + "token_acc": 0.30259952008049584 + }, + { + "epoch": 5.113163295221343, + "grad_norm": 0.16923407908976956, + "learning_rate": 0.0004443784568347854, + "loss": 2.934471845626831, + "step": 8723, + "token_acc": 0.30796485961274817 + }, + { + "epoch": 5.113749633538552, + "grad_norm": 0.18246011096612696, + "learning_rate": 0.00044436321836776606, + "loss": 2.9944136142730713, + "step": 8724, + "token_acc": 0.297609207180037 + }, + { + "epoch": 5.114335971855761, + "grad_norm": 0.16949756168186397, + "learning_rate": 0.00044434797807496617, + "loss": 3.020205020904541, + "step": 8725, + "token_acc": 0.29478818956877423 + }, + { + "epoch": 5.11492231017297, + "grad_norm": 0.19699919015133432, + "learning_rate": 0.0004443327359565289, + "loss": 2.981968402862549, + "step": 8726, + "token_acc": 0.3023252321853579 + }, + { + "epoch": 5.115508648490179, + "grad_norm": 0.18498674873559284, + "learning_rate": 0.0004443174920125975, + "loss": 3.033045530319214, + "step": 8727, + "token_acc": 0.29232261126127057 + }, + { + "epoch": 5.116094986807388, + "grad_norm": 0.18455677225860928, + "learning_rate": 0.00044430224624331506, + "loss": 2.974199056625366, + "step": 8728, + "token_acc": 0.3019660968901706 + }, + { + "epoch": 5.116681325124597, + "grad_norm": 0.1759017776492813, + "learning_rate": 0.00044428699864882483, + "loss": 3.020569324493408, + "step": 8729, + "token_acc": 0.2958360552256663 + }, + { + "epoch": 5.1172676634418055, + "grad_norm": 0.18322128721256692, + "learning_rate": 0.00044427174922927014, + "loss": 2.961002826690674, + "step": 8730, + "token_acc": 0.30525251893761784 + }, + { + "epoch": 5.117854001759015, + "grad_norm": 0.2505976846360587, + "learning_rate": 0.00044425649798479405, + "loss": 2.9866740703582764, + "step": 8731, + "token_acc": 0.3010071466313694 + }, + { + "epoch": 5.118440340076224, + "grad_norm": 0.2115810775356059, + "learning_rate": 0.00044424124491553995, + "loss": 2.9912612438201904, + "step": 8732, + "token_acc": 0.2994038515239318 + }, + { + "epoch": 5.119026678393433, + "grad_norm": 0.1965034297047565, + "learning_rate": 0.0004442259900216512, + "loss": 2.9565072059631348, + "step": 8733, + "token_acc": 0.3062210525473758 + }, + { + "epoch": 5.119613016710642, + "grad_norm": 0.25963934167700076, + "learning_rate": 0.00044421073330327087, + "loss": 2.981520652770996, + "step": 8734, + "token_acc": 0.30099062082657485 + }, + { + "epoch": 5.120199355027851, + "grad_norm": 0.16998169640142108, + "learning_rate": 0.00044419547476054244, + "loss": 2.996492385864258, + "step": 8735, + "token_acc": 0.29904945311368475 + }, + { + "epoch": 5.12078569334506, + "grad_norm": 0.21578283434751916, + "learning_rate": 0.0004441802143936092, + "loss": 3.0027551651000977, + "step": 8736, + "token_acc": 0.296802316918217 + }, + { + "epoch": 5.121372031662269, + "grad_norm": 0.17813585143368707, + "learning_rate": 0.00044416495220261454, + "loss": 2.9835734367370605, + "step": 8737, + "token_acc": 0.3010338112477618 + }, + { + "epoch": 5.121958369979478, + "grad_norm": 0.19881562054319968, + "learning_rate": 0.0004441496881877017, + "loss": 3.0267527103424072, + "step": 8738, + "token_acc": 0.2962101504718184 + }, + { + "epoch": 5.1225447082966875, + "grad_norm": 0.20660763304233926, + "learning_rate": 0.0004441344223490143, + "loss": 2.977837085723877, + "step": 8739, + "token_acc": 0.3009801461513645 + }, + { + "epoch": 5.123131046613897, + "grad_norm": 0.17158880776176214, + "learning_rate": 0.00044411915468669556, + "loss": 2.996511697769165, + "step": 8740, + "token_acc": 0.2993599224637937 + }, + { + "epoch": 5.123717384931105, + "grad_norm": 0.21452063140028457, + "learning_rate": 0.0004441038852008889, + "loss": 2.9855563640594482, + "step": 8741, + "token_acc": 0.3006892202351457 + }, + { + "epoch": 5.124303723248314, + "grad_norm": 0.17079051588619126, + "learning_rate": 0.0004440886138917378, + "loss": 2.928725242614746, + "step": 8742, + "token_acc": 0.3088905440984022 + }, + { + "epoch": 5.124890061565523, + "grad_norm": 0.18292682922830103, + "learning_rate": 0.00044407334075938573, + "loss": 2.976896047592163, + "step": 8743, + "token_acc": 0.30177201386454805 + }, + { + "epoch": 5.125476399882732, + "grad_norm": 0.18563912676883831, + "learning_rate": 0.00044405806580397614, + "loss": 2.9855833053588867, + "step": 8744, + "token_acc": 0.29957146258233475 + }, + { + "epoch": 5.126062738199941, + "grad_norm": 0.18408442047945414, + "learning_rate": 0.00044404278902565256, + "loss": 2.965723991394043, + "step": 8745, + "token_acc": 0.3026603587109736 + }, + { + "epoch": 5.12664907651715, + "grad_norm": 0.18114602512018754, + "learning_rate": 0.00044402751042455836, + "loss": 2.9670424461364746, + "step": 8746, + "token_acc": 0.30280305114289313 + }, + { + "epoch": 5.1272354148343595, + "grad_norm": 0.17609532361280458, + "learning_rate": 0.00044401223000083725, + "loss": 2.9774763584136963, + "step": 8747, + "token_acc": 0.3030851253374998 + }, + { + "epoch": 5.127821753151569, + "grad_norm": 0.19086479124439165, + "learning_rate": 0.0004439969477546326, + "loss": 2.9790468215942383, + "step": 8748, + "token_acc": 0.2999583157491385 + }, + { + "epoch": 5.128408091468778, + "grad_norm": 0.1712440520593261, + "learning_rate": 0.000443981663686088, + "loss": 2.962512969970703, + "step": 8749, + "token_acc": 0.3022198925435723 + }, + { + "epoch": 5.128994429785987, + "grad_norm": 0.18097582819288932, + "learning_rate": 0.00044396637779534716, + "loss": 2.993630886077881, + "step": 8750, + "token_acc": 0.30078204517051677 + }, + { + "epoch": 5.129580768103196, + "grad_norm": 0.17713790717679545, + "learning_rate": 0.0004439510900825535, + "loss": 2.957152843475342, + "step": 8751, + "token_acc": 0.30460715075337297 + }, + { + "epoch": 5.130167106420404, + "grad_norm": 0.18507534220347635, + "learning_rate": 0.0004439358005478508, + "loss": 2.991044044494629, + "step": 8752, + "token_acc": 0.29998440763054773 + }, + { + "epoch": 5.130753444737613, + "grad_norm": 0.1941744986195138, + "learning_rate": 0.0004439205091913825, + "loss": 2.980870246887207, + "step": 8753, + "token_acc": 0.29917432582567416 + }, + { + "epoch": 5.131339783054822, + "grad_norm": 0.17934373854653354, + "learning_rate": 0.00044390521601329235, + "loss": 2.979238510131836, + "step": 8754, + "token_acc": 0.3008060295195227 + }, + { + "epoch": 5.1319261213720315, + "grad_norm": 0.18935686712937697, + "learning_rate": 0.000443889921013724, + "loss": 2.961998224258423, + "step": 8755, + "token_acc": 0.30451088600728166 + }, + { + "epoch": 5.132512459689241, + "grad_norm": 0.1635256496647054, + "learning_rate": 0.0004438746241928211, + "loss": 2.937725305557251, + "step": 8756, + "token_acc": 0.3063446571308372 + }, + { + "epoch": 5.13309879800645, + "grad_norm": 0.1805589924140241, + "learning_rate": 0.0004438593255507274, + "loss": 2.9922070503234863, + "step": 8757, + "token_acc": 0.29834255568218737 + }, + { + "epoch": 5.133685136323659, + "grad_norm": 0.2014278226288138, + "learning_rate": 0.00044384402508758646, + "loss": 3.0203442573547363, + "step": 8758, + "token_acc": 0.296373181581373 + }, + { + "epoch": 5.134271474640868, + "grad_norm": 0.19606642172806865, + "learning_rate": 0.0004438287228035421, + "loss": 2.9669575691223145, + "step": 8759, + "token_acc": 0.3038575572799706 + }, + { + "epoch": 5.134857812958077, + "grad_norm": 0.17193300018633415, + "learning_rate": 0.00044381341869873827, + "loss": 3.0092475414276123, + "step": 8760, + "token_acc": 0.2968296321646089 + }, + { + "epoch": 5.135444151275286, + "grad_norm": 0.19259356140749984, + "learning_rate": 0.0004437981127733184, + "loss": 2.958791971206665, + "step": 8761, + "token_acc": 0.30542714977250535 + }, + { + "epoch": 5.136030489592494, + "grad_norm": 0.1625306921850991, + "learning_rate": 0.0004437828050274264, + "loss": 3.019230842590332, + "step": 8762, + "token_acc": 0.29588822970439427 + }, + { + "epoch": 5.1366168279097035, + "grad_norm": 0.19129821411572573, + "learning_rate": 0.0004437674954612061, + "loss": 2.9478063583374023, + "step": 8763, + "token_acc": 0.3046184320266889 + }, + { + "epoch": 5.137203166226913, + "grad_norm": 0.19429898068825613, + "learning_rate": 0.0004437521840748013, + "loss": 2.954503059387207, + "step": 8764, + "token_acc": 0.3052622437459451 + }, + { + "epoch": 5.137789504544122, + "grad_norm": 0.29201981464710863, + "learning_rate": 0.0004437368708683558, + "loss": 2.9911608695983887, + "step": 8765, + "token_acc": 0.30020483913197493 + }, + { + "epoch": 5.138375842861331, + "grad_norm": 0.3240815323999452, + "learning_rate": 0.0004437215558420135, + "loss": 3.0255513191223145, + "step": 8766, + "token_acc": 0.29469124997363816 + }, + { + "epoch": 5.13896218117854, + "grad_norm": 0.33238639344521165, + "learning_rate": 0.0004437062389959181, + "loss": 2.995145559310913, + "step": 8767, + "token_acc": 0.2994174827692491 + }, + { + "epoch": 5.139548519495749, + "grad_norm": 0.2516811538929707, + "learning_rate": 0.00044369092033021373, + "loss": 2.9815473556518555, + "step": 8768, + "token_acc": 0.30134539073245536 + }, + { + "epoch": 5.140134857812958, + "grad_norm": 0.16500870732976872, + "learning_rate": 0.0004436755998450441, + "loss": 2.966063976287842, + "step": 8769, + "token_acc": 0.3051165689475931 + }, + { + "epoch": 5.140721196130167, + "grad_norm": 0.1824959402155693, + "learning_rate": 0.0004436602775405533, + "loss": 2.9898056983947754, + "step": 8770, + "token_acc": 0.3005512259034415 + }, + { + "epoch": 5.141307534447376, + "grad_norm": 0.18657288282759818, + "learning_rate": 0.00044364495341688503, + "loss": 3.0107436180114746, + "step": 8771, + "token_acc": 0.297682422593849 + }, + { + "epoch": 5.1418938727645855, + "grad_norm": 0.2033234731112678, + "learning_rate": 0.0004436296274741834, + "loss": 3.000121593475342, + "step": 8772, + "token_acc": 0.29768572334354854 + }, + { + "epoch": 5.142480211081795, + "grad_norm": 0.18308175038365282, + "learning_rate": 0.0004436142997125923, + "loss": 2.959010124206543, + "step": 8773, + "token_acc": 0.3038789398013062 + }, + { + "epoch": 5.143066549399003, + "grad_norm": 0.1695435572624618, + "learning_rate": 0.0004435989701322558, + "loss": 2.948486804962158, + "step": 8774, + "token_acc": 0.30577777320774896 + }, + { + "epoch": 5.143652887716212, + "grad_norm": 0.17129029861989012, + "learning_rate": 0.00044358363873331786, + "loss": 2.980088710784912, + "step": 8775, + "token_acc": 0.30152320207849737 + }, + { + "epoch": 5.144239226033421, + "grad_norm": 0.15855817207467682, + "learning_rate": 0.00044356830551592254, + "loss": 2.9855031967163086, + "step": 8776, + "token_acc": 0.30069962866711913 + }, + { + "epoch": 5.14482556435063, + "grad_norm": 0.17003999805084932, + "learning_rate": 0.0004435529704802137, + "loss": 3.011748790740967, + "step": 8777, + "token_acc": 0.29669308036285963 + }, + { + "epoch": 5.145411902667839, + "grad_norm": 0.17948421936435988, + "learning_rate": 0.00044353763362633557, + "loss": 2.9702816009521484, + "step": 8778, + "token_acc": 0.30203123697591067 + }, + { + "epoch": 5.145998240985048, + "grad_norm": 0.20017730236600856, + "learning_rate": 0.00044352229495443226, + "loss": 2.980778217315674, + "step": 8779, + "token_acc": 0.30084952127225484 + }, + { + "epoch": 5.1465845793022575, + "grad_norm": 0.19341715004494375, + "learning_rate": 0.0004435069544646476, + "loss": 3.001784324645996, + "step": 8780, + "token_acc": 0.29766664926658665 + }, + { + "epoch": 5.147170917619467, + "grad_norm": 0.1645253423288801, + "learning_rate": 0.00044349161215712595, + "loss": 2.9548749923706055, + "step": 8781, + "token_acc": 0.3050363481910589 + }, + { + "epoch": 5.147757255936676, + "grad_norm": 0.20518528355313262, + "learning_rate": 0.00044347626803201135, + "loss": 3.0156431198120117, + "step": 8782, + "token_acc": 0.2966124348270663 + }, + { + "epoch": 5.148343594253885, + "grad_norm": 0.23441674911064184, + "learning_rate": 0.0004434609220894479, + "loss": 2.993480682373047, + "step": 8783, + "token_acc": 0.299571681828181 + }, + { + "epoch": 5.148929932571093, + "grad_norm": 0.20569072116922216, + "learning_rate": 0.0004434455743295798, + "loss": 2.9530177116394043, + "step": 8784, + "token_acc": 0.3036298284585956 + }, + { + "epoch": 5.149516270888302, + "grad_norm": 0.1587091362918602, + "learning_rate": 0.0004434302247525512, + "loss": 3.0004758834838867, + "step": 8785, + "token_acc": 0.2969946980676645 + }, + { + "epoch": 5.150102609205511, + "grad_norm": 0.22384193720168497, + "learning_rate": 0.0004434148733585063, + "loss": 2.9857680797576904, + "step": 8786, + "token_acc": 0.2996169889031716 + }, + { + "epoch": 5.15068894752272, + "grad_norm": 0.24629592166074957, + "learning_rate": 0.0004433995201475892, + "loss": 3.0081186294555664, + "step": 8787, + "token_acc": 0.29775586097937995 + }, + { + "epoch": 5.1512752858399296, + "grad_norm": 0.18742951773209884, + "learning_rate": 0.00044338416511994426, + "loss": 2.9570837020874023, + "step": 8788, + "token_acc": 0.30387830131476945 + }, + { + "epoch": 5.151861624157139, + "grad_norm": 0.19646515708892104, + "learning_rate": 0.0004433688082757158, + "loss": 3.026297092437744, + "step": 8789, + "token_acc": 0.2940225329962323 + }, + { + "epoch": 5.152447962474348, + "grad_norm": 0.20051299088990934, + "learning_rate": 0.0004433534496150478, + "loss": 2.987358570098877, + "step": 8790, + "token_acc": 0.3001125246561379 + }, + { + "epoch": 5.153034300791557, + "grad_norm": 0.15457959234496235, + "learning_rate": 0.00044333808913808476, + "loss": 2.959099769592285, + "step": 8791, + "token_acc": 0.3042096669192431 + }, + { + "epoch": 5.153620639108766, + "grad_norm": 0.1838312972855188, + "learning_rate": 0.0004433227268449709, + "loss": 2.9745407104492188, + "step": 8792, + "token_acc": 0.3020439101868741 + }, + { + "epoch": 5.154206977425975, + "grad_norm": 0.17796718775698855, + "learning_rate": 0.00044330736273585046, + "loss": 2.971836566925049, + "step": 8793, + "token_acc": 0.3024766009788569 + }, + { + "epoch": 5.154793315743184, + "grad_norm": 0.224051263765041, + "learning_rate": 0.00044329199681086796, + "loss": 3.021592378616333, + "step": 8794, + "token_acc": 0.2952031088550839 + }, + { + "epoch": 5.1553796540603924, + "grad_norm": 0.1775266659273417, + "learning_rate": 0.00044327662907016753, + "loss": 2.9839346408843994, + "step": 8795, + "token_acc": 0.3005945614875003 + }, + { + "epoch": 5.155965992377602, + "grad_norm": 0.19104162559981613, + "learning_rate": 0.0004432612595138936, + "loss": 3.0073342323303223, + "step": 8796, + "token_acc": 0.29680753508342755 + }, + { + "epoch": 5.156552330694811, + "grad_norm": 0.19157912912753658, + "learning_rate": 0.0004432458881421906, + "loss": 2.9781274795532227, + "step": 8797, + "token_acc": 0.3018584978555794 + }, + { + "epoch": 5.15713866901202, + "grad_norm": 0.20015034902580847, + "learning_rate": 0.00044323051495520285, + "loss": 2.9905576705932617, + "step": 8798, + "token_acc": 0.300727431195736 + }, + { + "epoch": 5.157725007329229, + "grad_norm": 0.19837806833311086, + "learning_rate": 0.0004432151399530748, + "loss": 2.9781527519226074, + "step": 8799, + "token_acc": 0.3013042730788081 + }, + { + "epoch": 5.158311345646438, + "grad_norm": 0.19962525418409838, + "learning_rate": 0.00044319976313595083, + "loss": 2.9713144302368164, + "step": 8800, + "token_acc": 0.3012242292914294 + }, + { + "epoch": 5.158897683963647, + "grad_norm": 0.23789679591304785, + "learning_rate": 0.0004431843845039755, + "loss": 2.9699878692626953, + "step": 8801, + "token_acc": 0.3012037079739888 + }, + { + "epoch": 5.159484022280856, + "grad_norm": 0.2048395340654621, + "learning_rate": 0.00044316900405729317, + "loss": 2.9315085411071777, + "step": 8802, + "token_acc": 0.3082174919600675 + }, + { + "epoch": 5.160070360598065, + "grad_norm": 0.16372085563401884, + "learning_rate": 0.00044315362179604836, + "loss": 2.999645233154297, + "step": 8803, + "token_acc": 0.299346276164543 + }, + { + "epoch": 5.1606566989152745, + "grad_norm": 0.18957966346949431, + "learning_rate": 0.0004431382377203855, + "loss": 3.01366925239563, + "step": 8804, + "token_acc": 0.29710502564494096 + }, + { + "epoch": 5.161243037232484, + "grad_norm": 0.1976172691767416, + "learning_rate": 0.0004431228518304492, + "loss": 2.998316764831543, + "step": 8805, + "token_acc": 0.29986448195145987 + }, + { + "epoch": 5.161829375549692, + "grad_norm": 0.18530247330629188, + "learning_rate": 0.0004431074641263839, + "loss": 2.966491460800171, + "step": 8806, + "token_acc": 0.30217870244318124 + }, + { + "epoch": 5.162415713866901, + "grad_norm": 0.19169113547699163, + "learning_rate": 0.00044309207460833423, + "loss": 3.0257625579833984, + "step": 8807, + "token_acc": 0.2947127294036172 + }, + { + "epoch": 5.16300205218411, + "grad_norm": 0.17714026416745718, + "learning_rate": 0.0004430766832764447, + "loss": 2.9855637550354004, + "step": 8808, + "token_acc": 0.30169432113506095 + }, + { + "epoch": 5.163588390501319, + "grad_norm": 0.18939479033801662, + "learning_rate": 0.00044306129013085995, + "loss": 3.060751438140869, + "step": 8809, + "token_acc": 0.2904538324955869 + }, + { + "epoch": 5.164174728818528, + "grad_norm": 0.1720470249802947, + "learning_rate": 0.00044304589517172444, + "loss": 3.0255823135375977, + "step": 8810, + "token_acc": 0.29640227246762146 + }, + { + "epoch": 5.164761067135737, + "grad_norm": 0.16863651620345926, + "learning_rate": 0.00044303049839918295, + "loss": 2.9605231285095215, + "step": 8811, + "token_acc": 0.30332551932698065 + }, + { + "epoch": 5.1653474054529465, + "grad_norm": 0.17915145685804046, + "learning_rate": 0.00044301509981338005, + "loss": 3.0141773223876953, + "step": 8812, + "token_acc": 0.2975359092829027 + }, + { + "epoch": 5.165933743770156, + "grad_norm": 0.16283239190205318, + "learning_rate": 0.00044299969941446034, + "loss": 2.9851694107055664, + "step": 8813, + "token_acc": 0.3007235762451293 + }, + { + "epoch": 5.166520082087365, + "grad_norm": 0.18237965713611776, + "learning_rate": 0.0004429842972025685, + "loss": 2.9824442863464355, + "step": 8814, + "token_acc": 0.2993478860026853 + }, + { + "epoch": 5.167106420404574, + "grad_norm": 0.15731697620541296, + "learning_rate": 0.0004429688931778493, + "loss": 2.97575044631958, + "step": 8815, + "token_acc": 0.3009870621777124 + }, + { + "epoch": 5.167692758721783, + "grad_norm": 0.19106150923677062, + "learning_rate": 0.0004429534873404474, + "loss": 2.9616377353668213, + "step": 8816, + "token_acc": 0.30445221325381727 + }, + { + "epoch": 5.168279097038991, + "grad_norm": 0.19014711845616725, + "learning_rate": 0.0004429380796905074, + "loss": 2.982950448989868, + "step": 8817, + "token_acc": 0.3007281814475076 + }, + { + "epoch": 5.1688654353562, + "grad_norm": 0.17713231803324647, + "learning_rate": 0.0004429226702281741, + "loss": 3.018231153488159, + "step": 8818, + "token_acc": 0.2940174744818733 + }, + { + "epoch": 5.169451773673409, + "grad_norm": 0.16605211019691218, + "learning_rate": 0.0004429072589535924, + "loss": 3.0333194732666016, + "step": 8819, + "token_acc": 0.2949870093355757 + }, + { + "epoch": 5.1700381119906185, + "grad_norm": 0.22078888940285127, + "learning_rate": 0.0004428918458669069, + "loss": 2.9845995903015137, + "step": 8820, + "token_acc": 0.3003319458348701 + }, + { + "epoch": 5.170624450307828, + "grad_norm": 0.1980923427841761, + "learning_rate": 0.00044287643096826247, + "loss": 2.957256317138672, + "step": 8821, + "token_acc": 0.3030481030422556 + }, + { + "epoch": 5.171210788625037, + "grad_norm": 0.16410361144003952, + "learning_rate": 0.0004428610142578038, + "loss": 3.0132460594177246, + "step": 8822, + "token_acc": 0.295656487736714 + }, + { + "epoch": 5.171797126942246, + "grad_norm": 0.17138479508595136, + "learning_rate": 0.00044284559573567583, + "loss": 2.945361614227295, + "step": 8823, + "token_acc": 0.3072721001307518 + }, + { + "epoch": 5.172383465259455, + "grad_norm": 0.1995769737744185, + "learning_rate": 0.0004428301754020233, + "loss": 3.0178587436676025, + "step": 8824, + "token_acc": 0.29459409136047665 + }, + { + "epoch": 5.172969803576664, + "grad_norm": 0.23002664600489142, + "learning_rate": 0.00044281475325699116, + "loss": 2.949838638305664, + "step": 8825, + "token_acc": 0.3056479350384029 + }, + { + "epoch": 5.173556141893873, + "grad_norm": 0.21468769051597134, + "learning_rate": 0.00044279932930072424, + "loss": 2.96749210357666, + "step": 8826, + "token_acc": 0.3028843479021118 + }, + { + "epoch": 5.174142480211081, + "grad_norm": 0.17038943244702934, + "learning_rate": 0.0004427839035333674, + "loss": 2.9870846271514893, + "step": 8827, + "token_acc": 0.2995438255120174 + }, + { + "epoch": 5.1747288185282905, + "grad_norm": 0.21917612113587115, + "learning_rate": 0.0004427684759550656, + "loss": 2.9842967987060547, + "step": 8828, + "token_acc": 0.30077621305499996 + }, + { + "epoch": 5.1753151568455, + "grad_norm": 0.23661548471761087, + "learning_rate": 0.0004427530465659637, + "loss": 2.9789273738861084, + "step": 8829, + "token_acc": 0.2999345953517684 + }, + { + "epoch": 5.175901495162709, + "grad_norm": 0.17502682394592814, + "learning_rate": 0.00044273761536620673, + "loss": 2.9573769569396973, + "step": 8830, + "token_acc": 0.3050305909059106 + }, + { + "epoch": 5.176487833479918, + "grad_norm": 0.2327482210417244, + "learning_rate": 0.0004427221823559395, + "loss": 3.0415148735046387, + "step": 8831, + "token_acc": 0.2930228878810252 + }, + { + "epoch": 5.177074171797127, + "grad_norm": 0.25092826963169784, + "learning_rate": 0.00044270674753530705, + "loss": 2.9905500411987305, + "step": 8832, + "token_acc": 0.2994847393107199 + }, + { + "epoch": 5.177660510114336, + "grad_norm": 0.15462550598691166, + "learning_rate": 0.0004426913109044544, + "loss": 2.9697647094726562, + "step": 8833, + "token_acc": 0.3044459055652667 + }, + { + "epoch": 5.178246848431545, + "grad_norm": 0.20374395545410434, + "learning_rate": 0.00044267587246352657, + "loss": 2.9857914447784424, + "step": 8834, + "token_acc": 0.299605878770167 + }, + { + "epoch": 5.178833186748754, + "grad_norm": 0.18287376638783923, + "learning_rate": 0.0004426604322126685, + "loss": 2.984575033187866, + "step": 8835, + "token_acc": 0.3018724869021955 + }, + { + "epoch": 5.179419525065963, + "grad_norm": 0.1599217214744648, + "learning_rate": 0.0004426449901520254, + "loss": 2.9735639095306396, + "step": 8836, + "token_acc": 0.3014944119758023 + }, + { + "epoch": 5.1800058633831725, + "grad_norm": 0.18801290230886053, + "learning_rate": 0.0004426295462817421, + "loss": 2.996950149536133, + "step": 8837, + "token_acc": 0.2996473884201713 + }, + { + "epoch": 5.180592201700381, + "grad_norm": 0.16971393864212894, + "learning_rate": 0.00044261410060196385, + "loss": 3.0004491806030273, + "step": 8838, + "token_acc": 0.29815328080720005 + }, + { + "epoch": 5.18117854001759, + "grad_norm": 0.2110796833092154, + "learning_rate": 0.0004425986531128356, + "loss": 2.984823226928711, + "step": 8839, + "token_acc": 0.3024030106234238 + }, + { + "epoch": 5.181764878334799, + "grad_norm": 0.21010269020015432, + "learning_rate": 0.0004425832038145026, + "loss": 3.0057268142700195, + "step": 8840, + "token_acc": 0.2955416517408647 + }, + { + "epoch": 5.182351216652008, + "grad_norm": 0.2089935370215622, + "learning_rate": 0.00044256775270710985, + "loss": 2.954976797103882, + "step": 8841, + "token_acc": 0.30377144962627645 + }, + { + "epoch": 5.182937554969217, + "grad_norm": 0.17218165629041793, + "learning_rate": 0.00044255229979080256, + "loss": 3.0092296600341797, + "step": 8842, + "token_acc": 0.2978609045151622 + }, + { + "epoch": 5.183523893286426, + "grad_norm": 0.20326666788450598, + "learning_rate": 0.0004425368450657259, + "loss": 3.0203351974487305, + "step": 8843, + "token_acc": 0.29590390983188997 + }, + { + "epoch": 5.184110231603635, + "grad_norm": 0.18783590937165168, + "learning_rate": 0.00044252138853202505, + "loss": 3.0002493858337402, + "step": 8844, + "token_acc": 0.2984321888886888 + }, + { + "epoch": 5.1846965699208445, + "grad_norm": 0.20995669748011578, + "learning_rate": 0.00044250593018984517, + "loss": 3.0225157737731934, + "step": 8845, + "token_acc": 0.29629019268032714 + }, + { + "epoch": 5.185282908238054, + "grad_norm": 0.2770379681723397, + "learning_rate": 0.00044249047003933154, + "loss": 2.970541477203369, + "step": 8846, + "token_acc": 0.3037223453404951 + }, + { + "epoch": 5.185869246555263, + "grad_norm": 0.18594168630015998, + "learning_rate": 0.0004424750080806292, + "loss": 2.9690101146698, + "step": 8847, + "token_acc": 0.3037826960743005 + }, + { + "epoch": 5.186455584872472, + "grad_norm": 0.2621826405552439, + "learning_rate": 0.0004424595443138836, + "loss": 2.958822011947632, + "step": 8848, + "token_acc": 0.3035287414371366 + }, + { + "epoch": 5.18704192318968, + "grad_norm": 0.29547647906371594, + "learning_rate": 0.00044244407873923985, + "loss": 2.981954574584961, + "step": 8849, + "token_acc": 0.3022268224475623 + }, + { + "epoch": 5.187628261506889, + "grad_norm": 0.16182079894335188, + "learning_rate": 0.00044242861135684343, + "loss": 2.9978561401367188, + "step": 8850, + "token_acc": 0.2995199304416057 + }, + { + "epoch": 5.188214599824098, + "grad_norm": 0.26586322426428327, + "learning_rate": 0.0004424131421668394, + "loss": 2.974959135055542, + "step": 8851, + "token_acc": 0.30220256140567964 + }, + { + "epoch": 5.188800938141307, + "grad_norm": 0.18208648661996515, + "learning_rate": 0.00044239767116937325, + "loss": 2.978020191192627, + "step": 8852, + "token_acc": 0.3027106188788786 + }, + { + "epoch": 5.1893872764585165, + "grad_norm": 0.2195109012234581, + "learning_rate": 0.00044238219836459017, + "loss": 2.957524299621582, + "step": 8853, + "token_acc": 0.3047144459256438 + }, + { + "epoch": 5.189973614775726, + "grad_norm": 0.18567011613468803, + "learning_rate": 0.00044236672375263565, + "loss": 2.978147506713867, + "step": 8854, + "token_acc": 0.30193847276998786 + }, + { + "epoch": 5.190559953092935, + "grad_norm": 0.21921101979344748, + "learning_rate": 0.000442351247333655, + "loss": 3.0142345428466797, + "step": 8855, + "token_acc": 0.2960742541716494 + }, + { + "epoch": 5.191146291410144, + "grad_norm": 0.18643333502722184, + "learning_rate": 0.0004423357691077934, + "loss": 3.047701358795166, + "step": 8856, + "token_acc": 0.291195281824107 + }, + { + "epoch": 5.191732629727353, + "grad_norm": 0.19416886735007322, + "learning_rate": 0.00044232028907519657, + "loss": 2.9476370811462402, + "step": 8857, + "token_acc": 0.3083077483129947 + }, + { + "epoch": 5.192318968044562, + "grad_norm": 0.16444472799054644, + "learning_rate": 0.0004423048072360097, + "loss": 2.968719482421875, + "step": 8858, + "token_acc": 0.30315532387042554 + }, + { + "epoch": 5.192905306361771, + "grad_norm": 0.22440563144353334, + "learning_rate": 0.00044228932359037846, + "loss": 3.003300428390503, + "step": 8859, + "token_acc": 0.29841229592416213 + }, + { + "epoch": 5.193491644678979, + "grad_norm": 0.17034459721299242, + "learning_rate": 0.00044227383813844794, + "loss": 2.944145441055298, + "step": 8860, + "token_acc": 0.3054725770699171 + }, + { + "epoch": 5.1940779829961885, + "grad_norm": 0.2639884836063497, + "learning_rate": 0.00044225835088036394, + "loss": 2.998051166534424, + "step": 8861, + "token_acc": 0.298192902690213 + }, + { + "epoch": 5.194664321313398, + "grad_norm": 0.2555542330292339, + "learning_rate": 0.0004422428618162717, + "loss": 3.016000747680664, + "step": 8862, + "token_acc": 0.29538192109036915 + }, + { + "epoch": 5.195250659630607, + "grad_norm": 0.1750001256941667, + "learning_rate": 0.0004422273709463169, + "loss": 2.9487268924713135, + "step": 8863, + "token_acc": 0.3062791546876366 + }, + { + "epoch": 5.195836997947816, + "grad_norm": 0.20599213843054098, + "learning_rate": 0.000442211878270645, + "loss": 3.002913236618042, + "step": 8864, + "token_acc": 0.2964381339945955 + }, + { + "epoch": 5.196423336265025, + "grad_norm": 0.1716775549776159, + "learning_rate": 0.0004421963837894014, + "loss": 2.9811151027679443, + "step": 8865, + "token_acc": 0.3003942058004568 + }, + { + "epoch": 5.197009674582234, + "grad_norm": 0.17518677003637623, + "learning_rate": 0.00044218088750273187, + "loss": 2.995006561279297, + "step": 8866, + "token_acc": 0.2992382829080563 + }, + { + "epoch": 5.197596012899443, + "grad_norm": 0.19537416401865554, + "learning_rate": 0.00044216538941078184, + "loss": 3.0042848587036133, + "step": 8867, + "token_acc": 0.29889121427363435 + }, + { + "epoch": 5.198182351216652, + "grad_norm": 0.19672864276154467, + "learning_rate": 0.00044214988951369694, + "loss": 2.952193260192871, + "step": 8868, + "token_acc": 0.3071014224035176 + }, + { + "epoch": 5.198768689533861, + "grad_norm": 0.19838105460970937, + "learning_rate": 0.00044213438781162266, + "loss": 3.0012454986572266, + "step": 8869, + "token_acc": 0.29818024421597183 + }, + { + "epoch": 5.19935502785107, + "grad_norm": 0.19112045839496805, + "learning_rate": 0.0004421188843047048, + "loss": 2.9869096279144287, + "step": 8870, + "token_acc": 0.3008218522123387 + }, + { + "epoch": 5.199941366168279, + "grad_norm": 0.18063861123395328, + "learning_rate": 0.00044210337899308887, + "loss": 3.007112979888916, + "step": 8871, + "token_acc": 0.29686860601880327 + }, + { + "epoch": 5.200527704485488, + "grad_norm": 0.19092886981454182, + "learning_rate": 0.00044208787187692057, + "loss": 2.970839262008667, + "step": 8872, + "token_acc": 0.30168692311011064 + }, + { + "epoch": 5.201114042802697, + "grad_norm": 0.18618914737788142, + "learning_rate": 0.00044207236295634554, + "loss": 3.0074117183685303, + "step": 8873, + "token_acc": 0.2970973308405062 + }, + { + "epoch": 5.201700381119906, + "grad_norm": 0.17198400316198803, + "learning_rate": 0.00044205685223150956, + "loss": 2.963113307952881, + "step": 8874, + "token_acc": 0.30570938870798636 + }, + { + "epoch": 5.202286719437115, + "grad_norm": 0.16754572948298388, + "learning_rate": 0.0004420413397025582, + "loss": 3.021437644958496, + "step": 8875, + "token_acc": 0.29643082712176977 + }, + { + "epoch": 5.202873057754324, + "grad_norm": 0.182910722509467, + "learning_rate": 0.0004420258253696372, + "loss": 2.9806041717529297, + "step": 8876, + "token_acc": 0.29928805287867555 + }, + { + "epoch": 5.203459396071533, + "grad_norm": 0.1608751009547726, + "learning_rate": 0.00044201030923289234, + "loss": 2.9849867820739746, + "step": 8877, + "token_acc": 0.30099584124644246 + }, + { + "epoch": 5.2040457343887425, + "grad_norm": 0.17162172630345301, + "learning_rate": 0.0004419947912924694, + "loss": 2.9662232398986816, + "step": 8878, + "token_acc": 0.30181101250398756 + }, + { + "epoch": 5.204632072705952, + "grad_norm": 0.1812353648798643, + "learning_rate": 0.00044197927154851404, + "loss": 2.9623656272888184, + "step": 8879, + "token_acc": 0.3032062071085775 + }, + { + "epoch": 5.205218411023161, + "grad_norm": 0.1823312978791871, + "learning_rate": 0.0004419637500011722, + "loss": 2.986788511276245, + "step": 8880, + "token_acc": 0.30140712359573957 + }, + { + "epoch": 5.205804749340369, + "grad_norm": 0.1697103545320523, + "learning_rate": 0.0004419482266505895, + "loss": 2.9684510231018066, + "step": 8881, + "token_acc": 0.3030649594247828 + }, + { + "epoch": 5.206391087657578, + "grad_norm": 0.18503390846837403, + "learning_rate": 0.000441932701496912, + "loss": 2.9884328842163086, + "step": 8882, + "token_acc": 0.30068852200498014 + }, + { + "epoch": 5.206977425974787, + "grad_norm": 0.1838669596996058, + "learning_rate": 0.00044191717454028533, + "loss": 2.9981112480163574, + "step": 8883, + "token_acc": 0.298980003908031 + }, + { + "epoch": 5.207563764291996, + "grad_norm": 0.16383488848890274, + "learning_rate": 0.0004419016457808554, + "loss": 3.0084104537963867, + "step": 8884, + "token_acc": 0.29859226316994214 + }, + { + "epoch": 5.208150102609205, + "grad_norm": 0.1782335823502246, + "learning_rate": 0.00044188611521876807, + "loss": 3.0121350288391113, + "step": 8885, + "token_acc": 0.2961333942488869 + }, + { + "epoch": 5.2087364409264145, + "grad_norm": 0.1607014009834479, + "learning_rate": 0.00044187058285416936, + "loss": 2.967247486114502, + "step": 8886, + "token_acc": 0.30333722761212983 + }, + { + "epoch": 5.209322779243624, + "grad_norm": 0.17535083303147614, + "learning_rate": 0.00044185504868720504, + "loss": 3.0083847045898438, + "step": 8887, + "token_acc": 0.2994988005483208 + }, + { + "epoch": 5.209909117560833, + "grad_norm": 0.20255302695839197, + "learning_rate": 0.00044183951271802106, + "loss": 3.0134835243225098, + "step": 8888, + "token_acc": 0.2964723190250621 + }, + { + "epoch": 5.210495455878042, + "grad_norm": 0.16376752223380722, + "learning_rate": 0.0004418239749467633, + "loss": 2.950873374938965, + "step": 8889, + "token_acc": 0.3054603000153231 + }, + { + "epoch": 5.211081794195251, + "grad_norm": 0.19316398726621323, + "learning_rate": 0.00044180843537357787, + "loss": 3.043321132659912, + "step": 8890, + "token_acc": 0.2929329915781765 + }, + { + "epoch": 5.21166813251246, + "grad_norm": 0.19767689159108195, + "learning_rate": 0.00044179289399861053, + "loss": 2.9896275997161865, + "step": 8891, + "token_acc": 0.29901280984519296 + }, + { + "epoch": 5.212254470829668, + "grad_norm": 0.23145129203068807, + "learning_rate": 0.00044177735082200753, + "loss": 2.986607313156128, + "step": 8892, + "token_acc": 0.29773632546313844 + }, + { + "epoch": 5.212840809146877, + "grad_norm": 0.16424175028920202, + "learning_rate": 0.0004417618058439147, + "loss": 3.0072226524353027, + "step": 8893, + "token_acc": 0.29735308691700474 + }, + { + "epoch": 5.2134271474640865, + "grad_norm": 0.16600318774178754, + "learning_rate": 0.00044174625906447806, + "loss": 3.036065101623535, + "step": 8894, + "token_acc": 0.2934841558844038 + }, + { + "epoch": 5.214013485781296, + "grad_norm": 0.17662863720933394, + "learning_rate": 0.0004417307104838438, + "loss": 2.970824956893921, + "step": 8895, + "token_acc": 0.3025347094270194 + }, + { + "epoch": 5.214599824098505, + "grad_norm": 0.20370745434552467, + "learning_rate": 0.0004417151601021578, + "loss": 3.003786325454712, + "step": 8896, + "token_acc": 0.2956347967276588 + }, + { + "epoch": 5.215186162415714, + "grad_norm": 0.23450451072540401, + "learning_rate": 0.0004416996079195662, + "loss": 2.9558749198913574, + "step": 8897, + "token_acc": 0.30644454868462995 + }, + { + "epoch": 5.215772500732923, + "grad_norm": 0.21807072201181496, + "learning_rate": 0.00044168405393621516, + "loss": 2.9484782218933105, + "step": 8898, + "token_acc": 0.30556206208245074 + }, + { + "epoch": 5.216358839050132, + "grad_norm": 0.1645682979787322, + "learning_rate": 0.00044166849815225066, + "loss": 2.9944329261779785, + "step": 8899, + "token_acc": 0.2993786516310395 + }, + { + "epoch": 5.216945177367341, + "grad_norm": 0.25450867419561446, + "learning_rate": 0.00044165294056781895, + "loss": 2.9656882286071777, + "step": 8900, + "token_acc": 0.3034319047923187 + }, + { + "epoch": 5.21753151568455, + "grad_norm": 0.18298559766996206, + "learning_rate": 0.00044163738118306606, + "loss": 2.980564594268799, + "step": 8901, + "token_acc": 0.3022960717346534 + }, + { + "epoch": 5.218117854001759, + "grad_norm": 0.18654995936683463, + "learning_rate": 0.00044162181999813835, + "loss": 2.972099781036377, + "step": 8902, + "token_acc": 0.30253908712646577 + }, + { + "epoch": 5.218704192318968, + "grad_norm": 0.2230033722583745, + "learning_rate": 0.00044160625701318167, + "loss": 3.0208468437194824, + "step": 8903, + "token_acc": 0.29435814611378175 + }, + { + "epoch": 5.219290530636177, + "grad_norm": 0.1702957777606774, + "learning_rate": 0.00044159069222834247, + "loss": 2.9655423164367676, + "step": 8904, + "token_acc": 0.3044766679479735 + }, + { + "epoch": 5.219876868953386, + "grad_norm": 0.20527997158592426, + "learning_rate": 0.0004415751256437669, + "loss": 3.014284372329712, + "step": 8905, + "token_acc": 0.2965131200722733 + }, + { + "epoch": 5.220463207270595, + "grad_norm": 0.17342505245519793, + "learning_rate": 0.00044155955725960115, + "loss": 3.0179245471954346, + "step": 8906, + "token_acc": 0.2947802911083205 + }, + { + "epoch": 5.221049545587804, + "grad_norm": 0.1611146176638883, + "learning_rate": 0.0004415439870759916, + "loss": 2.997894287109375, + "step": 8907, + "token_acc": 0.29835908333355415 + }, + { + "epoch": 5.221635883905013, + "grad_norm": 0.17740540049367778, + "learning_rate": 0.00044152841509308426, + "loss": 2.993295431137085, + "step": 8908, + "token_acc": 0.2987822660098522 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 0.15936008312579414, + "learning_rate": 0.00044151284131102564, + "loss": 3.0057215690612793, + "step": 8909, + "token_acc": 0.29852242572946447 + }, + { + "epoch": 5.222808560539431, + "grad_norm": 0.1802094448567103, + "learning_rate": 0.0004414972657299619, + "loss": 3.030078887939453, + "step": 8910, + "token_acc": 0.29605489386976774 + }, + { + "epoch": 5.2233948988566405, + "grad_norm": 0.18502441182507537, + "learning_rate": 0.00044148168835003943, + "loss": 2.9677929878234863, + "step": 8911, + "token_acc": 0.30523988434837734 + }, + { + "epoch": 5.22398123717385, + "grad_norm": 0.17013372774392807, + "learning_rate": 0.00044146610917140445, + "loss": 3.015394449234009, + "step": 8912, + "token_acc": 0.29783492404752837 + }, + { + "epoch": 5.224567575491059, + "grad_norm": 0.18967484607603383, + "learning_rate": 0.0004414505281942034, + "loss": 2.995748281478882, + "step": 8913, + "token_acc": 0.29927924578687354 + }, + { + "epoch": 5.225153913808267, + "grad_norm": 0.20999488452697718, + "learning_rate": 0.00044143494541858266, + "loss": 2.9627771377563477, + "step": 8914, + "token_acc": 0.30470339625799636 + }, + { + "epoch": 5.225740252125476, + "grad_norm": 0.1813279811064846, + "learning_rate": 0.00044141936084468857, + "loss": 3.0169663429260254, + "step": 8915, + "token_acc": 0.29530421848615357 + }, + { + "epoch": 5.226326590442685, + "grad_norm": 0.19978727939794774, + "learning_rate": 0.0004414037744726675, + "loss": 3.000596523284912, + "step": 8916, + "token_acc": 0.2980014149754277 + }, + { + "epoch": 5.226912928759894, + "grad_norm": 0.22008906941238965, + "learning_rate": 0.00044138818630266586, + "loss": 2.980915069580078, + "step": 8917, + "token_acc": 0.29976783598701584 + }, + { + "epoch": 5.227499267077103, + "grad_norm": 0.17925259503391702, + "learning_rate": 0.0004413725963348302, + "loss": 3.0163800716400146, + "step": 8918, + "token_acc": 0.29651659571646105 + }, + { + "epoch": 5.2280856053943126, + "grad_norm": 0.16288475389997062, + "learning_rate": 0.0004413570045693068, + "loss": 2.9983134269714355, + "step": 8919, + "token_acc": 0.29887425333129114 + }, + { + "epoch": 5.228671943711522, + "grad_norm": 0.16554900427244335, + "learning_rate": 0.00044134141100624227, + "loss": 3.030074119567871, + "step": 8920, + "token_acc": 0.2953712437499836 + }, + { + "epoch": 5.229258282028731, + "grad_norm": 0.19551230125111524, + "learning_rate": 0.0004413258156457829, + "loss": 2.982029438018799, + "step": 8921, + "token_acc": 0.30168336456402894 + }, + { + "epoch": 5.22984462034594, + "grad_norm": 0.17103674797047672, + "learning_rate": 0.0004413102184880754, + "loss": 3.018326759338379, + "step": 8922, + "token_acc": 0.2974741519088631 + }, + { + "epoch": 5.230430958663149, + "grad_norm": 0.1846554707296096, + "learning_rate": 0.00044129461953326617, + "loss": 2.9929158687591553, + "step": 8923, + "token_acc": 0.29943835167982974 + }, + { + "epoch": 5.231017296980358, + "grad_norm": 0.21596057627301538, + "learning_rate": 0.00044127901878150173, + "loss": 3.029737949371338, + "step": 8924, + "token_acc": 0.2959451934506281 + }, + { + "epoch": 5.231603635297566, + "grad_norm": 0.20478043341852908, + "learning_rate": 0.0004412634162329287, + "loss": 3.0275487899780273, + "step": 8925, + "token_acc": 0.2950249086682165 + }, + { + "epoch": 5.2321899736147754, + "grad_norm": 0.18603483725869288, + "learning_rate": 0.0004412478118876936, + "loss": 2.9790334701538086, + "step": 8926, + "token_acc": 0.3020004813992672 + }, + { + "epoch": 5.232776311931985, + "grad_norm": 0.21494013758739133, + "learning_rate": 0.000441232205745943, + "loss": 2.984231948852539, + "step": 8927, + "token_acc": 0.3004269037072689 + }, + { + "epoch": 5.233362650249194, + "grad_norm": 0.1984626802598579, + "learning_rate": 0.0004412165978078235, + "loss": 2.957655429840088, + "step": 8928, + "token_acc": 0.30387090621834706 + }, + { + "epoch": 5.233948988566403, + "grad_norm": 0.16402117223100196, + "learning_rate": 0.00044120098807348175, + "loss": 2.9990286827087402, + "step": 8929, + "token_acc": 0.29905224281170634 + }, + { + "epoch": 5.234535326883612, + "grad_norm": 0.20654602042874198, + "learning_rate": 0.0004411853765430644, + "loss": 2.9942474365234375, + "step": 8930, + "token_acc": 0.2988215302451835 + }, + { + "epoch": 5.235121665200821, + "grad_norm": 0.1756408942352806, + "learning_rate": 0.00044116976321671806, + "loss": 2.9616200923919678, + "step": 8931, + "token_acc": 0.3048046908728404 + }, + { + "epoch": 5.23570800351803, + "grad_norm": 0.2036433542364515, + "learning_rate": 0.0004411541480945893, + "loss": 2.9952173233032227, + "step": 8932, + "token_acc": 0.300897421569825 + }, + { + "epoch": 5.236294341835239, + "grad_norm": 0.178910159319396, + "learning_rate": 0.000441138531176825, + "loss": 3.0330865383148193, + "step": 8933, + "token_acc": 0.2943139031836964 + }, + { + "epoch": 5.236880680152448, + "grad_norm": 0.17842817647427806, + "learning_rate": 0.0004411229124635717, + "loss": 3.020102024078369, + "step": 8934, + "token_acc": 0.2958466112406758 + }, + { + "epoch": 5.237467018469657, + "grad_norm": 0.2439334544518674, + "learning_rate": 0.00044110729195497624, + "loss": 2.9836511611938477, + "step": 8935, + "token_acc": 0.3001236250673598 + }, + { + "epoch": 5.238053356786866, + "grad_norm": 0.20768452885111913, + "learning_rate": 0.0004410916696511853, + "loss": 2.9697375297546387, + "step": 8936, + "token_acc": 0.3034466440571023 + }, + { + "epoch": 5.238639695104075, + "grad_norm": 0.16610944527275565, + "learning_rate": 0.0004410760455523456, + "loss": 2.97253155708313, + "step": 8937, + "token_acc": 0.30274868831913676 + }, + { + "epoch": 5.239226033421284, + "grad_norm": 0.26550199089875925, + "learning_rate": 0.0004410604196586039, + "loss": 2.98433518409729, + "step": 8938, + "token_acc": 0.301486028734215 + }, + { + "epoch": 5.239812371738493, + "grad_norm": 0.23148763380201653, + "learning_rate": 0.00044104479197010704, + "loss": 3.0180928707122803, + "step": 8939, + "token_acc": 0.29548702087141016 + }, + { + "epoch": 5.240398710055702, + "grad_norm": 0.1689519885389431, + "learning_rate": 0.0004410291624870018, + "loss": 2.970151901245117, + "step": 8940, + "token_acc": 0.3029989658738366 + }, + { + "epoch": 5.240985048372911, + "grad_norm": 0.2566793729561637, + "learning_rate": 0.000441013531209435, + "loss": 2.960186243057251, + "step": 8941, + "token_acc": 0.3059482881643985 + }, + { + "epoch": 5.24157138669012, + "grad_norm": 0.1480607027245773, + "learning_rate": 0.0004409978981375534, + "loss": 2.9820432662963867, + "step": 8942, + "token_acc": 0.3007441695412346 + }, + { + "epoch": 5.2421577250073295, + "grad_norm": 0.2813427192433209, + "learning_rate": 0.000440982263271504, + "loss": 2.9836249351501465, + "step": 8943, + "token_acc": 0.30075547428497346 + }, + { + "epoch": 5.242744063324539, + "grad_norm": 0.18084629297810112, + "learning_rate": 0.0004409666266114336, + "loss": 3.0156726837158203, + "step": 8944, + "token_acc": 0.2977267806884607 + }, + { + "epoch": 5.243330401641748, + "grad_norm": 0.2590930547130295, + "learning_rate": 0.000440950988157489, + "loss": 3.019381523132324, + "step": 8945, + "token_acc": 0.29628158969835383 + }, + { + "epoch": 5.243916739958956, + "grad_norm": 0.21238467189924815, + "learning_rate": 0.0004409353479098172, + "loss": 2.9954915046691895, + "step": 8946, + "token_acc": 0.2999702526400782 + }, + { + "epoch": 5.244503078276165, + "grad_norm": 0.21909336429574225, + "learning_rate": 0.0004409197058685651, + "loss": 2.9572606086730957, + "step": 8947, + "token_acc": 0.3049026728272011 + }, + { + "epoch": 5.245089416593374, + "grad_norm": 0.18403618719413634, + "learning_rate": 0.00044090406203387965, + "loss": 3.008298873901367, + "step": 8948, + "token_acc": 0.2969490476238799 + }, + { + "epoch": 5.245675754910583, + "grad_norm": 0.20885422417421476, + "learning_rate": 0.00044088841640590776, + "loss": 2.993046283721924, + "step": 8949, + "token_acc": 0.29913778788304574 + }, + { + "epoch": 5.246262093227792, + "grad_norm": 0.16845572934840913, + "learning_rate": 0.00044087276898479644, + "loss": 2.9792304039001465, + "step": 8950, + "token_acc": 0.3001878658282922 + }, + { + "epoch": 5.2468484315450015, + "grad_norm": 0.25750556366653965, + "learning_rate": 0.00044085711977069267, + "loss": 2.963646650314331, + "step": 8951, + "token_acc": 0.3031053964543152 + }, + { + "epoch": 5.247434769862211, + "grad_norm": 0.1715417793121581, + "learning_rate": 0.0004408414687637434, + "loss": 3.00197696685791, + "step": 8952, + "token_acc": 0.2970641673483221 + }, + { + "epoch": 5.24802110817942, + "grad_norm": 0.21375651044166544, + "learning_rate": 0.0004408258159640957, + "loss": 3.0023531913757324, + "step": 8953, + "token_acc": 0.2989688588653051 + }, + { + "epoch": 5.248607446496629, + "grad_norm": 0.15817136100485737, + "learning_rate": 0.00044081016137189667, + "loss": 3.018862247467041, + "step": 8954, + "token_acc": 0.2944671701279631 + }, + { + "epoch": 5.249193784813838, + "grad_norm": 0.22573715221745091, + "learning_rate": 0.00044079450498729324, + "loss": 2.9969987869262695, + "step": 8955, + "token_acc": 0.29888204365606247 + }, + { + "epoch": 5.249780123131047, + "grad_norm": 0.15697446680234783, + "learning_rate": 0.0004407788468104325, + "loss": 3.039738655090332, + "step": 8956, + "token_acc": 0.29376139494715486 + }, + { + "epoch": 5.250366461448255, + "grad_norm": 0.19854052336317432, + "learning_rate": 0.0004407631868414616, + "loss": 2.9562487602233887, + "step": 8957, + "token_acc": 0.3041206195526397 + }, + { + "epoch": 5.250952799765464, + "grad_norm": 0.15438002685162322, + "learning_rate": 0.00044074752508052763, + "loss": 2.9413490295410156, + "step": 8958, + "token_acc": 0.3079357478216513 + }, + { + "epoch": 5.2515391380826735, + "grad_norm": 0.20102382666314142, + "learning_rate": 0.00044073186152777764, + "loss": 3.018157482147217, + "step": 8959, + "token_acc": 0.295324852375443 + }, + { + "epoch": 5.252125476399883, + "grad_norm": 0.17257851015975986, + "learning_rate": 0.00044071619618335896, + "loss": 2.957867383956909, + "step": 8960, + "token_acc": 0.30593616033640064 + }, + { + "epoch": 5.252711814717092, + "grad_norm": 0.2049432821868892, + "learning_rate": 0.00044070052904741854, + "loss": 2.9966557025909424, + "step": 8961, + "token_acc": 0.2991678138552783 + }, + { + "epoch": 5.253298153034301, + "grad_norm": 0.17512283238701976, + "learning_rate": 0.00044068486012010356, + "loss": 3.0010833740234375, + "step": 8962, + "token_acc": 0.2994248168663683 + }, + { + "epoch": 5.25388449135151, + "grad_norm": 0.23746866372384712, + "learning_rate": 0.0004406691894015613, + "loss": 2.9798941612243652, + "step": 8963, + "token_acc": 0.3015160022459292 + }, + { + "epoch": 5.254470829668719, + "grad_norm": 0.1783355414211252, + "learning_rate": 0.000440653516891939, + "loss": 3.007066249847412, + "step": 8964, + "token_acc": 0.2958862356432046 + }, + { + "epoch": 5.255057167985928, + "grad_norm": 0.20807832795203093, + "learning_rate": 0.0004406378425913837, + "loss": 2.976379156112671, + "step": 8965, + "token_acc": 0.30120299633536185 + }, + { + "epoch": 5.255643506303137, + "grad_norm": 0.16235502430411747, + "learning_rate": 0.00044062216650004286, + "loss": 2.9832100868225098, + "step": 8966, + "token_acc": 0.30056759615358736 + }, + { + "epoch": 5.256229844620346, + "grad_norm": 0.2473962715130966, + "learning_rate": 0.0004406064886180636, + "loss": 2.9909369945526123, + "step": 8967, + "token_acc": 0.2990887254991841 + }, + { + "epoch": 5.256816182937555, + "grad_norm": 0.20485010882088825, + "learning_rate": 0.0004405908089455932, + "loss": 2.9975104331970215, + "step": 8968, + "token_acc": 0.2990148296802106 + }, + { + "epoch": 5.257402521254764, + "grad_norm": 0.19310738514596032, + "learning_rate": 0.00044057512748277907, + "loss": 2.9998779296875, + "step": 8969, + "token_acc": 0.29703188175250217 + }, + { + "epoch": 5.257988859571973, + "grad_norm": 0.21991828785208, + "learning_rate": 0.00044055944422976835, + "loss": 3.00276255607605, + "step": 8970, + "token_acc": 0.2998272101819561 + }, + { + "epoch": 5.258575197889182, + "grad_norm": 0.17922257979645323, + "learning_rate": 0.0004405437591867084, + "loss": 3.0046751499176025, + "step": 8971, + "token_acc": 0.29758238295623185 + }, + { + "epoch": 5.259161536206391, + "grad_norm": 0.19963587343188294, + "learning_rate": 0.0004405280723537467, + "loss": 2.9805684089660645, + "step": 8972, + "token_acc": 0.30155046056612655 + }, + { + "epoch": 5.2597478745236, + "grad_norm": 0.1566887805441179, + "learning_rate": 0.0004405123837310304, + "loss": 3.0134499073028564, + "step": 8973, + "token_acc": 0.2961902849342508 + }, + { + "epoch": 5.260334212840809, + "grad_norm": 0.19050060204867159, + "learning_rate": 0.000440496693318707, + "loss": 3.0177907943725586, + "step": 8974, + "token_acc": 0.29542935062636944 + }, + { + "epoch": 5.260920551158018, + "grad_norm": 0.16947322314830734, + "learning_rate": 0.0004404810011169238, + "loss": 2.9758706092834473, + "step": 8975, + "token_acc": 0.3005742833378365 + }, + { + "epoch": 5.2615068894752275, + "grad_norm": 0.16374807215274723, + "learning_rate": 0.00044046530712582837, + "loss": 2.969801664352417, + "step": 8976, + "token_acc": 0.30163080044302293 + }, + { + "epoch": 5.262093227792437, + "grad_norm": 0.17665361265205137, + "learning_rate": 0.00044044961134556807, + "loss": 2.9777979850769043, + "step": 8977, + "token_acc": 0.30307320423883005 + }, + { + "epoch": 5.262679566109645, + "grad_norm": 0.18098293093167578, + "learning_rate": 0.00044043391377629015, + "loss": 2.998828411102295, + "step": 8978, + "token_acc": 0.2994901124017739 + }, + { + "epoch": 5.263265904426854, + "grad_norm": 0.1762123359405021, + "learning_rate": 0.00044041821441814235, + "loss": 2.9996912479400635, + "step": 8979, + "token_acc": 0.30041819710012313 + }, + { + "epoch": 5.263852242744063, + "grad_norm": 0.19713829118857354, + "learning_rate": 0.00044040251327127195, + "loss": 2.9943418502807617, + "step": 8980, + "token_acc": 0.29986187410060605 + }, + { + "epoch": 5.264438581061272, + "grad_norm": 0.17523977379726896, + "learning_rate": 0.00044038681033582656, + "loss": 3.0196452140808105, + "step": 8981, + "token_acc": 0.29542695799028923 + }, + { + "epoch": 5.265024919378481, + "grad_norm": 0.20990598627151094, + "learning_rate": 0.0004403711056119536, + "loss": 2.9773290157318115, + "step": 8982, + "token_acc": 0.3026007372985186 + }, + { + "epoch": 5.26561125769569, + "grad_norm": 0.1680517606360513, + "learning_rate": 0.00044035539909980065, + "loss": 2.9834506511688232, + "step": 8983, + "token_acc": 0.30214787332366083 + }, + { + "epoch": 5.2661975960128995, + "grad_norm": 0.1885221434714052, + "learning_rate": 0.0004403396907995152, + "loss": 2.9803786277770996, + "step": 8984, + "token_acc": 0.3029879139102925 + }, + { + "epoch": 5.266783934330109, + "grad_norm": 0.1614341785092233, + "learning_rate": 0.0004403239807112449, + "loss": 2.9949960708618164, + "step": 8985, + "token_acc": 0.2997863567920408 + }, + { + "epoch": 5.267370272647318, + "grad_norm": 0.19464696906147128, + "learning_rate": 0.0004403082688351372, + "loss": 3.0128018856048584, + "step": 8986, + "token_acc": 0.2963412745433046 + }, + { + "epoch": 5.267956610964527, + "grad_norm": 0.16907962032436916, + "learning_rate": 0.00044029255517133984, + "loss": 2.973231077194214, + "step": 8987, + "token_acc": 0.30173226475472714 + }, + { + "epoch": 5.268542949281736, + "grad_norm": 0.1751836547814023, + "learning_rate": 0.00044027683972000026, + "loss": 2.9882876873016357, + "step": 8988, + "token_acc": 0.2996053588119223 + }, + { + "epoch": 5.269129287598945, + "grad_norm": 0.16348007087086996, + "learning_rate": 0.00044026112248126624, + "loss": 2.986239194869995, + "step": 8989, + "token_acc": 0.30013280910991597 + }, + { + "epoch": 5.269715625916153, + "grad_norm": 0.16097578088024656, + "learning_rate": 0.0004402454034552853, + "loss": 2.9820144176483154, + "step": 8990, + "token_acc": 0.3003335074954705 + }, + { + "epoch": 5.270301964233362, + "grad_norm": 0.18900533189058852, + "learning_rate": 0.0004402296826422052, + "loss": 2.9793245792388916, + "step": 8991, + "token_acc": 0.3009920481259509 + }, + { + "epoch": 5.2708883025505715, + "grad_norm": 0.1606179658613837, + "learning_rate": 0.00044021396004217353, + "loss": 2.995755195617676, + "step": 8992, + "token_acc": 0.2992480379059865 + }, + { + "epoch": 5.271474640867781, + "grad_norm": 0.18573600184805467, + "learning_rate": 0.000440198235655338, + "loss": 2.9625377655029297, + "step": 8993, + "token_acc": 0.30563432052089895 + }, + { + "epoch": 5.27206097918499, + "grad_norm": 0.249366232030939, + "learning_rate": 0.0004401825094818465, + "loss": 3.0017154216766357, + "step": 8994, + "token_acc": 0.2982271864839338 + }, + { + "epoch": 5.272647317502199, + "grad_norm": 0.19587143004766586, + "learning_rate": 0.0004401667815218464, + "loss": 3.040799617767334, + "step": 8995, + "token_acc": 0.292670654364538 + }, + { + "epoch": 5.273233655819408, + "grad_norm": 0.17018425703692142, + "learning_rate": 0.0004401510517754857, + "loss": 3.045501232147217, + "step": 8996, + "token_acc": 0.2927826011648299 + }, + { + "epoch": 5.273819994136617, + "grad_norm": 0.19111976408633977, + "learning_rate": 0.00044013532024291213, + "loss": 2.998466730117798, + "step": 8997, + "token_acc": 0.2992639759516392 + }, + { + "epoch": 5.274406332453826, + "grad_norm": 0.19311184305396234, + "learning_rate": 0.00044011958692427344, + "loss": 3.0144405364990234, + "step": 8998, + "token_acc": 0.29683231344418404 + }, + { + "epoch": 5.274992670771035, + "grad_norm": 0.20402747957454556, + "learning_rate": 0.00044010385181971737, + "loss": 2.973325729370117, + "step": 8999, + "token_acc": 0.30287773596904977 + }, + { + "epoch": 5.2755790090882435, + "grad_norm": 0.21138347478060932, + "learning_rate": 0.00044008811492939184, + "loss": 2.994374990463257, + "step": 9000, + "token_acc": 0.3006901449683932 + }, + { + "epoch": 5.276165347405453, + "grad_norm": 0.17444842050541656, + "learning_rate": 0.0004400723762534446, + "loss": 2.9771082401275635, + "step": 9001, + "token_acc": 0.30027888144799214 + }, + { + "epoch": 5.276751685722662, + "grad_norm": 0.1793392358979344, + "learning_rate": 0.0004400566357920235, + "loss": 2.985520601272583, + "step": 9002, + "token_acc": 0.300737859041691 + }, + { + "epoch": 5.277338024039871, + "grad_norm": 0.23198400222942003, + "learning_rate": 0.00044004089354527645, + "loss": 3.0121638774871826, + "step": 9003, + "token_acc": 0.298374639892676 + }, + { + "epoch": 5.27792436235708, + "grad_norm": 0.19786139974896397, + "learning_rate": 0.0004400251495133512, + "loss": 2.980691909790039, + "step": 9004, + "token_acc": 0.3015544482546709 + }, + { + "epoch": 5.278510700674289, + "grad_norm": 0.16310231858834656, + "learning_rate": 0.0004400094036963958, + "loss": 3.0186262130737305, + "step": 9005, + "token_acc": 0.295912923391883 + }, + { + "epoch": 5.279097038991498, + "grad_norm": 0.2226138662969139, + "learning_rate": 0.0004399936560945581, + "loss": 3.0005500316619873, + "step": 9006, + "token_acc": 0.29972588308095144 + }, + { + "epoch": 5.279683377308707, + "grad_norm": 0.2086619740968379, + "learning_rate": 0.00043997790670798596, + "loss": 2.9874324798583984, + "step": 9007, + "token_acc": 0.30122326149264456 + }, + { + "epoch": 5.280269715625916, + "grad_norm": 0.2018533613990206, + "learning_rate": 0.00043996215553682746, + "loss": 3.01945161819458, + "step": 9008, + "token_acc": 0.29521037504527486 + }, + { + "epoch": 5.2808560539431255, + "grad_norm": 0.29030530268818694, + "learning_rate": 0.00043994640258123043, + "loss": 3.0150246620178223, + "step": 9009, + "token_acc": 0.294196137681103 + }, + { + "epoch": 5.281442392260335, + "grad_norm": 0.22786349535474545, + "learning_rate": 0.00043993064784134296, + "loss": 2.9889488220214844, + "step": 9010, + "token_acc": 0.29927417600066314 + }, + { + "epoch": 5.282028730577543, + "grad_norm": 0.184376233021802, + "learning_rate": 0.0004399148913173129, + "loss": 3.003593921661377, + "step": 9011, + "token_acc": 0.2980827694547884 + }, + { + "epoch": 5.282615068894752, + "grad_norm": 0.23196170676110875, + "learning_rate": 0.0004398991330092884, + "loss": 2.987874984741211, + "step": 9012, + "token_acc": 0.30077175970867276 + }, + { + "epoch": 5.283201407211961, + "grad_norm": 0.1678465415284779, + "learning_rate": 0.00043988337291741745, + "loss": 2.9983150959014893, + "step": 9013, + "token_acc": 0.29832164295316305 + }, + { + "epoch": 5.28378774552917, + "grad_norm": 0.26437958153503277, + "learning_rate": 0.00043986761104184807, + "loss": 3.0335121154785156, + "step": 9014, + "token_acc": 0.2949338983447906 + }, + { + "epoch": 5.284374083846379, + "grad_norm": 0.19310288010014848, + "learning_rate": 0.00043985184738272833, + "loss": 2.9948813915252686, + "step": 9015, + "token_acc": 0.30107556716997697 + }, + { + "epoch": 5.284960422163588, + "grad_norm": 0.21055216629100604, + "learning_rate": 0.0004398360819402063, + "loss": 3.0090439319610596, + "step": 9016, + "token_acc": 0.298008533003716 + }, + { + "epoch": 5.2855467604807975, + "grad_norm": 0.20108333680752324, + "learning_rate": 0.00043982031471443014, + "loss": 2.9878458976745605, + "step": 9017, + "token_acc": 0.300442637688266 + }, + { + "epoch": 5.286133098798007, + "grad_norm": 0.18783309491288122, + "learning_rate": 0.0004398045457055478, + "loss": 2.951472759246826, + "step": 9018, + "token_acc": 0.30635349120528127 + }, + { + "epoch": 5.286719437115216, + "grad_norm": 0.18916680988470463, + "learning_rate": 0.0004397887749137076, + "loss": 2.9853310585021973, + "step": 9019, + "token_acc": 0.30047829982710533 + }, + { + "epoch": 5.287305775432425, + "grad_norm": 0.16452639685793738, + "learning_rate": 0.0004397730023390576, + "loss": 2.9758191108703613, + "step": 9020, + "token_acc": 0.3014142738296317 + }, + { + "epoch": 5.287892113749633, + "grad_norm": 0.19148341471042932, + "learning_rate": 0.000439757227981746, + "loss": 2.998140811920166, + "step": 9021, + "token_acc": 0.299276188097431 + }, + { + "epoch": 5.288478452066842, + "grad_norm": 0.17677828606576196, + "learning_rate": 0.0004397414518419209, + "loss": 2.9652743339538574, + "step": 9022, + "token_acc": 0.3043837618762873 + }, + { + "epoch": 5.289064790384051, + "grad_norm": 0.18182722780218183, + "learning_rate": 0.00043972567391973054, + "loss": 2.9920711517333984, + "step": 9023, + "token_acc": 0.2978256366731837 + }, + { + "epoch": 5.28965112870126, + "grad_norm": 0.190047599058359, + "learning_rate": 0.00043970989421532314, + "loss": 3.048684597015381, + "step": 9024, + "token_acc": 0.2915355610453777 + }, + { + "epoch": 5.2902374670184695, + "grad_norm": 0.2128984464923535, + "learning_rate": 0.00043969411272884695, + "loss": 3.0161798000335693, + "step": 9025, + "token_acc": 0.2956715846200584 + }, + { + "epoch": 5.290823805335679, + "grad_norm": 0.2076837400338364, + "learning_rate": 0.0004396783294604502, + "loss": 2.9954628944396973, + "step": 9026, + "token_acc": 0.29907776197882346 + }, + { + "epoch": 5.291410143652888, + "grad_norm": 0.17352002175367803, + "learning_rate": 0.0004396625444102811, + "loss": 2.988490343093872, + "step": 9027, + "token_acc": 0.300507614213198 + }, + { + "epoch": 5.291996481970097, + "grad_norm": 0.16357906684042695, + "learning_rate": 0.0004396467575784879, + "loss": 3.040863513946533, + "step": 9028, + "token_acc": 0.29191863517745986 + }, + { + "epoch": 5.292582820287306, + "grad_norm": 0.16871807293084873, + "learning_rate": 0.00043963096896521903, + "loss": 2.991218090057373, + "step": 9029, + "token_acc": 0.29855056006618597 + }, + { + "epoch": 5.293169158604515, + "grad_norm": 0.15837612569619025, + "learning_rate": 0.0004396151785706227, + "loss": 3.007265329360962, + "step": 9030, + "token_acc": 0.2961740328211461 + }, + { + "epoch": 5.293755496921724, + "grad_norm": 0.1693331351156298, + "learning_rate": 0.00043959938639484737, + "loss": 2.9969210624694824, + "step": 9031, + "token_acc": 0.29905991868572585 + }, + { + "epoch": 5.294341835238933, + "grad_norm": 0.15884172461139925, + "learning_rate": 0.00043958359243804123, + "loss": 2.961029052734375, + "step": 9032, + "token_acc": 0.3057396982778822 + }, + { + "epoch": 5.2949281735561415, + "grad_norm": 0.1562626247102422, + "learning_rate": 0.00043956779670035275, + "loss": 3.016420364379883, + "step": 9033, + "token_acc": 0.2973867256727824 + }, + { + "epoch": 5.295514511873351, + "grad_norm": 0.16909214382619767, + "learning_rate": 0.00043955199918193017, + "loss": 2.964839458465576, + "step": 9034, + "token_acc": 0.3024468323609612 + }, + { + "epoch": 5.29610085019056, + "grad_norm": 0.16909519755189947, + "learning_rate": 0.00043953619988292204, + "loss": 2.970378875732422, + "step": 9035, + "token_acc": 0.3040509468571576 + }, + { + "epoch": 5.296687188507769, + "grad_norm": 0.16752595514924642, + "learning_rate": 0.0004395203988034767, + "loss": 2.984152317047119, + "step": 9036, + "token_acc": 0.30099869401551815 + }, + { + "epoch": 5.297273526824978, + "grad_norm": 0.17199017639625228, + "learning_rate": 0.00043950459594374266, + "loss": 2.991776466369629, + "step": 9037, + "token_acc": 0.29917025401297476 + }, + { + "epoch": 5.297859865142187, + "grad_norm": 0.16622895257561343, + "learning_rate": 0.0004394887913038682, + "loss": 2.9983131885528564, + "step": 9038, + "token_acc": 0.29719993535505196 + }, + { + "epoch": 5.298446203459396, + "grad_norm": 0.15446779685894196, + "learning_rate": 0.00043947298488400193, + "loss": 2.983732223510742, + "step": 9039, + "token_acc": 0.3007544608579208 + }, + { + "epoch": 5.299032541776605, + "grad_norm": 0.16341371397975465, + "learning_rate": 0.0004394571766842923, + "loss": 2.9753713607788086, + "step": 9040, + "token_acc": 0.3033618070771543 + }, + { + "epoch": 5.299618880093814, + "grad_norm": 0.1582565193448118, + "learning_rate": 0.00043944136670488775, + "loss": 2.9925074577331543, + "step": 9041, + "token_acc": 0.2988138734166553 + }, + { + "epoch": 5.3002052184110235, + "grad_norm": 0.2158839265510315, + "learning_rate": 0.00043942555494593693, + "loss": 3.013381004333496, + "step": 9042, + "token_acc": 0.29659904260264947 + }, + { + "epoch": 5.300791556728232, + "grad_norm": 0.2955990565102046, + "learning_rate": 0.0004394097414075882, + "loss": 2.9416894912719727, + "step": 9043, + "token_acc": 0.30741741596830147 + }, + { + "epoch": 5.301377895045441, + "grad_norm": 0.41102855052490916, + "learning_rate": 0.0004393939260899902, + "loss": 3.014218330383301, + "step": 9044, + "token_acc": 0.29890883937788265 + }, + { + "epoch": 5.30196423336265, + "grad_norm": 0.24742143524284269, + "learning_rate": 0.0004393781089932915, + "loss": 2.9374709129333496, + "step": 9045, + "token_acc": 0.306773796104543 + }, + { + "epoch": 5.302550571679859, + "grad_norm": 0.19923540453812247, + "learning_rate": 0.00043936229011764063, + "loss": 2.993546485900879, + "step": 9046, + "token_acc": 0.2991383160116921 + }, + { + "epoch": 5.303136909997068, + "grad_norm": 0.18734599103105803, + "learning_rate": 0.0004393464694631862, + "loss": 3.005157470703125, + "step": 9047, + "token_acc": 0.29651369997819066 + }, + { + "epoch": 5.303723248314277, + "grad_norm": 0.19786661598284475, + "learning_rate": 0.00043933064703007685, + "loss": 3.0181193351745605, + "step": 9048, + "token_acc": 0.29609156895589056 + }, + { + "epoch": 5.304309586631486, + "grad_norm": 0.1849158239155117, + "learning_rate": 0.0004393148228184612, + "loss": 2.9698729515075684, + "step": 9049, + "token_acc": 0.30432419903162666 + }, + { + "epoch": 5.3048959249486956, + "grad_norm": 0.22765277315197124, + "learning_rate": 0.0004392989968284879, + "loss": 2.9522218704223633, + "step": 9050, + "token_acc": 0.30494219586338744 + }, + { + "epoch": 5.305482263265905, + "grad_norm": 0.179471174950499, + "learning_rate": 0.0004392831690603056, + "loss": 3.0216903686523438, + "step": 9051, + "token_acc": 0.29581894408864573 + }, + { + "epoch": 5.306068601583114, + "grad_norm": 0.21619072079289595, + "learning_rate": 0.00043926733951406294, + "loss": 2.97031831741333, + "step": 9052, + "token_acc": 0.30289786121123663 + }, + { + "epoch": 5.306654939900323, + "grad_norm": 0.1862303326501486, + "learning_rate": 0.00043925150818990876, + "loss": 3.005505084991455, + "step": 9053, + "token_acc": 0.29801860394766694 + }, + { + "epoch": 5.307241278217531, + "grad_norm": 0.2451071186387691, + "learning_rate": 0.00043923567508799164, + "loss": 3.001654624938965, + "step": 9054, + "token_acc": 0.29795734210519415 + }, + { + "epoch": 5.30782761653474, + "grad_norm": 0.18561520212786708, + "learning_rate": 0.00043921984020846034, + "loss": 3.005300998687744, + "step": 9055, + "token_acc": 0.2967561583612746 + }, + { + "epoch": 5.308413954851949, + "grad_norm": 0.17125981233882595, + "learning_rate": 0.0004392040035514636, + "loss": 2.987459182739258, + "step": 9056, + "token_acc": 0.30125220509557826 + }, + { + "epoch": 5.3090002931691584, + "grad_norm": 0.16146577639858442, + "learning_rate": 0.0004391881651171503, + "loss": 2.992894172668457, + "step": 9057, + "token_acc": 0.2997772475365521 + }, + { + "epoch": 5.309586631486368, + "grad_norm": 0.17722148214195074, + "learning_rate": 0.000439172324905669, + "loss": 2.9894490242004395, + "step": 9058, + "token_acc": 0.30025777544721316 + }, + { + "epoch": 5.310172969803577, + "grad_norm": 0.16434965200368964, + "learning_rate": 0.0004391564829171687, + "loss": 3.050797462463379, + "step": 9059, + "token_acc": 0.2911740963791437 + }, + { + "epoch": 5.310759308120786, + "grad_norm": 0.1801426116645019, + "learning_rate": 0.0004391406391517981, + "loss": 3.003714084625244, + "step": 9060, + "token_acc": 0.29981185863860704 + }, + { + "epoch": 5.311345646437995, + "grad_norm": 0.17193464635070643, + "learning_rate": 0.00043912479360970605, + "loss": 2.9819345474243164, + "step": 9061, + "token_acc": 0.30195319040905727 + }, + { + "epoch": 5.311931984755204, + "grad_norm": 0.16890188659143965, + "learning_rate": 0.0004391089462910414, + "loss": 2.9901323318481445, + "step": 9062, + "token_acc": 0.29958821971247623 + }, + { + "epoch": 5.312518323072413, + "grad_norm": 0.1588347044790321, + "learning_rate": 0.0004390930971959531, + "loss": 3.0074849128723145, + "step": 9063, + "token_acc": 0.2977312790220008 + }, + { + "epoch": 5.313104661389621, + "grad_norm": 0.20184595665893776, + "learning_rate": 0.0004390772463245899, + "loss": 3.038729429244995, + "step": 9064, + "token_acc": 0.2940239404069497 + }, + { + "epoch": 5.3136909997068305, + "grad_norm": 0.1853595761019235, + "learning_rate": 0.0004390613936771007, + "loss": 3.005371570587158, + "step": 9065, + "token_acc": 0.2974896062283187 + }, + { + "epoch": 5.31427733802404, + "grad_norm": 0.17181148626626036, + "learning_rate": 0.00043904553925363463, + "loss": 2.994385004043579, + "step": 9066, + "token_acc": 0.29981219327796194 + }, + { + "epoch": 5.314863676341249, + "grad_norm": 0.16695814851275453, + "learning_rate": 0.00043902968305434037, + "loss": 2.994967460632324, + "step": 9067, + "token_acc": 0.29825487668290734 + }, + { + "epoch": 5.315450014658458, + "grad_norm": 0.17091380657950658, + "learning_rate": 0.00043901382507936695, + "loss": 3.0300703048706055, + "step": 9068, + "token_acc": 0.2957530805735992 + }, + { + "epoch": 5.316036352975667, + "grad_norm": 0.16914799738598335, + "learning_rate": 0.0004389979653288634, + "loss": 2.9900264739990234, + "step": 9069, + "token_acc": 0.29975252007001874 + }, + { + "epoch": 5.316622691292876, + "grad_norm": 0.17175649558983785, + "learning_rate": 0.00043898210380297856, + "loss": 2.979917526245117, + "step": 9070, + "token_acc": 0.302007556154995 + }, + { + "epoch": 5.317209029610085, + "grad_norm": 0.19096726336641856, + "learning_rate": 0.00043896624050186153, + "loss": 2.973921298980713, + "step": 9071, + "token_acc": 0.30105595583902744 + }, + { + "epoch": 5.317795367927294, + "grad_norm": 0.19104873045880574, + "learning_rate": 0.00043895037542566133, + "loss": 3.014247417449951, + "step": 9072, + "token_acc": 0.2959433058998717 + }, + { + "epoch": 5.318381706244503, + "grad_norm": 0.17981280137888261, + "learning_rate": 0.000438934508574527, + "loss": 2.9670023918151855, + "step": 9073, + "token_acc": 0.30469999606035536 + }, + { + "epoch": 5.3189680445617125, + "grad_norm": 0.2031681524840766, + "learning_rate": 0.00043891863994860745, + "loss": 3.0167717933654785, + "step": 9074, + "token_acc": 0.29542234418536056 + }, + { + "epoch": 5.319554382878922, + "grad_norm": 0.2029206731745043, + "learning_rate": 0.000438902769548052, + "loss": 2.9869422912597656, + "step": 9075, + "token_acc": 0.3006023337149339 + }, + { + "epoch": 5.32014072119613, + "grad_norm": 0.1665393521932243, + "learning_rate": 0.0004388868973730095, + "loss": 2.9725286960601807, + "step": 9076, + "token_acc": 0.30255266049769436 + }, + { + "epoch": 5.320727059513339, + "grad_norm": 0.2326136375892318, + "learning_rate": 0.00043887102342362905, + "loss": 2.991028308868408, + "step": 9077, + "token_acc": 0.2987586264608433 + }, + { + "epoch": 5.321313397830548, + "grad_norm": 0.35654014872848244, + "learning_rate": 0.00043885514770005996, + "loss": 2.9992032051086426, + "step": 9078, + "token_acc": 0.2984378666572466 + }, + { + "epoch": 5.321899736147757, + "grad_norm": 0.43352016405479826, + "learning_rate": 0.0004388392702024512, + "loss": 3.036896228790283, + "step": 9079, + "token_acc": 0.2930976126794435 + }, + { + "epoch": 5.322486074464966, + "grad_norm": 0.22709263576587976, + "learning_rate": 0.0004388233909309519, + "loss": 3.0080347061157227, + "step": 9080, + "token_acc": 0.29669356503021166 + }, + { + "epoch": 5.323072412782175, + "grad_norm": 0.3360876699048018, + "learning_rate": 0.0004388075098857114, + "loss": 3.030996084213257, + "step": 9081, + "token_acc": 0.29507307071053557 + }, + { + "epoch": 5.3236587510993845, + "grad_norm": 0.1713611819860275, + "learning_rate": 0.0004387916270668787, + "loss": 2.9721522331237793, + "step": 9082, + "token_acc": 0.30292086206851493 + }, + { + "epoch": 5.324245089416594, + "grad_norm": 0.27410192847292403, + "learning_rate": 0.00043877574247460304, + "loss": 3.0104458332061768, + "step": 9083, + "token_acc": 0.29868537886705276 + }, + { + "epoch": 5.324831427733803, + "grad_norm": 0.18501325999842583, + "learning_rate": 0.0004387598561090337, + "loss": 2.9424381256103516, + "step": 9084, + "token_acc": 0.30765649332926615 + }, + { + "epoch": 5.325417766051012, + "grad_norm": 0.2243954211447583, + "learning_rate": 0.0004387439679703199, + "loss": 3.0208492279052734, + "step": 9085, + "token_acc": 0.29631062516065876 + }, + { + "epoch": 5.32600410436822, + "grad_norm": 0.2142991068401102, + "learning_rate": 0.00043872807805861084, + "loss": 3.006753921508789, + "step": 9086, + "token_acc": 0.29839390276938466 + }, + { + "epoch": 5.326590442685429, + "grad_norm": 0.18678241223278796, + "learning_rate": 0.00043871218637405574, + "loss": 2.9700675010681152, + "step": 9087, + "token_acc": 0.304498306944872 + }, + { + "epoch": 5.327176781002638, + "grad_norm": 0.21918982552121088, + "learning_rate": 0.00043869629291680396, + "loss": 3.0078067779541016, + "step": 9088, + "token_acc": 0.29691341953773615 + }, + { + "epoch": 5.327763119319847, + "grad_norm": 0.1647629923234581, + "learning_rate": 0.00043868039768700485, + "loss": 2.9874019622802734, + "step": 9089, + "token_acc": 0.2998272637870504 + }, + { + "epoch": 5.3283494576370565, + "grad_norm": 0.23216085744773965, + "learning_rate": 0.0004386645006848076, + "loss": 2.9952993392944336, + "step": 9090, + "token_acc": 0.2996796118014837 + }, + { + "epoch": 5.328935795954266, + "grad_norm": 0.1845954321114459, + "learning_rate": 0.0004386486019103616, + "loss": 3.0147762298583984, + "step": 9091, + "token_acc": 0.29721920606584834 + }, + { + "epoch": 5.329522134271475, + "grad_norm": 0.18160347893725096, + "learning_rate": 0.00043863270136381614, + "loss": 2.9723920822143555, + "step": 9092, + "token_acc": 0.3020542979134778 + }, + { + "epoch": 5.330108472588684, + "grad_norm": 0.1921174722640265, + "learning_rate": 0.0004386167990453207, + "loss": 3.005222797393799, + "step": 9093, + "token_acc": 0.2964201099499321 + }, + { + "epoch": 5.330694810905893, + "grad_norm": 0.16919726479823421, + "learning_rate": 0.00043860089495502457, + "loss": 2.9306654930114746, + "step": 9094, + "token_acc": 0.30764397263842985 + }, + { + "epoch": 5.331281149223102, + "grad_norm": 0.22246804601587652, + "learning_rate": 0.0004385849890930772, + "loss": 3.024413585662842, + "step": 9095, + "token_acc": 0.29546120748317123 + }, + { + "epoch": 5.331867487540311, + "grad_norm": 0.16348867951802154, + "learning_rate": 0.0004385690814596279, + "loss": 3.0298447608947754, + "step": 9096, + "token_acc": 0.29299322607959355 + }, + { + "epoch": 5.33245382585752, + "grad_norm": 0.19417277952104597, + "learning_rate": 0.00043855317205482633, + "loss": 2.9752049446105957, + "step": 9097, + "token_acc": 0.3016191123767261 + }, + { + "epoch": 5.3330401641747285, + "grad_norm": 0.17432869043149182, + "learning_rate": 0.00043853726087882166, + "loss": 2.951829433441162, + "step": 9098, + "token_acc": 0.3064632413199307 + }, + { + "epoch": 5.333626502491938, + "grad_norm": 0.19529975466077387, + "learning_rate": 0.00043852134793176357, + "loss": 2.9743847846984863, + "step": 9099, + "token_acc": 0.30033352478260433 + }, + { + "epoch": 5.334212840809147, + "grad_norm": 0.15902717560952997, + "learning_rate": 0.00043850543321380137, + "loss": 2.9616200923919678, + "step": 9100, + "token_acc": 0.3040839599258503 + }, + { + "epoch": 5.334799179126356, + "grad_norm": 0.1711795980729838, + "learning_rate": 0.00043848951672508467, + "loss": 2.9648969173431396, + "step": 9101, + "token_acc": 0.3021213100127679 + }, + { + "epoch": 5.335385517443565, + "grad_norm": 0.17683121016681247, + "learning_rate": 0.00043847359846576294, + "loss": 2.9416234493255615, + "step": 9102, + "token_acc": 0.3071028697900199 + }, + { + "epoch": 5.335971855760774, + "grad_norm": 0.17082426975900597, + "learning_rate": 0.00043845767843598573, + "loss": 2.97518253326416, + "step": 9103, + "token_acc": 0.3020241852036798 + }, + { + "epoch": 5.336558194077983, + "grad_norm": 0.17732045007247624, + "learning_rate": 0.0004384417566359026, + "loss": 2.9920921325683594, + "step": 9104, + "token_acc": 0.29968072609712754 + }, + { + "epoch": 5.337144532395192, + "grad_norm": 0.18732295502292526, + "learning_rate": 0.0004384258330656631, + "loss": 3.0182032585144043, + "step": 9105, + "token_acc": 0.2962369624790306 + }, + { + "epoch": 5.337730870712401, + "grad_norm": 0.16235865257575285, + "learning_rate": 0.0004384099077254168, + "loss": 2.997974395751953, + "step": 9106, + "token_acc": 0.2998850106481171 + }, + { + "epoch": 5.3383172090296105, + "grad_norm": 0.17724697534035994, + "learning_rate": 0.0004383939806153133, + "loss": 2.9932146072387695, + "step": 9107, + "token_acc": 0.2999349755201958 + }, + { + "epoch": 5.338903547346819, + "grad_norm": 0.1505514019047103, + "learning_rate": 0.0004383780517355023, + "loss": 3.039780616760254, + "step": 9108, + "token_acc": 0.29069773432106905 + }, + { + "epoch": 5.339489885664028, + "grad_norm": 0.1787163038516192, + "learning_rate": 0.0004383621210861332, + "loss": 2.9905641078948975, + "step": 9109, + "token_acc": 0.30066747652764636 + }, + { + "epoch": 5.340076223981237, + "grad_norm": 0.16815565094377585, + "learning_rate": 0.0004383461886673558, + "loss": 2.951796531677246, + "step": 9110, + "token_acc": 0.30569369168016985 + }, + { + "epoch": 5.340662562298446, + "grad_norm": 0.16998321049608223, + "learning_rate": 0.00043833025447931985, + "loss": 3.0325381755828857, + "step": 9111, + "token_acc": 0.2933731175803017 + }, + { + "epoch": 5.341248900615655, + "grad_norm": 0.17016929803646744, + "learning_rate": 0.00043831431852217487, + "loss": 2.99227237701416, + "step": 9112, + "token_acc": 0.2998074009398834 + }, + { + "epoch": 5.341835238932864, + "grad_norm": 0.17890421546780835, + "learning_rate": 0.00043829838079607067, + "loss": 2.996424674987793, + "step": 9113, + "token_acc": 0.29883602976948315 + }, + { + "epoch": 5.342421577250073, + "grad_norm": 0.17656611181399984, + "learning_rate": 0.00043828244130115693, + "loss": 2.9825751781463623, + "step": 9114, + "token_acc": 0.29957749741094036 + }, + { + "epoch": 5.3430079155672825, + "grad_norm": 0.1835139930759743, + "learning_rate": 0.00043826650003758326, + "loss": 3.011472463607788, + "step": 9115, + "token_acc": 0.2957167065049242 + }, + { + "epoch": 5.343594253884492, + "grad_norm": 0.16232520937558356, + "learning_rate": 0.00043825055700549967, + "loss": 2.989093542098999, + "step": 9116, + "token_acc": 0.3002224371635333 + }, + { + "epoch": 5.344180592201701, + "grad_norm": 0.18884799020404452, + "learning_rate": 0.00043823461220505566, + "loss": 2.939085006713867, + "step": 9117, + "token_acc": 0.3080376668475884 + }, + { + "epoch": 5.34476693051891, + "grad_norm": 0.16289743138637722, + "learning_rate": 0.0004382186656364011, + "loss": 2.998645305633545, + "step": 9118, + "token_acc": 0.2987240533465257 + }, + { + "epoch": 5.345353268836118, + "grad_norm": 0.18348018468514293, + "learning_rate": 0.0004382027172996859, + "loss": 2.973112106323242, + "step": 9119, + "token_acc": 0.3037733776398539 + }, + { + "epoch": 5.345939607153327, + "grad_norm": 0.17111898173355553, + "learning_rate": 0.0004381867671950597, + "loss": 3.0142009258270264, + "step": 9120, + "token_acc": 0.295798492559428 + }, + { + "epoch": 5.346525945470536, + "grad_norm": 0.18493794820902176, + "learning_rate": 0.00043817081532267243, + "loss": 3.020679473876953, + "step": 9121, + "token_acc": 0.29420273558095317 + }, + { + "epoch": 5.347112283787745, + "grad_norm": 0.1900104764779965, + "learning_rate": 0.00043815486168267395, + "loss": 3.007984161376953, + "step": 9122, + "token_acc": 0.2980666413277588 + }, + { + "epoch": 5.3476986221049545, + "grad_norm": 0.20891142572759944, + "learning_rate": 0.000438138906275214, + "loss": 2.977004051208496, + "step": 9123, + "token_acc": 0.30262108736331444 + }, + { + "epoch": 5.348284960422164, + "grad_norm": 0.18171597979722, + "learning_rate": 0.00043812294910044265, + "loss": 2.9668478965759277, + "step": 9124, + "token_acc": 0.3027530770626728 + }, + { + "epoch": 5.348871298739373, + "grad_norm": 0.1819966298816744, + "learning_rate": 0.0004381069901585096, + "loss": 3.017306327819824, + "step": 9125, + "token_acc": 0.2964429197870244 + }, + { + "epoch": 5.349457637056582, + "grad_norm": 0.19670138542979698, + "learning_rate": 0.0004380910294495649, + "loss": 2.9684526920318604, + "step": 9126, + "token_acc": 0.3017075833273572 + }, + { + "epoch": 5.350043975373791, + "grad_norm": 0.24275647021977537, + "learning_rate": 0.0004380750669737584, + "loss": 3.0019752979278564, + "step": 9127, + "token_acc": 0.29845676635320423 + }, + { + "epoch": 5.350630313691, + "grad_norm": 0.34544412020102333, + "learning_rate": 0.00043805910273124016, + "loss": 3.029719829559326, + "step": 9128, + "token_acc": 0.2941714598740071 + }, + { + "epoch": 5.351216652008208, + "grad_norm": 0.39450054477024726, + "learning_rate": 0.00043804313672216, + "loss": 3.0041685104370117, + "step": 9129, + "token_acc": 0.2987318289719459 + }, + { + "epoch": 5.351802990325417, + "grad_norm": 0.21108597823311917, + "learning_rate": 0.000438027168946668, + "loss": 2.9998574256896973, + "step": 9130, + "token_acc": 0.2997330046117385 + }, + { + "epoch": 5.3523893286426265, + "grad_norm": 0.24200172827657124, + "learning_rate": 0.0004380111994049141, + "loss": 3.0410192012786865, + "step": 9131, + "token_acc": 0.29370106621552505 + }, + { + "epoch": 5.352975666959836, + "grad_norm": 0.17676015463625935, + "learning_rate": 0.0004379952280970483, + "loss": 2.993969440460205, + "step": 9132, + "token_acc": 0.29985223415532547 + }, + { + "epoch": 5.353562005277045, + "grad_norm": 0.2422420240392483, + "learning_rate": 0.0004379792550232207, + "loss": 2.9978816509246826, + "step": 9133, + "token_acc": 0.297968403325403 + }, + { + "epoch": 5.354148343594254, + "grad_norm": 0.1955019983002007, + "learning_rate": 0.0004379632801835813, + "loss": 2.9947316646575928, + "step": 9134, + "token_acc": 0.2980802568903255 + }, + { + "epoch": 5.354734681911463, + "grad_norm": 0.21153221776027153, + "learning_rate": 0.0004379473035782802, + "loss": 2.975295066833496, + "step": 9135, + "token_acc": 0.30332398493205254 + }, + { + "epoch": 5.355321020228672, + "grad_norm": 0.17094250122412796, + "learning_rate": 0.00043793132520746737, + "loss": 2.965813636779785, + "step": 9136, + "token_acc": 0.30223850309106665 + }, + { + "epoch": 5.355907358545881, + "grad_norm": 0.24498599400090912, + "learning_rate": 0.000437915345071293, + "loss": 2.9655113220214844, + "step": 9137, + "token_acc": 0.30281790319034746 + }, + { + "epoch": 5.35649369686309, + "grad_norm": 0.1742716541909434, + "learning_rate": 0.0004378993631699072, + "loss": 3.0123283863067627, + "step": 9138, + "token_acc": 0.2960446721836221 + }, + { + "epoch": 5.357080035180299, + "grad_norm": 0.202940611950018, + "learning_rate": 0.00043788337950346004, + "loss": 3.05010986328125, + "step": 9139, + "token_acc": 0.29201542621379095 + }, + { + "epoch": 5.3576663734975085, + "grad_norm": 0.18536594269826112, + "learning_rate": 0.00043786739407210176, + "loss": 3.0031495094299316, + "step": 9140, + "token_acc": 0.29814443910227567 + }, + { + "epoch": 5.358252711814717, + "grad_norm": 0.19895898318579572, + "learning_rate": 0.00043785140687598246, + "loss": 3.028254747390747, + "step": 9141, + "token_acc": 0.2948439971989522 + }, + { + "epoch": 5.358839050131926, + "grad_norm": 0.198364777569017, + "learning_rate": 0.0004378354179152523, + "loss": 2.9692914485931396, + "step": 9142, + "token_acc": 0.3033963210440252 + }, + { + "epoch": 5.359425388449135, + "grad_norm": 0.2046162409832557, + "learning_rate": 0.0004378194271900615, + "loss": 3.018787384033203, + "step": 9143, + "token_acc": 0.2963541029636208 + }, + { + "epoch": 5.360011726766344, + "grad_norm": 0.18350965376498618, + "learning_rate": 0.0004378034347005603, + "loss": 3.045400619506836, + "step": 9144, + "token_acc": 0.29270711800525673 + }, + { + "epoch": 5.360598065083553, + "grad_norm": 0.21239366422091696, + "learning_rate": 0.00043778744044689887, + "loss": 3.012734889984131, + "step": 9145, + "token_acc": 0.2966786874559734 + }, + { + "epoch": 5.361184403400762, + "grad_norm": 0.1785624495469117, + "learning_rate": 0.00043777144442922746, + "loss": 3.0375757217407227, + "step": 9146, + "token_acc": 0.2933921447434961 + }, + { + "epoch": 5.361770741717971, + "grad_norm": 0.1869701850839567, + "learning_rate": 0.0004377554466476964, + "loss": 2.9764602184295654, + "step": 9147, + "token_acc": 0.30308116970283044 + }, + { + "epoch": 5.3623570800351805, + "grad_norm": 0.20678921973343722, + "learning_rate": 0.00043773944710245595, + "loss": 3.0103230476379395, + "step": 9148, + "token_acc": 0.2969260637850056 + }, + { + "epoch": 5.36294341835239, + "grad_norm": 0.21617520499976442, + "learning_rate": 0.0004377234457936563, + "loss": 3.0311481952667236, + "step": 9149, + "token_acc": 0.29324328270640365 + }, + { + "epoch": 5.363529756669599, + "grad_norm": 0.1700281388531636, + "learning_rate": 0.0004377074427214479, + "loss": 3.042182207107544, + "step": 9150, + "token_acc": 0.2936014227689631 + }, + { + "epoch": 5.364116094986807, + "grad_norm": 0.24197773679398205, + "learning_rate": 0.000437691437885981, + "loss": 3.01670241355896, + "step": 9151, + "token_acc": 0.29751541832190337 + }, + { + "epoch": 5.364702433304016, + "grad_norm": 0.16192020141594574, + "learning_rate": 0.00043767543128740596, + "loss": 3.010223388671875, + "step": 9152, + "token_acc": 0.2980938225529976 + }, + { + "epoch": 5.365288771621225, + "grad_norm": 0.2247503009897838, + "learning_rate": 0.0004376594229258731, + "loss": 2.9473280906677246, + "step": 9153, + "token_acc": 0.3057339142986982 + }, + { + "epoch": 5.365875109938434, + "grad_norm": 0.15013193040836287, + "learning_rate": 0.00043764341280153285, + "loss": 2.9669604301452637, + "step": 9154, + "token_acc": 0.3040245085308132 + }, + { + "epoch": 5.366461448255643, + "grad_norm": 0.20336417158191517, + "learning_rate": 0.00043762740091453557, + "loss": 3.0124599933624268, + "step": 9155, + "token_acc": 0.2958797422604289 + }, + { + "epoch": 5.3670477865728525, + "grad_norm": 0.21391847122053625, + "learning_rate": 0.00043761138726503175, + "loss": 3.00254487991333, + "step": 9156, + "token_acc": 0.29970478623173236 + }, + { + "epoch": 5.367634124890062, + "grad_norm": 0.16388504604040585, + "learning_rate": 0.0004375953718531717, + "loss": 2.9890682697296143, + "step": 9157, + "token_acc": 0.2995357698004386 + }, + { + "epoch": 5.368220463207271, + "grad_norm": 0.19317133239185563, + "learning_rate": 0.00043757935467910597, + "loss": 2.9956259727478027, + "step": 9158, + "token_acc": 0.2983599585337976 + }, + { + "epoch": 5.36880680152448, + "grad_norm": 0.16795272970031677, + "learning_rate": 0.0004375633357429849, + "loss": 3.0120320320129395, + "step": 9159, + "token_acc": 0.2968905277472671 + }, + { + "epoch": 5.369393139841689, + "grad_norm": 0.18996447229060334, + "learning_rate": 0.00043754731504495913, + "loss": 2.9786884784698486, + "step": 9160, + "token_acc": 0.3021710997824595 + }, + { + "epoch": 5.369979478158898, + "grad_norm": 0.16044170495543397, + "learning_rate": 0.00043753129258517897, + "loss": 2.9795079231262207, + "step": 9161, + "token_acc": 0.30150728234532703 + }, + { + "epoch": 5.370565816476106, + "grad_norm": 0.17310262078471453, + "learning_rate": 0.0004375152683637951, + "loss": 2.946756362915039, + "step": 9162, + "token_acc": 0.30599011363727113 + }, + { + "epoch": 5.371152154793315, + "grad_norm": 0.18744979720332647, + "learning_rate": 0.00043749924238095796, + "loss": 3.028874397277832, + "step": 9163, + "token_acc": 0.29653462504681277 + }, + { + "epoch": 5.3717384931105245, + "grad_norm": 0.16666887544390915, + "learning_rate": 0.000437483214636818, + "loss": 2.978790521621704, + "step": 9164, + "token_acc": 0.30110063696872413 + }, + { + "epoch": 5.372324831427734, + "grad_norm": 0.16065214634098549, + "learning_rate": 0.00043746718513152597, + "loss": 2.9845144748687744, + "step": 9165, + "token_acc": 0.3002206268247571 + }, + { + "epoch": 5.372911169744943, + "grad_norm": 0.16518584922473464, + "learning_rate": 0.0004374511538652323, + "loss": 3.0337395668029785, + "step": 9166, + "token_acc": 0.2941255475659145 + }, + { + "epoch": 5.373497508062152, + "grad_norm": 0.15247452937110031, + "learning_rate": 0.0004374351208380876, + "loss": 2.99542236328125, + "step": 9167, + "token_acc": 0.29960084465063375 + }, + { + "epoch": 5.374083846379361, + "grad_norm": 0.18220169511160905, + "learning_rate": 0.0004374190860502426, + "loss": 2.979783058166504, + "step": 9168, + "token_acc": 0.3013378975149599 + }, + { + "epoch": 5.37467018469657, + "grad_norm": 0.17866960364153894, + "learning_rate": 0.00043740304950184777, + "loss": 3.079359531402588, + "step": 9169, + "token_acc": 0.2873769296219764 + }, + { + "epoch": 5.375256523013779, + "grad_norm": 0.17465333989636056, + "learning_rate": 0.0004373870111930539, + "loss": 2.9830048084259033, + "step": 9170, + "token_acc": 0.3008378983902508 + }, + { + "epoch": 5.375842861330988, + "grad_norm": 0.15919752440073343, + "learning_rate": 0.00043737097112401147, + "loss": 3.0141806602478027, + "step": 9171, + "token_acc": 0.29873897667532023 + }, + { + "epoch": 5.3764291996481965, + "grad_norm": 0.16926520412220503, + "learning_rate": 0.0004373549292948713, + "loss": 2.9947144985198975, + "step": 9172, + "token_acc": 0.2982845060774494 + }, + { + "epoch": 5.377015537965406, + "grad_norm": 0.175301236507858, + "learning_rate": 0.000437338885705784, + "loss": 3.015355110168457, + "step": 9173, + "token_acc": 0.2939951699110557 + }, + { + "epoch": 5.377601876282615, + "grad_norm": 0.22062452549267494, + "learning_rate": 0.00043732284035690036, + "loss": 3.0321760177612305, + "step": 9174, + "token_acc": 0.2948699058525854 + }, + { + "epoch": 5.378188214599824, + "grad_norm": 0.2028817041924162, + "learning_rate": 0.00043730679324837106, + "loss": 3.0003926753997803, + "step": 9175, + "token_acc": 0.29704682071361604 + }, + { + "epoch": 5.378774552917033, + "grad_norm": 0.178409406933029, + "learning_rate": 0.00043729074438034676, + "loss": 3.047703981399536, + "step": 9176, + "token_acc": 0.292749488798301 + }, + { + "epoch": 5.379360891234242, + "grad_norm": 0.27469746800952244, + "learning_rate": 0.0004372746937529783, + "loss": 3.012335777282715, + "step": 9177, + "token_acc": 0.29676354130824417 + }, + { + "epoch": 5.379947229551451, + "grad_norm": 0.340410924710611, + "learning_rate": 0.00043725864136641657, + "loss": 3.0225911140441895, + "step": 9178, + "token_acc": 0.29576936411100246 + }, + { + "epoch": 5.38053356786866, + "grad_norm": 0.16660309872293755, + "learning_rate": 0.00043724258722081214, + "loss": 3.0373167991638184, + "step": 9179, + "token_acc": 0.29349801653153795 + }, + { + "epoch": 5.381119906185869, + "grad_norm": 0.30377431415155953, + "learning_rate": 0.00043722653131631595, + "loss": 3.007723569869995, + "step": 9180, + "token_acc": 0.2962644766219692 + }, + { + "epoch": 5.3817062445030786, + "grad_norm": 0.18731345524247506, + "learning_rate": 0.0004372104736530788, + "loss": 3.002624750137329, + "step": 9181, + "token_acc": 0.29847645255169675 + }, + { + "epoch": 5.382292582820288, + "grad_norm": 0.22668001439788774, + "learning_rate": 0.00043719441423125153, + "loss": 3.0065250396728516, + "step": 9182, + "token_acc": 0.2991036932002966 + }, + { + "epoch": 5.382878921137497, + "grad_norm": 0.16166152827845828, + "learning_rate": 0.00043717835305098486, + "loss": 2.981966972351074, + "step": 9183, + "token_acc": 0.30158700808824696 + }, + { + "epoch": 5.383465259454705, + "grad_norm": 0.20781390419986828, + "learning_rate": 0.0004371622901124299, + "loss": 3.029080390930176, + "step": 9184, + "token_acc": 0.295977357581597 + }, + { + "epoch": 5.384051597771914, + "grad_norm": 0.15715379094598086, + "learning_rate": 0.0004371462254157375, + "loss": 2.988495349884033, + "step": 9185, + "token_acc": 0.29801257540319254 + }, + { + "epoch": 5.384637936089123, + "grad_norm": 0.17328543448150105, + "learning_rate": 0.0004371301589610583, + "loss": 3.0033321380615234, + "step": 9186, + "token_acc": 0.2986444277376902 + }, + { + "epoch": 5.385224274406332, + "grad_norm": 0.1910908168284505, + "learning_rate": 0.00043711409074854356, + "loss": 3.0306572914123535, + "step": 9187, + "token_acc": 0.2947030138463025 + }, + { + "epoch": 5.3858106127235414, + "grad_norm": 0.20530176067485745, + "learning_rate": 0.00043709802077834397, + "loss": 3.0185861587524414, + "step": 9188, + "token_acc": 0.29735392132131333 + }, + { + "epoch": 5.386396951040751, + "grad_norm": 0.19188222254927378, + "learning_rate": 0.0004370819490506107, + "loss": 3.0221943855285645, + "step": 9189, + "token_acc": 0.2960132166587996 + }, + { + "epoch": 5.38698328935796, + "grad_norm": 0.1918398241835136, + "learning_rate": 0.0004370658755654946, + "loss": 2.975102663040161, + "step": 9190, + "token_acc": 0.3022940654083272 + }, + { + "epoch": 5.387569627675169, + "grad_norm": 0.20112075610067767, + "learning_rate": 0.00043704980032314663, + "loss": 3.000765323638916, + "step": 9191, + "token_acc": 0.29835716546307217 + }, + { + "epoch": 5.388155965992378, + "grad_norm": 0.1801763373500616, + "learning_rate": 0.00043703372332371784, + "loss": 3.014225482940674, + "step": 9192, + "token_acc": 0.2973812137368669 + }, + { + "epoch": 5.388742304309587, + "grad_norm": 0.20236369849502156, + "learning_rate": 0.0004370176445673593, + "loss": 3.0275163650512695, + "step": 9193, + "token_acc": 0.29612129906852963 + }, + { + "epoch": 5.389328642626795, + "grad_norm": 0.16419626339382143, + "learning_rate": 0.0004370015640542219, + "loss": 3.0069777965545654, + "step": 9194, + "token_acc": 0.2981873425610928 + }, + { + "epoch": 5.389914980944004, + "grad_norm": 0.20727541648946754, + "learning_rate": 0.00043698548178445687, + "loss": 3.025744676589966, + "step": 9195, + "token_acc": 0.29461327099195256 + }, + { + "epoch": 5.3905013192612135, + "grad_norm": 0.19665410647308468, + "learning_rate": 0.0004369693977582152, + "loss": 3.0095314979553223, + "step": 9196, + "token_acc": 0.2959869262238341 + }, + { + "epoch": 5.391087657578423, + "grad_norm": 0.17692770078525114, + "learning_rate": 0.000436953311975648, + "loss": 3.0164284706115723, + "step": 9197, + "token_acc": 0.29699208802012245 + }, + { + "epoch": 5.391673995895632, + "grad_norm": 0.19368590935492458, + "learning_rate": 0.0004369372244369063, + "loss": 2.9869956970214844, + "step": 9198, + "token_acc": 0.3004770146481489 + }, + { + "epoch": 5.392260334212841, + "grad_norm": 0.17048860776292715, + "learning_rate": 0.00043692113514214135, + "loss": 2.9936423301696777, + "step": 9199, + "token_acc": 0.2990571415572285 + }, + { + "epoch": 5.39284667253005, + "grad_norm": 0.21351871790536314, + "learning_rate": 0.0004369050440915042, + "loss": 3.000220537185669, + "step": 9200, + "token_acc": 0.2978976453318129 + }, + { + "epoch": 5.393433010847259, + "grad_norm": 0.18031504774370866, + "learning_rate": 0.00043688895128514595, + "loss": 3.0250191688537598, + "step": 9201, + "token_acc": 0.296540668764597 + }, + { + "epoch": 5.394019349164468, + "grad_norm": 0.2072695184927117, + "learning_rate": 0.00043687285672321785, + "loss": 3.021432876586914, + "step": 9202, + "token_acc": 0.29552462494240594 + }, + { + "epoch": 5.394605687481677, + "grad_norm": 0.16515999631271344, + "learning_rate": 0.00043685676040587114, + "loss": 2.9758167266845703, + "step": 9203, + "token_acc": 0.3010076313828045 + }, + { + "epoch": 5.395192025798886, + "grad_norm": 0.2128930276757322, + "learning_rate": 0.0004368406623332569, + "loss": 2.9760899543762207, + "step": 9204, + "token_acc": 0.30306753460165803 + }, + { + "epoch": 5.395778364116095, + "grad_norm": 0.1716019996462372, + "learning_rate": 0.00043682456250552647, + "loss": 2.9838485717773438, + "step": 9205, + "token_acc": 0.29959297827710973 + }, + { + "epoch": 5.396364702433304, + "grad_norm": 0.21367083266072395, + "learning_rate": 0.000436808460922831, + "loss": 2.982295036315918, + "step": 9206, + "token_acc": 0.3008408499443801 + }, + { + "epoch": 5.396951040750513, + "grad_norm": 0.23771484930629602, + "learning_rate": 0.0004367923575853218, + "loss": 3.012080192565918, + "step": 9207, + "token_acc": 0.29722584798267776 + }, + { + "epoch": 5.397537379067722, + "grad_norm": 0.15524494049981094, + "learning_rate": 0.00043677625249315, + "loss": 2.9758782386779785, + "step": 9208, + "token_acc": 0.3019443158692274 + }, + { + "epoch": 5.398123717384931, + "grad_norm": 0.24224484263508103, + "learning_rate": 0.00043676014564646707, + "loss": 3.020840644836426, + "step": 9209, + "token_acc": 0.2950101693223363 + }, + { + "epoch": 5.39871005570214, + "grad_norm": 0.16731381304731427, + "learning_rate": 0.0004367440370454242, + "loss": 2.9955849647521973, + "step": 9210, + "token_acc": 0.2996279032548463 + }, + { + "epoch": 5.399296394019349, + "grad_norm": 0.18402064897074494, + "learning_rate": 0.0004367279266901728, + "loss": 2.963852882385254, + "step": 9211, + "token_acc": 0.3025639309316809 + }, + { + "epoch": 5.399882732336558, + "grad_norm": 0.1717535595453676, + "learning_rate": 0.0004367118145808641, + "loss": 3.0061962604522705, + "step": 9212, + "token_acc": 0.29711706331592863 + }, + { + "epoch": 5.4004690706537675, + "grad_norm": 0.16538817031606068, + "learning_rate": 0.0004366957007176495, + "loss": 3.008577823638916, + "step": 9213, + "token_acc": 0.2990590505753934 + }, + { + "epoch": 5.401055408970977, + "grad_norm": 0.17236010616899708, + "learning_rate": 0.0004366795851006804, + "loss": 2.997645854949951, + "step": 9214, + "token_acc": 0.29857436119621616 + }, + { + "epoch": 5.401641747288186, + "grad_norm": 0.16176818007024776, + "learning_rate": 0.0004366634677301081, + "loss": 2.9997642040252686, + "step": 9215, + "token_acc": 0.29937201117446943 + }, + { + "epoch": 5.402228085605394, + "grad_norm": 0.19191844251632073, + "learning_rate": 0.00043664734860608407, + "loss": 3.0136168003082275, + "step": 9216, + "token_acc": 0.2968503577640633 + }, + { + "epoch": 5.402814423922603, + "grad_norm": 0.17072337754377945, + "learning_rate": 0.0004366312277287597, + "loss": 3.0065536499023438, + "step": 9217, + "token_acc": 0.2967459919107903 + }, + { + "epoch": 5.403400762239812, + "grad_norm": 0.2413245608773218, + "learning_rate": 0.0004366151050982865, + "loss": 3.002647638320923, + "step": 9218, + "token_acc": 0.2985527169969832 + }, + { + "epoch": 5.403987100557021, + "grad_norm": 0.1689021934084639, + "learning_rate": 0.0004365989807148158, + "loss": 2.9973690509796143, + "step": 9219, + "token_acc": 0.2985785132733975 + }, + { + "epoch": 5.40457343887423, + "grad_norm": 0.19824066540803134, + "learning_rate": 0.0004365828545784991, + "loss": 2.9683022499084473, + "step": 9220, + "token_acc": 0.3031374600344811 + }, + { + "epoch": 5.4051597771914395, + "grad_norm": 0.17914715084163957, + "learning_rate": 0.00043656672668948793, + "loss": 2.9808239936828613, + "step": 9221, + "token_acc": 0.30219912444637875 + }, + { + "epoch": 5.405746115508649, + "grad_norm": 0.1944230219706418, + "learning_rate": 0.0004365505970479338, + "loss": 2.983018398284912, + "step": 9222, + "token_acc": 0.3016974100403996 + }, + { + "epoch": 5.406332453825858, + "grad_norm": 0.20422988992206242, + "learning_rate": 0.0004365344656539881, + "loss": 2.992086410522461, + "step": 9223, + "token_acc": 0.29913463095169707 + }, + { + "epoch": 5.406918792143067, + "grad_norm": 0.17372542947531716, + "learning_rate": 0.0004365183325078026, + "loss": 3.002697467803955, + "step": 9224, + "token_acc": 0.29948867786705624 + }, + { + "epoch": 5.407505130460276, + "grad_norm": 0.17849456607175218, + "learning_rate": 0.0004365021976095286, + "loss": 3.034900188446045, + "step": 9225, + "token_acc": 0.292770246871442 + }, + { + "epoch": 5.408091468777485, + "grad_norm": 0.1730381274158474, + "learning_rate": 0.00043648606095931776, + "loss": 2.9906253814697266, + "step": 9226, + "token_acc": 0.3000059074482649 + }, + { + "epoch": 5.408677807094693, + "grad_norm": 0.18590836432215493, + "learning_rate": 0.0004364699225573217, + "loss": 2.988412380218506, + "step": 9227, + "token_acc": 0.30035891363871825 + }, + { + "epoch": 5.409264145411902, + "grad_norm": 0.18214320240758064, + "learning_rate": 0.00043645378240369197, + "loss": 3.0267510414123535, + "step": 9228, + "token_acc": 0.2927593511056003 + }, + { + "epoch": 5.4098504837291115, + "grad_norm": 0.17971269440319435, + "learning_rate": 0.00043643764049858025, + "loss": 2.9992713928222656, + "step": 9229, + "token_acc": 0.3002239620732642 + }, + { + "epoch": 5.410436822046321, + "grad_norm": 0.17506003836245262, + "learning_rate": 0.00043642149684213806, + "loss": 3.0024595260620117, + "step": 9230, + "token_acc": 0.2996928178128943 + }, + { + "epoch": 5.41102316036353, + "grad_norm": 0.17909626918620603, + "learning_rate": 0.0004364053514345172, + "loss": 3.0282022953033447, + "step": 9231, + "token_acc": 0.29547305613597746 + }, + { + "epoch": 5.411609498680739, + "grad_norm": 0.18218685450453048, + "learning_rate": 0.00043638920427586914, + "loss": 2.9802122116088867, + "step": 9232, + "token_acc": 0.30058711672223426 + }, + { + "epoch": 5.412195836997948, + "grad_norm": 0.22286997600339165, + "learning_rate": 0.0004363730553663458, + "loss": 2.9856045246124268, + "step": 9233, + "token_acc": 0.30046031938876067 + }, + { + "epoch": 5.412782175315157, + "grad_norm": 0.1927053476867642, + "learning_rate": 0.0004363569047060986, + "loss": 3.005012035369873, + "step": 9234, + "token_acc": 0.2979531304278118 + }, + { + "epoch": 5.413368513632366, + "grad_norm": 0.19135134004819682, + "learning_rate": 0.00043634075229527947, + "loss": 3.0004217624664307, + "step": 9235, + "token_acc": 0.2980238014280857 + }, + { + "epoch": 5.413954851949575, + "grad_norm": 0.2383198081137073, + "learning_rate": 0.0004363245981340401, + "loss": 3.013132095336914, + "step": 9236, + "token_acc": 0.2977878646143379 + }, + { + "epoch": 5.4145411902667835, + "grad_norm": 0.18864435919583616, + "learning_rate": 0.0004363084422225322, + "loss": 3.0213661193847656, + "step": 9237, + "token_acc": 0.2967907264808319 + }, + { + "epoch": 5.415127528583993, + "grad_norm": 0.21119102406146584, + "learning_rate": 0.00043629228456090746, + "loss": 2.998109817504883, + "step": 9238, + "token_acc": 0.2995388483308461 + }, + { + "epoch": 5.415713866901202, + "grad_norm": 0.2019791506724574, + "learning_rate": 0.0004362761251493178, + "loss": 3.0338542461395264, + "step": 9239, + "token_acc": 0.2938528142930139 + }, + { + "epoch": 5.416300205218411, + "grad_norm": 0.22774311915643164, + "learning_rate": 0.000436259963987915, + "loss": 3.043783187866211, + "step": 9240, + "token_acc": 0.2920801796726226 + }, + { + "epoch": 5.41688654353562, + "grad_norm": 0.30062014050764185, + "learning_rate": 0.00043624380107685075, + "loss": 3.0157740116119385, + "step": 9241, + "token_acc": 0.2968377472360541 + }, + { + "epoch": 5.417472881852829, + "grad_norm": 0.171528258582784, + "learning_rate": 0.00043622763641627696, + "loss": 3.025561809539795, + "step": 9242, + "token_acc": 0.2952885331679027 + }, + { + "epoch": 5.418059220170038, + "grad_norm": 0.224950669205509, + "learning_rate": 0.0004362114700063455, + "loss": 3.0040669441223145, + "step": 9243, + "token_acc": 0.30032943484333186 + }, + { + "epoch": 5.418645558487247, + "grad_norm": 0.17686075053443148, + "learning_rate": 0.0004361953018472082, + "loss": 2.9854679107666016, + "step": 9244, + "token_acc": 0.2999281912107064 + }, + { + "epoch": 5.419231896804456, + "grad_norm": 0.20183812832250378, + "learning_rate": 0.0004361791319390169, + "loss": 3.0033135414123535, + "step": 9245, + "token_acc": 0.2978057568941098 + }, + { + "epoch": 5.4198182351216655, + "grad_norm": 0.19114749915503423, + "learning_rate": 0.0004361629602819236, + "loss": 2.997903823852539, + "step": 9246, + "token_acc": 0.29931048208641364 + }, + { + "epoch": 5.420404573438875, + "grad_norm": 0.1834011025760006, + "learning_rate": 0.00043614678687608013, + "loss": 2.952251672744751, + "step": 9247, + "token_acc": 0.3068261844437789 + }, + { + "epoch": 5.420990911756084, + "grad_norm": 0.1850155134363094, + "learning_rate": 0.00043613061172163836, + "loss": 3.013467311859131, + "step": 9248, + "token_acc": 0.2968805583209424 + }, + { + "epoch": 5.421577250073292, + "grad_norm": 0.20399408835518085, + "learning_rate": 0.00043611443481875043, + "loss": 3.0461878776550293, + "step": 9249, + "token_acc": 0.2913973021769462 + }, + { + "epoch": 5.422163588390501, + "grad_norm": 0.20812083883917137, + "learning_rate": 0.00043609825616756806, + "loss": 3.0102429389953613, + "step": 9250, + "token_acc": 0.2973120388142738 + }, + { + "epoch": 5.42274992670771, + "grad_norm": 0.17338536210648833, + "learning_rate": 0.0004360820757682434, + "loss": 2.9975225925445557, + "step": 9251, + "token_acc": 0.29768570261789107 + }, + { + "epoch": 5.423336265024919, + "grad_norm": 0.2206571098266944, + "learning_rate": 0.0004360658936209284, + "loss": 3.038896083831787, + "step": 9252, + "token_acc": 0.293813964007289 + }, + { + "epoch": 5.423922603342128, + "grad_norm": 0.19092589515622527, + "learning_rate": 0.00043604970972577504, + "loss": 2.994692325592041, + "step": 9253, + "token_acc": 0.2983439760205509 + }, + { + "epoch": 5.4245089416593375, + "grad_norm": 0.2121925931217304, + "learning_rate": 0.00043603352408293537, + "loss": 2.9860029220581055, + "step": 9254, + "token_acc": 0.2991710724185089 + }, + { + "epoch": 5.425095279976547, + "grad_norm": 0.18318351864060545, + "learning_rate": 0.0004360173366925614, + "loss": 3.019348621368408, + "step": 9255, + "token_acc": 0.29682057232220554 + }, + { + "epoch": 5.425681618293756, + "grad_norm": 0.23123159053509515, + "learning_rate": 0.0004360011475548052, + "loss": 3.001685619354248, + "step": 9256, + "token_acc": 0.2986617809084044 + }, + { + "epoch": 5.426267956610965, + "grad_norm": 0.16810121788417504, + "learning_rate": 0.00043598495666981893, + "loss": 3.029845714569092, + "step": 9257, + "token_acc": 0.29408669733687554 + }, + { + "epoch": 5.426854294928174, + "grad_norm": 0.24424652568678923, + "learning_rate": 0.0004359687640377545, + "loss": 2.9915499687194824, + "step": 9258, + "token_acc": 0.29974120752568517 + }, + { + "epoch": 5.427440633245382, + "grad_norm": 0.1731489771855594, + "learning_rate": 0.00043595256965876425, + "loss": 3.022756338119507, + "step": 9259, + "token_acc": 0.29660911424733677 + }, + { + "epoch": 5.428026971562591, + "grad_norm": 0.22888904374953642, + "learning_rate": 0.0004359363735330001, + "loss": 3.0366921424865723, + "step": 9260, + "token_acc": 0.2936887750226628 + }, + { + "epoch": 5.4286133098798, + "grad_norm": 0.1722030270620727, + "learning_rate": 0.0004359201756606143, + "loss": 2.9952399730682373, + "step": 9261, + "token_acc": 0.299448297134879 + }, + { + "epoch": 5.4291996481970095, + "grad_norm": 0.19550145150084677, + "learning_rate": 0.00043590397604175904, + "loss": 3.012085437774658, + "step": 9262, + "token_acc": 0.2988688559383098 + }, + { + "epoch": 5.429785986514219, + "grad_norm": 0.15268101380819202, + "learning_rate": 0.00043588777467658637, + "loss": 3.0059452056884766, + "step": 9263, + "token_acc": 0.29804369193696695 + }, + { + "epoch": 5.430372324831428, + "grad_norm": 0.21377781914596058, + "learning_rate": 0.0004358715715652485, + "loss": 2.9617834091186523, + "step": 9264, + "token_acc": 0.30397697784974365 + }, + { + "epoch": 5.430958663148637, + "grad_norm": 0.15815124395461022, + "learning_rate": 0.00043585536670789774, + "loss": 3.0124738216400146, + "step": 9265, + "token_acc": 0.29803631728485314 + }, + { + "epoch": 5.431545001465846, + "grad_norm": 0.28691000049356535, + "learning_rate": 0.0004358391601046863, + "loss": 3.0200634002685547, + "step": 9266, + "token_acc": 0.2946043860005691 + }, + { + "epoch": 5.432131339783055, + "grad_norm": 0.21717247195790618, + "learning_rate": 0.00043582295175576626, + "loss": 3.0279107093811035, + "step": 9267, + "token_acc": 0.2937962939555578 + }, + { + "epoch": 5.432717678100264, + "grad_norm": 0.18031065853372505, + "learning_rate": 0.00043580674166129006, + "loss": 2.976165771484375, + "step": 9268, + "token_acc": 0.3014500146659355 + }, + { + "epoch": 5.433304016417473, + "grad_norm": 0.18264851342714286, + "learning_rate": 0.00043579052982140986, + "loss": 3.009168863296509, + "step": 9269, + "token_acc": 0.2982893652102226 + }, + { + "epoch": 5.4338903547346815, + "grad_norm": 0.171081317370748, + "learning_rate": 0.000435774316236278, + "loss": 2.9766793251037598, + "step": 9270, + "token_acc": 0.3010177881207117 + }, + { + "epoch": 5.434476693051891, + "grad_norm": 0.17820552169061585, + "learning_rate": 0.00043575810090604677, + "loss": 2.9695844650268555, + "step": 9271, + "token_acc": 0.30336865909660626 + }, + { + "epoch": 5.4350630313691, + "grad_norm": 0.16766333512772605, + "learning_rate": 0.00043574188383086853, + "loss": 3.061079740524292, + "step": 9272, + "token_acc": 0.29130429143916897 + }, + { + "epoch": 5.435649369686309, + "grad_norm": 0.18019955424352724, + "learning_rate": 0.00043572566501089556, + "loss": 3.0243349075317383, + "step": 9273, + "token_acc": 0.29577893275590106 + }, + { + "epoch": 5.436235708003518, + "grad_norm": 0.1700494382420953, + "learning_rate": 0.00043570944444628023, + "loss": 3.0157384872436523, + "step": 9274, + "token_acc": 0.2970691676436108 + }, + { + "epoch": 5.436822046320727, + "grad_norm": 0.19469236522996472, + "learning_rate": 0.0004356932221371749, + "loss": 3.0311927795410156, + "step": 9275, + "token_acc": 0.29395106305827795 + }, + { + "epoch": 5.437408384637936, + "grad_norm": 0.1739583651296284, + "learning_rate": 0.000435676998083732, + "loss": 3.0042123794555664, + "step": 9276, + "token_acc": 0.2994384576441051 + }, + { + "epoch": 5.437994722955145, + "grad_norm": 0.16191266215786504, + "learning_rate": 0.0004356607722861039, + "loss": 3.0321288108825684, + "step": 9277, + "token_acc": 0.29488933810190227 + }, + { + "epoch": 5.438581061272354, + "grad_norm": 0.19423654784793112, + "learning_rate": 0.000435644544744443, + "loss": 3.0031025409698486, + "step": 9278, + "token_acc": 0.2970747732835048 + }, + { + "epoch": 5.4391673995895635, + "grad_norm": 0.22403544521636995, + "learning_rate": 0.0004356283154589018, + "loss": 3.0001492500305176, + "step": 9279, + "token_acc": 0.2995907630936788 + }, + { + "epoch": 5.439753737906772, + "grad_norm": 0.16828275422280975, + "learning_rate": 0.00043561208442963276, + "loss": 2.9616000652313232, + "step": 9280, + "token_acc": 0.30402783597179966 + }, + { + "epoch": 5.440340076223981, + "grad_norm": 0.1845331992738277, + "learning_rate": 0.00043559585165678816, + "loss": 3.0193986892700195, + "step": 9281, + "token_acc": 0.29629318273976124 + }, + { + "epoch": 5.44092641454119, + "grad_norm": 0.20789023389744837, + "learning_rate": 0.0004355796171405207, + "loss": 3.00126314163208, + "step": 9282, + "token_acc": 0.2981333807594632 + }, + { + "epoch": 5.441512752858399, + "grad_norm": 0.16954784224428396, + "learning_rate": 0.00043556338088098287, + "loss": 2.997490882873535, + "step": 9283, + "token_acc": 0.29713143051380375 + }, + { + "epoch": 5.442099091175608, + "grad_norm": 0.1997056145546008, + "learning_rate": 0.00043554714287832706, + "loss": 2.968294143676758, + "step": 9284, + "token_acc": 0.3028045737271407 + }, + { + "epoch": 5.442685429492817, + "grad_norm": 0.2205692575472107, + "learning_rate": 0.0004355309031327059, + "loss": 3.010143280029297, + "step": 9285, + "token_acc": 0.2968342817807521 + }, + { + "epoch": 5.443271767810026, + "grad_norm": 0.16775892558814662, + "learning_rate": 0.0004355146616442719, + "loss": 3.0132737159729004, + "step": 9286, + "token_acc": 0.29593930613486497 + }, + { + "epoch": 5.4438581061272355, + "grad_norm": 0.17157262877560361, + "learning_rate": 0.0004354984184131776, + "loss": 2.9661223888397217, + "step": 9287, + "token_acc": 0.303610413673653 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.16533490426807776, + "learning_rate": 0.00043548217343957564, + "loss": 3.0108532905578613, + "step": 9288, + "token_acc": 0.297357799230755 + }, + { + "epoch": 5.445030782761654, + "grad_norm": 0.16525369286095035, + "learning_rate": 0.00043546592672361864, + "loss": 3.048128604888916, + "step": 9289, + "token_acc": 0.2936736290546731 + }, + { + "epoch": 5.445617121078863, + "grad_norm": 0.1671089805013018, + "learning_rate": 0.0004354496782654591, + "loss": 3.0219802856445312, + "step": 9290, + "token_acc": 0.29415309054544997 + }, + { + "epoch": 5.446203459396072, + "grad_norm": 0.18941501665907617, + "learning_rate": 0.00043543342806524985, + "loss": 3.030090093612671, + "step": 9291, + "token_acc": 0.29489052279625394 + }, + { + "epoch": 5.44678979771328, + "grad_norm": 0.17666970004787222, + "learning_rate": 0.00043541717612314337, + "loss": 3.032655715942383, + "step": 9292, + "token_acc": 0.29356235342628145 + }, + { + "epoch": 5.447376136030489, + "grad_norm": 0.15355495087167506, + "learning_rate": 0.0004354009224392923, + "loss": 2.9572701454162598, + "step": 9293, + "token_acc": 0.30464304107532225 + }, + { + "epoch": 5.447962474347698, + "grad_norm": 0.17084517834543655, + "learning_rate": 0.0004353846670138495, + "loss": 2.9812989234924316, + "step": 9294, + "token_acc": 0.3013412434318867 + }, + { + "epoch": 5.4485488126649075, + "grad_norm": 0.1673863729460517, + "learning_rate": 0.00043536840984696745, + "loss": 3.003182888031006, + "step": 9295, + "token_acc": 0.297100122323711 + }, + { + "epoch": 5.449135150982117, + "grad_norm": 0.1730762621250855, + "learning_rate": 0.000435352150938799, + "loss": 2.9701852798461914, + "step": 9296, + "token_acc": 0.30238794100257943 + }, + { + "epoch": 5.449721489299326, + "grad_norm": 0.16722979845107286, + "learning_rate": 0.00043533589028949693, + "loss": 2.96159029006958, + "step": 9297, + "token_acc": 0.30298906999442643 + }, + { + "epoch": 5.450307827616535, + "grad_norm": 0.20761695612468078, + "learning_rate": 0.0004353196278992139, + "loss": 2.9742608070373535, + "step": 9298, + "token_acc": 0.3024488976178739 + }, + { + "epoch": 5.450894165933744, + "grad_norm": 0.24730649567758445, + "learning_rate": 0.00043530336376810267, + "loss": 3.027625560760498, + "step": 9299, + "token_acc": 0.2932454912381992 + }, + { + "epoch": 5.451480504250953, + "grad_norm": 0.2086871442759235, + "learning_rate": 0.000435287097896316, + "loss": 2.963857889175415, + "step": 9300, + "token_acc": 0.303133837270567 + }, + { + "epoch": 5.452066842568162, + "grad_norm": 0.17552198299056931, + "learning_rate": 0.0004352708302840068, + "loss": 3.0172722339630127, + "step": 9301, + "token_acc": 0.2954539299429642 + }, + { + "epoch": 5.45265318088537, + "grad_norm": 0.20034092909944234, + "learning_rate": 0.00043525456093132774, + "loss": 3.028874158859253, + "step": 9302, + "token_acc": 0.2929356010958453 + }, + { + "epoch": 5.4532395192025795, + "grad_norm": 0.1679772025208035, + "learning_rate": 0.00043523828983843175, + "loss": 3.028562545776367, + "step": 9303, + "token_acc": 0.2949679623202518 + }, + { + "epoch": 5.453825857519789, + "grad_norm": 0.2056386046516587, + "learning_rate": 0.0004352220170054716, + "loss": 2.993312358856201, + "step": 9304, + "token_acc": 0.3008977513613443 + }, + { + "epoch": 5.454412195836998, + "grad_norm": 0.24164607982125028, + "learning_rate": 0.0004352057424326002, + "loss": 2.9532313346862793, + "step": 9305, + "token_acc": 0.303769057068378 + }, + { + "epoch": 5.454998534154207, + "grad_norm": 0.18961125269549683, + "learning_rate": 0.00043518946611997047, + "loss": 3.045319080352783, + "step": 9306, + "token_acc": 0.2936871415286535 + }, + { + "epoch": 5.455584872471416, + "grad_norm": 0.20391789495819038, + "learning_rate": 0.00043517318806773523, + "loss": 2.975623607635498, + "step": 9307, + "token_acc": 0.30348533440944714 + }, + { + "epoch": 5.456171210788625, + "grad_norm": 0.21554786432652587, + "learning_rate": 0.0004351569082760474, + "loss": 2.994164228439331, + "step": 9308, + "token_acc": 0.29931022985391875 + }, + { + "epoch": 5.456757549105834, + "grad_norm": 0.1608018041268314, + "learning_rate": 0.0004351406267450598, + "loss": 2.9838571548461914, + "step": 9309, + "token_acc": 0.3014281185485305 + }, + { + "epoch": 5.457343887423043, + "grad_norm": 0.17861351621622457, + "learning_rate": 0.0004351243434749257, + "loss": 3.0143494606018066, + "step": 9310, + "token_acc": 0.29763984332314986 + }, + { + "epoch": 5.457930225740252, + "grad_norm": 0.19405751891232503, + "learning_rate": 0.0004351080584657977, + "loss": 2.983546733856201, + "step": 9311, + "token_acc": 0.29993846798371365 + }, + { + "epoch": 5.4585165640574616, + "grad_norm": 0.16875607433881457, + "learning_rate": 0.000435091771717829, + "loss": 2.9874768257141113, + "step": 9312, + "token_acc": 0.3021902556180271 + }, + { + "epoch": 5.45910290237467, + "grad_norm": 0.18848562525728227, + "learning_rate": 0.0004350754832311725, + "loss": 3.001331329345703, + "step": 9313, + "token_acc": 0.2978758174173115 + }, + { + "epoch": 5.459689240691879, + "grad_norm": 0.1743419543391133, + "learning_rate": 0.00043505919300598116, + "loss": 3.0114850997924805, + "step": 9314, + "token_acc": 0.29753927525905527 + }, + { + "epoch": 5.460275579009088, + "grad_norm": 0.16160176014485184, + "learning_rate": 0.0004350429010424082, + "loss": 2.9708027839660645, + "step": 9315, + "token_acc": 0.303546845474579 + }, + { + "epoch": 5.460861917326297, + "grad_norm": 0.1679364862816142, + "learning_rate": 0.0004350266073406065, + "loss": 2.9907479286193848, + "step": 9316, + "token_acc": 0.3013923298783718 + }, + { + "epoch": 5.461448255643506, + "grad_norm": 0.16828796381774416, + "learning_rate": 0.0004350103119007291, + "loss": 2.9992048740386963, + "step": 9317, + "token_acc": 0.2986363540301788 + }, + { + "epoch": 5.462034593960715, + "grad_norm": 0.17432363369494414, + "learning_rate": 0.0004349940147229291, + "loss": 2.998342275619507, + "step": 9318, + "token_acc": 0.29761861223295594 + }, + { + "epoch": 5.4626209322779244, + "grad_norm": 0.15796614884459884, + "learning_rate": 0.0004349777158073597, + "loss": 3.0219717025756836, + "step": 9319, + "token_acc": 0.2963373852956559 + }, + { + "epoch": 5.463207270595134, + "grad_norm": 0.17108010254385855, + "learning_rate": 0.0004349614151541739, + "loss": 2.9519553184509277, + "step": 9320, + "token_acc": 0.3052930157880653 + }, + { + "epoch": 5.463793608912343, + "grad_norm": 0.16508200271117104, + "learning_rate": 0.00043494511276352476, + "loss": 2.9958677291870117, + "step": 9321, + "token_acc": 0.29816006516098753 + }, + { + "epoch": 5.464379947229552, + "grad_norm": 0.16089407466310768, + "learning_rate": 0.00043492880863556563, + "loss": 2.9480462074279785, + "step": 9322, + "token_acc": 0.3062120695181084 + }, + { + "epoch": 5.464966285546761, + "grad_norm": 0.17575109116089047, + "learning_rate": 0.0004349125027704495, + "loss": 3.0240964889526367, + "step": 9323, + "token_acc": 0.2949767441860465 + }, + { + "epoch": 5.465552623863969, + "grad_norm": 0.16368225472213027, + "learning_rate": 0.0004348961951683295, + "loss": 2.9775891304016113, + "step": 9324, + "token_acc": 0.302826786614448 + }, + { + "epoch": 5.466138962181178, + "grad_norm": 0.16068471993661282, + "learning_rate": 0.000434879885829359, + "loss": 2.9678049087524414, + "step": 9325, + "token_acc": 0.3033337182257559 + }, + { + "epoch": 5.466725300498387, + "grad_norm": 0.18143339984879545, + "learning_rate": 0.000434863574753691, + "loss": 2.999800682067871, + "step": 9326, + "token_acc": 0.29858578763054866 + }, + { + "epoch": 5.4673116388155965, + "grad_norm": 0.1714597427861293, + "learning_rate": 0.0004348472619414789, + "loss": 2.9835762977600098, + "step": 9327, + "token_acc": 0.3013311567624895 + }, + { + "epoch": 5.467897977132806, + "grad_norm": 0.1606308949179028, + "learning_rate": 0.00043483094739287583, + "loss": 3.0146384239196777, + "step": 9328, + "token_acc": 0.29494197503831837 + }, + { + "epoch": 5.468484315450015, + "grad_norm": 0.22327434064351578, + "learning_rate": 0.0004348146311080351, + "loss": 3.0092413425445557, + "step": 9329, + "token_acc": 0.29714063266568547 + }, + { + "epoch": 5.469070653767224, + "grad_norm": 0.20174208910688704, + "learning_rate": 0.0004347983130871099, + "loss": 3.0328457355499268, + "step": 9330, + "token_acc": 0.29422749303363666 + }, + { + "epoch": 5.469656992084433, + "grad_norm": 0.20139370258095388, + "learning_rate": 0.0004347819933302536, + "loss": 3.0082056522369385, + "step": 9331, + "token_acc": 0.2987750160054218 + }, + { + "epoch": 5.470243330401642, + "grad_norm": 0.21304896528816133, + "learning_rate": 0.0004347656718376195, + "loss": 3.0142111778259277, + "step": 9332, + "token_acc": 0.29565170440228933 + }, + { + "epoch": 5.470829668718851, + "grad_norm": 0.1671499857318402, + "learning_rate": 0.00043474934860936084, + "loss": 3.001624822616577, + "step": 9333, + "token_acc": 0.29842798187004055 + }, + { + "epoch": 5.47141600703606, + "grad_norm": 0.1771438022559278, + "learning_rate": 0.000434733023645631, + "loss": 3.044080972671509, + "step": 9334, + "token_acc": 0.29239327557689954 + }, + { + "epoch": 5.4720023453532685, + "grad_norm": 0.17616371080157178, + "learning_rate": 0.0004347166969465833, + "loss": 3.0142054557800293, + "step": 9335, + "token_acc": 0.297204885947399 + }, + { + "epoch": 5.472588683670478, + "grad_norm": 0.23854516696567812, + "learning_rate": 0.0004347003685123713, + "loss": 2.941039562225342, + "step": 9336, + "token_acc": 0.30678391695821644 + }, + { + "epoch": 5.473175021987687, + "grad_norm": 0.27139848290724333, + "learning_rate": 0.00043468403834314806, + "loss": 3.019939422607422, + "step": 9337, + "token_acc": 0.2958992674332099 + }, + { + "epoch": 5.473761360304896, + "grad_norm": 0.23103085215293517, + "learning_rate": 0.00043466770643906715, + "loss": 3.024759292602539, + "step": 9338, + "token_acc": 0.2953997514188592 + }, + { + "epoch": 5.474347698622105, + "grad_norm": 0.199254949621376, + "learning_rate": 0.0004346513728002821, + "loss": 2.948340892791748, + "step": 9339, + "token_acc": 0.3054001886337216 + }, + { + "epoch": 5.474934036939314, + "grad_norm": 0.24108422225289827, + "learning_rate": 0.0004346350374269461, + "loss": 2.9728355407714844, + "step": 9340, + "token_acc": 0.30249291726650035 + }, + { + "epoch": 5.475520375256523, + "grad_norm": 0.18333449599754734, + "learning_rate": 0.0004346187003192128, + "loss": 3.007983684539795, + "step": 9341, + "token_acc": 0.2965609322945698 + }, + { + "epoch": 5.476106713573732, + "grad_norm": 0.21960325550894513, + "learning_rate": 0.00043460236147723554, + "loss": 2.9771604537963867, + "step": 9342, + "token_acc": 0.3029555330514968 + }, + { + "epoch": 5.476693051890941, + "grad_norm": 0.21545323323924467, + "learning_rate": 0.0004345860209011679, + "loss": 3.042112112045288, + "step": 9343, + "token_acc": 0.2935750961258718 + }, + { + "epoch": 5.4772793902081505, + "grad_norm": 0.22157656146051166, + "learning_rate": 0.00043456967859116323, + "loss": 3.02256178855896, + "step": 9344, + "token_acc": 0.29564296915838995 + }, + { + "epoch": 5.477865728525359, + "grad_norm": 0.22812204607766765, + "learning_rate": 0.0004345533345473753, + "loss": 2.99564528465271, + "step": 9345, + "token_acc": 0.300691012603308 + }, + { + "epoch": 5.478452066842568, + "grad_norm": 0.16539667899202423, + "learning_rate": 0.0004345369887699574, + "loss": 2.991602897644043, + "step": 9346, + "token_acc": 0.30129104943838436 + }, + { + "epoch": 5.479038405159777, + "grad_norm": 0.19475536965149032, + "learning_rate": 0.0004345206412590631, + "loss": 3.02744722366333, + "step": 9347, + "token_acc": 0.2950667609911991 + }, + { + "epoch": 5.479624743476986, + "grad_norm": 0.1600815320060909, + "learning_rate": 0.0004345042920148461, + "loss": 2.9872257709503174, + "step": 9348, + "token_acc": 0.30130164704858464 + }, + { + "epoch": 5.480211081794195, + "grad_norm": 0.1933174638543292, + "learning_rate": 0.0004344879410374598, + "loss": 3.0092458724975586, + "step": 9349, + "token_acc": 0.29812729587577225 + }, + { + "epoch": 5.480797420111404, + "grad_norm": 0.1727257584071273, + "learning_rate": 0.00043447158832705805, + "loss": 3.0022435188293457, + "step": 9350, + "token_acc": 0.300934805340844 + }, + { + "epoch": 5.481383758428613, + "grad_norm": 0.18644676950434388, + "learning_rate": 0.00043445523388379415, + "loss": 3.0150766372680664, + "step": 9351, + "token_acc": 0.2966749374456903 + }, + { + "epoch": 5.4819700967458225, + "grad_norm": 0.15508637663972538, + "learning_rate": 0.000434438877707822, + "loss": 3.0035812854766846, + "step": 9352, + "token_acc": 0.2975214307378772 + }, + { + "epoch": 5.482556435063032, + "grad_norm": 0.169499747956723, + "learning_rate": 0.00043442251979929503, + "loss": 3.0251708030700684, + "step": 9353, + "token_acc": 0.29517203262421055 + }, + { + "epoch": 5.483142773380241, + "grad_norm": 0.15878939296820843, + "learning_rate": 0.00043440616015836707, + "loss": 2.9710445404052734, + "step": 9354, + "token_acc": 0.30166225816799275 + }, + { + "epoch": 5.48372911169745, + "grad_norm": 0.18162086682813014, + "learning_rate": 0.00043438979878519167, + "loss": 3.005713701248169, + "step": 9355, + "token_acc": 0.2981250256284086 + }, + { + "epoch": 5.484315450014659, + "grad_norm": 0.20055223048789003, + "learning_rate": 0.0004343734356799226, + "loss": 3.0156188011169434, + "step": 9356, + "token_acc": 0.2956498232795972 + }, + { + "epoch": 5.484901788331867, + "grad_norm": 0.17553405711785136, + "learning_rate": 0.0004343570708427136, + "loss": 2.993959426879883, + "step": 9357, + "token_acc": 0.29916330909708394 + }, + { + "epoch": 5.485488126649076, + "grad_norm": 0.24411861725253006, + "learning_rate": 0.0004343407042737183, + "loss": 3.007037878036499, + "step": 9358, + "token_acc": 0.29775937639362776 + }, + { + "epoch": 5.486074464966285, + "grad_norm": 0.23974016629393172, + "learning_rate": 0.00043432433597309053, + "loss": 2.9751949310302734, + "step": 9359, + "token_acc": 0.30380451212301995 + }, + { + "epoch": 5.4866608032834945, + "grad_norm": 0.18379043830839406, + "learning_rate": 0.0004343079659409839, + "loss": 2.967970371246338, + "step": 9360, + "token_acc": 0.3029283690067507 + }, + { + "epoch": 5.487247141600704, + "grad_norm": 0.18169451798087394, + "learning_rate": 0.0004342915941775524, + "loss": 2.987395763397217, + "step": 9361, + "token_acc": 0.30139185469139934 + }, + { + "epoch": 5.487833479917913, + "grad_norm": 0.16386457270493635, + "learning_rate": 0.00043427522068294964, + "loss": 3.0131359100341797, + "step": 9362, + "token_acc": 0.29625857197843486 + }, + { + "epoch": 5.488419818235122, + "grad_norm": 0.18017356074422491, + "learning_rate": 0.0004342588454573295, + "loss": 2.94559645652771, + "step": 9363, + "token_acc": 0.3063685557235271 + }, + { + "epoch": 5.489006156552331, + "grad_norm": 0.1915794965924, + "learning_rate": 0.0004342424685008458, + "loss": 3.0099494457244873, + "step": 9364, + "token_acc": 0.29770148969688 + }, + { + "epoch": 5.48959249486954, + "grad_norm": 0.16935458706535836, + "learning_rate": 0.0004342260898136524, + "loss": 2.9863579273223877, + "step": 9365, + "token_acc": 0.29992056489485847 + }, + { + "epoch": 5.490178833186749, + "grad_norm": 0.1775506097557626, + "learning_rate": 0.00043420970939590315, + "loss": 2.9944534301757812, + "step": 9366, + "token_acc": 0.300386918987335 + }, + { + "epoch": 5.490765171503957, + "grad_norm": 0.1974878024972098, + "learning_rate": 0.00043419332724775183, + "loss": 3.0044822692871094, + "step": 9367, + "token_acc": 0.29898780564857524 + }, + { + "epoch": 5.4913515098211665, + "grad_norm": 0.16881773587847546, + "learning_rate": 0.00043417694336935256, + "loss": 3.0309898853302, + "step": 9368, + "token_acc": 0.2934425913853382 + }, + { + "epoch": 5.491937848138376, + "grad_norm": 0.17724400557659958, + "learning_rate": 0.00043416055776085893, + "loss": 3.0080833435058594, + "step": 9369, + "token_acc": 0.2971438302582633 + }, + { + "epoch": 5.492524186455585, + "grad_norm": 0.15701262671466007, + "learning_rate": 0.00043414417042242506, + "loss": 3.0453224182128906, + "step": 9370, + "token_acc": 0.29231416469161803 + }, + { + "epoch": 5.493110524772794, + "grad_norm": 0.19354910230130587, + "learning_rate": 0.0004341277813542049, + "loss": 2.989180564880371, + "step": 9371, + "token_acc": 0.30065705469248777 + }, + { + "epoch": 5.493696863090003, + "grad_norm": 0.23206530054141628, + "learning_rate": 0.0004341113905563523, + "loss": 3.0411672592163086, + "step": 9372, + "token_acc": 0.29412633467530197 + }, + { + "epoch": 5.494283201407212, + "grad_norm": 0.19446118650442637, + "learning_rate": 0.0004340949980290213, + "loss": 3.0418009757995605, + "step": 9373, + "token_acc": 0.2931182966065528 + }, + { + "epoch": 5.494869539724421, + "grad_norm": 0.18023346085427694, + "learning_rate": 0.00043407860377236583, + "loss": 2.970451593399048, + "step": 9374, + "token_acc": 0.3025208310246384 + }, + { + "epoch": 5.49545587804163, + "grad_norm": 0.18846912317183515, + "learning_rate": 0.0004340622077865399, + "loss": 3.0217268466949463, + "step": 9375, + "token_acc": 0.29559524304977525 + }, + { + "epoch": 5.496042216358839, + "grad_norm": 0.18030663470292602, + "learning_rate": 0.00043404581007169764, + "loss": 3.004028081893921, + "step": 9376, + "token_acc": 0.2991030748691432 + }, + { + "epoch": 5.4966285546760485, + "grad_norm": 0.1826128650138986, + "learning_rate": 0.000434029410627993, + "loss": 3.0328335762023926, + "step": 9377, + "token_acc": 0.29389144419967406 + }, + { + "epoch": 5.497214892993257, + "grad_norm": 0.16835067970420253, + "learning_rate": 0.00043401300945558, + "loss": 3.0004849433898926, + "step": 9378, + "token_acc": 0.29846186712240974 + }, + { + "epoch": 5.497801231310466, + "grad_norm": 0.18967900230634488, + "learning_rate": 0.0004339966065546127, + "loss": 3.0322606563568115, + "step": 9379, + "token_acc": 0.2957057353105662 + }, + { + "epoch": 5.498387569627675, + "grad_norm": 0.21495167929653738, + "learning_rate": 0.00043398020192524523, + "loss": 3.0453951358795166, + "step": 9380, + "token_acc": 0.29316311185136074 + }, + { + "epoch": 5.498973907944884, + "grad_norm": 0.1712357227152772, + "learning_rate": 0.0004339637955676318, + "loss": 2.9697022438049316, + "step": 9381, + "token_acc": 0.3007912726279548 + }, + { + "epoch": 5.499560246262093, + "grad_norm": 0.23446145021797088, + "learning_rate": 0.0004339473874819262, + "loss": 2.98172664642334, + "step": 9382, + "token_acc": 0.30057683582863437 + }, + { + "epoch": 5.500146584579302, + "grad_norm": 0.2907211520884201, + "learning_rate": 0.00043393097766828293, + "loss": 2.9900078773498535, + "step": 9383, + "token_acc": 0.29807310104548207 + }, + { + "epoch": 5.500732922896511, + "grad_norm": 0.16959876754183015, + "learning_rate": 0.0004339145661268559, + "loss": 3.022305488586426, + "step": 9384, + "token_acc": 0.29637482795922826 + }, + { + "epoch": 5.5013192612137205, + "grad_norm": 0.21187735366206484, + "learning_rate": 0.0004338981528577994, + "loss": 3.000006675720215, + "step": 9385, + "token_acc": 0.2995254164369933 + }, + { + "epoch": 5.50190559953093, + "grad_norm": 0.2443164023925679, + "learning_rate": 0.0004338817378612675, + "loss": 3.0584471225738525, + "step": 9386, + "token_acc": 0.28929861903019655 + }, + { + "epoch": 5.502491937848139, + "grad_norm": 0.1671842632629723, + "learning_rate": 0.0004338653211374145, + "loss": 3.0012640953063965, + "step": 9387, + "token_acc": 0.2984004728538658 + }, + { + "epoch": 5.503078276165347, + "grad_norm": 0.19813626163432357, + "learning_rate": 0.0004338489026863945, + "loss": 2.9920334815979004, + "step": 9388, + "token_acc": 0.29932451823428746 + }, + { + "epoch": 5.503664614482556, + "grad_norm": 0.17687717320635774, + "learning_rate": 0.00043383248250836187, + "loss": 3.0035901069641113, + "step": 9389, + "token_acc": 0.2986588751510074 + }, + { + "epoch": 5.504250952799765, + "grad_norm": 0.19068261016671093, + "learning_rate": 0.00043381606060347076, + "loss": 2.984023094177246, + "step": 9390, + "token_acc": 0.30226055135020585 + }, + { + "epoch": 5.504837291116974, + "grad_norm": 0.15992504897434573, + "learning_rate": 0.00043379963697187554, + "loss": 3.0117697715759277, + "step": 9391, + "token_acc": 0.2966624106230848 + }, + { + "epoch": 5.505423629434183, + "grad_norm": 0.1896594067260591, + "learning_rate": 0.00043378321161373026, + "loss": 3.0310535430908203, + "step": 9392, + "token_acc": 0.2949860127035132 + }, + { + "epoch": 5.5060099677513925, + "grad_norm": 0.16920094324649537, + "learning_rate": 0.0004337667845291894, + "loss": 2.99334716796875, + "step": 9393, + "token_acc": 0.29846604959701756 + }, + { + "epoch": 5.506596306068602, + "grad_norm": 0.176388198803083, + "learning_rate": 0.00043375035571840726, + "loss": 3.0528552532196045, + "step": 9394, + "token_acc": 0.2911713331620858 + }, + { + "epoch": 5.507182644385811, + "grad_norm": 0.17092608085548297, + "learning_rate": 0.00043373392518153815, + "loss": 2.986557960510254, + "step": 9395, + "token_acc": 0.30249499227305804 + }, + { + "epoch": 5.50776898270302, + "grad_norm": 0.15328530036068685, + "learning_rate": 0.0004337174929187364, + "loss": 3.0155205726623535, + "step": 9396, + "token_acc": 0.2970230022916062 + }, + { + "epoch": 5.508355321020229, + "grad_norm": 0.18237846686239156, + "learning_rate": 0.0004337010589301563, + "loss": 2.9875736236572266, + "step": 9397, + "token_acc": 0.3006323535047714 + }, + { + "epoch": 5.508941659337438, + "grad_norm": 0.15841846383228525, + "learning_rate": 0.00043368462321595236, + "loss": 3.0443122386932373, + "step": 9398, + "token_acc": 0.2914185151423073 + }, + { + "epoch": 5.509527997654647, + "grad_norm": 0.16263065524028364, + "learning_rate": 0.00043366818577627886, + "loss": 2.9979233741760254, + "step": 9399, + "token_acc": 0.29885475272351814 + }, + { + "epoch": 5.510114335971855, + "grad_norm": 0.1723298723366606, + "learning_rate": 0.0004336517466112903, + "loss": 2.995983600616455, + "step": 9400, + "token_acc": 0.2978448076656448 + }, + { + "epoch": 5.5107006742890645, + "grad_norm": 0.20187269414754777, + "learning_rate": 0.000433635305721141, + "loss": 3.0249767303466797, + "step": 9401, + "token_acc": 0.29541429173865136 + }, + { + "epoch": 5.511287012606274, + "grad_norm": 0.18664800798980863, + "learning_rate": 0.0004336188631059855, + "loss": 3.0449013710021973, + "step": 9402, + "token_acc": 0.292352200527398 + }, + { + "epoch": 5.511873350923483, + "grad_norm": 0.15324617081762024, + "learning_rate": 0.00043360241876597817, + "loss": 2.972271203994751, + "step": 9403, + "token_acc": 0.3038447509737675 + }, + { + "epoch": 5.512459689240692, + "grad_norm": 0.17140731816783925, + "learning_rate": 0.00043358597270127353, + "loss": 2.977762222290039, + "step": 9404, + "token_acc": 0.3025508731418675 + }, + { + "epoch": 5.513046027557901, + "grad_norm": 0.18327996559283097, + "learning_rate": 0.0004335695249120261, + "loss": 2.9809370040893555, + "step": 9405, + "token_acc": 0.2999408274968501 + }, + { + "epoch": 5.51363236587511, + "grad_norm": 0.18974578362194042, + "learning_rate": 0.00043355307539839026, + "loss": 3.010877847671509, + "step": 9406, + "token_acc": 0.29699738731304526 + }, + { + "epoch": 5.514218704192319, + "grad_norm": 0.1834888309245768, + "learning_rate": 0.0004335366241605207, + "loss": 2.9924750328063965, + "step": 9407, + "token_acc": 0.29959800924011043 + }, + { + "epoch": 5.514805042509528, + "grad_norm": 0.16288238828971746, + "learning_rate": 0.0004335201711985718, + "loss": 3.0277392864227295, + "step": 9408, + "token_acc": 0.2942145762624038 + }, + { + "epoch": 5.515391380826737, + "grad_norm": 0.16089403547306336, + "learning_rate": 0.00043350371651269825, + "loss": 2.9950387477874756, + "step": 9409, + "token_acc": 0.2991269017109097 + }, + { + "epoch": 5.515977719143946, + "grad_norm": 0.15991453230368985, + "learning_rate": 0.0004334872601030546, + "loss": 2.969026565551758, + "step": 9410, + "token_acc": 0.3028002726214051 + }, + { + "epoch": 5.516564057461155, + "grad_norm": 0.1967749455189207, + "learning_rate": 0.0004334708019697953, + "loss": 3.0168423652648926, + "step": 9411, + "token_acc": 0.2964568616264732 + }, + { + "epoch": 5.517150395778364, + "grad_norm": 0.2366919285871696, + "learning_rate": 0.0004334543421130751, + "loss": 3.01121187210083, + "step": 9412, + "token_acc": 0.2964904170023908 + }, + { + "epoch": 5.517736734095573, + "grad_norm": 0.23840548553057841, + "learning_rate": 0.00043343788053304855, + "loss": 3.015990972518921, + "step": 9413, + "token_acc": 0.29593804861535716 + }, + { + "epoch": 5.518323072412782, + "grad_norm": 0.19128260085810947, + "learning_rate": 0.00043342141722987025, + "loss": 2.993687152862549, + "step": 9414, + "token_acc": 0.2999158970085569 + }, + { + "epoch": 5.518909410729991, + "grad_norm": 0.1638974745229396, + "learning_rate": 0.000433404952203695, + "loss": 2.9645180702209473, + "step": 9415, + "token_acc": 0.30518516606213936 + }, + { + "epoch": 5.5194957490472, + "grad_norm": 0.20631894037930643, + "learning_rate": 0.0004333884854546773, + "loss": 2.9568023681640625, + "step": 9416, + "token_acc": 0.3049634063737341 + }, + { + "epoch": 5.520082087364409, + "grad_norm": 0.19887940177501076, + "learning_rate": 0.0004333720169829719, + "loss": 3.007230281829834, + "step": 9417, + "token_acc": 0.29784604566798684 + }, + { + "epoch": 5.5206684256816185, + "grad_norm": 0.1830773133928135, + "learning_rate": 0.00043335554678873345, + "loss": 3.0155134201049805, + "step": 9418, + "token_acc": 0.2965248363999102 + }, + { + "epoch": 5.521254763998828, + "grad_norm": 0.23703950501463483, + "learning_rate": 0.00043333907487211684, + "loss": 3.0468897819519043, + "step": 9419, + "token_acc": 0.2931239480551567 + }, + { + "epoch": 5.521841102316037, + "grad_norm": 0.20751805438002874, + "learning_rate": 0.0004333226012332766, + "loss": 3.0036652088165283, + "step": 9420, + "token_acc": 0.30087034577609334 + }, + { + "epoch": 5.522427440633246, + "grad_norm": 0.16104949912946592, + "learning_rate": 0.0004333061258723675, + "loss": 2.9873478412628174, + "step": 9421, + "token_acc": 0.29975913415065736 + }, + { + "epoch": 5.523013778950454, + "grad_norm": 0.2109539337701507, + "learning_rate": 0.00043328964878954445, + "loss": 3.0426526069641113, + "step": 9422, + "token_acc": 0.29446116574185255 + }, + { + "epoch": 5.523600117267663, + "grad_norm": 0.16941032704916537, + "learning_rate": 0.00043327316998496206, + "loss": 3.024261474609375, + "step": 9423, + "token_acc": 0.2959658970611777 + }, + { + "epoch": 5.524186455584872, + "grad_norm": 0.20434604066975953, + "learning_rate": 0.00043325668945877526, + "loss": 3.0104238986968994, + "step": 9424, + "token_acc": 0.2969926793337366 + }, + { + "epoch": 5.524772793902081, + "grad_norm": 0.1749752595667825, + "learning_rate": 0.00043324020721113876, + "loss": 2.9500277042388916, + "step": 9425, + "token_acc": 0.3064660501523786 + }, + { + "epoch": 5.5253591322192905, + "grad_norm": 0.16697771530572791, + "learning_rate": 0.0004332237232422075, + "loss": 3.023108959197998, + "step": 9426, + "token_acc": 0.29665352218817626 + }, + { + "epoch": 5.5259454705365, + "grad_norm": 0.2752882037177016, + "learning_rate": 0.0004332072375521362, + "loss": 3.0061073303222656, + "step": 9427, + "token_acc": 0.29803831152612065 + }, + { + "epoch": 5.526531808853709, + "grad_norm": 0.25590083139718867, + "learning_rate": 0.0004331907501410798, + "loss": 3.0086348056793213, + "step": 9428, + "token_acc": 0.29783920980374856 + }, + { + "epoch": 5.527118147170918, + "grad_norm": 0.17319592805934894, + "learning_rate": 0.0004331742610091931, + "loss": 3.012814521789551, + "step": 9429, + "token_acc": 0.2979724496921221 + }, + { + "epoch": 5.527704485488127, + "grad_norm": 0.21454232799656464, + "learning_rate": 0.0004331577701566311, + "loss": 3.007486581802368, + "step": 9430, + "token_acc": 0.29804485831947125 + }, + { + "epoch": 5.528290823805335, + "grad_norm": 0.22447794462132634, + "learning_rate": 0.0004331412775835486, + "loss": 2.9998936653137207, + "step": 9431, + "token_acc": 0.29816750464800895 + }, + { + "epoch": 5.528877162122544, + "grad_norm": 0.2875163596874977, + "learning_rate": 0.00043312478329010065, + "loss": 3.067638635635376, + "step": 9432, + "token_acc": 0.2889450543664271 + }, + { + "epoch": 5.529463500439753, + "grad_norm": 0.17526870198281402, + "learning_rate": 0.00043310828727644214, + "loss": 2.993114709854126, + "step": 9433, + "token_acc": 0.29982795710361665 + }, + { + "epoch": 5.5300498387569625, + "grad_norm": 0.1935005606618356, + "learning_rate": 0.000433091789542728, + "loss": 2.9839954376220703, + "step": 9434, + "token_acc": 0.3010632694176998 + }, + { + "epoch": 5.530636177074172, + "grad_norm": 0.17675171428271097, + "learning_rate": 0.00043307529008911315, + "loss": 3.0044941902160645, + "step": 9435, + "token_acc": 0.29715196856185183 + }, + { + "epoch": 5.531222515391381, + "grad_norm": 0.17227898136409595, + "learning_rate": 0.00043305878891575266, + "loss": 3.031464099884033, + "step": 9436, + "token_acc": 0.29422518696661126 + }, + { + "epoch": 5.53180885370859, + "grad_norm": 0.19061003497288673, + "learning_rate": 0.0004330422860228016, + "loss": 2.971642255783081, + "step": 9437, + "token_acc": 0.30416964608370045 + }, + { + "epoch": 5.532395192025799, + "grad_norm": 0.19336355354909046, + "learning_rate": 0.00043302578141041486, + "loss": 3.00479793548584, + "step": 9438, + "token_acc": 0.29789834512331337 + }, + { + "epoch": 5.532981530343008, + "grad_norm": 0.19053764720440614, + "learning_rate": 0.00043300927507874753, + "loss": 3.0219674110412598, + "step": 9439, + "token_acc": 0.29553795989519616 + }, + { + "epoch": 5.533567868660217, + "grad_norm": 0.1795587059848596, + "learning_rate": 0.0004329927670279547, + "loss": 2.9725608825683594, + "step": 9440, + "token_acc": 0.30276111887691454 + }, + { + "epoch": 5.534154206977426, + "grad_norm": 0.16730239467832667, + "learning_rate": 0.0004329762572581914, + "loss": 3.0156161785125732, + "step": 9441, + "token_acc": 0.29544475151665284 + }, + { + "epoch": 5.534740545294635, + "grad_norm": 0.2471057373874181, + "learning_rate": 0.00043295974576961274, + "loss": 3.0116987228393555, + "step": 9442, + "token_acc": 0.2962087084176546 + }, + { + "epoch": 5.535326883611844, + "grad_norm": 0.15518135666905755, + "learning_rate": 0.0004329432325623738, + "loss": 2.9920949935913086, + "step": 9443, + "token_acc": 0.2996758339433232 + }, + { + "epoch": 5.535913221929053, + "grad_norm": 0.15241851584977692, + "learning_rate": 0.0004329267176366297, + "loss": 2.9840667247772217, + "step": 9444, + "token_acc": 0.300709035563145 + }, + { + "epoch": 5.536499560246262, + "grad_norm": 0.14709230456075592, + "learning_rate": 0.00043291020099253555, + "loss": 3.0079855918884277, + "step": 9445, + "token_acc": 0.29767388925819493 + }, + { + "epoch": 5.537085898563471, + "grad_norm": 0.1491350928490702, + "learning_rate": 0.00043289368263024655, + "loss": 2.9527130126953125, + "step": 9446, + "token_acc": 0.3067292604368832 + }, + { + "epoch": 5.53767223688068, + "grad_norm": 0.15493413931037414, + "learning_rate": 0.0004328771625499179, + "loss": 3.022693634033203, + "step": 9447, + "token_acc": 0.2946309304321768 + }, + { + "epoch": 5.538258575197889, + "grad_norm": 0.1496983759595075, + "learning_rate": 0.0004328606407517047, + "loss": 3.0094494819641113, + "step": 9448, + "token_acc": 0.2970012593845771 + }, + { + "epoch": 5.538844913515098, + "grad_norm": 0.16415841876125883, + "learning_rate": 0.0004328441172357622, + "loss": 2.9764533042907715, + "step": 9449, + "token_acc": 0.3007958346546098 + }, + { + "epoch": 5.5394312518323074, + "grad_norm": 0.15710987994295583, + "learning_rate": 0.00043282759200224556, + "loss": 2.9747698307037354, + "step": 9450, + "token_acc": 0.3032096196329081 + }, + { + "epoch": 5.540017590149517, + "grad_norm": 0.1577542351239319, + "learning_rate": 0.0004328110650513101, + "loss": 3.002152681350708, + "step": 9451, + "token_acc": 0.29886545466241043 + }, + { + "epoch": 5.540603928466726, + "grad_norm": 0.15485076271625242, + "learning_rate": 0.00043279453638311107, + "loss": 3.0010223388671875, + "step": 9452, + "token_acc": 0.2992914516814983 + }, + { + "epoch": 5.541190266783934, + "grad_norm": 0.16379006894422668, + "learning_rate": 0.00043277800599780364, + "loss": 3.0720930099487305, + "step": 9453, + "token_acc": 0.2888835302628406 + }, + { + "epoch": 5.541776605101143, + "grad_norm": 0.15439909680552336, + "learning_rate": 0.0004327614738955431, + "loss": 2.9842398166656494, + "step": 9454, + "token_acc": 0.3001962416905164 + }, + { + "epoch": 5.542362943418352, + "grad_norm": 0.15270878349037145, + "learning_rate": 0.0004327449400764849, + "loss": 3.042994976043701, + "step": 9455, + "token_acc": 0.2923127819233011 + }, + { + "epoch": 5.542949281735561, + "grad_norm": 0.16045093121590084, + "learning_rate": 0.0004327284045407841, + "loss": 3.035616636276245, + "step": 9456, + "token_acc": 0.2930281401939159 + }, + { + "epoch": 5.54353562005277, + "grad_norm": 0.15264748646531065, + "learning_rate": 0.00043271186728859626, + "loss": 3.0118813514709473, + "step": 9457, + "token_acc": 0.29589908205811954 + }, + { + "epoch": 5.5441219583699795, + "grad_norm": 0.17808934519793188, + "learning_rate": 0.0004326953283200766, + "loss": 3.0128226280212402, + "step": 9458, + "token_acc": 0.2940254399926992 + }, + { + "epoch": 5.544708296687189, + "grad_norm": 0.17305300894296483, + "learning_rate": 0.00043267878763538056, + "loss": 2.9983043670654297, + "step": 9459, + "token_acc": 0.299401134884744 + }, + { + "epoch": 5.545294635004398, + "grad_norm": 0.20042670898163922, + "learning_rate": 0.00043266224523466347, + "loss": 2.977257251739502, + "step": 9460, + "token_acc": 0.3009390616705413 + }, + { + "epoch": 5.545880973321607, + "grad_norm": 0.18409919883494427, + "learning_rate": 0.00043264570111808064, + "loss": 3.039703369140625, + "step": 9461, + "token_acc": 0.2931015674264778 + }, + { + "epoch": 5.546467311638816, + "grad_norm": 0.16439806752008956, + "learning_rate": 0.00043262915528578767, + "loss": 3.0225369930267334, + "step": 9462, + "token_acc": 0.29539860576643145 + }, + { + "epoch": 5.547053649956025, + "grad_norm": 0.17286503420805704, + "learning_rate": 0.0004326126077379398, + "loss": 2.992945909500122, + "step": 9463, + "token_acc": 0.300638338290669 + }, + { + "epoch": 5.547639988273234, + "grad_norm": 0.173935008201628, + "learning_rate": 0.00043259605847469263, + "loss": 3.0009875297546387, + "step": 9464, + "token_acc": 0.29795214786851 + }, + { + "epoch": 5.548226326590442, + "grad_norm": 0.16709085431629267, + "learning_rate": 0.00043257950749620147, + "loss": 2.9774985313415527, + "step": 9465, + "token_acc": 0.3012531074019583 + }, + { + "epoch": 5.5488126649076515, + "grad_norm": 0.1934510195318812, + "learning_rate": 0.00043256295480262195, + "loss": 3.0060086250305176, + "step": 9466, + "token_acc": 0.3000656243968346 + }, + { + "epoch": 5.549399003224861, + "grad_norm": 0.2728313701104438, + "learning_rate": 0.00043254640039410943, + "loss": 2.9284114837646484, + "step": 9467, + "token_acc": 0.30796650593664127 + }, + { + "epoch": 5.54998534154207, + "grad_norm": 0.3647029398591561, + "learning_rate": 0.00043252984427081945, + "loss": 3.061497688293457, + "step": 9468, + "token_acc": 0.2908314452312613 + }, + { + "epoch": 5.550571679859279, + "grad_norm": 0.2398856113506125, + "learning_rate": 0.0004325132864329075, + "loss": 3.0241472721099854, + "step": 9469, + "token_acc": 0.2960483742084566 + }, + { + "epoch": 5.551158018176488, + "grad_norm": 0.2214949390067725, + "learning_rate": 0.0004324967268805292, + "loss": 2.998025417327881, + "step": 9470, + "token_acc": 0.2997026147754056 + }, + { + "epoch": 5.551744356493697, + "grad_norm": 0.22839354474485768, + "learning_rate": 0.00043248016561384014, + "loss": 3.0145392417907715, + "step": 9471, + "token_acc": 0.29481959292381493 + }, + { + "epoch": 5.552330694810906, + "grad_norm": 0.20551582814020553, + "learning_rate": 0.00043246360263299576, + "loss": 3.010359287261963, + "step": 9472, + "token_acc": 0.2984882462006645 + }, + { + "epoch": 5.552917033128115, + "grad_norm": 0.18292084583042478, + "learning_rate": 0.00043244703793815167, + "loss": 2.997507095336914, + "step": 9473, + "token_acc": 0.29935713344750364 + }, + { + "epoch": 5.5535033714453235, + "grad_norm": 0.19812503568566023, + "learning_rate": 0.00043243047152946356, + "loss": 2.9943928718566895, + "step": 9474, + "token_acc": 0.30122276785483765 + }, + { + "epoch": 5.554089709762533, + "grad_norm": 0.15795928045482333, + "learning_rate": 0.00043241390340708697, + "loss": 3.0328638553619385, + "step": 9475, + "token_acc": 0.29366938488782995 + }, + { + "epoch": 5.554676048079742, + "grad_norm": 0.213239481451516, + "learning_rate": 0.0004323973335711776, + "loss": 3.0118188858032227, + "step": 9476, + "token_acc": 0.29764269176030755 + }, + { + "epoch": 5.555262386396951, + "grad_norm": 0.15263667282382087, + "learning_rate": 0.000432380762021891, + "loss": 3.031569719314575, + "step": 9477, + "token_acc": 0.2929050498869861 + }, + { + "epoch": 5.55584872471416, + "grad_norm": 0.20638453816711766, + "learning_rate": 0.000432364188759383, + "loss": 3.0327658653259277, + "step": 9478, + "token_acc": 0.2926560625491251 + }, + { + "epoch": 5.556435063031369, + "grad_norm": 0.16573847023168606, + "learning_rate": 0.0004323476137838092, + "loss": 2.987891912460327, + "step": 9479, + "token_acc": 0.30018902504385747 + }, + { + "epoch": 5.557021401348578, + "grad_norm": 0.22255895005456156, + "learning_rate": 0.0004323310370953252, + "loss": 2.985628843307495, + "step": 9480, + "token_acc": 0.3011409692219205 + }, + { + "epoch": 5.557607739665787, + "grad_norm": 0.17078694536071845, + "learning_rate": 0.0004323144586940868, + "loss": 3.0173001289367676, + "step": 9481, + "token_acc": 0.2958549032758368 + }, + { + "epoch": 5.558194077982996, + "grad_norm": 0.22715437180514036, + "learning_rate": 0.00043229787858024973, + "loss": 3.02471661567688, + "step": 9482, + "token_acc": 0.2932128228384664 + }, + { + "epoch": 5.5587804163002055, + "grad_norm": 0.2020690966866365, + "learning_rate": 0.0004322812967539698, + "loss": 3.0398335456848145, + "step": 9483, + "token_acc": 0.29275114619076703 + }, + { + "epoch": 5.559366754617415, + "grad_norm": 0.3352401803029688, + "learning_rate": 0.0004322647132154026, + "loss": 3.02543306350708, + "step": 9484, + "token_acc": 0.2953252669850706 + }, + { + "epoch": 5.559953092934624, + "grad_norm": 0.19594454581560863, + "learning_rate": 0.00043224812796470414, + "loss": 3.0144078731536865, + "step": 9485, + "token_acc": 0.29517604160439953 + }, + { + "epoch": 5.560539431251832, + "grad_norm": 0.23418751638448665, + "learning_rate": 0.00043223154100203, + "loss": 2.95459246635437, + "step": 9486, + "token_acc": 0.3076315339358776 + }, + { + "epoch": 5.561125769569041, + "grad_norm": 0.20373948204869285, + "learning_rate": 0.00043221495232753616, + "loss": 2.9931588172912598, + "step": 9487, + "token_acc": 0.29906892208599717 + }, + { + "epoch": 5.56171210788625, + "grad_norm": 0.24754027517796798, + "learning_rate": 0.0004321983619413784, + "loss": 3.020735263824463, + "step": 9488, + "token_acc": 0.295491859077549 + }, + { + "epoch": 5.562298446203459, + "grad_norm": 0.20078553734281449, + "learning_rate": 0.0004321817698437125, + "loss": 3.0241341590881348, + "step": 9489, + "token_acc": 0.2963542859418717 + }, + { + "epoch": 5.562884784520668, + "grad_norm": 0.20940765083540744, + "learning_rate": 0.0004321651760346944, + "loss": 2.9976284503936768, + "step": 9490, + "token_acc": 0.2978556836902801 + }, + { + "epoch": 5.5634711228378775, + "grad_norm": 0.17133403241427358, + "learning_rate": 0.0004321485805144799, + "loss": 2.993675708770752, + "step": 9491, + "token_acc": 0.2983808830625918 + }, + { + "epoch": 5.564057461155087, + "grad_norm": 0.20110559372543427, + "learning_rate": 0.000432131983283225, + "loss": 2.9991750717163086, + "step": 9492, + "token_acc": 0.29702934750274396 + }, + { + "epoch": 5.564643799472296, + "grad_norm": 0.18754257217321788, + "learning_rate": 0.0004321153843410855, + "loss": 3.0179224014282227, + "step": 9493, + "token_acc": 0.2964063873541159 + }, + { + "epoch": 5.565230137789505, + "grad_norm": 0.203897837846292, + "learning_rate": 0.0004320987836882173, + "loss": 2.994328022003174, + "step": 9494, + "token_acc": 0.2983112743381653 + }, + { + "epoch": 5.565816476106714, + "grad_norm": 0.16036465056483465, + "learning_rate": 0.0004320821813247765, + "loss": 3.0221657752990723, + "step": 9495, + "token_acc": 0.29479546516159166 + }, + { + "epoch": 5.566402814423922, + "grad_norm": 0.2012359393730973, + "learning_rate": 0.0004320655772509189, + "loss": 2.99320125579834, + "step": 9496, + "token_acc": 0.30027345757413576 + }, + { + "epoch": 5.566989152741131, + "grad_norm": 0.183975998874183, + "learning_rate": 0.00043204897146680065, + "loss": 3.008193016052246, + "step": 9497, + "token_acc": 0.2992542140207764 + }, + { + "epoch": 5.56757549105834, + "grad_norm": 0.1972059160150747, + "learning_rate": 0.0004320323639725775, + "loss": 3.022758722305298, + "step": 9498, + "token_acc": 0.2956468972565552 + }, + { + "epoch": 5.5681618293755495, + "grad_norm": 0.17278827872451347, + "learning_rate": 0.0004320157547684057, + "loss": 2.9845824241638184, + "step": 9499, + "token_acc": 0.3009983062156752 + }, + { + "epoch": 5.568748167692759, + "grad_norm": 0.21005859958466666, + "learning_rate": 0.0004319991438544411, + "loss": 3.0067405700683594, + "step": 9500, + "token_acc": 0.2977675209897229 + }, + { + "epoch": 5.569334506009968, + "grad_norm": 0.1743347513588534, + "learning_rate": 0.0004319825312308397, + "loss": 2.9743270874023438, + "step": 9501, + "token_acc": 0.3026966224810234 + }, + { + "epoch": 5.569920844327177, + "grad_norm": 0.16531455874855314, + "learning_rate": 0.0004319659168977578, + "loss": 2.9722275733947754, + "step": 9502, + "token_acc": 0.30333792652820335 + }, + { + "epoch": 5.570507182644386, + "grad_norm": 0.16879334573894708, + "learning_rate": 0.0004319493008553512, + "loss": 3.0240373611450195, + "step": 9503, + "token_acc": 0.29491075686700285 + }, + { + "epoch": 5.571093520961595, + "grad_norm": 0.1635008896473854, + "learning_rate": 0.0004319326831037762, + "loss": 3.0178942680358887, + "step": 9504, + "token_acc": 0.2967016507241287 + }, + { + "epoch": 5.571679859278804, + "grad_norm": 0.18688984143424864, + "learning_rate": 0.0004319160636431887, + "loss": 3.0153112411499023, + "step": 9505, + "token_acc": 0.2972402466417318 + }, + { + "epoch": 5.572266197596013, + "grad_norm": 0.182453126088087, + "learning_rate": 0.00043189944247374495, + "loss": 3.02286434173584, + "step": 9506, + "token_acc": 0.29398997723043013 + }, + { + "epoch": 5.572852535913222, + "grad_norm": 0.19833881459217165, + "learning_rate": 0.00043188281959560105, + "loss": 2.9843196868896484, + "step": 9507, + "token_acc": 0.29977302452454746 + }, + { + "epoch": 5.573438874230431, + "grad_norm": 0.17581081450183797, + "learning_rate": 0.00043186619500891314, + "loss": 2.997870683670044, + "step": 9508, + "token_acc": 0.29883769449394176 + }, + { + "epoch": 5.57402521254764, + "grad_norm": 0.2039416403457468, + "learning_rate": 0.00043184956871383746, + "loss": 3.0138344764709473, + "step": 9509, + "token_acc": 0.2966442987061073 + }, + { + "epoch": 5.574611550864849, + "grad_norm": 0.2072866115869426, + "learning_rate": 0.0004318329407105301, + "loss": 2.9914472103118896, + "step": 9510, + "token_acc": 0.3007728229921276 + }, + { + "epoch": 5.575197889182058, + "grad_norm": 0.15104159368363976, + "learning_rate": 0.0004318163109991472, + "loss": 3.0027198791503906, + "step": 9511, + "token_acc": 0.29881884795386016 + }, + { + "epoch": 5.575784227499267, + "grad_norm": 0.1806939022655288, + "learning_rate": 0.00043179967957984516, + "loss": 2.9996519088745117, + "step": 9512, + "token_acc": 0.2988821421674856 + }, + { + "epoch": 5.576370565816476, + "grad_norm": 0.16043262903584404, + "learning_rate": 0.00043178304645278003, + "loss": 2.9799346923828125, + "step": 9513, + "token_acc": 0.3019449116904962 + }, + { + "epoch": 5.576956904133685, + "grad_norm": 0.1737721818800752, + "learning_rate": 0.00043176641161810824, + "loss": 3.014482259750366, + "step": 9514, + "token_acc": 0.2968615113340873 + }, + { + "epoch": 5.577543242450894, + "grad_norm": 0.18926054639918807, + "learning_rate": 0.0004317497750759859, + "loss": 2.9841599464416504, + "step": 9515, + "token_acc": 0.30318275400748224 + }, + { + "epoch": 5.5781295807681035, + "grad_norm": 0.16322166011277983, + "learning_rate": 0.0004317331368265693, + "loss": 3.021703004837036, + "step": 9516, + "token_acc": 0.29446560026867596 + }, + { + "epoch": 5.578715919085313, + "grad_norm": 0.17428426544483375, + "learning_rate": 0.0004317164968700148, + "loss": 3.018117904663086, + "step": 9517, + "token_acc": 0.2965354941366815 + }, + { + "epoch": 5.579302257402521, + "grad_norm": 0.16268074246702305, + "learning_rate": 0.00043169985520647857, + "loss": 3.01544189453125, + "step": 9518, + "token_acc": 0.294755603733564 + }, + { + "epoch": 5.57988859571973, + "grad_norm": 0.17943860147494728, + "learning_rate": 0.0004316832118361172, + "loss": 3.0336971282958984, + "step": 9519, + "token_acc": 0.2941748835898242 + }, + { + "epoch": 5.580474934036939, + "grad_norm": 0.18477428241520144, + "learning_rate": 0.0004316665667590868, + "loss": 3.0227746963500977, + "step": 9520, + "token_acc": 0.29475153869704296 + }, + { + "epoch": 5.581061272354148, + "grad_norm": 0.2088075111321705, + "learning_rate": 0.0004316499199755438, + "loss": 2.984206199645996, + "step": 9521, + "token_acc": 0.3017601455791507 + }, + { + "epoch": 5.581647610671357, + "grad_norm": 0.21254027026259575, + "learning_rate": 0.00043163327148564457, + "loss": 3.0296449661254883, + "step": 9522, + "token_acc": 0.29570377416251203 + }, + { + "epoch": 5.582233948988566, + "grad_norm": 0.19722027035943757, + "learning_rate": 0.00043161662128954554, + "loss": 3.0179333686828613, + "step": 9523, + "token_acc": 0.2953351197804762 + }, + { + "epoch": 5.5828202873057755, + "grad_norm": 0.22325342548140517, + "learning_rate": 0.0004315999693874031, + "loss": 2.9934096336364746, + "step": 9524, + "token_acc": 0.2991760144536382 + }, + { + "epoch": 5.583406625622985, + "grad_norm": 0.19130693250430392, + "learning_rate": 0.0004315833157793736, + "loss": 3.0450313091278076, + "step": 9525, + "token_acc": 0.292958554193621 + }, + { + "epoch": 5.583992963940194, + "grad_norm": 0.23290702776374, + "learning_rate": 0.0004315666604656136, + "loss": 2.958040475845337, + "step": 9526, + "token_acc": 0.3034703592860798 + }, + { + "epoch": 5.584579302257403, + "grad_norm": 0.23654086113241085, + "learning_rate": 0.0004315500034462795, + "loss": 2.962702751159668, + "step": 9527, + "token_acc": 0.3022096528417662 + }, + { + "epoch": 5.585165640574612, + "grad_norm": 0.18405221620687423, + "learning_rate": 0.00043153334472152764, + "loss": 2.993170976638794, + "step": 9528, + "token_acc": 0.2983177491520436 + }, + { + "epoch": 5.585751978891821, + "grad_norm": 0.2212657501567586, + "learning_rate": 0.00043151668429151473, + "loss": 3.004142999649048, + "step": 9529, + "token_acc": 0.2971003345266435 + }, + { + "epoch": 5.586338317209029, + "grad_norm": 0.20256211746017752, + "learning_rate": 0.0004315000221563972, + "loss": 3.0117411613464355, + "step": 9530, + "token_acc": 0.2984054610557037 + }, + { + "epoch": 5.586924655526238, + "grad_norm": 0.18909968891366608, + "learning_rate": 0.00043148335831633144, + "loss": 2.9673969745635986, + "step": 9531, + "token_acc": 0.30217277433137507 + }, + { + "epoch": 5.5875109938434475, + "grad_norm": 0.19822619812522504, + "learning_rate": 0.0004314666927714741, + "loss": 2.9493350982666016, + "step": 9532, + "token_acc": 0.306335849681107 + }, + { + "epoch": 5.588097332160657, + "grad_norm": 0.1653030354919907, + "learning_rate": 0.0004314500255219817, + "loss": 3.0565433502197266, + "step": 9533, + "token_acc": 0.2892925306085353 + }, + { + "epoch": 5.588683670477866, + "grad_norm": 0.169206849004507, + "learning_rate": 0.00043143335656801085, + "loss": 2.97257661819458, + "step": 9534, + "token_acc": 0.3024045618031257 + }, + { + "epoch": 5.589270008795075, + "grad_norm": 0.18463700508008768, + "learning_rate": 0.0004314166859097181, + "loss": 3.0033555030822754, + "step": 9535, + "token_acc": 0.2975013070756361 + }, + { + "epoch": 5.589856347112284, + "grad_norm": 0.19389542722141503, + "learning_rate": 0.00043140001354726007, + "loss": 3.017164468765259, + "step": 9536, + "token_acc": 0.29687775073784495 + }, + { + "epoch": 5.590442685429493, + "grad_norm": 0.18970568826400527, + "learning_rate": 0.00043138333948079333, + "loss": 2.9916439056396484, + "step": 9537, + "token_acc": 0.2996392563130145 + }, + { + "epoch": 5.591029023746702, + "grad_norm": 0.20000789500609364, + "learning_rate": 0.0004313666637104745, + "loss": 3.020634889602661, + "step": 9538, + "token_acc": 0.2954950942647031 + }, + { + "epoch": 5.59161536206391, + "grad_norm": 0.1613265693315138, + "learning_rate": 0.00043134998623646026, + "loss": 3.0108795166015625, + "step": 9539, + "token_acc": 0.2994221399311984 + }, + { + "epoch": 5.5922017003811195, + "grad_norm": 0.18126871128087596, + "learning_rate": 0.00043133330705890727, + "loss": 2.9905154705047607, + "step": 9540, + "token_acc": 0.30001103851390065 + }, + { + "epoch": 5.592788038698329, + "grad_norm": 0.20689412250807387, + "learning_rate": 0.00043131662617797227, + "loss": 3.0328121185302734, + "step": 9541, + "token_acc": 0.2934050294977063 + }, + { + "epoch": 5.593374377015538, + "grad_norm": 0.16438513533665722, + "learning_rate": 0.00043129994359381187, + "loss": 3.0185208320617676, + "step": 9542, + "token_acc": 0.2951699735382681 + }, + { + "epoch": 5.593960715332747, + "grad_norm": 0.18867560935661773, + "learning_rate": 0.0004312832593065828, + "loss": 2.99249267578125, + "step": 9543, + "token_acc": 0.3017088950043104 + }, + { + "epoch": 5.594547053649956, + "grad_norm": 0.1863607718558092, + "learning_rate": 0.0004312665733164418, + "loss": 3.016849994659424, + "step": 9544, + "token_acc": 0.2948993393396419 + }, + { + "epoch": 5.595133391967165, + "grad_norm": 0.16932417741866526, + "learning_rate": 0.00043124988562354556, + "loss": 2.951427936553955, + "step": 9545, + "token_acc": 0.3044230687561048 + }, + { + "epoch": 5.595719730284374, + "grad_norm": 0.19064838047441154, + "learning_rate": 0.0004312331962280509, + "loss": 2.9883534908294678, + "step": 9546, + "token_acc": 0.29957602518282583 + }, + { + "epoch": 5.596306068601583, + "grad_norm": 0.194961639619161, + "learning_rate": 0.0004312165051301146, + "loss": 2.997401714324951, + "step": 9547, + "token_acc": 0.29868872890232423 + }, + { + "epoch": 5.596892406918792, + "grad_norm": 0.17465762650883984, + "learning_rate": 0.00043119981232989346, + "loss": 2.982698678970337, + "step": 9548, + "token_acc": 0.3013779460084126 + }, + { + "epoch": 5.5974787452360015, + "grad_norm": 0.1835960069757496, + "learning_rate": 0.0004311831178275442, + "loss": 3.0160086154937744, + "step": 9549, + "token_acc": 0.29681308854804495 + }, + { + "epoch": 5.598065083553211, + "grad_norm": 0.18046356377062456, + "learning_rate": 0.00043116642162322366, + "loss": 3.0057034492492676, + "step": 9550, + "token_acc": 0.2979149837679657 + }, + { + "epoch": 5.598651421870419, + "grad_norm": 0.18743938937309376, + "learning_rate": 0.00043114972371708884, + "loss": 2.997492790222168, + "step": 9551, + "token_acc": 0.3004006487717376 + }, + { + "epoch": 5.599237760187628, + "grad_norm": 0.18578818663756563, + "learning_rate": 0.0004311330241092963, + "loss": 3.0247702598571777, + "step": 9552, + "token_acc": 0.2943992919680048 + }, + { + "epoch": 5.599824098504837, + "grad_norm": 0.2146490150787229, + "learning_rate": 0.00043111632280000325, + "loss": 3.0167722702026367, + "step": 9553, + "token_acc": 0.2948207914433943 + }, + { + "epoch": 5.600410436822046, + "grad_norm": 0.20455195149780603, + "learning_rate": 0.0004310996197893663, + "loss": 2.997222661972046, + "step": 9554, + "token_acc": 0.30008608046929797 + }, + { + "epoch": 5.600996775139255, + "grad_norm": 0.17392518613616814, + "learning_rate": 0.00043108291507754244, + "loss": 3.0305111408233643, + "step": 9555, + "token_acc": 0.29682078090082736 + }, + { + "epoch": 5.601583113456464, + "grad_norm": 0.18573324796172533, + "learning_rate": 0.00043106620866468863, + "loss": 3.0287609100341797, + "step": 9556, + "token_acc": 0.29382080915880404 + }, + { + "epoch": 5.6021694517736735, + "grad_norm": 0.14933753101867098, + "learning_rate": 0.0004310495005509618, + "loss": 3.010798454284668, + "step": 9557, + "token_acc": 0.2962245961235133 + }, + { + "epoch": 5.602755790090883, + "grad_norm": 0.1774086041438887, + "learning_rate": 0.00043103279073651894, + "loss": 2.989170789718628, + "step": 9558, + "token_acc": 0.30033399291682944 + }, + { + "epoch": 5.603342128408092, + "grad_norm": 0.19124858814237647, + "learning_rate": 0.0004310160792215168, + "loss": 3.0522878170013428, + "step": 9559, + "token_acc": 0.2932727879799666 + }, + { + "epoch": 5.603928466725301, + "grad_norm": 0.17971728779723478, + "learning_rate": 0.0004309993660061126, + "loss": 2.9710168838500977, + "step": 9560, + "token_acc": 0.30304193919173217 + }, + { + "epoch": 5.604514805042509, + "grad_norm": 0.22346838349322226, + "learning_rate": 0.0004309826510904633, + "loss": 2.9577934741973877, + "step": 9561, + "token_acc": 0.3044612445827372 + }, + { + "epoch": 5.605101143359718, + "grad_norm": 0.30844898390450953, + "learning_rate": 0.00043096593447472574, + "loss": 3.0060253143310547, + "step": 9562, + "token_acc": 0.2979023328704518 + }, + { + "epoch": 5.605687481676927, + "grad_norm": 0.30398529653323336, + "learning_rate": 0.00043094921615905726, + "loss": 3.0262985229492188, + "step": 9563, + "token_acc": 0.2950060010650897 + }, + { + "epoch": 5.606273819994136, + "grad_norm": 0.18426877913750847, + "learning_rate": 0.0004309324961436146, + "loss": 3.0328433513641357, + "step": 9564, + "token_acc": 0.293977275948532 + }, + { + "epoch": 5.6068601583113455, + "grad_norm": 0.1906612566560988, + "learning_rate": 0.000430915774428555, + "loss": 3.029179573059082, + "step": 9565, + "token_acc": 0.29620073820801573 + }, + { + "epoch": 5.607446496628555, + "grad_norm": 0.1656020304966405, + "learning_rate": 0.0004308990510140355, + "loss": 3.026366949081421, + "step": 9566, + "token_acc": 0.2949843415756632 + }, + { + "epoch": 5.608032834945764, + "grad_norm": 0.2119433880332245, + "learning_rate": 0.00043088232590021307, + "loss": 3.0275793075561523, + "step": 9567, + "token_acc": 0.29360575993646515 + }, + { + "epoch": 5.608619173262973, + "grad_norm": 0.1981877850757136, + "learning_rate": 0.0004308655990872451, + "loss": 2.9862585067749023, + "step": 9568, + "token_acc": 0.3010810853393064 + }, + { + "epoch": 5.609205511580182, + "grad_norm": 0.16796205263474484, + "learning_rate": 0.0004308488705752884, + "loss": 3.0349173545837402, + "step": 9569, + "token_acc": 0.29396607257891344 + }, + { + "epoch": 5.609791849897391, + "grad_norm": 0.19379537811112035, + "learning_rate": 0.0004308321403645004, + "loss": 2.985604763031006, + "step": 9570, + "token_acc": 0.30119517341487384 + }, + { + "epoch": 5.6103781882146, + "grad_norm": 0.16759587131431963, + "learning_rate": 0.000430815408455038, + "loss": 2.9972641468048096, + "step": 9571, + "token_acc": 0.29898307583363826 + }, + { + "epoch": 5.610964526531809, + "grad_norm": 0.20723113338273044, + "learning_rate": 0.0004307986748470585, + "loss": 3.0022172927856445, + "step": 9572, + "token_acc": 0.29989593122965746 + }, + { + "epoch": 5.6115508648490176, + "grad_norm": 0.15891757816208982, + "learning_rate": 0.0004307819395407191, + "loss": 2.9610557556152344, + "step": 9573, + "token_acc": 0.3049252963749291 + }, + { + "epoch": 5.612137203166227, + "grad_norm": 0.20498814126873524, + "learning_rate": 0.0004307652025361769, + "loss": 3.014742374420166, + "step": 9574, + "token_acc": 0.29650920581353607 + }, + { + "epoch": 5.612723541483436, + "grad_norm": 0.16549235440141769, + "learning_rate": 0.0004307484638335893, + "loss": 3.035586357116699, + "step": 9575, + "token_acc": 0.2927976931646897 + }, + { + "epoch": 5.613309879800645, + "grad_norm": 0.2063805413379132, + "learning_rate": 0.0004307317234331135, + "loss": 2.9846510887145996, + "step": 9576, + "token_acc": 0.3018079505978217 + }, + { + "epoch": 5.613896218117854, + "grad_norm": 0.18337881699184097, + "learning_rate": 0.00043071498133490663, + "loss": 3.0001320838928223, + "step": 9577, + "token_acc": 0.29861320510528383 + }, + { + "epoch": 5.614482556435063, + "grad_norm": 0.179091746629479, + "learning_rate": 0.00043069823753912604, + "loss": 3.0667128562927246, + "step": 9578, + "token_acc": 0.2906930547187236 + }, + { + "epoch": 5.615068894752272, + "grad_norm": 0.17443620105808696, + "learning_rate": 0.00043068149204592894, + "loss": 2.9557714462280273, + "step": 9579, + "token_acc": 0.30458732948231976 + }, + { + "epoch": 5.615655233069481, + "grad_norm": 0.18183156962717031, + "learning_rate": 0.0004306647448554728, + "loss": 2.9554553031921387, + "step": 9580, + "token_acc": 0.30323421592702593 + }, + { + "epoch": 5.6162415713866904, + "grad_norm": 0.17971486281632484, + "learning_rate": 0.00043064799596791473, + "loss": 3.0134754180908203, + "step": 9581, + "token_acc": 0.2953587800398216 + }, + { + "epoch": 5.616827909703899, + "grad_norm": 0.1680766872328503, + "learning_rate": 0.0004306312453834122, + "loss": 3.0314207077026367, + "step": 9582, + "token_acc": 0.2931591171844893 + }, + { + "epoch": 5.617414248021108, + "grad_norm": 0.1791789724021555, + "learning_rate": 0.0004306144931021225, + "loss": 3.0352745056152344, + "step": 9583, + "token_acc": 0.29327257801377904 + }, + { + "epoch": 5.618000586338317, + "grad_norm": 0.16565237767824229, + "learning_rate": 0.0004305977391242031, + "loss": 2.9725794792175293, + "step": 9584, + "token_acc": 0.30460619709891235 + }, + { + "epoch": 5.618586924655526, + "grad_norm": 0.20220192773069864, + "learning_rate": 0.00043058098344981124, + "loss": 3.034191608428955, + "step": 9585, + "token_acc": 0.29381273439584066 + }, + { + "epoch": 5.619173262972735, + "grad_norm": 0.3133331869981549, + "learning_rate": 0.0004305642260791044, + "loss": 3.021151542663574, + "step": 9586, + "token_acc": 0.2951352677749831 + }, + { + "epoch": 5.619759601289944, + "grad_norm": 0.29715495114804735, + "learning_rate": 0.0004305474670122399, + "loss": 2.986931324005127, + "step": 9587, + "token_acc": 0.30382221083642885 + }, + { + "epoch": 5.620345939607153, + "grad_norm": 0.17416711064100368, + "learning_rate": 0.0004305307062493754, + "loss": 2.989133596420288, + "step": 9588, + "token_acc": 0.3011156075977321 + }, + { + "epoch": 5.6209322779243625, + "grad_norm": 0.2666930558579081, + "learning_rate": 0.000430513943790668, + "loss": 2.996760845184326, + "step": 9589, + "token_acc": 0.298364182550406 + }, + { + "epoch": 5.621518616241572, + "grad_norm": 0.17759517420041773, + "learning_rate": 0.0004304971796362754, + "loss": 2.977205514907837, + "step": 9590, + "token_acc": 0.2996640492694385 + }, + { + "epoch": 5.622104954558781, + "grad_norm": 0.2029505751181237, + "learning_rate": 0.00043048041378635507, + "loss": 3.0145092010498047, + "step": 9591, + "token_acc": 0.296279004496068 + }, + { + "epoch": 5.62269129287599, + "grad_norm": 0.2071340865748021, + "learning_rate": 0.00043046364624106437, + "loss": 3.01771879196167, + "step": 9592, + "token_acc": 0.2958692264263358 + }, + { + "epoch": 5.623277631193199, + "grad_norm": 0.18015656623955906, + "learning_rate": 0.000430446877000561, + "loss": 3.024195432662964, + "step": 9593, + "token_acc": 0.29632518480438214 + }, + { + "epoch": 5.623863969510407, + "grad_norm": 0.20289019911959003, + "learning_rate": 0.0004304301060650023, + "loss": 2.9754247665405273, + "step": 9594, + "token_acc": 0.30220750493256393 + }, + { + "epoch": 5.624450307827616, + "grad_norm": 0.1774965744770958, + "learning_rate": 0.0004304133334345459, + "loss": 2.9952664375305176, + "step": 9595, + "token_acc": 0.2995590168748132 + }, + { + "epoch": 5.625036646144825, + "grad_norm": 0.22040994260841465, + "learning_rate": 0.00043039655910934936, + "loss": 3.048928737640381, + "step": 9596, + "token_acc": 0.2933570798269591 + }, + { + "epoch": 5.6256229844620345, + "grad_norm": 0.1619246695524503, + "learning_rate": 0.0004303797830895703, + "loss": 3.059825897216797, + "step": 9597, + "token_acc": 0.2900392779910852 + }, + { + "epoch": 5.626209322779244, + "grad_norm": 0.18173730652951914, + "learning_rate": 0.0004303630053753661, + "loss": 2.9628822803497314, + "step": 9598, + "token_acc": 0.30443590526874287 + }, + { + "epoch": 5.626795661096453, + "grad_norm": 0.17676495272694956, + "learning_rate": 0.0004303462259668945, + "loss": 3.0075762271881104, + "step": 9599, + "token_acc": 0.29794336611578104 + }, + { + "epoch": 5.627381999413662, + "grad_norm": 0.21949285664824922, + "learning_rate": 0.0004303294448643133, + "loss": 3.0106120109558105, + "step": 9600, + "token_acc": 0.29631546615430865 + }, + { + "epoch": 5.627968337730871, + "grad_norm": 0.14889499934497757, + "learning_rate": 0.00043031266206777985, + "loss": 2.9857020378112793, + "step": 9601, + "token_acc": 0.3005728918015711 + }, + { + "epoch": 5.62855467604808, + "grad_norm": 0.1914237252271556, + "learning_rate": 0.0004302958775774519, + "loss": 3.0286338329315186, + "step": 9602, + "token_acc": 0.29449090091068464 + }, + { + "epoch": 5.629141014365289, + "grad_norm": 0.1514533817159106, + "learning_rate": 0.0004302790913934872, + "loss": 2.979556083679199, + "step": 9603, + "token_acc": 0.3026030864745488 + }, + { + "epoch": 5.629727352682497, + "grad_norm": 0.23294354768241712, + "learning_rate": 0.0004302623035160433, + "loss": 3.0235707759857178, + "step": 9604, + "token_acc": 0.29476490272814926 + }, + { + "epoch": 5.6303136909997065, + "grad_norm": 0.1660070302903849, + "learning_rate": 0.000430245513945278, + "loss": 3.0098559856414795, + "step": 9605, + "token_acc": 0.2977769563839899 + }, + { + "epoch": 5.630900029316916, + "grad_norm": 0.1904031362317103, + "learning_rate": 0.00043022872268134896, + "loss": 2.9822745323181152, + "step": 9606, + "token_acc": 0.2994259573449356 + }, + { + "epoch": 5.631486367634125, + "grad_norm": 0.18577592173023713, + "learning_rate": 0.00043021192972441394, + "loss": 3.000027656555176, + "step": 9607, + "token_acc": 0.2992741347725885 + }, + { + "epoch": 5.632072705951334, + "grad_norm": 0.16806913403330592, + "learning_rate": 0.0004301951350746307, + "loss": 2.967625379562378, + "step": 9608, + "token_acc": 0.30256362757933575 + }, + { + "epoch": 5.632659044268543, + "grad_norm": 0.20525597502535287, + "learning_rate": 0.00043017833873215693, + "loss": 3.0037293434143066, + "step": 9609, + "token_acc": 0.29628084426192425 + }, + { + "epoch": 5.633245382585752, + "grad_norm": 0.17102531424477008, + "learning_rate": 0.0004301615406971505, + "loss": 3.0159807205200195, + "step": 9610, + "token_acc": 0.29815839598465593 + }, + { + "epoch": 5.633831720902961, + "grad_norm": 0.1632568933900903, + "learning_rate": 0.0004301447409697692, + "loss": 2.9904332160949707, + "step": 9611, + "token_acc": 0.298852141454758 + }, + { + "epoch": 5.63441805922017, + "grad_norm": 0.16258987689427026, + "learning_rate": 0.0004301279395501707, + "loss": 3.0202741622924805, + "step": 9612, + "token_acc": 0.2958002543746282 + }, + { + "epoch": 5.635004397537379, + "grad_norm": 0.16705275273409906, + "learning_rate": 0.000430111136438513, + "loss": 3.025203227996826, + "step": 9613, + "token_acc": 0.2960612980830362 + }, + { + "epoch": 5.6355907358545885, + "grad_norm": 0.17253200213145323, + "learning_rate": 0.00043009433163495393, + "loss": 3.0537548065185547, + "step": 9614, + "token_acc": 0.29039515048614517 + }, + { + "epoch": 5.636177074171798, + "grad_norm": 0.15450757489085828, + "learning_rate": 0.0004300775251396513, + "loss": 3.0372791290283203, + "step": 9615, + "token_acc": 0.2938734758374541 + }, + { + "epoch": 5.636763412489006, + "grad_norm": 0.1564287493835027, + "learning_rate": 0.00043006071695276285, + "loss": 3.049679756164551, + "step": 9616, + "token_acc": 0.291018561028399 + }, + { + "epoch": 5.637349750806215, + "grad_norm": 0.164010178808354, + "learning_rate": 0.0004300439070744466, + "loss": 3.0052952766418457, + "step": 9617, + "token_acc": 0.2977842811597144 + }, + { + "epoch": 5.637936089123424, + "grad_norm": 0.18052484177361303, + "learning_rate": 0.0004300270955048605, + "loss": 3.0111279487609863, + "step": 9618, + "token_acc": 0.2971457198982215 + }, + { + "epoch": 5.638522427440633, + "grad_norm": 0.19196036412766956, + "learning_rate": 0.0004300102822441625, + "loss": 3.0343875885009766, + "step": 9619, + "token_acc": 0.29424578446723976 + }, + { + "epoch": 5.639108765757842, + "grad_norm": 0.17243112602710092, + "learning_rate": 0.00042999346729251045, + "loss": 2.997490882873535, + "step": 9620, + "token_acc": 0.2989858074102722 + }, + { + "epoch": 5.639695104075051, + "grad_norm": 0.20113933334743675, + "learning_rate": 0.00042997665065006225, + "loss": 3.047173261642456, + "step": 9621, + "token_acc": 0.29113161188632886 + }, + { + "epoch": 5.6402814423922605, + "grad_norm": 0.2585591125496731, + "learning_rate": 0.000429959832316976, + "loss": 2.9997150897979736, + "step": 9622, + "token_acc": 0.29988833379333396 + }, + { + "epoch": 5.64086778070947, + "grad_norm": 0.20017018742608264, + "learning_rate": 0.0004299430122934096, + "loss": 2.998478889465332, + "step": 9623, + "token_acc": 0.29868727653884447 + }, + { + "epoch": 5.641454119026679, + "grad_norm": 0.24058965700287915, + "learning_rate": 0.00042992619057952104, + "loss": 2.973613739013672, + "step": 9624, + "token_acc": 0.30261002600295167 + }, + { + "epoch": 5.642040457343887, + "grad_norm": 0.26664231546732725, + "learning_rate": 0.00042990936717546844, + "loss": 2.9879512786865234, + "step": 9625, + "token_acc": 0.298649619381812 + }, + { + "epoch": 5.642626795661096, + "grad_norm": 0.17993722326741365, + "learning_rate": 0.00042989254208140973, + "loss": 2.987138032913208, + "step": 9626, + "token_acc": 0.30090425984513874 + }, + { + "epoch": 5.643213133978305, + "grad_norm": 0.28639773475492575, + "learning_rate": 0.000429875715297503, + "loss": 3.0216989517211914, + "step": 9627, + "token_acc": 0.2943093002397818 + }, + { + "epoch": 5.643799472295514, + "grad_norm": 0.19931065816872046, + "learning_rate": 0.0004298588868239063, + "loss": 2.9978439807891846, + "step": 9628, + "token_acc": 0.30077226921214784 + }, + { + "epoch": 5.644385810612723, + "grad_norm": 0.21915779202010707, + "learning_rate": 0.0004298420566607777, + "loss": 3.0021843910217285, + "step": 9629, + "token_acc": 0.2999309392265193 + }, + { + "epoch": 5.6449721489299325, + "grad_norm": 0.20022634930008637, + "learning_rate": 0.00042982522480827535, + "loss": 2.9696850776672363, + "step": 9630, + "token_acc": 0.30302787435261086 + }, + { + "epoch": 5.645558487247142, + "grad_norm": 0.1861986787477895, + "learning_rate": 0.00042980839126655735, + "loss": 3.033020257949829, + "step": 9631, + "token_acc": 0.29264681515214686 + }, + { + "epoch": 5.646144825564351, + "grad_norm": 0.19885598517749872, + "learning_rate": 0.00042979155603578177, + "loss": 3.023491144180298, + "step": 9632, + "token_acc": 0.29548935074963 + }, + { + "epoch": 5.64673116388156, + "grad_norm": 0.2351570344041383, + "learning_rate": 0.0004297747191161068, + "loss": 2.9687154293060303, + "step": 9633, + "token_acc": 0.30320883004121807 + }, + { + "epoch": 5.647317502198769, + "grad_norm": 0.18211110565034438, + "learning_rate": 0.0004297578805076906, + "loss": 3.027240514755249, + "step": 9634, + "token_acc": 0.2960423766523404 + }, + { + "epoch": 5.647903840515978, + "grad_norm": 0.2573021120589888, + "learning_rate": 0.00042974104021069136, + "loss": 2.9699649810791016, + "step": 9635, + "token_acc": 0.3045506690138984 + }, + { + "epoch": 5.648490178833187, + "grad_norm": 0.18323474374008664, + "learning_rate": 0.0004297241982252672, + "loss": 3.007740020751953, + "step": 9636, + "token_acc": 0.2973945260227566 + }, + { + "epoch": 5.649076517150396, + "grad_norm": 0.2126417772933353, + "learning_rate": 0.00042970735455157645, + "loss": 2.9918341636657715, + "step": 9637, + "token_acc": 0.30088923887535746 + }, + { + "epoch": 5.6496628554676045, + "grad_norm": 0.1812814982429344, + "learning_rate": 0.0004296905091897772, + "loss": 3.040578842163086, + "step": 9638, + "token_acc": 0.29455503018533374 + }, + { + "epoch": 5.650249193784814, + "grad_norm": 0.18309232079384316, + "learning_rate": 0.0004296736621400278, + "loss": 2.981400966644287, + "step": 9639, + "token_acc": 0.3022540009608986 + }, + { + "epoch": 5.650835532102023, + "grad_norm": 0.16718615678249557, + "learning_rate": 0.0004296568134024864, + "loss": 2.9951562881469727, + "step": 9640, + "token_acc": 0.3002097317613834 + }, + { + "epoch": 5.651421870419232, + "grad_norm": 0.1853777646607536, + "learning_rate": 0.0004296399629773114, + "loss": 3.020698308944702, + "step": 9641, + "token_acc": 0.2968197221416231 + }, + { + "epoch": 5.652008208736441, + "grad_norm": 0.17698208098032897, + "learning_rate": 0.000429623110864661, + "loss": 2.990372657775879, + "step": 9642, + "token_acc": 0.3002808261264784 + }, + { + "epoch": 5.65259454705365, + "grad_norm": 0.231477921332244, + "learning_rate": 0.0004296062570646935, + "loss": 3.031933307647705, + "step": 9643, + "token_acc": 0.2922380698273227 + }, + { + "epoch": 5.653180885370859, + "grad_norm": 0.16872941623725754, + "learning_rate": 0.00042958940157756723, + "loss": 3.007328748703003, + "step": 9644, + "token_acc": 0.29785110457363956 + }, + { + "epoch": 5.653767223688068, + "grad_norm": 0.19171234055105638, + "learning_rate": 0.0004295725444034405, + "loss": 3.008355140686035, + "step": 9645, + "token_acc": 0.2990646385247595 + }, + { + "epoch": 5.654353562005277, + "grad_norm": 0.18484908502502898, + "learning_rate": 0.0004295556855424717, + "loss": 2.999434232711792, + "step": 9646, + "token_acc": 0.2990242580295433 + }, + { + "epoch": 5.654939900322486, + "grad_norm": 0.2385347917215755, + "learning_rate": 0.00042953882499481924, + "loss": 3.022439479827881, + "step": 9647, + "token_acc": 0.2952998456491868 + }, + { + "epoch": 5.655526238639695, + "grad_norm": 0.17282707392578864, + "learning_rate": 0.00042952196276064143, + "loss": 2.9714865684509277, + "step": 9648, + "token_acc": 0.3018283147818686 + }, + { + "epoch": 5.656112576956904, + "grad_norm": 0.21653907044315882, + "learning_rate": 0.0004295050988400967, + "loss": 2.9956345558166504, + "step": 9649, + "token_acc": 0.3011193373237407 + }, + { + "epoch": 5.656698915274113, + "grad_norm": 0.19938927816284252, + "learning_rate": 0.0004294882332333434, + "loss": 2.9746618270874023, + "step": 9650, + "token_acc": 0.30127286154915506 + }, + { + "epoch": 5.657285253591322, + "grad_norm": 0.1760122764114551, + "learning_rate": 0.0004294713659405401, + "loss": 2.992704391479492, + "step": 9651, + "token_acc": 0.2991779325413082 + }, + { + "epoch": 5.657871591908531, + "grad_norm": 0.20716780027055753, + "learning_rate": 0.0004294544969618451, + "loss": 3.0144577026367188, + "step": 9652, + "token_acc": 0.29784884081009105 + }, + { + "epoch": 5.65845793022574, + "grad_norm": 0.1606883926285121, + "learning_rate": 0.0004294376262974169, + "loss": 2.9628968238830566, + "step": 9653, + "token_acc": 0.30408607420943695 + }, + { + "epoch": 5.659044268542949, + "grad_norm": 0.20776777506456132, + "learning_rate": 0.00042942075394741407, + "loss": 3.0030856132507324, + "step": 9654, + "token_acc": 0.2964939436630601 + }, + { + "epoch": 5.6596306068601585, + "grad_norm": 0.1833194884507952, + "learning_rate": 0.0004294038799119949, + "loss": 2.9778831005096436, + "step": 9655, + "token_acc": 0.30300539827584916 + }, + { + "epoch": 5.660216945177368, + "grad_norm": 0.2307086055516595, + "learning_rate": 0.0004293870041913182, + "loss": 3.0047645568847656, + "step": 9656, + "token_acc": 0.2978107081688882 + }, + { + "epoch": 5.660803283494577, + "grad_norm": 0.1629645673453858, + "learning_rate": 0.00042937012678554223, + "loss": 3.021730899810791, + "step": 9657, + "token_acc": 0.29500829936546624 + }, + { + "epoch": 5.661389621811786, + "grad_norm": 0.19321640486443206, + "learning_rate": 0.00042935324769482564, + "loss": 3.0084288120269775, + "step": 9658, + "token_acc": 0.29875639613964633 + }, + { + "epoch": 5.661975960128994, + "grad_norm": 0.16792054232838532, + "learning_rate": 0.000429336366919327, + "loss": 3.0241122245788574, + "step": 9659, + "token_acc": 0.29619280195364106 + }, + { + "epoch": 5.662562298446203, + "grad_norm": 0.15951579831358248, + "learning_rate": 0.00042931948445920474, + "loss": 2.98006010055542, + "step": 9660, + "token_acc": 0.30314749366245397 + }, + { + "epoch": 5.663148636763412, + "grad_norm": 0.1820638270397434, + "learning_rate": 0.0004293026003146176, + "loss": 3.0115294456481934, + "step": 9661, + "token_acc": 0.29739966140358953 + }, + { + "epoch": 5.663734975080621, + "grad_norm": 0.15257340074744735, + "learning_rate": 0.00042928571448572417, + "loss": 2.9487955570220947, + "step": 9662, + "token_acc": 0.30703578762485967 + }, + { + "epoch": 5.6643213133978305, + "grad_norm": 0.1792191473418344, + "learning_rate": 0.0004292688269726831, + "loss": 3.0508499145507812, + "step": 9663, + "token_acc": 0.29224626207453536 + }, + { + "epoch": 5.66490765171504, + "grad_norm": 0.20980940221123134, + "learning_rate": 0.00042925193777565294, + "loss": 3.053311824798584, + "step": 9664, + "token_acc": 0.2919631929649608 + }, + { + "epoch": 5.665493990032249, + "grad_norm": 0.17087286114275047, + "learning_rate": 0.00042923504689479234, + "loss": 3.009725570678711, + "step": 9665, + "token_acc": 0.2960150387541081 + }, + { + "epoch": 5.666080328349458, + "grad_norm": 0.1698506675528494, + "learning_rate": 0.00042921815433026, + "loss": 3.012098789215088, + "step": 9666, + "token_acc": 0.2967871060095963 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.1903519607869863, + "learning_rate": 0.00042920126008221454, + "loss": 2.9759397506713867, + "step": 9667, + "token_acc": 0.29977653718687175 + }, + { + "epoch": 5.667253004983876, + "grad_norm": 0.16613858970975162, + "learning_rate": 0.00042918436415081474, + "loss": 2.9691386222839355, + "step": 9668, + "token_acc": 0.3035372541140895 + }, + { + "epoch": 5.667839343301084, + "grad_norm": 0.16956420996080152, + "learning_rate": 0.0004291674665362194, + "loss": 3.00679087638855, + "step": 9669, + "token_acc": 0.2971437048225267 + }, + { + "epoch": 5.668425681618293, + "grad_norm": 0.16343544576976046, + "learning_rate": 0.00042915056723858704, + "loss": 2.9896368980407715, + "step": 9670, + "token_acc": 0.30152768853512374 + }, + { + "epoch": 5.6690120199355025, + "grad_norm": 0.1620120027899225, + "learning_rate": 0.0004291336662580766, + "loss": 3.0608201026916504, + "step": 9671, + "token_acc": 0.29073280012975294 + }, + { + "epoch": 5.669598358252712, + "grad_norm": 0.15773349503321887, + "learning_rate": 0.00042911676359484664, + "loss": 3.003131151199341, + "step": 9672, + "token_acc": 0.2984501072080827 + }, + { + "epoch": 5.670184696569921, + "grad_norm": 0.1681830930163674, + "learning_rate": 0.0004290998592490561, + "loss": 3.0266337394714355, + "step": 9673, + "token_acc": 0.2956958028391378 + }, + { + "epoch": 5.67077103488713, + "grad_norm": 0.17026784322134483, + "learning_rate": 0.0004290829532208637, + "loss": 2.9676289558410645, + "step": 9674, + "token_acc": 0.30237150512612176 + }, + { + "epoch": 5.671357373204339, + "grad_norm": 0.14881734179531328, + "learning_rate": 0.00042906604551042835, + "loss": 3.0124878883361816, + "step": 9675, + "token_acc": 0.29810283511329955 + }, + { + "epoch": 5.671943711521548, + "grad_norm": 0.15746213050040386, + "learning_rate": 0.00042904913611790875, + "loss": 3.021151542663574, + "step": 9676, + "token_acc": 0.2953353756937941 + }, + { + "epoch": 5.672530049838757, + "grad_norm": 0.1508119232093372, + "learning_rate": 0.0004290322250434637, + "loss": 2.995553970336914, + "step": 9677, + "token_acc": 0.29879655822092444 + }, + { + "epoch": 5.673116388155966, + "grad_norm": 0.17271149428032068, + "learning_rate": 0.00042901531228725225, + "loss": 2.9819164276123047, + "step": 9678, + "token_acc": 0.30188738260876363 + }, + { + "epoch": 5.673702726473175, + "grad_norm": 0.22317560950998053, + "learning_rate": 0.0004289983978494332, + "loss": 2.9949841499328613, + "step": 9679, + "token_acc": 0.2983412883874897 + }, + { + "epoch": 5.6742890647903845, + "grad_norm": 0.37987441045838943, + "learning_rate": 0.0004289814817301653, + "loss": 3.043926954269409, + "step": 9680, + "token_acc": 0.2929892484369686 + }, + { + "epoch": 5.674875403107593, + "grad_norm": 0.46172890222334556, + "learning_rate": 0.00042896456392960765, + "loss": 3.0338521003723145, + "step": 9681, + "token_acc": 0.2925842754968263 + }, + { + "epoch": 5.675461741424802, + "grad_norm": 0.1653286598332045, + "learning_rate": 0.000428947644447919, + "loss": 2.9987215995788574, + "step": 9682, + "token_acc": 0.2984008878351452 + }, + { + "epoch": 5.676048079742011, + "grad_norm": 0.29159400659256407, + "learning_rate": 0.0004289307232852584, + "loss": 3.0331783294677734, + "step": 9683, + "token_acc": 0.29358516339182517 + }, + { + "epoch": 5.67663441805922, + "grad_norm": 0.18898488147756087, + "learning_rate": 0.00042891380044178474, + "loss": 3.05173397064209, + "step": 9684, + "token_acc": 0.29150446530799173 + }, + { + "epoch": 5.677220756376429, + "grad_norm": 0.22029151321848311, + "learning_rate": 0.0004288968759176571, + "loss": 3.046402931213379, + "step": 9685, + "token_acc": 0.2926949104465516 + }, + { + "epoch": 5.677807094693638, + "grad_norm": 0.16576300669822813, + "learning_rate": 0.00042887994971303435, + "loss": 3.056812047958374, + "step": 9686, + "token_acc": 0.2901332729541733 + }, + { + "epoch": 5.678393433010847, + "grad_norm": 0.22457991502511784, + "learning_rate": 0.00042886302182807546, + "loss": 3.003857374191284, + "step": 9687, + "token_acc": 0.2986999657885734 + }, + { + "epoch": 5.6789797713280565, + "grad_norm": 0.15561880908922285, + "learning_rate": 0.0004288460922629395, + "loss": 3.0109992027282715, + "step": 9688, + "token_acc": 0.29794032628657546 + }, + { + "epoch": 5.679566109645266, + "grad_norm": 0.18152768908813405, + "learning_rate": 0.00042882916101778557, + "loss": 2.9868319034576416, + "step": 9689, + "token_acc": 0.301632402480874 + }, + { + "epoch": 5.680152447962474, + "grad_norm": 0.17653165035954657, + "learning_rate": 0.0004288122280927726, + "loss": 2.988758087158203, + "step": 9690, + "token_acc": 0.3012203805633085 + }, + { + "epoch": 5.680738786279683, + "grad_norm": 0.15868739937927923, + "learning_rate": 0.0004287952934880597, + "loss": 2.97629451751709, + "step": 9691, + "token_acc": 0.3014322694691345 + }, + { + "epoch": 5.681325124596892, + "grad_norm": 0.17053649195480833, + "learning_rate": 0.00042877835720380597, + "loss": 3.0265848636627197, + "step": 9692, + "token_acc": 0.2963419236575372 + }, + { + "epoch": 5.681911462914101, + "grad_norm": 0.155066110951518, + "learning_rate": 0.0004287614192401704, + "loss": 3.077479600906372, + "step": 9693, + "token_acc": 0.2875623630031064 + }, + { + "epoch": 5.68249780123131, + "grad_norm": 0.16396740526187453, + "learning_rate": 0.0004287444795973123, + "loss": 3.0188803672790527, + "step": 9694, + "token_acc": 0.295553803307501 + }, + { + "epoch": 5.683084139548519, + "grad_norm": 0.1483217950896347, + "learning_rate": 0.00042872753827539057, + "loss": 3.0223069190979004, + "step": 9695, + "token_acc": 0.29533774772211796 + }, + { + "epoch": 5.6836704778657285, + "grad_norm": 0.19569883138269079, + "learning_rate": 0.00042871059527456457, + "loss": 3.030677318572998, + "step": 9696, + "token_acc": 0.29351466558408607 + }, + { + "epoch": 5.684256816182938, + "grad_norm": 0.16568520147825302, + "learning_rate": 0.00042869365059499325, + "loss": 2.9931516647338867, + "step": 9697, + "token_acc": 0.30169962125807015 + }, + { + "epoch": 5.684843154500147, + "grad_norm": 0.18911416548021692, + "learning_rate": 0.0004286767042368359, + "loss": 3.007972478866577, + "step": 9698, + "token_acc": 0.2972762811704428 + }, + { + "epoch": 5.685429492817356, + "grad_norm": 0.18121591454233538, + "learning_rate": 0.0004286597562002517, + "loss": 3.0184693336486816, + "step": 9699, + "token_acc": 0.29508342055039094 + }, + { + "epoch": 5.686015831134565, + "grad_norm": 0.1841168351292108, + "learning_rate": 0.0004286428064853998, + "loss": 3.033012866973877, + "step": 9700, + "token_acc": 0.2944825546399884 + }, + { + "epoch": 5.686602169451774, + "grad_norm": 0.17421679504180962, + "learning_rate": 0.00042862585509243953, + "loss": 2.986377239227295, + "step": 9701, + "token_acc": 0.30175944733376486 + }, + { + "epoch": 5.687188507768982, + "grad_norm": 0.23461696561100684, + "learning_rate": 0.00042860890202153003, + "loss": 2.979397773742676, + "step": 9702, + "token_acc": 0.3020695397455961 + }, + { + "epoch": 5.687774846086191, + "grad_norm": 0.23151070460380196, + "learning_rate": 0.0004285919472728305, + "loss": 3.036905527114868, + "step": 9703, + "token_acc": 0.2922642212134788 + }, + { + "epoch": 5.6883611844034006, + "grad_norm": 0.20276624522389372, + "learning_rate": 0.0004285749908465003, + "loss": 3.0349578857421875, + "step": 9704, + "token_acc": 0.2943399281327038 + }, + { + "epoch": 5.68894752272061, + "grad_norm": 0.2052356051437249, + "learning_rate": 0.0004285580327426988, + "loss": 3.0245180130004883, + "step": 9705, + "token_acc": 0.2948254090625125 + }, + { + "epoch": 5.689533861037819, + "grad_norm": 0.21843578573941294, + "learning_rate": 0.00042854107296158507, + "loss": 3.016998767852783, + "step": 9706, + "token_acc": 0.29631629833162537 + }, + { + "epoch": 5.690120199355028, + "grad_norm": 0.17463113248808024, + "learning_rate": 0.00042852411150331863, + "loss": 3.000985622406006, + "step": 9707, + "token_acc": 0.2986096678443723 + }, + { + "epoch": 5.690706537672237, + "grad_norm": 0.18187663569681634, + "learning_rate": 0.00042850714836805873, + "loss": 3.0590057373046875, + "step": 9708, + "token_acc": 0.2917847534553221 + }, + { + "epoch": 5.691292875989446, + "grad_norm": 0.20459450455746692, + "learning_rate": 0.0004284901835559646, + "loss": 2.9940104484558105, + "step": 9709, + "token_acc": 0.29937229161356105 + }, + { + "epoch": 5.691879214306655, + "grad_norm": 0.19848694857738772, + "learning_rate": 0.0004284732170671958, + "loss": 3.0109429359436035, + "step": 9710, + "token_acc": 0.2955458356988835 + }, + { + "epoch": 5.692465552623864, + "grad_norm": 0.1875707159858398, + "learning_rate": 0.0004284562489019116, + "loss": 3.0396504402160645, + "step": 9711, + "token_acc": 0.29188674818155746 + }, + { + "epoch": 5.693051890941073, + "grad_norm": 0.18397003362578113, + "learning_rate": 0.0004284392790602715, + "loss": 3.006220817565918, + "step": 9712, + "token_acc": 0.29829215320131114 + }, + { + "epoch": 5.693638229258282, + "grad_norm": 0.1960355377965541, + "learning_rate": 0.00042842230754243474, + "loss": 3.0132362842559814, + "step": 9713, + "token_acc": 0.29544653066458637 + }, + { + "epoch": 5.694224567575491, + "grad_norm": 0.21111888271164667, + "learning_rate": 0.00042840533434856075, + "loss": 3.0100584030151367, + "step": 9714, + "token_acc": 0.298651075700256 + }, + { + "epoch": 5.6948109058927, + "grad_norm": 0.24987361450120996, + "learning_rate": 0.00042838835947880917, + "loss": 2.991548538208008, + "step": 9715, + "token_acc": 0.30014962885653734 + }, + { + "epoch": 5.695397244209909, + "grad_norm": 0.17244989323190352, + "learning_rate": 0.0004283713829333393, + "loss": 3.0689945220947266, + "step": 9716, + "token_acc": 0.28985194559849287 + }, + { + "epoch": 5.695983582527118, + "grad_norm": 0.1796449712262519, + "learning_rate": 0.00042835440471231063, + "loss": 2.986182451248169, + "step": 9717, + "token_acc": 0.30105001274849913 + }, + { + "epoch": 5.696569920844327, + "grad_norm": 0.17306794291337274, + "learning_rate": 0.00042833742481588266, + "loss": 3.0349810123443604, + "step": 9718, + "token_acc": 0.2946296373532417 + }, + { + "epoch": 5.697156259161536, + "grad_norm": 0.16789664725774142, + "learning_rate": 0.0004283204432442149, + "loss": 3.0230164527893066, + "step": 9719, + "token_acc": 0.2941915291233863 + }, + { + "epoch": 5.6977425974787455, + "grad_norm": 0.1990958888628293, + "learning_rate": 0.00042830345999746687, + "loss": 2.998344898223877, + "step": 9720, + "token_acc": 0.2996156311790466 + }, + { + "epoch": 5.698328935795955, + "grad_norm": 0.19853982693005615, + "learning_rate": 0.00042828647507579814, + "loss": 2.9882311820983887, + "step": 9721, + "token_acc": 0.30040991089929 + }, + { + "epoch": 5.698915274113164, + "grad_norm": 0.18270764614196836, + "learning_rate": 0.0004282694884793682, + "loss": 3.003427028656006, + "step": 9722, + "token_acc": 0.2993785247551643 + }, + { + "epoch": 5.699501612430373, + "grad_norm": 0.22010723512213365, + "learning_rate": 0.0004282525002083366, + "loss": 2.9778850078582764, + "step": 9723, + "token_acc": 0.30137416312160037 + }, + { + "epoch": 5.700087950747581, + "grad_norm": 0.2723642953420089, + "learning_rate": 0.000428235510262863, + "loss": 2.9844465255737305, + "step": 9724, + "token_acc": 0.3017111962487655 + }, + { + "epoch": 5.70067428906479, + "grad_norm": 0.18108927568815072, + "learning_rate": 0.00042821851864310694, + "loss": 3.0325722694396973, + "step": 9725, + "token_acc": 0.29424398400961516 + }, + { + "epoch": 5.701260627381999, + "grad_norm": 0.22271844642587205, + "learning_rate": 0.00042820152534922806, + "loss": 3.0136730670928955, + "step": 9726, + "token_acc": 0.2982234127797068 + }, + { + "epoch": 5.701846965699208, + "grad_norm": 0.24488658869004318, + "learning_rate": 0.00042818453038138596, + "loss": 3.031216621398926, + "step": 9727, + "token_acc": 0.29369836920538234 + }, + { + "epoch": 5.7024333040164175, + "grad_norm": 0.1731579051312719, + "learning_rate": 0.0004281675337397403, + "loss": 2.9910900592803955, + "step": 9728, + "token_acc": 0.2986923084960764 + }, + { + "epoch": 5.703019642333627, + "grad_norm": 0.21383594010860807, + "learning_rate": 0.00042815053542445084, + "loss": 2.9857466220855713, + "step": 9729, + "token_acc": 0.30061557473894307 + }, + { + "epoch": 5.703605980650836, + "grad_norm": 0.18534351668934268, + "learning_rate": 0.000428133535435677, + "loss": 3.0403828620910645, + "step": 9730, + "token_acc": 0.29395040112891 + }, + { + "epoch": 5.704192318968045, + "grad_norm": 0.194058046589898, + "learning_rate": 0.00042811653377357875, + "loss": 3.026549816131592, + "step": 9731, + "token_acc": 0.29632029260009146 + }, + { + "epoch": 5.704778657285254, + "grad_norm": 0.19533187733946833, + "learning_rate": 0.0004280995304383156, + "loss": 3.010878086090088, + "step": 9732, + "token_acc": 0.29721484230999023 + }, + { + "epoch": 5.705364995602462, + "grad_norm": 0.15889017477815057, + "learning_rate": 0.00042808252543004744, + "loss": 2.9961342811584473, + "step": 9733, + "token_acc": 0.29910503748434764 + }, + { + "epoch": 5.705951333919671, + "grad_norm": 0.18710498167806622, + "learning_rate": 0.0004280655187489338, + "loss": 2.9783310890197754, + "step": 9734, + "token_acc": 0.3014467209725438 + }, + { + "epoch": 5.70653767223688, + "grad_norm": 0.1570021766033346, + "learning_rate": 0.00042804851039513473, + "loss": 2.96163010597229, + "step": 9735, + "token_acc": 0.30590998232573907 + }, + { + "epoch": 5.7071240105540895, + "grad_norm": 0.20027907481769028, + "learning_rate": 0.00042803150036880964, + "loss": 2.9812936782836914, + "step": 9736, + "token_acc": 0.3018255844496979 + }, + { + "epoch": 5.707710348871299, + "grad_norm": 0.16080911012704258, + "learning_rate": 0.0004280144886701186, + "loss": 3.0218868255615234, + "step": 9737, + "token_acc": 0.2958968552191588 + }, + { + "epoch": 5.708296687188508, + "grad_norm": 0.1975564427783344, + "learning_rate": 0.00042799747529922137, + "loss": 2.997437000274658, + "step": 9738, + "token_acc": 0.2993607349684892 + }, + { + "epoch": 5.708883025505717, + "grad_norm": 0.1704753932499822, + "learning_rate": 0.0004279804602562777, + "loss": 2.983586072921753, + "step": 9739, + "token_acc": 0.300933417914892 + }, + { + "epoch": 5.709469363822926, + "grad_norm": 0.16751104048917995, + "learning_rate": 0.00042796344354144734, + "loss": 3.0001845359802246, + "step": 9740, + "token_acc": 0.2971088991076718 + }, + { + "epoch": 5.710055702140135, + "grad_norm": 0.238968363089349, + "learning_rate": 0.0004279464251548903, + "loss": 3.0158004760742188, + "step": 9741, + "token_acc": 0.295391095782308 + }, + { + "epoch": 5.710642040457344, + "grad_norm": 0.17578908454802472, + "learning_rate": 0.00042792940509676637, + "loss": 3.036902904510498, + "step": 9742, + "token_acc": 0.29293363578814097 + }, + { + "epoch": 5.711228378774553, + "grad_norm": 0.17691966355134336, + "learning_rate": 0.0004279123833672355, + "loss": 3.0241262912750244, + "step": 9743, + "token_acc": 0.2938525764118015 + }, + { + "epoch": 5.711814717091762, + "grad_norm": 0.19849294832039296, + "learning_rate": 0.00042789535996645743, + "loss": 2.9944612979888916, + "step": 9744, + "token_acc": 0.30042127084704484 + }, + { + "epoch": 5.7124010554089715, + "grad_norm": 0.1810045138690793, + "learning_rate": 0.0004278783348945922, + "loss": 3.0278000831604004, + "step": 9745, + "token_acc": 0.29502459833438727 + }, + { + "epoch": 5.71298739372618, + "grad_norm": 0.18246860490966896, + "learning_rate": 0.0004278613081517997, + "loss": 3.004056215286255, + "step": 9746, + "token_acc": 0.2987751416226092 + }, + { + "epoch": 5.713573732043389, + "grad_norm": 0.1790329437786488, + "learning_rate": 0.00042784427973823993, + "loss": 3.0240578651428223, + "step": 9747, + "token_acc": 0.29692370239869903 + }, + { + "epoch": 5.714160070360598, + "grad_norm": 0.20027441640000265, + "learning_rate": 0.0004278272496540728, + "loss": 2.9672203063964844, + "step": 9748, + "token_acc": 0.30474545269789655 + }, + { + "epoch": 5.714746408677807, + "grad_norm": 0.22349385972403324, + "learning_rate": 0.00042781021789945823, + "loss": 2.992806911468506, + "step": 9749, + "token_acc": 0.2995803835757498 + }, + { + "epoch": 5.715332746995016, + "grad_norm": 0.17219798287010957, + "learning_rate": 0.0004277931844745563, + "loss": 3.0111806392669678, + "step": 9750, + "token_acc": 0.2964327686457248 + }, + { + "epoch": 5.715919085312225, + "grad_norm": 0.21507034256874447, + "learning_rate": 0.00042777614937952697, + "loss": 3.0356366634368896, + "step": 9751, + "token_acc": 0.29383720444481576 + }, + { + "epoch": 5.716505423629434, + "grad_norm": 0.16750405663613704, + "learning_rate": 0.00042775911261453027, + "loss": 2.9886884689331055, + "step": 9752, + "token_acc": 0.298380589983707 + }, + { + "epoch": 5.7170917619466435, + "grad_norm": 0.21003041646855272, + "learning_rate": 0.0004277420741797262, + "loss": 3.0114574432373047, + "step": 9753, + "token_acc": 0.2975876465602114 + }, + { + "epoch": 5.717678100263853, + "grad_norm": 0.20326612970657906, + "learning_rate": 0.000427725034075275, + "loss": 2.983753204345703, + "step": 9754, + "token_acc": 0.30083016302571547 + }, + { + "epoch": 5.718264438581061, + "grad_norm": 0.16083676880496495, + "learning_rate": 0.00042770799230133647, + "loss": 2.9960367679595947, + "step": 9755, + "token_acc": 0.2997880384658947 + }, + { + "epoch": 5.71885077689827, + "grad_norm": 0.2311905509721948, + "learning_rate": 0.00042769094885807083, + "loss": 3.0426080226898193, + "step": 9756, + "token_acc": 0.2931654092203111 + }, + { + "epoch": 5.719437115215479, + "grad_norm": 0.18502198678827375, + "learning_rate": 0.0004276739037456382, + "loss": 3.0202221870422363, + "step": 9757, + "token_acc": 0.29512087069387627 + }, + { + "epoch": 5.720023453532688, + "grad_norm": 0.17968974538929308, + "learning_rate": 0.0004276568569641986, + "loss": 2.999652862548828, + "step": 9758, + "token_acc": 0.2976344528627108 + }, + { + "epoch": 5.720609791849897, + "grad_norm": 0.15756421521310826, + "learning_rate": 0.0004276398085139123, + "loss": 3.016390800476074, + "step": 9759, + "token_acc": 0.29705845061493596 + }, + { + "epoch": 5.721196130167106, + "grad_norm": 0.1857479456575506, + "learning_rate": 0.0004276227583949393, + "loss": 2.991766929626465, + "step": 9760, + "token_acc": 0.2992602430440268 + }, + { + "epoch": 5.7217824684843155, + "grad_norm": 0.18131721528561273, + "learning_rate": 0.00042760570660743994, + "loss": 3.006643295288086, + "step": 9761, + "token_acc": 0.29766950598111913 + }, + { + "epoch": 5.722368806801525, + "grad_norm": 0.15176663597237508, + "learning_rate": 0.0004275886531515741, + "loss": 3.047158718109131, + "step": 9762, + "token_acc": 0.2920587246091153 + }, + { + "epoch": 5.722955145118734, + "grad_norm": 0.16835670084605237, + "learning_rate": 0.0004275715980275023, + "loss": 3.009458541870117, + "step": 9763, + "token_acc": 0.29540245483880695 + }, + { + "epoch": 5.723541483435943, + "grad_norm": 0.2033296025349335, + "learning_rate": 0.0004275545412353846, + "loss": 3.0113673210144043, + "step": 9764, + "token_acc": 0.2974983417037809 + }, + { + "epoch": 5.724127821753152, + "grad_norm": 0.16164466559274118, + "learning_rate": 0.0004275374827753813, + "loss": 3.0099995136260986, + "step": 9765, + "token_acc": 0.29614441520953005 + }, + { + "epoch": 5.724714160070361, + "grad_norm": 0.17865797682836773, + "learning_rate": 0.0004275204226476526, + "loss": 3.024019241333008, + "step": 9766, + "token_acc": 0.29525821645165995 + }, + { + "epoch": 5.725300498387569, + "grad_norm": 0.2593570335125841, + "learning_rate": 0.00042750336085235866, + "loss": 3.0303597450256348, + "step": 9767, + "token_acc": 0.2949683037580621 + }, + { + "epoch": 5.725886836704778, + "grad_norm": 0.2111550684505764, + "learning_rate": 0.0004274862973896598, + "loss": 2.978832721710205, + "step": 9768, + "token_acc": 0.3028105702991447 + }, + { + "epoch": 5.7264731750219875, + "grad_norm": 0.16434690790023865, + "learning_rate": 0.0004274692322597164, + "loss": 3.0385990142822266, + "step": 9769, + "token_acc": 0.2936921899993045 + }, + { + "epoch": 5.727059513339197, + "grad_norm": 0.21630121804126842, + "learning_rate": 0.00042745216546268873, + "loss": 2.9944584369659424, + "step": 9770, + "token_acc": 0.3004844660219101 + }, + { + "epoch": 5.727645851656406, + "grad_norm": 0.1771829562045815, + "learning_rate": 0.00042743509699873705, + "loss": 2.9621706008911133, + "step": 9771, + "token_acc": 0.3042818741943178 + }, + { + "epoch": 5.728232189973615, + "grad_norm": 0.16403831400169688, + "learning_rate": 0.0004274180268680217, + "loss": 3.0051217079162598, + "step": 9772, + "token_acc": 0.2986670889751102 + }, + { + "epoch": 5.728818528290824, + "grad_norm": 0.18986715685086744, + "learning_rate": 0.0004274009550707031, + "loss": 3.0222558975219727, + "step": 9773, + "token_acc": 0.2961210328147539 + }, + { + "epoch": 5.729404866608033, + "grad_norm": 0.1484794289155972, + "learning_rate": 0.00042738388160694157, + "loss": 2.989415407180786, + "step": 9774, + "token_acc": 0.29924619578432754 + }, + { + "epoch": 5.729991204925242, + "grad_norm": 0.23548169241021238, + "learning_rate": 0.0004273668064768975, + "loss": 3.024458885192871, + "step": 9775, + "token_acc": 0.29376841891128674 + }, + { + "epoch": 5.730577543242451, + "grad_norm": 0.16487381177316116, + "learning_rate": 0.00042734972968073125, + "loss": 3.020822525024414, + "step": 9776, + "token_acc": 0.29728829561249376 + }, + { + "epoch": 5.7311638815596595, + "grad_norm": 0.16802247038031837, + "learning_rate": 0.00042733265121860334, + "loss": 3.0019431114196777, + "step": 9777, + "token_acc": 0.2985817297913022 + }, + { + "epoch": 5.731750219876869, + "grad_norm": 0.16497398452427606, + "learning_rate": 0.0004273155710906741, + "loss": 3.002401113510132, + "step": 9778, + "token_acc": 0.2979241874989009 + }, + { + "epoch": 5.732336558194078, + "grad_norm": 0.18918092124654226, + "learning_rate": 0.00042729848929710403, + "loss": 3.0445446968078613, + "step": 9779, + "token_acc": 0.29243978619689226 + }, + { + "epoch": 5.732922896511287, + "grad_norm": 0.21983977584860623, + "learning_rate": 0.00042728140583805354, + "loss": 2.978278875350952, + "step": 9780, + "token_acc": 0.3018021678014853 + }, + { + "epoch": 5.733509234828496, + "grad_norm": 0.17150808864042918, + "learning_rate": 0.00042726432071368316, + "loss": 3.007329225540161, + "step": 9781, + "token_acc": 0.2970757521671952 + }, + { + "epoch": 5.734095573145705, + "grad_norm": 0.3468445096942142, + "learning_rate": 0.00042724723392415333, + "loss": 2.9984984397888184, + "step": 9782, + "token_acc": 0.29936849467231663 + }, + { + "epoch": 5.734681911462914, + "grad_norm": 0.31661307645515097, + "learning_rate": 0.0004272301454696246, + "loss": 3.0195417404174805, + "step": 9783, + "token_acc": 0.29593348466071145 + }, + { + "epoch": 5.735268249780123, + "grad_norm": 0.1893982672655234, + "learning_rate": 0.00042721305535025744, + "loss": 3.026937246322632, + "step": 9784, + "token_acc": 0.29479486044389064 + }, + { + "epoch": 5.735854588097332, + "grad_norm": 0.20813351102625063, + "learning_rate": 0.0004271959635662124, + "loss": 3.011892318725586, + "step": 9785, + "token_acc": 0.2964921403143874 + }, + { + "epoch": 5.7364409264145415, + "grad_norm": 0.19420941936563837, + "learning_rate": 0.0004271788701176501, + "loss": 3.0171279907226562, + "step": 9786, + "token_acc": 0.29744646772308947 + }, + { + "epoch": 5.737027264731751, + "grad_norm": 0.19765457516585758, + "learning_rate": 0.0004271617750047311, + "loss": 3.0471858978271484, + "step": 9787, + "token_acc": 0.2927863593984458 + }, + { + "epoch": 5.73761360304896, + "grad_norm": 0.21551670737881892, + "learning_rate": 0.0004271446782276159, + "loss": 3.04540753364563, + "step": 9788, + "token_acc": 0.29376635298327686 + }, + { + "epoch": 5.738199941366168, + "grad_norm": 0.17335568131810578, + "learning_rate": 0.0004271275797864651, + "loss": 3.0074214935302734, + "step": 9789, + "token_acc": 0.29775162955637113 + }, + { + "epoch": 5.738786279683377, + "grad_norm": 0.2262175397264559, + "learning_rate": 0.00042711047968143945, + "loss": 3.044785976409912, + "step": 9790, + "token_acc": 0.2926703803750759 + }, + { + "epoch": 5.739372618000586, + "grad_norm": 0.16936770866551007, + "learning_rate": 0.00042709337791269953, + "loss": 2.9827089309692383, + "step": 9791, + "token_acc": 0.3009463556082773 + }, + { + "epoch": 5.739958956317795, + "grad_norm": 0.2001771713243243, + "learning_rate": 0.0004270762744804059, + "loss": 2.970305919647217, + "step": 9792, + "token_acc": 0.3033125524117751 + }, + { + "epoch": 5.740545294635004, + "grad_norm": 0.14901210741679927, + "learning_rate": 0.00042705916938471934, + "loss": 2.980947494506836, + "step": 9793, + "token_acc": 0.30191288005906436 + }, + { + "epoch": 5.7411316329522135, + "grad_norm": 0.1860267515424033, + "learning_rate": 0.00042704206262580036, + "loss": 3.050400495529175, + "step": 9794, + "token_acc": 0.29157887185101145 + }, + { + "epoch": 5.741717971269423, + "grad_norm": 0.16210204620671523, + "learning_rate": 0.0004270249542038099, + "loss": 2.991826057434082, + "step": 9795, + "token_acc": 0.29750529672127707 + }, + { + "epoch": 5.742304309586632, + "grad_norm": 0.1666851259163461, + "learning_rate": 0.0004270078441189085, + "loss": 2.9970202445983887, + "step": 9796, + "token_acc": 0.29860864363843653 + }, + { + "epoch": 5.742890647903841, + "grad_norm": 0.1578545090205116, + "learning_rate": 0.0004269907323712569, + "loss": 3.029672384262085, + "step": 9797, + "token_acc": 0.29600779999788046 + }, + { + "epoch": 5.743476986221049, + "grad_norm": 0.18944046093002984, + "learning_rate": 0.00042697361896101583, + "loss": 2.9879541397094727, + "step": 9798, + "token_acc": 0.3003644112611457 + }, + { + "epoch": 5.744063324538258, + "grad_norm": 0.213559691757153, + "learning_rate": 0.0004269565038883462, + "loss": 2.989189386367798, + "step": 9799, + "token_acc": 0.29865113743317395 + }, + { + "epoch": 5.744649662855467, + "grad_norm": 0.1781893738300793, + "learning_rate": 0.0004269393871534086, + "loss": 2.99185848236084, + "step": 9800, + "token_acc": 0.300760268071098 + }, + { + "epoch": 5.745236001172676, + "grad_norm": 0.2662629491516313, + "learning_rate": 0.0004269222687563639, + "loss": 3.0291504859924316, + "step": 9801, + "token_acc": 0.2964964631313853 + }, + { + "epoch": 5.7458223394898855, + "grad_norm": 0.1818353558872522, + "learning_rate": 0.0004269051486973728, + "loss": 3.0381107330322266, + "step": 9802, + "token_acc": 0.29219570885345847 + }, + { + "epoch": 5.746408677807095, + "grad_norm": 0.2576406147144324, + "learning_rate": 0.00042688802697659635, + "loss": 2.9792261123657227, + "step": 9803, + "token_acc": 0.30034735350907704 + }, + { + "epoch": 5.746995016124304, + "grad_norm": 0.1613269765864322, + "learning_rate": 0.00042687090359419517, + "loss": 2.9641284942626953, + "step": 9804, + "token_acc": 0.30337067475661766 + }, + { + "epoch": 5.747581354441513, + "grad_norm": 0.24483848251258744, + "learning_rate": 0.0004268537785503302, + "loss": 2.9684858322143555, + "step": 9805, + "token_acc": 0.3031364927797926 + }, + { + "epoch": 5.748167692758722, + "grad_norm": 0.16307764805023037, + "learning_rate": 0.0004268366518451623, + "loss": 3.0233664512634277, + "step": 9806, + "token_acc": 0.29607322762318194 + }, + { + "epoch": 5.748754031075931, + "grad_norm": 0.19165530216029253, + "learning_rate": 0.00042681952347885233, + "loss": 3.0188350677490234, + "step": 9807, + "token_acc": 0.29867636847199247 + }, + { + "epoch": 5.74934036939314, + "grad_norm": 0.1818282656414819, + "learning_rate": 0.00042680239345156126, + "loss": 2.979160785675049, + "step": 9808, + "token_acc": 0.30208137234928684 + }, + { + "epoch": 5.749926707710349, + "grad_norm": 0.20846138689722382, + "learning_rate": 0.0004267852617634499, + "loss": 2.9984664916992188, + "step": 9809, + "token_acc": 0.2988695581618738 + }, + { + "epoch": 5.7505130460275575, + "grad_norm": 0.16870047331698834, + "learning_rate": 0.00042676812841467923, + "loss": 3.022526264190674, + "step": 9810, + "token_acc": 0.29490765164509175 + }, + { + "epoch": 5.751099384344767, + "grad_norm": 0.1744523519090359, + "learning_rate": 0.00042675099340541017, + "loss": 3.014735698699951, + "step": 9811, + "token_acc": 0.29642171536622225 + }, + { + "epoch": 5.751685722661976, + "grad_norm": 0.15760754157824028, + "learning_rate": 0.00042673385673580366, + "loss": 2.994710922241211, + "step": 9812, + "token_acc": 0.2981859174831423 + }, + { + "epoch": 5.752272060979185, + "grad_norm": 0.17359981542197367, + "learning_rate": 0.00042671671840602087, + "loss": 2.9937655925750732, + "step": 9813, + "token_acc": 0.2978452713076337 + }, + { + "epoch": 5.752858399296394, + "grad_norm": 0.16510595094494643, + "learning_rate": 0.0004266995784162225, + "loss": 3.0381357669830322, + "step": 9814, + "token_acc": 0.29298097890623315 + }, + { + "epoch": 5.753444737613603, + "grad_norm": 0.19059267994434784, + "learning_rate": 0.0004266824367665698, + "loss": 3.007462739944458, + "step": 9815, + "token_acc": 0.29770543698780927 + }, + { + "epoch": 5.754031075930812, + "grad_norm": 0.1589780408348912, + "learning_rate": 0.0004266652934572236, + "loss": 3.002833843231201, + "step": 9816, + "token_acc": 0.299048553833489 + }, + { + "epoch": 5.754617414248021, + "grad_norm": 0.18096755872241815, + "learning_rate": 0.0004266481484883451, + "loss": 2.9881269931793213, + "step": 9817, + "token_acc": 0.30013900346373423 + }, + { + "epoch": 5.75520375256523, + "grad_norm": 0.1621406845412763, + "learning_rate": 0.0004266310018600952, + "loss": 2.9817309379577637, + "step": 9818, + "token_acc": 0.3025529385302529 + }, + { + "epoch": 5.7557900908824395, + "grad_norm": 0.16863611510753743, + "learning_rate": 0.00042661385357263516, + "loss": 3.0115761756896973, + "step": 9819, + "token_acc": 0.2979749429144072 + }, + { + "epoch": 5.756376429199648, + "grad_norm": 0.1791824773520426, + "learning_rate": 0.0004265967036261259, + "loss": 3.0059618949890137, + "step": 9820, + "token_acc": 0.2991214496081466 + }, + { + "epoch": 5.756962767516857, + "grad_norm": 0.15483704425513048, + "learning_rate": 0.00042657955202072856, + "loss": 2.984379291534424, + "step": 9821, + "token_acc": 0.29944282441158093 + }, + { + "epoch": 5.757549105834066, + "grad_norm": 0.17150397239285364, + "learning_rate": 0.00042656239875660425, + "loss": 2.983255624771118, + "step": 9822, + "token_acc": 0.3016053351812012 + }, + { + "epoch": 5.758135444151275, + "grad_norm": 0.15887307691157063, + "learning_rate": 0.0004265452438339141, + "loss": 2.976199150085449, + "step": 9823, + "token_acc": 0.30170426498801567 + }, + { + "epoch": 5.758721782468484, + "grad_norm": 0.16143268127256333, + "learning_rate": 0.00042652808725281946, + "loss": 3.008729934692383, + "step": 9824, + "token_acc": 0.2984476467992138 + }, + { + "epoch": 5.759308120785693, + "grad_norm": 0.17055784408218197, + "learning_rate": 0.00042651092901348117, + "loss": 3.0651113986968994, + "step": 9825, + "token_acc": 0.29029814813812954 + }, + { + "epoch": 5.759894459102902, + "grad_norm": 0.1481536693558484, + "learning_rate": 0.0004264937691160605, + "loss": 3.024775743484497, + "step": 9826, + "token_acc": 0.2945779264413816 + }, + { + "epoch": 5.7604807974201115, + "grad_norm": 0.1601951083400055, + "learning_rate": 0.0004264766075607188, + "loss": 3.0059478282928467, + "step": 9827, + "token_acc": 0.2974255187730803 + }, + { + "epoch": 5.761067135737321, + "grad_norm": 0.17061466215621363, + "learning_rate": 0.0004264594443476172, + "loss": 2.9989688396453857, + "step": 9828, + "token_acc": 0.29873509349423816 + }, + { + "epoch": 5.76165347405453, + "grad_norm": 0.16158903886122816, + "learning_rate": 0.00042644227947691684, + "loss": 3.012486696243286, + "step": 9829, + "token_acc": 0.29633690285214276 + }, + { + "epoch": 5.762239812371739, + "grad_norm": 0.1726561587352895, + "learning_rate": 0.00042642511294877904, + "loss": 2.963540554046631, + "step": 9830, + "token_acc": 0.3035180926016506 + }, + { + "epoch": 5.762826150688948, + "grad_norm": 0.18255360400419068, + "learning_rate": 0.0004264079447633651, + "loss": 3.016244411468506, + "step": 9831, + "token_acc": 0.2952851555550835 + }, + { + "epoch": 5.763412489006156, + "grad_norm": 0.1952540596241095, + "learning_rate": 0.0004263907749208361, + "loss": 2.9847264289855957, + "step": 9832, + "token_acc": 0.2999788117304538 + }, + { + "epoch": 5.763998827323365, + "grad_norm": 0.21577931220425517, + "learning_rate": 0.00042637360342135354, + "loss": 3.010488271713257, + "step": 9833, + "token_acc": 0.2982501565426919 + }, + { + "epoch": 5.764585165640574, + "grad_norm": 0.1728646644286135, + "learning_rate": 0.00042635643026507863, + "loss": 2.9888224601745605, + "step": 9834, + "token_acc": 0.3006070243188329 + }, + { + "epoch": 5.7651715039577835, + "grad_norm": 0.1843046223580123, + "learning_rate": 0.0004263392554521727, + "loss": 2.995633840560913, + "step": 9835, + "token_acc": 0.2980454086519792 + }, + { + "epoch": 5.765757842274993, + "grad_norm": 0.22080286055160886, + "learning_rate": 0.00042632207898279706, + "loss": 3.0202553272247314, + "step": 9836, + "token_acc": 0.29518842317889155 + }, + { + "epoch": 5.766344180592202, + "grad_norm": 0.1942678526814422, + "learning_rate": 0.0004263049008571131, + "loss": 3.0228071212768555, + "step": 9837, + "token_acc": 0.2958257861327328 + }, + { + "epoch": 5.766930518909411, + "grad_norm": 0.19632194336078884, + "learning_rate": 0.00042628772107528224, + "loss": 3.0176773071289062, + "step": 9838, + "token_acc": 0.29681067753001716 + }, + { + "epoch": 5.76751685722662, + "grad_norm": 0.18122861862610815, + "learning_rate": 0.0004262705396374657, + "loss": 2.9950449466705322, + "step": 9839, + "token_acc": 0.30017655521164827 + }, + { + "epoch": 5.768103195543829, + "grad_norm": 0.18450854606986924, + "learning_rate": 0.00042625335654382504, + "loss": 2.9820404052734375, + "step": 9840, + "token_acc": 0.3026783015049113 + }, + { + "epoch": 5.768689533861037, + "grad_norm": 0.1855358493245563, + "learning_rate": 0.00042623617179452156, + "loss": 2.979336738586426, + "step": 9841, + "token_acc": 0.3014573657956873 + }, + { + "epoch": 5.7692758721782464, + "grad_norm": 0.171195440933651, + "learning_rate": 0.0004262189853897167, + "loss": 3.0250351428985596, + "step": 9842, + "token_acc": 0.29585189490932595 + }, + { + "epoch": 5.769862210495456, + "grad_norm": 0.22939461109295775, + "learning_rate": 0.00042620179732957196, + "loss": 2.9973955154418945, + "step": 9843, + "token_acc": 0.3001539627709102 + }, + { + "epoch": 5.770448548812665, + "grad_norm": 0.22404468869983718, + "learning_rate": 0.00042618460761424876, + "loss": 2.97257661819458, + "step": 9844, + "token_acc": 0.3020805431291509 + }, + { + "epoch": 5.771034887129874, + "grad_norm": 0.17821824589940463, + "learning_rate": 0.00042616741624390856, + "loss": 3.0196237564086914, + "step": 9845, + "token_acc": 0.2962530581583035 + }, + { + "epoch": 5.771621225447083, + "grad_norm": 0.2391526302564196, + "learning_rate": 0.0004261502232187129, + "loss": 2.9857356548309326, + "step": 9846, + "token_acc": 0.29999017705200054 + }, + { + "epoch": 5.772207563764292, + "grad_norm": 0.2694231248265357, + "learning_rate": 0.0004261330285388233, + "loss": 3.013343334197998, + "step": 9847, + "token_acc": 0.2982479009928384 + }, + { + "epoch": 5.772793902081501, + "grad_norm": 0.1791459476106535, + "learning_rate": 0.0004261158322044012, + "loss": 2.982802152633667, + "step": 9848, + "token_acc": 0.30107480837579753 + }, + { + "epoch": 5.77338024039871, + "grad_norm": 0.2022295237788424, + "learning_rate": 0.0004260986342156081, + "loss": 3.024275779724121, + "step": 9849, + "token_acc": 0.2961895522311039 + }, + { + "epoch": 5.773966578715919, + "grad_norm": 0.18355964882066209, + "learning_rate": 0.00042608143457260574, + "loss": 3.0137851238250732, + "step": 9850, + "token_acc": 0.29786150319887017 + }, + { + "epoch": 5.7745529170331285, + "grad_norm": 0.17175963604717426, + "learning_rate": 0.00042606423327555553, + "loss": 3.011504650115967, + "step": 9851, + "token_acc": 0.2948839936429236 + }, + { + "epoch": 5.775139255350338, + "grad_norm": 0.25252950666588686, + "learning_rate": 0.0004260470303246191, + "loss": 3.01700758934021, + "step": 9852, + "token_acc": 0.29523926086280267 + }, + { + "epoch": 5.775725593667546, + "grad_norm": 0.2361043160689279, + "learning_rate": 0.0004260298257199581, + "loss": 2.9895331859588623, + "step": 9853, + "token_acc": 0.30056262268729617 + }, + { + "epoch": 5.776311931984755, + "grad_norm": 0.20070251007576217, + "learning_rate": 0.00042601261946173405, + "loss": 3.05723237991333, + "step": 9854, + "token_acc": 0.2910841195351531 + }, + { + "epoch": 5.776898270301964, + "grad_norm": 0.18049050140552117, + "learning_rate": 0.0004259954115501086, + "loss": 3.012880325317383, + "step": 9855, + "token_acc": 0.29730930998049415 + }, + { + "epoch": 5.777484608619173, + "grad_norm": 0.20172210634782356, + "learning_rate": 0.0004259782019852434, + "loss": 3.0195188522338867, + "step": 9856, + "token_acc": 0.295771397038731 + }, + { + "epoch": 5.778070946936382, + "grad_norm": 0.17809974345665736, + "learning_rate": 0.00042596099076730015, + "loss": 2.981475830078125, + "step": 9857, + "token_acc": 0.3019364411755287 + }, + { + "epoch": 5.778657285253591, + "grad_norm": 0.18613424724006009, + "learning_rate": 0.00042594377789644054, + "loss": 3.0010063648223877, + "step": 9858, + "token_acc": 0.2984356229265175 + }, + { + "epoch": 5.7792436235708005, + "grad_norm": 0.18165367219150833, + "learning_rate": 0.00042592656337282623, + "loss": 3.0383360385894775, + "step": 9859, + "token_acc": 0.2929335071707953 + }, + { + "epoch": 5.77982996188801, + "grad_norm": 0.1746716071503972, + "learning_rate": 0.0004259093471966188, + "loss": 2.9904322624206543, + "step": 9860, + "token_acc": 0.30081722155089796 + }, + { + "epoch": 5.780416300205219, + "grad_norm": 0.18130247616422657, + "learning_rate": 0.0004258921293679802, + "loss": 3.0089967250823975, + "step": 9861, + "token_acc": 0.2983422173276554 + }, + { + "epoch": 5.781002638522428, + "grad_norm": 0.1869687863912142, + "learning_rate": 0.000425874909887072, + "loss": 3.0213675498962402, + "step": 9862, + "token_acc": 0.296800129070424 + }, + { + "epoch": 5.781588976839636, + "grad_norm": 0.21147826203105444, + "learning_rate": 0.0004258576887540561, + "loss": 3.0386123657226562, + "step": 9863, + "token_acc": 0.2940331677885734 + }, + { + "epoch": 5.782175315156845, + "grad_norm": 0.156330527476051, + "learning_rate": 0.0004258404659690941, + "loss": 3.0121331214904785, + "step": 9864, + "token_acc": 0.29718483860792766 + }, + { + "epoch": 5.782761653474054, + "grad_norm": 0.20342599706877254, + "learning_rate": 0.000425823241532348, + "loss": 3.004225730895996, + "step": 9865, + "token_acc": 0.2984643504501129 + }, + { + "epoch": 5.783347991791263, + "grad_norm": 0.21684627553868843, + "learning_rate": 0.00042580601544397936, + "loss": 3.011324405670166, + "step": 9866, + "token_acc": 0.2966895934104269 + }, + { + "epoch": 5.7839343301084725, + "grad_norm": 0.16770357802135302, + "learning_rate": 0.0004257887877041501, + "loss": 2.9937868118286133, + "step": 9867, + "token_acc": 0.2988383701110826 + }, + { + "epoch": 5.784520668425682, + "grad_norm": 0.20331196359580697, + "learning_rate": 0.00042577155831302203, + "loss": 3.02339506149292, + "step": 9868, + "token_acc": 0.2957570388387685 + }, + { + "epoch": 5.785107006742891, + "grad_norm": 0.23845863094953398, + "learning_rate": 0.00042575432727075713, + "loss": 2.983982563018799, + "step": 9869, + "token_acc": 0.30085166863760776 + }, + { + "epoch": 5.7856933450601, + "grad_norm": 0.17406971275578026, + "learning_rate": 0.0004257370945775171, + "loss": 2.949066162109375, + "step": 9870, + "token_acc": 0.30657564929870407 + }, + { + "epoch": 5.786279683377309, + "grad_norm": 0.16104200731844928, + "learning_rate": 0.00042571986023346386, + "loss": 3.007042169570923, + "step": 9871, + "token_acc": 0.29838674720606645 + }, + { + "epoch": 5.786866021694518, + "grad_norm": 0.15193710981436453, + "learning_rate": 0.0004257026242387594, + "loss": 2.980504035949707, + "step": 9872, + "token_acc": 0.30315834466279407 + }, + { + "epoch": 5.787452360011727, + "grad_norm": 0.1704912964935057, + "learning_rate": 0.00042568538659356546, + "loss": 3.005622386932373, + "step": 9873, + "token_acc": 0.2975203313673693 + }, + { + "epoch": 5.788038698328936, + "grad_norm": 0.18606616275557095, + "learning_rate": 0.0004256681472980441, + "loss": 3.0079498291015625, + "step": 9874, + "token_acc": 0.29871990903592094 + }, + { + "epoch": 5.7886250366461445, + "grad_norm": 0.17978125562898373, + "learning_rate": 0.00042565090635235717, + "loss": 3.004889726638794, + "step": 9875, + "token_acc": 0.29727987291632285 + }, + { + "epoch": 5.789211374963354, + "grad_norm": 0.18121020103396734, + "learning_rate": 0.0004256336637566667, + "loss": 3.0541625022888184, + "step": 9876, + "token_acc": 0.29197883959751714 + }, + { + "epoch": 5.789797713280563, + "grad_norm": 0.17341035990252748, + "learning_rate": 0.0004256164195111346, + "loss": 2.9958696365356445, + "step": 9877, + "token_acc": 0.29954415390092376 + }, + { + "epoch": 5.790384051597772, + "grad_norm": 0.17478342312582604, + "learning_rate": 0.00042559917361592294, + "loss": 2.9630792140960693, + "step": 9878, + "token_acc": 0.3039548740091649 + }, + { + "epoch": 5.790970389914981, + "grad_norm": 0.17855634615418253, + "learning_rate": 0.0004255819260711936, + "loss": 3.0145084857940674, + "step": 9879, + "token_acc": 0.29548514990263464 + }, + { + "epoch": 5.79155672823219, + "grad_norm": 0.18151171473310634, + "learning_rate": 0.0004255646768771087, + "loss": 3.022407293319702, + "step": 9880, + "token_acc": 0.29410911238985515 + }, + { + "epoch": 5.792143066549399, + "grad_norm": 0.15461045173057483, + "learning_rate": 0.00042554742603383025, + "loss": 3.0339534282684326, + "step": 9881, + "token_acc": 0.2931750695987865 + }, + { + "epoch": 5.792729404866608, + "grad_norm": 0.16895235214048931, + "learning_rate": 0.0004255301735415202, + "loss": 3.0054728984832764, + "step": 9882, + "token_acc": 0.2979912946818558 + }, + { + "epoch": 5.793315743183817, + "grad_norm": 0.16020655949294255, + "learning_rate": 0.0004255129194003408, + "loss": 2.9826042652130127, + "step": 9883, + "token_acc": 0.3008270047625177 + }, + { + "epoch": 5.7939020815010265, + "grad_norm": 0.1572664882809678, + "learning_rate": 0.000425495663610454, + "loss": 3.0225157737731934, + "step": 9884, + "token_acc": 0.2957892947000855 + }, + { + "epoch": 5.794488419818235, + "grad_norm": 0.16709593831118263, + "learning_rate": 0.00042547840617202193, + "loss": 3.0286941528320312, + "step": 9885, + "token_acc": 0.2948386281747078 + }, + { + "epoch": 5.795074758135444, + "grad_norm": 0.21423684240921123, + "learning_rate": 0.00042546114708520667, + "loss": 3.038414716720581, + "step": 9886, + "token_acc": 0.29237983464217543 + }, + { + "epoch": 5.795661096452653, + "grad_norm": 0.2594387257095789, + "learning_rate": 0.0004254438863501704, + "loss": 2.9988722801208496, + "step": 9887, + "token_acc": 0.2974965015689933 + }, + { + "epoch": 5.796247434769862, + "grad_norm": 0.27718640358647034, + "learning_rate": 0.00042542662396707524, + "loss": 3.008835792541504, + "step": 9888, + "token_acc": 0.29856044020911987 + }, + { + "epoch": 5.796833773087071, + "grad_norm": 0.1706571923942546, + "learning_rate": 0.00042540935993608324, + "loss": 3.0124828815460205, + "step": 9889, + "token_acc": 0.2974697486654576 + }, + { + "epoch": 5.79742011140428, + "grad_norm": 0.18361752711194348, + "learning_rate": 0.0004253920942573568, + "loss": 3.0069122314453125, + "step": 9890, + "token_acc": 0.29760818107276016 + }, + { + "epoch": 5.798006449721489, + "grad_norm": 0.19120750205207077, + "learning_rate": 0.0004253748269310579, + "loss": 2.9694414138793945, + "step": 9891, + "token_acc": 0.3039701041197858 + }, + { + "epoch": 5.7985927880386985, + "grad_norm": 0.17759839898621618, + "learning_rate": 0.0004253575579573488, + "loss": 3.014781951904297, + "step": 9892, + "token_acc": 0.29786470575999 + }, + { + "epoch": 5.799179126355908, + "grad_norm": 0.17854784113921143, + "learning_rate": 0.0004253402873363918, + "loss": 3.0146846771240234, + "step": 9893, + "token_acc": 0.29626095054855084 + }, + { + "epoch": 5.799765464673117, + "grad_norm": 0.16822589885417866, + "learning_rate": 0.00042532301506834904, + "loss": 3.0046703815460205, + "step": 9894, + "token_acc": 0.2981885355209971 + }, + { + "epoch": 5.800351802990326, + "grad_norm": 0.17326018957401854, + "learning_rate": 0.0004253057411533828, + "loss": 3.0333333015441895, + "step": 9895, + "token_acc": 0.29528862895434327 + }, + { + "epoch": 5.800938141307535, + "grad_norm": 0.17263074484874288, + "learning_rate": 0.0004252884655916554, + "loss": 3.0302329063415527, + "step": 9896, + "token_acc": 0.29509516206165737 + }, + { + "epoch": 5.801524479624743, + "grad_norm": 0.1776082236260518, + "learning_rate": 0.0004252711883833291, + "loss": 3.0505847930908203, + "step": 9897, + "token_acc": 0.2930789133247089 + }, + { + "epoch": 5.802110817941952, + "grad_norm": 0.1811347585820215, + "learning_rate": 0.000425253909528566, + "loss": 3.0150270462036133, + "step": 9898, + "token_acc": 0.29435353961323174 + }, + { + "epoch": 5.802697156259161, + "grad_norm": 0.16261723627282812, + "learning_rate": 0.00042523662902752875, + "loss": 2.995666980743408, + "step": 9899, + "token_acc": 0.2994882559208546 + }, + { + "epoch": 5.8032834945763705, + "grad_norm": 0.18289604008314098, + "learning_rate": 0.00042521934688037946, + "loss": 2.9709839820861816, + "step": 9900, + "token_acc": 0.30213232906273896 + }, + { + "epoch": 5.80386983289358, + "grad_norm": 0.1956725728162049, + "learning_rate": 0.0004252020630872804, + "loss": 3.048267364501953, + "step": 9901, + "token_acc": 0.2928976764605454 + }, + { + "epoch": 5.804456171210789, + "grad_norm": 0.17546838421050795, + "learning_rate": 0.0004251847776483942, + "loss": 2.974883794784546, + "step": 9902, + "token_acc": 0.30272165273701934 + }, + { + "epoch": 5.805042509527998, + "grad_norm": 0.1666667587613834, + "learning_rate": 0.00042516749056388303, + "loss": 3.0285680294036865, + "step": 9903, + "token_acc": 0.2936522053015124 + }, + { + "epoch": 5.805628847845207, + "grad_norm": 0.21010598647358766, + "learning_rate": 0.0004251502018339093, + "loss": 2.959278106689453, + "step": 9904, + "token_acc": 0.3052134344304492 + }, + { + "epoch": 5.806215186162416, + "grad_norm": 0.1918767663017535, + "learning_rate": 0.0004251329114586354, + "loss": 2.9866018295288086, + "step": 9905, + "token_acc": 0.3002852077391505 + }, + { + "epoch": 5.806801524479624, + "grad_norm": 0.19178689390597403, + "learning_rate": 0.00042511561943822386, + "loss": 3.036616325378418, + "step": 9906, + "token_acc": 0.2932062976874452 + }, + { + "epoch": 5.807387862796833, + "grad_norm": 0.20518040770597976, + "learning_rate": 0.0004250983257728371, + "loss": 3.040219306945801, + "step": 9907, + "token_acc": 0.2967445942711242 + }, + { + "epoch": 5.8079742011140425, + "grad_norm": 0.19064656090011026, + "learning_rate": 0.00042508103046263746, + "loss": 3.007669448852539, + "step": 9908, + "token_acc": 0.2970445976816316 + }, + { + "epoch": 5.808560539431252, + "grad_norm": 0.18329839468225553, + "learning_rate": 0.00042506373350778743, + "loss": 3.007746934890747, + "step": 9909, + "token_acc": 0.2966362329460028 + }, + { + "epoch": 5.809146877748461, + "grad_norm": 0.1816163410490305, + "learning_rate": 0.00042504643490844953, + "loss": 2.9976625442504883, + "step": 9910, + "token_acc": 0.2976666109651055 + }, + { + "epoch": 5.80973321606567, + "grad_norm": 0.2097161392658969, + "learning_rate": 0.0004250291346647863, + "loss": 3.026937246322632, + "step": 9911, + "token_acc": 0.29505573035689736 + }, + { + "epoch": 5.810319554382879, + "grad_norm": 0.19706998113774662, + "learning_rate": 0.0004250118327769602, + "loss": 2.9947919845581055, + "step": 9912, + "token_acc": 0.29995824164000195 + }, + { + "epoch": 5.810905892700088, + "grad_norm": 0.15714298494345658, + "learning_rate": 0.0004249945292451337, + "loss": 3.0226359367370605, + "step": 9913, + "token_acc": 0.29586274729516054 + }, + { + "epoch": 5.811492231017297, + "grad_norm": 0.1687407601029449, + "learning_rate": 0.00042497722406946946, + "loss": 3.010319232940674, + "step": 9914, + "token_acc": 0.29809959808732395 + }, + { + "epoch": 5.812078569334506, + "grad_norm": 0.17948614148038625, + "learning_rate": 0.00042495991725012994, + "loss": 3.0053184032440186, + "step": 9915, + "token_acc": 0.2994510581148214 + }, + { + "epoch": 5.812664907651715, + "grad_norm": 0.2086592467517009, + "learning_rate": 0.0004249426087872778, + "loss": 3.0264275074005127, + "step": 9916, + "token_acc": 0.295315836829382 + }, + { + "epoch": 5.8132512459689245, + "grad_norm": 0.23335396366070113, + "learning_rate": 0.0004249252986810756, + "loss": 3.0102686882019043, + "step": 9917, + "token_acc": 0.2970574787467355 + }, + { + "epoch": 5.813837584286133, + "grad_norm": 0.21996269038311886, + "learning_rate": 0.00042490798693168593, + "loss": 3.043732166290283, + "step": 9918, + "token_acc": 0.2933223435675527 + }, + { + "epoch": 5.814423922603342, + "grad_norm": 0.18530469826867668, + "learning_rate": 0.0004248906735392714, + "loss": 3.0129292011260986, + "step": 9919, + "token_acc": 0.2971605174716196 + }, + { + "epoch": 5.815010260920551, + "grad_norm": 0.2377449686025687, + "learning_rate": 0.0004248733585039946, + "loss": 2.9897301197052, + "step": 9920, + "token_acc": 0.30024687517248644 + }, + { + "epoch": 5.81559659923776, + "grad_norm": 0.27267475489364945, + "learning_rate": 0.00042485604182601833, + "loss": 2.972597122192383, + "step": 9921, + "token_acc": 0.30305584490970827 + }, + { + "epoch": 5.816182937554969, + "grad_norm": 0.1757501901623698, + "learning_rate": 0.0004248387235055051, + "loss": 3.0212197303771973, + "step": 9922, + "token_acc": 0.29612465738232174 + }, + { + "epoch": 5.816769275872178, + "grad_norm": 0.1895733874698817, + "learning_rate": 0.00042482140354261776, + "loss": 3.0097126960754395, + "step": 9923, + "token_acc": 0.29815154327853677 + }, + { + "epoch": 5.817355614189387, + "grad_norm": 0.20299232679116552, + "learning_rate": 0.00042480408193751887, + "loss": 3.0479159355163574, + "step": 9924, + "token_acc": 0.2908756202560031 + }, + { + "epoch": 5.8179419525065965, + "grad_norm": 0.17620247341020376, + "learning_rate": 0.0004247867586903711, + "loss": 3.021477460861206, + "step": 9925, + "token_acc": 0.2961721902898373 + }, + { + "epoch": 5.818528290823806, + "grad_norm": 0.2049642520910694, + "learning_rate": 0.0004247694338013373, + "loss": 3.039742946624756, + "step": 9926, + "token_acc": 0.29330553941837817 + }, + { + "epoch": 5.819114629141015, + "grad_norm": 0.19089721742243326, + "learning_rate": 0.0004247521072705802, + "loss": 3.023261070251465, + "step": 9927, + "token_acc": 0.2946545400475813 + }, + { + "epoch": 5.819700967458223, + "grad_norm": 0.1731140297627653, + "learning_rate": 0.00042473477909826254, + "loss": 2.9618992805480957, + "step": 9928, + "token_acc": 0.3030949143607262 + }, + { + "epoch": 5.820287305775432, + "grad_norm": 0.19273139677956433, + "learning_rate": 0.0004247174492845471, + "loss": 3.0052127838134766, + "step": 9929, + "token_acc": 0.2991691816613384 + }, + { + "epoch": 5.820873644092641, + "grad_norm": 0.1621394562792739, + "learning_rate": 0.00042470011782959663, + "loss": 3.032313346862793, + "step": 9930, + "token_acc": 0.29331483010714104 + }, + { + "epoch": 5.82145998240985, + "grad_norm": 0.1772828852103662, + "learning_rate": 0.0004246827847335739, + "loss": 3.045546531677246, + "step": 9931, + "token_acc": 0.2927859401389224 + }, + { + "epoch": 5.822046320727059, + "grad_norm": 0.2382340101664907, + "learning_rate": 0.0004246654499966419, + "loss": 2.982382297515869, + "step": 9932, + "token_acc": 0.3027504480922232 + }, + { + "epoch": 5.8226326590442685, + "grad_norm": 0.20607170315934709, + "learning_rate": 0.0004246481136189633, + "loss": 3.0268819332122803, + "step": 9933, + "token_acc": 0.2941412074811982 + }, + { + "epoch": 5.823218997361478, + "grad_norm": 0.16556030420120882, + "learning_rate": 0.0004246307756007011, + "loss": 2.988980770111084, + "step": 9934, + "token_acc": 0.30174394952164646 + }, + { + "epoch": 5.823805335678687, + "grad_norm": 0.17229705862178424, + "learning_rate": 0.00042461343594201795, + "loss": 3.0542287826538086, + "step": 9935, + "token_acc": 0.2906349922111995 + }, + { + "epoch": 5.824391673995896, + "grad_norm": 0.15428436541313784, + "learning_rate": 0.00042459609464307697, + "loss": 3.036468505859375, + "step": 9936, + "token_acc": 0.2939639439246826 + }, + { + "epoch": 5.824978012313105, + "grad_norm": 0.1871352827475086, + "learning_rate": 0.0004245787517040409, + "loss": 3.027794599533081, + "step": 9937, + "token_acc": 0.2960834359007579 + }, + { + "epoch": 5.825564350630314, + "grad_norm": 0.17925905677672996, + "learning_rate": 0.00042456140712507275, + "loss": 3.0783824920654297, + "step": 9938, + "token_acc": 0.2878753922008068 + }, + { + "epoch": 5.826150688947523, + "grad_norm": 0.19512006381030642, + "learning_rate": 0.0004245440609063354, + "loss": 2.99771785736084, + "step": 9939, + "token_acc": 0.2985421734271251 + }, + { + "epoch": 5.826737027264731, + "grad_norm": 0.1740642098808907, + "learning_rate": 0.00042452671304799174, + "loss": 3.0366663932800293, + "step": 9940, + "token_acc": 0.2942588325652842 + }, + { + "epoch": 5.8273233655819405, + "grad_norm": 0.1895174215358304, + "learning_rate": 0.00042450936355020486, + "loss": 3.0096302032470703, + "step": 9941, + "token_acc": 0.29793306888428006 + }, + { + "epoch": 5.82790970389915, + "grad_norm": 0.16667071548354603, + "learning_rate": 0.00042449201241313763, + "loss": 3.041743278503418, + "step": 9942, + "token_acc": 0.2920382440371146 + }, + { + "epoch": 5.828496042216359, + "grad_norm": 0.1926521572238255, + "learning_rate": 0.000424474659636953, + "loss": 3.0008723735809326, + "step": 9943, + "token_acc": 0.300528268870288 + }, + { + "epoch": 5.829082380533568, + "grad_norm": 0.27653223054714676, + "learning_rate": 0.0004244573052218141, + "loss": 2.988206386566162, + "step": 9944, + "token_acc": 0.29988626853501 + }, + { + "epoch": 5.829668718850777, + "grad_norm": 0.31921464609985134, + "learning_rate": 0.00042443994916788397, + "loss": 3.0417234897613525, + "step": 9945, + "token_acc": 0.29328729207535004 + }, + { + "epoch": 5.830255057167986, + "grad_norm": 0.18686841873703186, + "learning_rate": 0.0004244225914753255, + "loss": 3.0199027061462402, + "step": 9946, + "token_acc": 0.2958733534843021 + }, + { + "epoch": 5.830841395485195, + "grad_norm": 0.2055391020820378, + "learning_rate": 0.00042440523214430187, + "loss": 3.0208704471588135, + "step": 9947, + "token_acc": 0.2954877548365143 + }, + { + "epoch": 5.831427733802404, + "grad_norm": 0.20446241064715343, + "learning_rate": 0.0004243878711749761, + "loss": 2.98987078666687, + "step": 9948, + "token_acc": 0.30055845005062437 + }, + { + "epoch": 5.8320140721196125, + "grad_norm": 0.16865969175337753, + "learning_rate": 0.00042437050856751127, + "loss": 3.0320911407470703, + "step": 9949, + "token_acc": 0.2944001907818166 + }, + { + "epoch": 5.832600410436822, + "grad_norm": 0.20136093713224978, + "learning_rate": 0.0004243531443220704, + "loss": 2.9527812004089355, + "step": 9950, + "token_acc": 0.3060652945535865 + }, + { + "epoch": 5.833186748754031, + "grad_norm": 0.17683303919843515, + "learning_rate": 0.0004243357784388168, + "loss": 3.008302688598633, + "step": 9951, + "token_acc": 0.29778024242296813 + }, + { + "epoch": 5.83377308707124, + "grad_norm": 0.16422772292648222, + "learning_rate": 0.0004243184109179134, + "loss": 2.995554208755493, + "step": 9952, + "token_acc": 0.30057305201843354 + }, + { + "epoch": 5.834359425388449, + "grad_norm": 0.17638838422798378, + "learning_rate": 0.00042430104175952344, + "loss": 2.981992244720459, + "step": 9953, + "token_acc": 0.3014567261853168 + }, + { + "epoch": 5.834945763705658, + "grad_norm": 0.16829007990277933, + "learning_rate": 0.00042428367096381003, + "loss": 3.0044472217559814, + "step": 9954, + "token_acc": 0.2994327312684568 + }, + { + "epoch": 5.835532102022867, + "grad_norm": 0.16626726644792333, + "learning_rate": 0.0004242662985309365, + "loss": 2.9926421642303467, + "step": 9955, + "token_acc": 0.29958532796667803 + }, + { + "epoch": 5.836118440340076, + "grad_norm": 0.15885383165849284, + "learning_rate": 0.0004242489244610658, + "loss": 2.9899938106536865, + "step": 9956, + "token_acc": 0.29993209574499957 + }, + { + "epoch": 5.836704778657285, + "grad_norm": 0.15598748013134373, + "learning_rate": 0.0004242315487543613, + "loss": 2.981785774230957, + "step": 9957, + "token_acc": 0.3004575840709582 + }, + { + "epoch": 5.8372911169744945, + "grad_norm": 0.17810021420414673, + "learning_rate": 0.0004242141714109862, + "loss": 3.001772880554199, + "step": 9958, + "token_acc": 0.2987717218593155 + }, + { + "epoch": 5.837877455291704, + "grad_norm": 0.15093821765840398, + "learning_rate": 0.00042419679243110376, + "loss": 2.9479687213897705, + "step": 9959, + "token_acc": 0.3046957073465015 + }, + { + "epoch": 5.838463793608913, + "grad_norm": 0.17913905170088082, + "learning_rate": 0.00042417941181487707, + "loss": 2.9735822677612305, + "step": 9960, + "token_acc": 0.30178581266675586 + }, + { + "epoch": 5.839050131926121, + "grad_norm": 0.16544547317507688, + "learning_rate": 0.0004241620295624696, + "loss": 3.018216848373413, + "step": 9961, + "token_acc": 0.2960263276763245 + }, + { + "epoch": 5.83963647024333, + "grad_norm": 0.15202959355547385, + "learning_rate": 0.00042414464567404455, + "loss": 3.01279878616333, + "step": 9962, + "token_acc": 0.2957606236657356 + }, + { + "epoch": 5.840222808560539, + "grad_norm": 0.17288497200996936, + "learning_rate": 0.00042412726014976523, + "loss": 2.9706430435180664, + "step": 9963, + "token_acc": 0.3040054166721984 + }, + { + "epoch": 5.840809146877748, + "grad_norm": 0.1573875942851172, + "learning_rate": 0.00042410987298979486, + "loss": 3.0085582733154297, + "step": 9964, + "token_acc": 0.29829367461588674 + }, + { + "epoch": 5.841395485194957, + "grad_norm": 0.17070485609223432, + "learning_rate": 0.0004240924841942969, + "loss": 3.004000186920166, + "step": 9965, + "token_acc": 0.2989459227812 + }, + { + "epoch": 5.8419818235121665, + "grad_norm": 0.1565068737039995, + "learning_rate": 0.0004240750937634347, + "loss": 2.9882073402404785, + "step": 9966, + "token_acc": 0.3003839753297722 + }, + { + "epoch": 5.842568161829376, + "grad_norm": 0.185847502278089, + "learning_rate": 0.00042405770169737147, + "loss": 3.0152997970581055, + "step": 9967, + "token_acc": 0.29861415397156976 + }, + { + "epoch": 5.843154500146585, + "grad_norm": 0.23822248929003415, + "learning_rate": 0.00042404030799627067, + "loss": 3.0349934101104736, + "step": 9968, + "token_acc": 0.2941174888157771 + }, + { + "epoch": 5.843740838463794, + "grad_norm": 0.288853400246377, + "learning_rate": 0.00042402291266029567, + "loss": 3.035841464996338, + "step": 9969, + "token_acc": 0.29273378560606456 + }, + { + "epoch": 5.844327176781003, + "grad_norm": 0.19553709852321036, + "learning_rate": 0.00042400551568960997, + "loss": 3.033818244934082, + "step": 9970, + "token_acc": 0.2934002830245379 + }, + { + "epoch": 5.844913515098211, + "grad_norm": 0.18562729822762683, + "learning_rate": 0.000423988117084377, + "loss": 2.963871955871582, + "step": 9971, + "token_acc": 0.3052341055231444 + }, + { + "epoch": 5.84549985341542, + "grad_norm": 0.22645461006625403, + "learning_rate": 0.00042397071684476006, + "loss": 2.992323398590088, + "step": 9972, + "token_acc": 0.29948532694310886 + }, + { + "epoch": 5.8460861917326294, + "grad_norm": 0.2034308076779647, + "learning_rate": 0.00042395331497092263, + "loss": 2.9774386882781982, + "step": 9973, + "token_acc": 0.3013217704707845 + }, + { + "epoch": 5.846672530049839, + "grad_norm": 0.17630827789734335, + "learning_rate": 0.0004239359114630282, + "loss": 3.010213851928711, + "step": 9974, + "token_acc": 0.29740926004025175 + }, + { + "epoch": 5.847258868367048, + "grad_norm": 0.15202877286755886, + "learning_rate": 0.00042391850632124027, + "loss": 3.0469069480895996, + "step": 9975, + "token_acc": 0.2928263250253988 + }, + { + "epoch": 5.847845206684257, + "grad_norm": 0.16524180912842562, + "learning_rate": 0.00042390109954572243, + "loss": 2.9999923706054688, + "step": 9976, + "token_acc": 0.2977966614772531 + }, + { + "epoch": 5.848431545001466, + "grad_norm": 0.19842541139418873, + "learning_rate": 0.00042388369113663805, + "loss": 3.0255346298217773, + "step": 9977, + "token_acc": 0.29530012078258605 + }, + { + "epoch": 5.849017883318675, + "grad_norm": 0.23061938638648846, + "learning_rate": 0.0004238662810941506, + "loss": 2.981649875640869, + "step": 9978, + "token_acc": 0.3009559139756576 + }, + { + "epoch": 5.849604221635884, + "grad_norm": 0.17567699642177417, + "learning_rate": 0.0004238488694184238, + "loss": 2.9582037925720215, + "step": 9979, + "token_acc": 0.30470684935966125 + }, + { + "epoch": 5.850190559953093, + "grad_norm": 0.1849544207014276, + "learning_rate": 0.00042383145610962116, + "loss": 3.008603096008301, + "step": 9980, + "token_acc": 0.29719393427313556 + }, + { + "epoch": 5.850776898270302, + "grad_norm": 0.1571779266102812, + "learning_rate": 0.0004238140411679062, + "loss": 3.0345168113708496, + "step": 9981, + "token_acc": 0.29378139240026224 + }, + { + "epoch": 5.8513632365875115, + "grad_norm": 0.16655801763866693, + "learning_rate": 0.0004237966245934426, + "loss": 2.976304531097412, + "step": 9982, + "token_acc": 0.30085239420607507 + }, + { + "epoch": 5.85194957490472, + "grad_norm": 0.16608412857119426, + "learning_rate": 0.0004237792063863938, + "loss": 3.003843307495117, + "step": 9983, + "token_acc": 0.2980717791268346 + }, + { + "epoch": 5.852535913221929, + "grad_norm": 0.17708581616201324, + "learning_rate": 0.0004237617865469236, + "loss": 3.0005133152008057, + "step": 9984, + "token_acc": 0.2995318916745642 + }, + { + "epoch": 5.853122251539138, + "grad_norm": 0.1778489834230273, + "learning_rate": 0.00042374436507519554, + "loss": 3.0067050457000732, + "step": 9985, + "token_acc": 0.2987595986593696 + }, + { + "epoch": 5.853708589856347, + "grad_norm": 0.21598815941824, + "learning_rate": 0.0004237269419713733, + "loss": 3.0277063846588135, + "step": 9986, + "token_acc": 0.29448556050098335 + }, + { + "epoch": 5.854294928173556, + "grad_norm": 0.25153577806676974, + "learning_rate": 0.0004237095172356206, + "loss": 3.039735794067383, + "step": 9987, + "token_acc": 0.2948286796419214 + }, + { + "epoch": 5.854881266490765, + "grad_norm": 0.22169680231067176, + "learning_rate": 0.0004236920908681009, + "loss": 2.97993803024292, + "step": 9988, + "token_acc": 0.30163205244833635 + }, + { + "epoch": 5.855467604807974, + "grad_norm": 0.16037444612800486, + "learning_rate": 0.00042367466286897816, + "loss": 3.0701088905334473, + "step": 9989, + "token_acc": 0.28724421301004843 + }, + { + "epoch": 5.8560539431251835, + "grad_norm": 0.28998339505038306, + "learning_rate": 0.000423657233238416, + "loss": 3.080392360687256, + "step": 9990, + "token_acc": 0.28919700320487685 + }, + { + "epoch": 5.856640281442393, + "grad_norm": 0.31270408535737, + "learning_rate": 0.00042363980197657815, + "loss": 3.026762008666992, + "step": 9991, + "token_acc": 0.29613566786615825 + }, + { + "epoch": 5.857226619759601, + "grad_norm": 0.15025587048822409, + "learning_rate": 0.00042362236908362837, + "loss": 3.029123306274414, + "step": 9992, + "token_acc": 0.2941401161099376 + }, + { + "epoch": 5.85781295807681, + "grad_norm": 0.23840128305711616, + "learning_rate": 0.00042360493455973034, + "loss": 2.972878932952881, + "step": 9993, + "token_acc": 0.30066985595598467 + }, + { + "epoch": 5.858399296394019, + "grad_norm": 0.14763499637312424, + "learning_rate": 0.0004235874984050478, + "loss": 2.985220193862915, + "step": 9994, + "token_acc": 0.3005376444303605 + }, + { + "epoch": 5.858985634711228, + "grad_norm": 0.2336876774261463, + "learning_rate": 0.0004235700606197448, + "loss": 3.011638879776001, + "step": 9995, + "token_acc": 0.29578717383051534 + }, + { + "epoch": 5.859571973028437, + "grad_norm": 0.1623394203731384, + "learning_rate": 0.00042355262120398484, + "loss": 3.051365375518799, + "step": 9996, + "token_acc": 0.290441474598927 + }, + { + "epoch": 5.860158311345646, + "grad_norm": 0.2210879482021252, + "learning_rate": 0.0004235351801579319, + "loss": 3.061344623565674, + "step": 9997, + "token_acc": 0.289923532149744 + }, + { + "epoch": 5.8607446496628555, + "grad_norm": 0.16200233129833125, + "learning_rate": 0.0004235177374817498, + "loss": 3.0351545810699463, + "step": 9998, + "token_acc": 0.292527715173705 + }, + { + "epoch": 5.861330987980065, + "grad_norm": 0.22726253865968601, + "learning_rate": 0.0004235002931756024, + "loss": 3.027992010116577, + "step": 9999, + "token_acc": 0.29617437698990595 + }, + { + "epoch": 5.861917326297274, + "grad_norm": 0.1593521513185362, + "learning_rate": 0.0004234828472396535, + "loss": 2.9697537422180176, + "step": 10000, + "token_acc": 0.30195530430548057 + }, + { + "epoch": 5.862503664614483, + "grad_norm": 0.17330643505791402, + "learning_rate": 0.00042346539967406705, + "loss": 3.0172080993652344, + "step": 10001, + "token_acc": 0.2964358741112228 + }, + { + "epoch": 5.863090002931692, + "grad_norm": 0.15705309844577311, + "learning_rate": 0.00042344795047900694, + "loss": 2.997549295425415, + "step": 10002, + "token_acc": 0.2988051706313638 + }, + { + "epoch": 5.863676341248901, + "grad_norm": 0.18221318803017367, + "learning_rate": 0.000423430499654637, + "loss": 3.0182929039001465, + "step": 10003, + "token_acc": 0.29566994933890267 + }, + { + "epoch": 5.86426267956611, + "grad_norm": 0.15657318098032233, + "learning_rate": 0.0004234130472011212, + "loss": 3.0519533157348633, + "step": 10004, + "token_acc": 0.2934261741389128 + }, + { + "epoch": 5.864849017883318, + "grad_norm": 0.18449563535452232, + "learning_rate": 0.00042339559311862357, + "loss": 3.04988694190979, + "step": 10005, + "token_acc": 0.2916774146950483 + }, + { + "epoch": 5.8654353562005275, + "grad_norm": 0.15247236271147513, + "learning_rate": 0.00042337813740730794, + "loss": 2.9692835807800293, + "step": 10006, + "token_acc": 0.30247149945732116 + }, + { + "epoch": 5.866021694517737, + "grad_norm": 0.17167170801019946, + "learning_rate": 0.0004233606800673384, + "loss": 3.03755521774292, + "step": 10007, + "token_acc": 0.2942136385086061 + }, + { + "epoch": 5.866608032834946, + "grad_norm": 0.16569413257247906, + "learning_rate": 0.0004233432210988788, + "loss": 3.003567934036255, + "step": 10008, + "token_acc": 0.29582393977050736 + }, + { + "epoch": 5.867194371152155, + "grad_norm": 0.16403796620204583, + "learning_rate": 0.00042332576050209327, + "loss": 2.9730613231658936, + "step": 10009, + "token_acc": 0.3018683416721425 + }, + { + "epoch": 5.867780709469364, + "grad_norm": 0.15527960156900567, + "learning_rate": 0.0004233082982771458, + "loss": 3.0426597595214844, + "step": 10010, + "token_acc": 0.29251465208669314 + }, + { + "epoch": 5.868367047786573, + "grad_norm": 0.17287883552585795, + "learning_rate": 0.00042329083442420036, + "loss": 2.973691463470459, + "step": 10011, + "token_acc": 0.30258306370291305 + }, + { + "epoch": 5.868953386103782, + "grad_norm": 0.15627604500139708, + "learning_rate": 0.00042327336894342106, + "loss": 3.0062315464019775, + "step": 10012, + "token_acc": 0.29665731638138754 + }, + { + "epoch": 5.869539724420991, + "grad_norm": 0.16479267427658223, + "learning_rate": 0.00042325590183497187, + "loss": 3.0075550079345703, + "step": 10013, + "token_acc": 0.298632909246471 + }, + { + "epoch": 5.8701260627381995, + "grad_norm": 0.16848292514437746, + "learning_rate": 0.00042323843309901703, + "loss": 3.049063205718994, + "step": 10014, + "token_acc": 0.2927030130466635 + }, + { + "epoch": 5.870712401055409, + "grad_norm": 0.16466197200629742, + "learning_rate": 0.0004232209627357206, + "loss": 3.0021820068359375, + "step": 10015, + "token_acc": 0.2998244813166587 + }, + { + "epoch": 5.871298739372618, + "grad_norm": 0.250844522405624, + "learning_rate": 0.00042320349074524656, + "loss": 2.9807190895080566, + "step": 10016, + "token_acc": 0.2988374791103417 + }, + { + "epoch": 5.871885077689827, + "grad_norm": 0.30475767325698316, + "learning_rate": 0.00042318601712775916, + "loss": 3.0048766136169434, + "step": 10017, + "token_acc": 0.3003643767219022 + }, + { + "epoch": 5.872471416007036, + "grad_norm": 0.26139063510196614, + "learning_rate": 0.0004231685418834225, + "loss": 3.018401861190796, + "step": 10018, + "token_acc": 0.2953928612756288 + }, + { + "epoch": 5.873057754324245, + "grad_norm": 0.1577548289212022, + "learning_rate": 0.00042315106501240066, + "loss": 3.000736951828003, + "step": 10019, + "token_acc": 0.299025417793533 + }, + { + "epoch": 5.873644092641454, + "grad_norm": 0.22560844017948412, + "learning_rate": 0.000423133586514858, + "loss": 3.0134692192077637, + "step": 10020, + "token_acc": 0.2980477246219962 + }, + { + "epoch": 5.874230430958663, + "grad_norm": 0.16825441308293498, + "learning_rate": 0.00042311610639095856, + "loss": 3.022550582885742, + "step": 10021, + "token_acc": 0.29709109379400006 + }, + { + "epoch": 5.874816769275872, + "grad_norm": 0.2319930068238738, + "learning_rate": 0.0004230986246408665, + "loss": 3.000293731689453, + "step": 10022, + "token_acc": 0.29891590445482663 + }, + { + "epoch": 5.8754031075930815, + "grad_norm": 0.18342840000589894, + "learning_rate": 0.00042308114126474617, + "loss": 3.0302324295043945, + "step": 10023, + "token_acc": 0.2940948278062201 + }, + { + "epoch": 5.875989445910291, + "grad_norm": 0.18755149717289787, + "learning_rate": 0.0004230636562627618, + "loss": 3.000781536102295, + "step": 10024, + "token_acc": 0.3002076162789238 + }, + { + "epoch": 5.8765757842275, + "grad_norm": 0.19518953629190688, + "learning_rate": 0.0004230461696350775, + "loss": 3.042587995529175, + "step": 10025, + "token_acc": 0.292569172367593 + }, + { + "epoch": 5.877162122544708, + "grad_norm": 0.16479337513603584, + "learning_rate": 0.00042302868138185766, + "loss": 2.980506420135498, + "step": 10026, + "token_acc": 0.3002999565544279 + }, + { + "epoch": 5.877748460861917, + "grad_norm": 0.19573420565503852, + "learning_rate": 0.0004230111915032665, + "loss": 3.01444673538208, + "step": 10027, + "token_acc": 0.2961651856955868 + }, + { + "epoch": 5.878334799179126, + "grad_norm": 0.17286499789409712, + "learning_rate": 0.00042299369999946836, + "loss": 3.0103507041931152, + "step": 10028, + "token_acc": 0.29722196024186026 + }, + { + "epoch": 5.878921137496335, + "grad_norm": 0.22643062319185542, + "learning_rate": 0.00042297620687062755, + "loss": 3.048881769180298, + "step": 10029, + "token_acc": 0.29269626094369006 + }, + { + "epoch": 5.879507475813544, + "grad_norm": 0.1652456923383218, + "learning_rate": 0.00042295871211690827, + "loss": 3.025496006011963, + "step": 10030, + "token_acc": 0.29473120742412584 + }, + { + "epoch": 5.8800938141307535, + "grad_norm": 0.30059949164679156, + "learning_rate": 0.0004229412157384751, + "loss": 3.0132880210876465, + "step": 10031, + "token_acc": 0.2980761840651113 + }, + { + "epoch": 5.880680152447963, + "grad_norm": 0.15363081105578819, + "learning_rate": 0.0004229237177354921, + "loss": 2.975437879562378, + "step": 10032, + "token_acc": 0.30338283245866576 + }, + { + "epoch": 5.881266490765172, + "grad_norm": 0.31549443305675395, + "learning_rate": 0.0004229062181081238, + "loss": 3.012751817703247, + "step": 10033, + "token_acc": 0.2965051487843297 + }, + { + "epoch": 5.881852829082381, + "grad_norm": 0.1620672092408641, + "learning_rate": 0.00042288871685653464, + "loss": 2.9819223880767822, + "step": 10034, + "token_acc": 0.3025931312813659 + }, + { + "epoch": 5.88243916739959, + "grad_norm": 0.26012782705430415, + "learning_rate": 0.00042287121398088895, + "loss": 3.044726610183716, + "step": 10035, + "token_acc": 0.29393716122915853 + }, + { + "epoch": 5.883025505716798, + "grad_norm": 0.14391103455766485, + "learning_rate": 0.00042285370948135116, + "loss": 2.966080904006958, + "step": 10036, + "token_acc": 0.3045379974527441 + }, + { + "epoch": 5.883611844034007, + "grad_norm": 0.23902293422252838, + "learning_rate": 0.00042283620335808564, + "loss": 2.988171100616455, + "step": 10037, + "token_acc": 0.30130849935164444 + }, + { + "epoch": 5.884198182351216, + "grad_norm": 0.1585695493076413, + "learning_rate": 0.0004228186956112569, + "loss": 3.031554937362671, + "step": 10038, + "token_acc": 0.29391588904785293 + }, + { + "epoch": 5.8847845206684255, + "grad_norm": 0.2225791666270798, + "learning_rate": 0.00042280118624102943, + "loss": 3.0251078605651855, + "step": 10039, + "token_acc": 0.2941930616479032 + }, + { + "epoch": 5.885370858985635, + "grad_norm": 0.1670203481737096, + "learning_rate": 0.0004227836752475677, + "loss": 3.010450839996338, + "step": 10040, + "token_acc": 0.2960606648184929 + }, + { + "epoch": 5.885957197302844, + "grad_norm": 0.17366190492008451, + "learning_rate": 0.00042276616263103606, + "loss": 3.0037269592285156, + "step": 10041, + "token_acc": 0.29986869692284884 + }, + { + "epoch": 5.886543535620053, + "grad_norm": 0.2175738807681619, + "learning_rate": 0.0004227486483915992, + "loss": 2.995307445526123, + "step": 10042, + "token_acc": 0.30061494130803623 + }, + { + "epoch": 5.887129873937262, + "grad_norm": 0.16041855497488258, + "learning_rate": 0.00042273113252942155, + "loss": 3.0140137672424316, + "step": 10043, + "token_acc": 0.29779458955370497 + }, + { + "epoch": 5.887716212254471, + "grad_norm": 0.22573114688853205, + "learning_rate": 0.00042271361504466766, + "loss": 3.006498098373413, + "step": 10044, + "token_acc": 0.29725368885417497 + }, + { + "epoch": 5.88830255057168, + "grad_norm": 0.16405933175243043, + "learning_rate": 0.00042269609593750216, + "loss": 2.991373062133789, + "step": 10045, + "token_acc": 0.3015227771325332 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.22142054993720608, + "learning_rate": 0.0004226785752080895, + "loss": 3.01389741897583, + "step": 10046, + "token_acc": 0.296225701900419 + }, + { + "epoch": 5.889475227206098, + "grad_norm": 0.17432813970734776, + "learning_rate": 0.0004226610528565943, + "loss": 2.981689929962158, + "step": 10047, + "token_acc": 0.3014212662961011 + }, + { + "epoch": 5.890061565523307, + "grad_norm": 0.20480444213756044, + "learning_rate": 0.0004226435288831811, + "loss": 3.0245089530944824, + "step": 10048, + "token_acc": 0.29495018753230007 + }, + { + "epoch": 5.890647903840516, + "grad_norm": 0.20257929713420675, + "learning_rate": 0.0004226260032880147, + "loss": 2.9766602516174316, + "step": 10049, + "token_acc": 0.3019105475175835 + }, + { + "epoch": 5.891234242157725, + "grad_norm": 0.15983918650224122, + "learning_rate": 0.0004226084760712596, + "loss": 2.996727466583252, + "step": 10050, + "token_acc": 0.30193033319832013 + }, + { + "epoch": 5.891820580474934, + "grad_norm": 0.19353037821900418, + "learning_rate": 0.00042259094723308047, + "loss": 3.0193777084350586, + "step": 10051, + "token_acc": 0.2955161937828894 + }, + { + "epoch": 5.892406918792143, + "grad_norm": 0.1612425404916145, + "learning_rate": 0.0004225734167736419, + "loss": 3.0128660202026367, + "step": 10052, + "token_acc": 0.2970894412577013 + }, + { + "epoch": 5.892993257109352, + "grad_norm": 0.21342947143734461, + "learning_rate": 0.0004225558846931086, + "loss": 3.012537717819214, + "step": 10053, + "token_acc": 0.29938055982488004 + }, + { + "epoch": 5.893579595426561, + "grad_norm": 0.1490338844860703, + "learning_rate": 0.0004225383509916454, + "loss": 2.994819402694702, + "step": 10054, + "token_acc": 0.29935380322468974 + }, + { + "epoch": 5.89416593374377, + "grad_norm": 0.1837830558151019, + "learning_rate": 0.0004225208156694168, + "loss": 2.9809417724609375, + "step": 10055, + "token_acc": 0.3014735496759157 + }, + { + "epoch": 5.8947522720609795, + "grad_norm": 0.14608987894109293, + "learning_rate": 0.00042250327872658767, + "loss": 3.0008561611175537, + "step": 10056, + "token_acc": 0.2980989520495876 + }, + { + "epoch": 5.895338610378188, + "grad_norm": 0.17734471261151086, + "learning_rate": 0.0004224857401633226, + "loss": 3.030193328857422, + "step": 10057, + "token_acc": 0.29490195069226416 + }, + { + "epoch": 5.895924948695397, + "grad_norm": 0.18912485448980312, + "learning_rate": 0.00042246819997978654, + "loss": 3.008021354675293, + "step": 10058, + "token_acc": 0.29973014447970603 + }, + { + "epoch": 5.896511287012606, + "grad_norm": 0.144317069993294, + "learning_rate": 0.0004224506581761441, + "loss": 2.986654281616211, + "step": 10059, + "token_acc": 0.3009143450943002 + }, + { + "epoch": 5.897097625329815, + "grad_norm": 0.166776700108989, + "learning_rate": 0.0004224331147525601, + "loss": 3.02883243560791, + "step": 10060, + "token_acc": 0.29305233086749016 + }, + { + "epoch": 5.897683963647024, + "grad_norm": 0.15621421055863774, + "learning_rate": 0.00042241556970919934, + "loss": 2.978274345397949, + "step": 10061, + "token_acc": 0.3023130335684859 + }, + { + "epoch": 5.898270301964233, + "grad_norm": 0.17391352353632042, + "learning_rate": 0.00042239802304622665, + "loss": 3.0054380893707275, + "step": 10062, + "token_acc": 0.29787828838926617 + }, + { + "epoch": 5.898856640281442, + "grad_norm": 0.16676243594658033, + "learning_rate": 0.0004223804747638068, + "loss": 3.0046634674072266, + "step": 10063, + "token_acc": 0.29894731329213364 + }, + { + "epoch": 5.8994429785986515, + "grad_norm": 0.21260519971642114, + "learning_rate": 0.0004223629248621048, + "loss": 3.021902561187744, + "step": 10064, + "token_acc": 0.29586290435295765 + }, + { + "epoch": 5.900029316915861, + "grad_norm": 0.18957657084814813, + "learning_rate": 0.00042234537334128526, + "loss": 2.99540638923645, + "step": 10065, + "token_acc": 0.3002756596879295 + }, + { + "epoch": 5.90061565523307, + "grad_norm": 0.17104956859457543, + "learning_rate": 0.00042232782020151316, + "loss": 3.010921001434326, + "step": 10066, + "token_acc": 0.29734299516908214 + }, + { + "epoch": 5.901201993550279, + "grad_norm": 0.18837925304786562, + "learning_rate": 0.0004223102654429535, + "loss": 2.971719264984131, + "step": 10067, + "token_acc": 0.30193314300838475 + }, + { + "epoch": 5.901788331867488, + "grad_norm": 0.18217170296890248, + "learning_rate": 0.000422292709065771, + "loss": 3.038811206817627, + "step": 10068, + "token_acc": 0.29390959617552176 + }, + { + "epoch": 5.902374670184696, + "grad_norm": 0.15696668377741466, + "learning_rate": 0.0004222751510701307, + "loss": 3.0397796630859375, + "step": 10069, + "token_acc": 0.29142475743640833 + }, + { + "epoch": 5.902961008501905, + "grad_norm": 0.20711992650186103, + "learning_rate": 0.00042225759145619754, + "loss": 3.0648930072784424, + "step": 10070, + "token_acc": 0.28867804472541797 + }, + { + "epoch": 5.903547346819114, + "grad_norm": 0.1896654278087999, + "learning_rate": 0.0004222400302241364, + "loss": 3.005308151245117, + "step": 10071, + "token_acc": 0.29679277235681417 + }, + { + "epoch": 5.9041336851363235, + "grad_norm": 0.16042788725347884, + "learning_rate": 0.00042222246737411216, + "loss": 2.9597105979919434, + "step": 10072, + "token_acc": 0.30532019781323244 + }, + { + "epoch": 5.904720023453533, + "grad_norm": 0.15987663656730164, + "learning_rate": 0.0004222049029062901, + "loss": 3.018336296081543, + "step": 10073, + "token_acc": 0.2959441717614564 + }, + { + "epoch": 5.905306361770742, + "grad_norm": 0.15060876743971716, + "learning_rate": 0.0004221873368208349, + "loss": 3.0405611991882324, + "step": 10074, + "token_acc": 0.2934697735585419 + }, + { + "epoch": 5.905892700087951, + "grad_norm": 0.1567229842625141, + "learning_rate": 0.0004221697691179118, + "loss": 3.0078377723693848, + "step": 10075, + "token_acc": 0.29613654224218394 + }, + { + "epoch": 5.90647903840516, + "grad_norm": 0.1796836867231402, + "learning_rate": 0.0004221521997976856, + "loss": 3.005740165710449, + "step": 10076, + "token_acc": 0.29786077197436983 + }, + { + "epoch": 5.907065376722369, + "grad_norm": 0.19579248019580392, + "learning_rate": 0.0004221346288603215, + "loss": 2.9656715393066406, + "step": 10077, + "token_acc": 0.3035575736431283 + }, + { + "epoch": 5.907651715039578, + "grad_norm": 0.17121102330562168, + "learning_rate": 0.00042211705630598457, + "loss": 3.025899648666382, + "step": 10078, + "token_acc": 0.29430141006463123 + }, + { + "epoch": 5.908238053356786, + "grad_norm": 0.15890179811204816, + "learning_rate": 0.00042209948213483986, + "loss": 2.960113525390625, + "step": 10079, + "token_acc": 0.30467723001520125 + }, + { + "epoch": 5.9088243916739955, + "grad_norm": 0.17909616112135726, + "learning_rate": 0.00042208190634705235, + "loss": 2.9747910499572754, + "step": 10080, + "token_acc": 0.30368928228380204 + }, + { + "epoch": 5.909410729991205, + "grad_norm": 0.1555424350502692, + "learning_rate": 0.0004220643289427871, + "loss": 2.9822983741760254, + "step": 10081, + "token_acc": 0.30000025906534405 + }, + { + "epoch": 5.909997068308414, + "grad_norm": 0.16369627275082183, + "learning_rate": 0.0004220467499222095, + "loss": 3.0656371116638184, + "step": 10082, + "token_acc": 0.2908480436155713 + }, + { + "epoch": 5.910583406625623, + "grad_norm": 0.2349843770150894, + "learning_rate": 0.00042202916928548454, + "loss": 3.0482265949249268, + "step": 10083, + "token_acc": 0.2924696736831585 + }, + { + "epoch": 5.911169744942832, + "grad_norm": 0.21026545184142623, + "learning_rate": 0.00042201158703277723, + "loss": 3.0417137145996094, + "step": 10084, + "token_acc": 0.29291762272565686 + }, + { + "epoch": 5.911756083260041, + "grad_norm": 0.1606142225559163, + "learning_rate": 0.00042199400316425296, + "loss": 3.072801351547241, + "step": 10085, + "token_acc": 0.28848822806659735 + }, + { + "epoch": 5.91234242157725, + "grad_norm": 0.19337590010188072, + "learning_rate": 0.0004219764176800767, + "loss": 3.010916233062744, + "step": 10086, + "token_acc": 0.297163927495916 + }, + { + "epoch": 5.912928759894459, + "grad_norm": 0.1988391131517098, + "learning_rate": 0.00042195883058041375, + "loss": 3.0219666957855225, + "step": 10087, + "token_acc": 0.29657923308105044 + }, + { + "epoch": 5.913515098211668, + "grad_norm": 0.16708615931787546, + "learning_rate": 0.0004219412418654294, + "loss": 2.984231472015381, + "step": 10088, + "token_acc": 0.30143759327057634 + }, + { + "epoch": 5.9141014365288775, + "grad_norm": 0.18235467398098493, + "learning_rate": 0.00042192365153528866, + "loss": 3.020191192626953, + "step": 10089, + "token_acc": 0.2960278027379883 + }, + { + "epoch": 5.914687774846087, + "grad_norm": 0.1638703920961271, + "learning_rate": 0.0004219060595901569, + "loss": 3.034750461578369, + "step": 10090, + "token_acc": 0.2942687184792448 + }, + { + "epoch": 5.915274113163295, + "grad_norm": 0.16346942959519864, + "learning_rate": 0.0004218884660301994, + "loss": 3.013485908508301, + "step": 10091, + "token_acc": 0.29581557843474576 + }, + { + "epoch": 5.915860451480504, + "grad_norm": 0.17220907253259696, + "learning_rate": 0.0004218708708555814, + "loss": 2.9683408737182617, + "step": 10092, + "token_acc": 0.30378970891766205 + }, + { + "epoch": 5.916446789797713, + "grad_norm": 0.2572340807126346, + "learning_rate": 0.0004218532740664681, + "loss": 3.024446487426758, + "step": 10093, + "token_acc": 0.29551667059489656 + }, + { + "epoch": 5.917033128114922, + "grad_norm": 0.3472846970067426, + "learning_rate": 0.00042183567566302495, + "loss": 3.021104097366333, + "step": 10094, + "token_acc": 0.29650833693036577 + }, + { + "epoch": 5.917619466432131, + "grad_norm": 0.14610202725191773, + "learning_rate": 0.0004218180756454171, + "loss": 2.9883360862731934, + "step": 10095, + "token_acc": 0.2999964331937183 + }, + { + "epoch": 5.91820580474934, + "grad_norm": 0.30741693626857425, + "learning_rate": 0.0004218004740138099, + "loss": 2.999293327331543, + "step": 10096, + "token_acc": 0.29815387438143187 + }, + { + "epoch": 5.9187921430665495, + "grad_norm": 0.18159162380598898, + "learning_rate": 0.0004217828707683689, + "loss": 3.000601291656494, + "step": 10097, + "token_acc": 0.2980950021514349 + }, + { + "epoch": 5.919378481383759, + "grad_norm": 0.24605915103767623, + "learning_rate": 0.00042176526590925924, + "loss": 3.0329198837280273, + "step": 10098, + "token_acc": 0.2942846134086822 + }, + { + "epoch": 5.919964819700968, + "grad_norm": 0.14876797430958535, + "learning_rate": 0.0004217476594366464, + "loss": 2.993206739425659, + "step": 10099, + "token_acc": 0.3004081568864428 + }, + { + "epoch": 5.920551158018176, + "grad_norm": 0.232993958363065, + "learning_rate": 0.0004217300513506957, + "loss": 3.0578975677490234, + "step": 10100, + "token_acc": 0.2902110806314792 + }, + { + "epoch": 5.921137496335385, + "grad_norm": 0.14631291090468782, + "learning_rate": 0.00042171244165157255, + "loss": 3.0388808250427246, + "step": 10101, + "token_acc": 0.29397384056775044 + }, + { + "epoch": 5.921723834652594, + "grad_norm": 0.20128754777790575, + "learning_rate": 0.00042169483033944246, + "loss": 2.991434097290039, + "step": 10102, + "token_acc": 0.2989116559411478 + }, + { + "epoch": 5.922310172969803, + "grad_norm": 0.17051339520599743, + "learning_rate": 0.0004216772174144707, + "loss": 3.0356264114379883, + "step": 10103, + "token_acc": 0.2945274381922584 + }, + { + "epoch": 5.9228965112870124, + "grad_norm": 0.21949858594867216, + "learning_rate": 0.0004216596028768229, + "loss": 3.0281591415405273, + "step": 10104, + "token_acc": 0.2961920224145608 + }, + { + "epoch": 5.923482849604222, + "grad_norm": 0.15615513171713064, + "learning_rate": 0.00042164198672666446, + "loss": 3.053191661834717, + "step": 10105, + "token_acc": 0.2919570777356787 + }, + { + "epoch": 5.924069187921431, + "grad_norm": 0.17428113614385846, + "learning_rate": 0.0004216243689641608, + "loss": 3.0184764862060547, + "step": 10106, + "token_acc": 0.296030211867119 + }, + { + "epoch": 5.92465552623864, + "grad_norm": 0.15108839836139726, + "learning_rate": 0.00042160674958947755, + "loss": 2.9939656257629395, + "step": 10107, + "token_acc": 0.2990989709674276 + }, + { + "epoch": 5.925241864555849, + "grad_norm": 0.17859686426085583, + "learning_rate": 0.00042158912860278, + "loss": 3.020618438720703, + "step": 10108, + "token_acc": 0.2948943824248984 + }, + { + "epoch": 5.925828202873058, + "grad_norm": 0.16219310528037026, + "learning_rate": 0.0004215715060042339, + "loss": 3.0204007625579834, + "step": 10109, + "token_acc": 0.29779476544816275 + }, + { + "epoch": 5.926414541190267, + "grad_norm": 0.17026218455976855, + "learning_rate": 0.00042155388179400464, + "loss": 3.0147199630737305, + "step": 10110, + "token_acc": 0.2972232189682299 + }, + { + "epoch": 5.927000879507476, + "grad_norm": 0.1664879749656846, + "learning_rate": 0.00042153625597225785, + "loss": 2.9834794998168945, + "step": 10111, + "token_acc": 0.3025141344260688 + }, + { + "epoch": 5.927587217824685, + "grad_norm": 0.1610867208063306, + "learning_rate": 0.0004215186285391591, + "loss": 3.011470317840576, + "step": 10112, + "token_acc": 0.29861466189441394 + }, + { + "epoch": 5.928173556141894, + "grad_norm": 0.16211481305449554, + "learning_rate": 0.00042150099949487396, + "loss": 2.9802756309509277, + "step": 10113, + "token_acc": 0.30140612635875225 + }, + { + "epoch": 5.928759894459103, + "grad_norm": 0.18800682230113608, + "learning_rate": 0.00042148336883956805, + "loss": 3.047673225402832, + "step": 10114, + "token_acc": 0.29372559107598045 + }, + { + "epoch": 5.929346232776312, + "grad_norm": 0.21650815408753366, + "learning_rate": 0.0004214657365734069, + "loss": 3.0441136360168457, + "step": 10115, + "token_acc": 0.29390858341860193 + }, + { + "epoch": 5.929932571093521, + "grad_norm": 0.16771390255794055, + "learning_rate": 0.0004214481026965562, + "loss": 2.9964442253112793, + "step": 10116, + "token_acc": 0.2987840838577004 + }, + { + "epoch": 5.93051890941073, + "grad_norm": 0.24865785494638254, + "learning_rate": 0.00042143046720918164, + "loss": 3.0219011306762695, + "step": 10117, + "token_acc": 0.2962811218858909 + }, + { + "epoch": 5.931105247727939, + "grad_norm": 0.3250852109243497, + "learning_rate": 0.00042141283011144895, + "loss": 3.0204553604125977, + "step": 10118, + "token_acc": 0.29671923240849507 + }, + { + "epoch": 5.931691586045148, + "grad_norm": 0.21793301030392387, + "learning_rate": 0.00042139519140352354, + "loss": 3.014747381210327, + "step": 10119, + "token_acc": 0.2974399838101743 + }, + { + "epoch": 5.932277924362357, + "grad_norm": 0.23285777578008057, + "learning_rate": 0.0004213775510855713, + "loss": 3.0208816528320312, + "step": 10120, + "token_acc": 0.29494999845832864 + }, + { + "epoch": 5.9328642626795665, + "grad_norm": 0.23156039871883682, + "learning_rate": 0.000421359909157758, + "loss": 3.0300447940826416, + "step": 10121, + "token_acc": 0.29296029166161774 + }, + { + "epoch": 5.933450600996775, + "grad_norm": 0.16926181555819733, + "learning_rate": 0.0004213422656202491, + "loss": 2.975583553314209, + "step": 10122, + "token_acc": 0.3022562912133464 + }, + { + "epoch": 5.934036939313984, + "grad_norm": 0.21167627465243666, + "learning_rate": 0.0004213246204732106, + "loss": 2.9940714836120605, + "step": 10123, + "token_acc": 0.2998531495555842 + }, + { + "epoch": 5.934623277631193, + "grad_norm": 0.19142962243362605, + "learning_rate": 0.00042130697371680823, + "loss": 3.028719425201416, + "step": 10124, + "token_acc": 0.2949912870685683 + }, + { + "epoch": 5.935209615948402, + "grad_norm": 0.19928050904484318, + "learning_rate": 0.00042128932535120755, + "loss": 2.958505153656006, + "step": 10125, + "token_acc": 0.30484854250606164 + }, + { + "epoch": 5.935795954265611, + "grad_norm": 0.17769264041902896, + "learning_rate": 0.0004212716753765745, + "loss": 2.967970848083496, + "step": 10126, + "token_acc": 0.30144919274807674 + }, + { + "epoch": 5.93638229258282, + "grad_norm": 0.22636247680469265, + "learning_rate": 0.00042125402379307485, + "loss": 2.9788336753845215, + "step": 10127, + "token_acc": 0.3019274073508645 + }, + { + "epoch": 5.936968630900029, + "grad_norm": 0.15558066191604647, + "learning_rate": 0.0004212363706008745, + "loss": 3.0154614448547363, + "step": 10128, + "token_acc": 0.29628383528805663 + }, + { + "epoch": 5.9375549692172385, + "grad_norm": 0.21101759767864942, + "learning_rate": 0.00042121871580013916, + "loss": 3.0050694942474365, + "step": 10129, + "token_acc": 0.2971880539292045 + }, + { + "epoch": 5.938141307534448, + "grad_norm": 0.15956575178319748, + "learning_rate": 0.00042120105939103463, + "loss": 2.990670680999756, + "step": 10130, + "token_acc": 0.29903663866083974 + }, + { + "epoch": 5.938727645851657, + "grad_norm": 0.18672151806483672, + "learning_rate": 0.0004211834013737269, + "loss": 3.01796293258667, + "step": 10131, + "token_acc": 0.2968043773372932 + }, + { + "epoch": 5.939313984168866, + "grad_norm": 0.18281724163599583, + "learning_rate": 0.0004211657417483817, + "loss": 3.025787115097046, + "step": 10132, + "token_acc": 0.29653590344437436 + }, + { + "epoch": 5.939900322486075, + "grad_norm": 0.17068006411827066, + "learning_rate": 0.00042114808051516516, + "loss": 3.026735782623291, + "step": 10133, + "token_acc": 0.2971053215301164 + }, + { + "epoch": 5.940486660803283, + "grad_norm": 0.16124717346598438, + "learning_rate": 0.000421130417674243, + "loss": 2.952767848968506, + "step": 10134, + "token_acc": 0.30526819825264 + }, + { + "epoch": 5.941072999120492, + "grad_norm": 0.18565891505849802, + "learning_rate": 0.0004211127532257811, + "loss": 3.005120277404785, + "step": 10135, + "token_acc": 0.2994960478991464 + }, + { + "epoch": 5.941659337437701, + "grad_norm": 0.18690578146756098, + "learning_rate": 0.00042109508716994544, + "loss": 2.98258113861084, + "step": 10136, + "token_acc": 0.3015057843208205 + }, + { + "epoch": 5.9422456757549105, + "grad_norm": 0.18339414119882347, + "learning_rate": 0.00042107741950690204, + "loss": 3.0450644493103027, + "step": 10137, + "token_acc": 0.291850049189665 + }, + { + "epoch": 5.94283201407212, + "grad_norm": 0.17149843543263027, + "learning_rate": 0.0004210597502368168, + "loss": 3.0039820671081543, + "step": 10138, + "token_acc": 0.2983694493687451 + }, + { + "epoch": 5.943418352389329, + "grad_norm": 0.16643873121497746, + "learning_rate": 0.0004210420793598557, + "loss": 2.9848217964172363, + "step": 10139, + "token_acc": 0.3014339947680725 + }, + { + "epoch": 5.944004690706538, + "grad_norm": 0.1838409624411319, + "learning_rate": 0.00042102440687618475, + "loss": 2.9607200622558594, + "step": 10140, + "token_acc": 0.30480847595762023 + }, + { + "epoch": 5.944591029023747, + "grad_norm": 0.16136157026772766, + "learning_rate": 0.00042100673278596995, + "loss": 3.0131280422210693, + "step": 10141, + "token_acc": 0.29757970524573785 + }, + { + "epoch": 5.945177367340956, + "grad_norm": 0.17821963070933206, + "learning_rate": 0.0004209890570893774, + "loss": 3.0361881256103516, + "step": 10142, + "token_acc": 0.2955748820048396 + }, + { + "epoch": 5.945763705658165, + "grad_norm": 0.1790502665619974, + "learning_rate": 0.000420971379786573, + "loss": 2.9963536262512207, + "step": 10143, + "token_acc": 0.2983562900969223 + }, + { + "epoch": 5.946350043975373, + "grad_norm": 0.17485340806531377, + "learning_rate": 0.0004209537008777229, + "loss": 2.980264663696289, + "step": 10144, + "token_acc": 0.30024540960924473 + }, + { + "epoch": 5.9469363822925825, + "grad_norm": 0.19346446317447036, + "learning_rate": 0.0004209360203629931, + "loss": 2.991318702697754, + "step": 10145, + "token_acc": 0.2998691888687903 + }, + { + "epoch": 5.947522720609792, + "grad_norm": 0.19431975203143642, + "learning_rate": 0.00042091833824254974, + "loss": 3.015496253967285, + "step": 10146, + "token_acc": 0.29626794456355043 + }, + { + "epoch": 5.948109058927001, + "grad_norm": 0.1780084962740806, + "learning_rate": 0.00042090065451655894, + "loss": 3.0227599143981934, + "step": 10147, + "token_acc": 0.29663602921800725 + }, + { + "epoch": 5.94869539724421, + "grad_norm": 0.1606896798211275, + "learning_rate": 0.0004208829691851868, + "loss": 3.0569238662719727, + "step": 10148, + "token_acc": 0.29111936419127854 + }, + { + "epoch": 5.949281735561419, + "grad_norm": 0.16162398875864414, + "learning_rate": 0.0004208652822485994, + "loss": 3.025092601776123, + "step": 10149, + "token_acc": 0.29523710779040685 + }, + { + "epoch": 5.949868073878628, + "grad_norm": 0.14706922334470088, + "learning_rate": 0.0004208475937069629, + "loss": 2.971788167953491, + "step": 10150, + "token_acc": 0.3023514430169077 + }, + { + "epoch": 5.950454412195837, + "grad_norm": 0.16040543950593195, + "learning_rate": 0.0004208299035604435, + "loss": 2.999964714050293, + "step": 10151, + "token_acc": 0.299005364907074 + }, + { + "epoch": 5.951040750513046, + "grad_norm": 0.19364090382557161, + "learning_rate": 0.0004208122118092074, + "loss": 3.0230162143707275, + "step": 10152, + "token_acc": 0.29543316356832383 + }, + { + "epoch": 5.951627088830255, + "grad_norm": 0.19574007473981236, + "learning_rate": 0.00042079451845342065, + "loss": 3.0151703357696533, + "step": 10153, + "token_acc": 0.298155662504063 + }, + { + "epoch": 5.9522134271474645, + "grad_norm": 0.17282084313920243, + "learning_rate": 0.0004207768234932496, + "loss": 3.0512194633483887, + "step": 10154, + "token_acc": 0.29242574409171257 + }, + { + "epoch": 5.952799765464674, + "grad_norm": 0.1728576488477399, + "learning_rate": 0.00042075912692886037, + "loss": 3.0385985374450684, + "step": 10155, + "token_acc": 0.2927045497805811 + }, + { + "epoch": 5.953386103781882, + "grad_norm": 0.20075972573551354, + "learning_rate": 0.0004207414287604193, + "loss": 2.997769832611084, + "step": 10156, + "token_acc": 0.2983158675742021 + }, + { + "epoch": 5.953972442099091, + "grad_norm": 0.19920310925867202, + "learning_rate": 0.0004207237289880925, + "loss": 3.0041701793670654, + "step": 10157, + "token_acc": 0.29838202745746417 + }, + { + "epoch": 5.9545587804163, + "grad_norm": 0.1703959870445464, + "learning_rate": 0.00042070602761204646, + "loss": 3.024268627166748, + "step": 10158, + "token_acc": 0.29591052898824327 + }, + { + "epoch": 5.955145118733509, + "grad_norm": 0.1689078528622826, + "learning_rate": 0.00042068832463244723, + "loss": 3.0222389698028564, + "step": 10159, + "token_acc": 0.29559484162802036 + }, + { + "epoch": 5.955731457050718, + "grad_norm": 0.28107386772482545, + "learning_rate": 0.0004206706200494612, + "loss": 3.0285372734069824, + "step": 10160, + "token_acc": 0.2936767472145854 + }, + { + "epoch": 5.956317795367927, + "grad_norm": 0.32293390818038675, + "learning_rate": 0.00042065291386325464, + "loss": 3.0013487339019775, + "step": 10161, + "token_acc": 0.29854484795392194 + }, + { + "epoch": 5.9569041336851365, + "grad_norm": 0.17980249612167645, + "learning_rate": 0.00042063520607399396, + "loss": 3.004537343978882, + "step": 10162, + "token_acc": 0.29593235528852146 + }, + { + "epoch": 5.957490472002346, + "grad_norm": 0.2835992235177475, + "learning_rate": 0.0004206174966818455, + "loss": 2.979393720626831, + "step": 10163, + "token_acc": 0.3017877687146938 + }, + { + "epoch": 5.958076810319555, + "grad_norm": 0.31382247076774583, + "learning_rate": 0.00042059978568697546, + "loss": 3.059453010559082, + "step": 10164, + "token_acc": 0.290568353575525 + }, + { + "epoch": 5.958663148636763, + "grad_norm": 0.18438570128047704, + "learning_rate": 0.00042058207308955044, + "loss": 3.0478997230529785, + "step": 10165, + "token_acc": 0.2905588197996365 + }, + { + "epoch": 5.959249486953972, + "grad_norm": 0.21976389456502318, + "learning_rate": 0.0004205643588897366, + "loss": 3.006145477294922, + "step": 10166, + "token_acc": 0.29787883912952384 + }, + { + "epoch": 5.959835825271181, + "grad_norm": 0.18165579240596308, + "learning_rate": 0.00042054664308770055, + "loss": 3.003462314605713, + "step": 10167, + "token_acc": 0.29640935719124856 + }, + { + "epoch": 5.96042216358839, + "grad_norm": 0.17607163701784415, + "learning_rate": 0.0004205289256836085, + "loss": 2.9869227409362793, + "step": 10168, + "token_acc": 0.29992468753198154 + }, + { + "epoch": 5.961008501905599, + "grad_norm": 0.16877812078672985, + "learning_rate": 0.000420511206677627, + "loss": 2.9978489875793457, + "step": 10169, + "token_acc": 0.29977983518794493 + }, + { + "epoch": 5.9615948402228085, + "grad_norm": 0.1640217566138485, + "learning_rate": 0.00042049348606992257, + "loss": 3.0411765575408936, + "step": 10170, + "token_acc": 0.29139203900812144 + }, + { + "epoch": 5.962181178540018, + "grad_norm": 0.16581981663744633, + "learning_rate": 0.0004204757638606614, + "loss": 3.007805824279785, + "step": 10171, + "token_acc": 0.29770460603447824 + }, + { + "epoch": 5.962767516857227, + "grad_norm": 0.15590179006372804, + "learning_rate": 0.0004204580400500103, + "loss": 3.007822036743164, + "step": 10172, + "token_acc": 0.29901279519987695 + }, + { + "epoch": 5.963353855174436, + "grad_norm": 0.15782226567640084, + "learning_rate": 0.0004204403146381356, + "loss": 3.0108377933502197, + "step": 10173, + "token_acc": 0.2988230263453107 + }, + { + "epoch": 5.963940193491645, + "grad_norm": 0.15488764902337943, + "learning_rate": 0.0004204225876252038, + "loss": 3.0299644470214844, + "step": 10174, + "token_acc": 0.2947123917035231 + }, + { + "epoch": 5.964526531808854, + "grad_norm": 0.16047125951682964, + "learning_rate": 0.0004204048590113814, + "loss": 2.986888885498047, + "step": 10175, + "token_acc": 0.29897792778625026 + }, + { + "epoch": 5.965112870126063, + "grad_norm": 0.148887769167955, + "learning_rate": 0.000420387128796835, + "loss": 3.0119035243988037, + "step": 10176, + "token_acc": 0.29797941348650275 + }, + { + "epoch": 5.965699208443271, + "grad_norm": 0.17127556745838854, + "learning_rate": 0.00042036939698173115, + "loss": 3.0405898094177246, + "step": 10177, + "token_acc": 0.2927879059776772 + }, + { + "epoch": 5.9662855467604805, + "grad_norm": 0.15769680984664589, + "learning_rate": 0.0004203516635662363, + "loss": 3.0122013092041016, + "step": 10178, + "token_acc": 0.2955767260590826 + }, + { + "epoch": 5.96687188507769, + "grad_norm": 0.14976668413094577, + "learning_rate": 0.00042033392855051724, + "loss": 2.9921820163726807, + "step": 10179, + "token_acc": 0.30129953683241495 + }, + { + "epoch": 5.967458223394899, + "grad_norm": 0.16479827443943812, + "learning_rate": 0.00042031619193474035, + "loss": 2.9297056198120117, + "step": 10180, + "token_acc": 0.3078925066437883 + }, + { + "epoch": 5.968044561712108, + "grad_norm": 0.16215011606989627, + "learning_rate": 0.0004202984537190724, + "loss": 2.997084140777588, + "step": 10181, + "token_acc": 0.2976976694716117 + }, + { + "epoch": 5.968630900029317, + "grad_norm": 0.16104988354849414, + "learning_rate": 0.00042028071390367997, + "loss": 3.022481918334961, + "step": 10182, + "token_acc": 0.29570990134221997 + }, + { + "epoch": 5.969217238346526, + "grad_norm": 0.16811743541530252, + "learning_rate": 0.0004202629724887297, + "loss": 3.023376703262329, + "step": 10183, + "token_acc": 0.29402472582359473 + }, + { + "epoch": 5.969803576663735, + "grad_norm": 0.16818534641799762, + "learning_rate": 0.00042024522947438814, + "loss": 2.993215560913086, + "step": 10184, + "token_acc": 0.3017783467722414 + }, + { + "epoch": 5.970389914980944, + "grad_norm": 0.1618383286085388, + "learning_rate": 0.0004202274848608221, + "loss": 3.0567498207092285, + "step": 10185, + "token_acc": 0.2902952910878935 + }, + { + "epoch": 5.970976253298153, + "grad_norm": 0.1693569419786793, + "learning_rate": 0.0004202097386481982, + "loss": 3.079111099243164, + "step": 10186, + "token_acc": 0.28904556467037873 + }, + { + "epoch": 5.971562591615362, + "grad_norm": 0.20173180485046935, + "learning_rate": 0.00042019199083668325, + "loss": 3.0212135314941406, + "step": 10187, + "token_acc": 0.29545552346405907 + }, + { + "epoch": 5.972148929932571, + "grad_norm": 0.19181097673967357, + "learning_rate": 0.0004201742414264439, + "loss": 2.9974780082702637, + "step": 10188, + "token_acc": 0.29880486415585367 + }, + { + "epoch": 5.97273526824978, + "grad_norm": 0.15255212681466096, + "learning_rate": 0.00042015649041764674, + "loss": 2.981656312942505, + "step": 10189, + "token_acc": 0.30045094467678723 + }, + { + "epoch": 5.973321606566989, + "grad_norm": 0.16750701396309994, + "learning_rate": 0.0004201387378104587, + "loss": 2.9705629348754883, + "step": 10190, + "token_acc": 0.3016007472713081 + }, + { + "epoch": 5.973907944884198, + "grad_norm": 0.1988683883181944, + "learning_rate": 0.0004201209836050465, + "loss": 3.028747320175171, + "step": 10191, + "token_acc": 0.2945310645033637 + }, + { + "epoch": 5.974494283201407, + "grad_norm": 0.23337242869419822, + "learning_rate": 0.0004201032278015769, + "loss": 3.009829044342041, + "step": 10192, + "token_acc": 0.2967162890159148 + }, + { + "epoch": 5.975080621518616, + "grad_norm": 0.19976368089196495, + "learning_rate": 0.00042008547040021666, + "loss": 3.0155391693115234, + "step": 10193, + "token_acc": 0.2975462600661399 + }, + { + "epoch": 5.975666959835825, + "grad_norm": 0.17116752011754288, + "learning_rate": 0.00042006771140113265, + "loss": 3.0229172706604004, + "step": 10194, + "token_acc": 0.29530639677794174 + }, + { + "epoch": 5.9762532981530345, + "grad_norm": 0.15143046214575706, + "learning_rate": 0.0004200499508044916, + "loss": 3.013610363006592, + "step": 10195, + "token_acc": 0.29653473215704973 + }, + { + "epoch": 5.976839636470244, + "grad_norm": 0.17577597328590638, + "learning_rate": 0.00042003218861046045, + "loss": 3.0347273349761963, + "step": 10196, + "token_acc": 0.2939757967069533 + }, + { + "epoch": 5.977425974787453, + "grad_norm": 0.18523430910513905, + "learning_rate": 0.00042001442481920604, + "loss": 3.039219856262207, + "step": 10197, + "token_acc": 0.294815289470921 + }, + { + "epoch": 5.978012313104662, + "grad_norm": 0.20451882411876415, + "learning_rate": 0.0004199966594308952, + "loss": 3.0615601539611816, + "step": 10198, + "token_acc": 0.29018676938236215 + }, + { + "epoch": 5.97859865142187, + "grad_norm": 0.15991013191907558, + "learning_rate": 0.00041997889244569476, + "loss": 3.0142476558685303, + "step": 10199, + "token_acc": 0.2987742080361775 + }, + { + "epoch": 5.979184989739079, + "grad_norm": 0.21747726175218823, + "learning_rate": 0.0004199611238637717, + "loss": 3.0526537895202637, + "step": 10200, + "token_acc": 0.29054927429949196 + }, + { + "epoch": 5.979771328056288, + "grad_norm": 0.27418772009522324, + "learning_rate": 0.00041994335368529295, + "loss": 2.94643235206604, + "step": 10201, + "token_acc": 0.3055137355364387 + }, + { + "epoch": 5.980357666373497, + "grad_norm": 0.1833981945104493, + "learning_rate": 0.0004199255819104254, + "loss": 3.0539803504943848, + "step": 10202, + "token_acc": 0.2897983020547229 + }, + { + "epoch": 5.9809440046907065, + "grad_norm": 0.22275993132558394, + "learning_rate": 0.00041990780853933587, + "loss": 3.0202012062072754, + "step": 10203, + "token_acc": 0.29544636685367226 + }, + { + "epoch": 5.981530343007916, + "grad_norm": 0.2870112044490932, + "learning_rate": 0.0004198900335721916, + "loss": 3.0273287296295166, + "step": 10204, + "token_acc": 0.2956271121187492 + }, + { + "epoch": 5.982116681325125, + "grad_norm": 0.17647883945140513, + "learning_rate": 0.00041987225700915924, + "loss": 2.98618745803833, + "step": 10205, + "token_acc": 0.301177848636689 + }, + { + "epoch": 5.982703019642334, + "grad_norm": 0.2654677882195503, + "learning_rate": 0.000419854478850406, + "loss": 3.0113375186920166, + "step": 10206, + "token_acc": 0.2971034440570135 + }, + { + "epoch": 5.983289357959543, + "grad_norm": 0.17716159808297402, + "learning_rate": 0.00041983669909609886, + "loss": 3.0044643878936768, + "step": 10207, + "token_acc": 0.298307702375499 + }, + { + "epoch": 5.983875696276751, + "grad_norm": 0.2056691154845506, + "learning_rate": 0.00041981891774640467, + "loss": 3.036376953125, + "step": 10208, + "token_acc": 0.29454017585161674 + }, + { + "epoch": 5.98446203459396, + "grad_norm": 0.16510792824010562, + "learning_rate": 0.0004198011348014907, + "loss": 3.0273358821868896, + "step": 10209, + "token_acc": 0.29516528067347964 + }, + { + "epoch": 5.985048372911169, + "grad_norm": 0.21722174258949137, + "learning_rate": 0.0004197833502615238, + "loss": 3.0159785747528076, + "step": 10210, + "token_acc": 0.2960712583193087 + }, + { + "epoch": 5.9856347112283785, + "grad_norm": 0.19078106039372017, + "learning_rate": 0.00041976556412667116, + "loss": 3.020984411239624, + "step": 10211, + "token_acc": 0.29661512656216193 + }, + { + "epoch": 5.986221049545588, + "grad_norm": 0.2225403146966675, + "learning_rate": 0.0004197477763970998, + "loss": 3.043126106262207, + "step": 10212, + "token_acc": 0.2935525175575876 + }, + { + "epoch": 5.986807387862797, + "grad_norm": 0.19659209912739298, + "learning_rate": 0.0004197299870729768, + "loss": 2.9881772994995117, + "step": 10213, + "token_acc": 0.3005287119611131 + }, + { + "epoch": 5.987393726180006, + "grad_norm": 0.17777550847436593, + "learning_rate": 0.0004197121961544693, + "loss": 3.027704954147339, + "step": 10214, + "token_acc": 0.29740364995270124 + }, + { + "epoch": 5.987980064497215, + "grad_norm": 0.1668465078725287, + "learning_rate": 0.0004196944036417444, + "loss": 3.013617753982544, + "step": 10215, + "token_acc": 0.2955202540029648 + }, + { + "epoch": 5.988566402814424, + "grad_norm": 0.19592716842153865, + "learning_rate": 0.0004196766095349692, + "loss": 3.01521372795105, + "step": 10216, + "token_acc": 0.29671640869898397 + }, + { + "epoch": 5.989152741131633, + "grad_norm": 0.1666962793766802, + "learning_rate": 0.000419658813834311, + "loss": 3.039407730102539, + "step": 10217, + "token_acc": 0.2947220447953425 + }, + { + "epoch": 5.989739079448842, + "grad_norm": 0.19626625633076203, + "learning_rate": 0.0004196410165399367, + "loss": 2.996209144592285, + "step": 10218, + "token_acc": 0.2984555349208961 + }, + { + "epoch": 5.990325417766051, + "grad_norm": 0.16703075487887545, + "learning_rate": 0.00041962321765201375, + "loss": 3.010937213897705, + "step": 10219, + "token_acc": 0.2978081515384177 + }, + { + "epoch": 5.99091175608326, + "grad_norm": 0.16751913684880004, + "learning_rate": 0.00041960541717070925, + "loss": 2.983689069747925, + "step": 10220, + "token_acc": 0.301797365184583 + }, + { + "epoch": 5.991498094400469, + "grad_norm": 0.18181719918290598, + "learning_rate": 0.00041958761509619036, + "loss": 2.9922404289245605, + "step": 10221, + "token_acc": 0.29966833616167454 + }, + { + "epoch": 5.992084432717678, + "grad_norm": 0.16532789154897104, + "learning_rate": 0.00041956981142862444, + "loss": 3.0589213371276855, + "step": 10222, + "token_acc": 0.29111120215920555 + }, + { + "epoch": 5.992670771034887, + "grad_norm": 0.18308493192581085, + "learning_rate": 0.00041955200616817855, + "loss": 2.977748394012451, + "step": 10223, + "token_acc": 0.30077948412418437 + }, + { + "epoch": 5.993257109352096, + "grad_norm": 0.18598531445637626, + "learning_rate": 0.00041953419931502005, + "loss": 2.9840216636657715, + "step": 10224, + "token_acc": 0.3001503278928953 + }, + { + "epoch": 5.993843447669305, + "grad_norm": 0.17486518973025356, + "learning_rate": 0.00041951639086931623, + "loss": 2.999925374984741, + "step": 10225, + "token_acc": 0.29937232384885126 + }, + { + "epoch": 5.994429785986514, + "grad_norm": 0.18042436118828006, + "learning_rate": 0.0004194985808312343, + "loss": 3.027662515640259, + "step": 10226, + "token_acc": 0.2963155760754411 + }, + { + "epoch": 5.995016124303723, + "grad_norm": 0.1998489285459203, + "learning_rate": 0.00041948076920094167, + "loss": 3.0158166885375977, + "step": 10227, + "token_acc": 0.29682251475261007 + }, + { + "epoch": 5.9956024626209325, + "grad_norm": 0.15512991124024128, + "learning_rate": 0.0004194629559786055, + "loss": 2.97990083694458, + "step": 10228, + "token_acc": 0.3021337693097738 + }, + { + "epoch": 5.996188800938142, + "grad_norm": 0.21406460170670397, + "learning_rate": 0.0004194451411643933, + "loss": 3.0143604278564453, + "step": 10229, + "token_acc": 0.29750987980842875 + }, + { + "epoch": 5.99677513925535, + "grad_norm": 0.18499744631889353, + "learning_rate": 0.0004194273247584722, + "loss": 3.0046916007995605, + "step": 10230, + "token_acc": 0.29774000594061434 + }, + { + "epoch": 5.997361477572559, + "grad_norm": 0.15606292161293817, + "learning_rate": 0.0004194095067610099, + "loss": 3.0212793350219727, + "step": 10231, + "token_acc": 0.2963970001104033 + }, + { + "epoch": 5.997947815889768, + "grad_norm": 0.16688105034440717, + "learning_rate": 0.0004193916871721734, + "loss": 3.0289816856384277, + "step": 10232, + "token_acc": 0.2940353026094184 + }, + { + "epoch": 5.998534154206977, + "grad_norm": 0.18476601808057938, + "learning_rate": 0.0004193738659921303, + "loss": 2.9925098419189453, + "step": 10233, + "token_acc": 0.3007105849472005 + }, + { + "epoch": 5.999120492524186, + "grad_norm": 0.17962035332060558, + "learning_rate": 0.0004193560432210479, + "loss": 3.032536745071411, + "step": 10234, + "token_acc": 0.29506226324381724 + }, + { + "epoch": 5.999706830841395, + "grad_norm": 0.16718451680398844, + "learning_rate": 0.00041933821885909383, + "loss": 3.0617456436157227, + "step": 10235, + "token_acc": 0.29062680935172797 + }, + { + "epoch": 6.0, + "grad_norm": 0.21063145652853174, + "learning_rate": 0.0004193203929064353, + "loss": 3.056640625, + "step": 10236, + "token_acc": 0.2909936897151622 + }, + { + "epoch": 6.0, + "eval_loss": 3.0649304389953613, + "eval_runtime": 6.4806, + "eval_samples_per_second": 39.503, + "eval_steps_per_second": 4.938, + "eval_token_acc": 0.29070042328995666, + "step": 10236 + }, + { + "epoch": 6.000586338317209, + "grad_norm": 0.1732140922757181, + "learning_rate": 0.00041930256536323987, + "loss": 2.9602222442626953, + "step": 10237, + "token_acc": 0.3034973715878143 + }, + { + "epoch": 6.001172676634418, + "grad_norm": 0.23073474418582104, + "learning_rate": 0.0004192847362296749, + "loss": 2.8903746604919434, + "step": 10238, + "token_acc": 0.3131769465357366 + }, + { + "epoch": 6.001759014951627, + "grad_norm": 0.3609871729948302, + "learning_rate": 0.00041926690550590795, + "loss": 2.9143409729003906, + "step": 10239, + "token_acc": 0.3083252396049039 + }, + { + "epoch": 6.0023453532688364, + "grad_norm": 0.3052491497679607, + "learning_rate": 0.0004192490731921066, + "loss": 2.9281511306762695, + "step": 10240, + "token_acc": 0.30790565018847943 + }, + { + "epoch": 6.002931691586046, + "grad_norm": 0.1671651248234393, + "learning_rate": 0.0004192312392884382, + "loss": 2.920990228652954, + "step": 10241, + "token_acc": 0.30904523285274377 + }, + { + "epoch": 6.003518029903254, + "grad_norm": 0.25736012696735594, + "learning_rate": 0.00041921340379507045, + "loss": 2.9117870330810547, + "step": 10242, + "token_acc": 0.3114368728339256 + }, + { + "epoch": 6.004104368220463, + "grad_norm": 0.20279930532446444, + "learning_rate": 0.0004191955667121707, + "loss": 2.939326047897339, + "step": 10243, + "token_acc": 0.3078686079619528 + }, + { + "epoch": 6.004690706537672, + "grad_norm": 0.22137151923084947, + "learning_rate": 0.0004191777280399066, + "loss": 2.911097526550293, + "step": 10244, + "token_acc": 0.3088605357291407 + }, + { + "epoch": 6.005277044854881, + "grad_norm": 0.1829015509180036, + "learning_rate": 0.0004191598877784457, + "loss": 2.9374403953552246, + "step": 10245, + "token_acc": 0.30779441422095577 + }, + { + "epoch": 6.00586338317209, + "grad_norm": 0.20160660293676944, + "learning_rate": 0.00041914204592795567, + "loss": 2.9247970581054688, + "step": 10246, + "token_acc": 0.3088559747792386 + }, + { + "epoch": 6.006449721489299, + "grad_norm": 0.20762467217449807, + "learning_rate": 0.00041912420248860395, + "loss": 2.9592928886413574, + "step": 10247, + "token_acc": 0.30244030319147597 + }, + { + "epoch": 6.0070360598065085, + "grad_norm": 0.19888383328183237, + "learning_rate": 0.00041910635746055837, + "loss": 2.8911755084991455, + "step": 10248, + "token_acc": 0.31474624841752874 + }, + { + "epoch": 6.007622398123718, + "grad_norm": 0.18415535820961962, + "learning_rate": 0.00041908851084398633, + "loss": 2.9320225715637207, + "step": 10249, + "token_acc": 0.3070092871485944 + }, + { + "epoch": 6.008208736440927, + "grad_norm": 0.20120319718821617, + "learning_rate": 0.00041907066263905556, + "loss": 2.929243564605713, + "step": 10250, + "token_acc": 0.30765369736030607 + }, + { + "epoch": 6.008795074758136, + "grad_norm": 0.1818432437470063, + "learning_rate": 0.0004190528128459339, + "loss": 2.9245409965515137, + "step": 10251, + "token_acc": 0.3074367499610369 + }, + { + "epoch": 6.009381413075345, + "grad_norm": 0.20991402729349298, + "learning_rate": 0.00041903496146478863, + "loss": 2.9130775928497314, + "step": 10252, + "token_acc": 0.30931196994038246 + }, + { + "epoch": 6.009967751392553, + "grad_norm": 0.18934900162091142, + "learning_rate": 0.0004190171084957878, + "loss": 2.9686741828918457, + "step": 10253, + "token_acc": 0.30122390583338754 + }, + { + "epoch": 6.010554089709762, + "grad_norm": 0.20253345313546836, + "learning_rate": 0.00041899925393909906, + "loss": 2.9493823051452637, + "step": 10254, + "token_acc": 0.3041237941691618 + }, + { + "epoch": 6.011140428026971, + "grad_norm": 0.1700873310626552, + "learning_rate": 0.00041898139779489, + "loss": 2.949160575866699, + "step": 10255, + "token_acc": 0.3051789116863703 + }, + { + "epoch": 6.0117267663441805, + "grad_norm": 0.18633794896729666, + "learning_rate": 0.0004189635400633284, + "loss": 2.900892496109009, + "step": 10256, + "token_acc": 0.31239673819057373 + }, + { + "epoch": 6.01231310466139, + "grad_norm": 0.18970582546479434, + "learning_rate": 0.00041894568074458196, + "loss": 2.8754091262817383, + "step": 10257, + "token_acc": 0.3153083741832259 + }, + { + "epoch": 6.012899442978599, + "grad_norm": 0.18164607035165228, + "learning_rate": 0.00041892781983881856, + "loss": 2.9724740982055664, + "step": 10258, + "token_acc": 0.300861901301041 + }, + { + "epoch": 6.013485781295808, + "grad_norm": 0.1899247818119006, + "learning_rate": 0.00041890995734620597, + "loss": 2.908720016479492, + "step": 10259, + "token_acc": 0.31182330349692394 + }, + { + "epoch": 6.014072119613017, + "grad_norm": 0.16423428634503887, + "learning_rate": 0.0004188920932669119, + "loss": 2.943769931793213, + "step": 10260, + "token_acc": 0.30470131059553685 + }, + { + "epoch": 6.014658457930226, + "grad_norm": 0.15987445710260936, + "learning_rate": 0.0004188742276011042, + "loss": 2.970900774002075, + "step": 10261, + "token_acc": 0.3021199469882295 + }, + { + "epoch": 6.015244796247435, + "grad_norm": 0.180316375322308, + "learning_rate": 0.0004188563603489507, + "loss": 2.9571022987365723, + "step": 10262, + "token_acc": 0.30539840561949644 + }, + { + "epoch": 6.015831134564644, + "grad_norm": 0.18304974203301286, + "learning_rate": 0.00041883849151061925, + "loss": 2.9161009788513184, + "step": 10263, + "token_acc": 0.31026708882378445 + }, + { + "epoch": 6.0164174728818525, + "grad_norm": 0.16850110187330536, + "learning_rate": 0.0004188206210862776, + "loss": 2.971259117126465, + "step": 10264, + "token_acc": 0.3013107725368899 + }, + { + "epoch": 6.017003811199062, + "grad_norm": 0.20797054182622418, + "learning_rate": 0.0004188027490760938, + "loss": 2.919766902923584, + "step": 10265, + "token_acc": 0.30772082018927444 + }, + { + "epoch": 6.017590149516271, + "grad_norm": 0.17961505033452962, + "learning_rate": 0.00041878487548023557, + "loss": 2.9463272094726562, + "step": 10266, + "token_acc": 0.30418695861796236 + }, + { + "epoch": 6.01817648783348, + "grad_norm": 0.18104506752635316, + "learning_rate": 0.0004187670002988709, + "loss": 2.926412582397461, + "step": 10267, + "token_acc": 0.30650189840204967 + }, + { + "epoch": 6.018762826150689, + "grad_norm": 0.1771857256890836, + "learning_rate": 0.0004187491235321678, + "loss": 2.954050302505493, + "step": 10268, + "token_acc": 0.30280952002644734 + }, + { + "epoch": 6.019349164467898, + "grad_norm": 0.16411790394919737, + "learning_rate": 0.0004187312451802939, + "loss": 2.9356212615966797, + "step": 10269, + "token_acc": 0.30725775640603387 + }, + { + "epoch": 6.019935502785107, + "grad_norm": 0.15502051172818598, + "learning_rate": 0.0004187133652434174, + "loss": 2.9147231578826904, + "step": 10270, + "token_acc": 0.30916826583830875 + }, + { + "epoch": 6.020521841102316, + "grad_norm": 0.16839230176558526, + "learning_rate": 0.0004186954837217062, + "loss": 2.9511795043945312, + "step": 10271, + "token_acc": 0.30318921901351265 + }, + { + "epoch": 6.021108179419525, + "grad_norm": 0.17475458469242688, + "learning_rate": 0.0004186776006153281, + "loss": 2.899600028991699, + "step": 10272, + "token_acc": 0.3110405490186036 + }, + { + "epoch": 6.0216945177367345, + "grad_norm": 0.22619432671172965, + "learning_rate": 0.0004186597159244514, + "loss": 2.8895201683044434, + "step": 10273, + "token_acc": 0.3129109439648905 + }, + { + "epoch": 6.022280856053943, + "grad_norm": 0.19186210027787526, + "learning_rate": 0.0004186418296492439, + "loss": 2.899771213531494, + "step": 10274, + "token_acc": 0.31199427490776815 + }, + { + "epoch": 6.022867194371152, + "grad_norm": 0.16170970079912325, + "learning_rate": 0.0004186239417898736, + "loss": 2.9532153606414795, + "step": 10275, + "token_acc": 0.30277884424439 + }, + { + "epoch": 6.023453532688361, + "grad_norm": 0.16659249526852862, + "learning_rate": 0.0004186060523465087, + "loss": 2.9661145210266113, + "step": 10276, + "token_acc": 0.30138699195281704 + }, + { + "epoch": 6.02403987100557, + "grad_norm": 0.1550768826486709, + "learning_rate": 0.00041858816131931697, + "loss": 2.931694507598877, + "step": 10277, + "token_acc": 0.30713176000591846 + }, + { + "epoch": 6.024626209322779, + "grad_norm": 0.2023897108379906, + "learning_rate": 0.0004185702687084668, + "loss": 2.952353000640869, + "step": 10278, + "token_acc": 0.3042075776010866 + }, + { + "epoch": 6.025212547639988, + "grad_norm": 0.2737210806263657, + "learning_rate": 0.0004185523745141261, + "loss": 2.9173951148986816, + "step": 10279, + "token_acc": 0.3083536401011936 + }, + { + "epoch": 6.025798885957197, + "grad_norm": 0.24692656329680757, + "learning_rate": 0.0004185344787364629, + "loss": 2.926464080810547, + "step": 10280, + "token_acc": 0.30737464055580044 + }, + { + "epoch": 6.0263852242744065, + "grad_norm": 0.1766469610697834, + "learning_rate": 0.0004185165813756454, + "loss": 2.9276132583618164, + "step": 10281, + "token_acc": 0.30764688815478036 + }, + { + "epoch": 6.026971562591616, + "grad_norm": 0.20095621938661906, + "learning_rate": 0.0004184986824318416, + "loss": 2.943746328353882, + "step": 10282, + "token_acc": 0.30490956730965296 + }, + { + "epoch": 6.027557900908825, + "grad_norm": 0.20193416637082162, + "learning_rate": 0.00041848078190521987, + "loss": 2.946881055831909, + "step": 10283, + "token_acc": 0.30474138720848964 + }, + { + "epoch": 6.028144239226034, + "grad_norm": 0.17148757224654243, + "learning_rate": 0.0004184628797959482, + "loss": 2.9447832107543945, + "step": 10284, + "token_acc": 0.3054198110436394 + }, + { + "epoch": 6.028730577543242, + "grad_norm": 0.22323579976070798, + "learning_rate": 0.0004184449761041947, + "loss": 2.942300796508789, + "step": 10285, + "token_acc": 0.3051546712345452 + }, + { + "epoch": 6.029316915860451, + "grad_norm": 0.15810423850894004, + "learning_rate": 0.00041842707083012776, + "loss": 2.872720718383789, + "step": 10286, + "token_acc": 0.3152277343563658 + }, + { + "epoch": 6.02990325417766, + "grad_norm": 0.25053218335349375, + "learning_rate": 0.00041840916397391535, + "loss": 2.9371237754821777, + "step": 10287, + "token_acc": 0.30627491885652564 + }, + { + "epoch": 6.030489592494869, + "grad_norm": 0.2464954453440709, + "learning_rate": 0.0004183912555357259, + "loss": 2.939788818359375, + "step": 10288, + "token_acc": 0.30507402905417785 + }, + { + "epoch": 6.0310759308120785, + "grad_norm": 0.14457853428184936, + "learning_rate": 0.00041837334551572734, + "loss": 2.9254703521728516, + "step": 10289, + "token_acc": 0.3081608453305416 + }, + { + "epoch": 6.031662269129288, + "grad_norm": 0.1888141336440917, + "learning_rate": 0.0004183554339140882, + "loss": 2.918151378631592, + "step": 10290, + "token_acc": 0.3088473701709576 + }, + { + "epoch": 6.032248607446497, + "grad_norm": 0.15168523100534667, + "learning_rate": 0.0004183375207309766, + "loss": 2.971987724304199, + "step": 10291, + "token_acc": 0.30259228974709307 + }, + { + "epoch": 6.032834945763706, + "grad_norm": 0.1994691072720343, + "learning_rate": 0.00041831960596656084, + "loss": 2.924440860748291, + "step": 10292, + "token_acc": 0.30785049546177956 + }, + { + "epoch": 6.033421284080915, + "grad_norm": 0.16615673005090645, + "learning_rate": 0.0004183016896210091, + "loss": 2.9166624546051025, + "step": 10293, + "token_acc": 0.3078486012521779 + }, + { + "epoch": 6.034007622398124, + "grad_norm": 0.157124867727336, + "learning_rate": 0.0004182837716944899, + "loss": 2.941511631011963, + "step": 10294, + "token_acc": 0.3049785646619318 + }, + { + "epoch": 6.034593960715333, + "grad_norm": 0.1539725454567096, + "learning_rate": 0.0004182658521871714, + "loss": 2.9362523555755615, + "step": 10295, + "token_acc": 0.3079132987746913 + }, + { + "epoch": 6.035180299032541, + "grad_norm": 0.1719964590210007, + "learning_rate": 0.0004182479310992219, + "loss": 2.915773391723633, + "step": 10296, + "token_acc": 0.3092828439606234 + }, + { + "epoch": 6.0357666373497505, + "grad_norm": 0.15213747989747783, + "learning_rate": 0.0004182300084308099, + "loss": 2.931304454803467, + "step": 10297, + "token_acc": 0.307035988370769 + }, + { + "epoch": 6.03635297566696, + "grad_norm": 0.1650825013505614, + "learning_rate": 0.00041821208418210356, + "loss": 2.979233980178833, + "step": 10298, + "token_acc": 0.30156367523059263 + }, + { + "epoch": 6.036939313984169, + "grad_norm": 0.1728601046774603, + "learning_rate": 0.00041819415835327146, + "loss": 2.950711488723755, + "step": 10299, + "token_acc": 0.30277904863654603 + }, + { + "epoch": 6.037525652301378, + "grad_norm": 0.16217665300653067, + "learning_rate": 0.00041817623094448183, + "loss": 2.903207778930664, + "step": 10300, + "token_acc": 0.3144722782231533 + }, + { + "epoch": 6.038111990618587, + "grad_norm": 0.1684995085779338, + "learning_rate": 0.0004181583019559031, + "loss": 2.93789005279541, + "step": 10301, + "token_acc": 0.30763651164732403 + }, + { + "epoch": 6.038698328935796, + "grad_norm": 0.15511565110013817, + "learning_rate": 0.0004181403713877038, + "loss": 2.90281343460083, + "step": 10302, + "token_acc": 0.31053696589342794 + }, + { + "epoch": 6.039284667253005, + "grad_norm": 0.1619090760156598, + "learning_rate": 0.0004181224392400522, + "loss": 2.907012462615967, + "step": 10303, + "token_acc": 0.3120054145446404 + }, + { + "epoch": 6.039871005570214, + "grad_norm": 0.15782127629841022, + "learning_rate": 0.00041810450551311686, + "loss": 2.9243760108947754, + "step": 10304, + "token_acc": 0.3067929490908171 + }, + { + "epoch": 6.040457343887423, + "grad_norm": 0.15300743518901036, + "learning_rate": 0.0004180865702070662, + "loss": 2.9212276935577393, + "step": 10305, + "token_acc": 0.30850843719963017 + }, + { + "epoch": 6.0410436822046325, + "grad_norm": 0.1648611663087758, + "learning_rate": 0.00041806863332206873, + "loss": 2.9567737579345703, + "step": 10306, + "token_acc": 0.30360432281049854 + }, + { + "epoch": 6.041630020521841, + "grad_norm": 0.18277209953764542, + "learning_rate": 0.00041805069485829297, + "loss": 2.9483766555786133, + "step": 10307, + "token_acc": 0.30584636001693316 + }, + { + "epoch": 6.04221635883905, + "grad_norm": 0.1774457935808168, + "learning_rate": 0.0004180327548159073, + "loss": 2.8721115589141846, + "step": 10308, + "token_acc": 0.3162860942198818 + }, + { + "epoch": 6.042802697156259, + "grad_norm": 0.17867244998495968, + "learning_rate": 0.00041801481319508036, + "loss": 2.9132578372955322, + "step": 10309, + "token_acc": 0.3100189566446484 + }, + { + "epoch": 6.043389035473468, + "grad_norm": 0.1950571612596303, + "learning_rate": 0.00041799686999598076, + "loss": 2.916032314300537, + "step": 10310, + "token_acc": 0.30868813833762354 + }, + { + "epoch": 6.043975373790677, + "grad_norm": 0.16812822607809844, + "learning_rate": 0.00041797892521877676, + "loss": 2.9665660858154297, + "step": 10311, + "token_acc": 0.3012493996859704 + }, + { + "epoch": 6.044561712107886, + "grad_norm": 0.1854890423740394, + "learning_rate": 0.00041796097886363727, + "loss": 2.933084011077881, + "step": 10312, + "token_acc": 0.3065175802237407 + }, + { + "epoch": 6.045148050425095, + "grad_norm": 0.17313592687949653, + "learning_rate": 0.0004179430309307306, + "loss": 2.929915428161621, + "step": 10313, + "token_acc": 0.3061842188521651 + }, + { + "epoch": 6.0457343887423045, + "grad_norm": 0.16677884034130305, + "learning_rate": 0.0004179250814202255, + "loss": 2.9196548461914062, + "step": 10314, + "token_acc": 0.3095116587890463 + }, + { + "epoch": 6.046320727059514, + "grad_norm": 0.18826194550535516, + "learning_rate": 0.00041790713033229056, + "loss": 2.9299111366271973, + "step": 10315, + "token_acc": 0.3077982785001814 + }, + { + "epoch": 6.046907065376723, + "grad_norm": 0.20063663884843969, + "learning_rate": 0.0004178891776670943, + "loss": 2.9062259197235107, + "step": 10316, + "token_acc": 0.31244908691572404 + }, + { + "epoch": 6.047493403693931, + "grad_norm": 0.1699002414670016, + "learning_rate": 0.00041787122342480556, + "loss": 2.9566714763641357, + "step": 10317, + "token_acc": 0.3038845317340012 + }, + { + "epoch": 6.04807974201114, + "grad_norm": 0.16554430243884574, + "learning_rate": 0.00041785326760559284, + "loss": 2.92295241355896, + "step": 10318, + "token_acc": 0.3102327570730059 + }, + { + "epoch": 6.048666080328349, + "grad_norm": 0.16112018530797267, + "learning_rate": 0.00041783531020962483, + "loss": 2.944362163543701, + "step": 10319, + "token_acc": 0.30557263190117856 + }, + { + "epoch": 6.049252418645558, + "grad_norm": 0.1651824161066362, + "learning_rate": 0.0004178173512370703, + "loss": 2.947007656097412, + "step": 10320, + "token_acc": 0.3061213710561273 + }, + { + "epoch": 6.049838756962767, + "grad_norm": 0.1795722350359318, + "learning_rate": 0.00041779939068809787, + "loss": 2.9627013206481934, + "step": 10321, + "token_acc": 0.30487512487512486 + }, + { + "epoch": 6.0504250952799765, + "grad_norm": 0.16799045230146586, + "learning_rate": 0.00041778142856287626, + "loss": 2.9076757431030273, + "step": 10322, + "token_acc": 0.31302323825141326 + }, + { + "epoch": 6.051011433597186, + "grad_norm": 0.1806317288147836, + "learning_rate": 0.00041776346486157423, + "loss": 2.9344029426574707, + "step": 10323, + "token_acc": 0.3085921505200229 + }, + { + "epoch": 6.051597771914395, + "grad_norm": 0.23655106092417819, + "learning_rate": 0.00041774549958436055, + "loss": 2.932159423828125, + "step": 10324, + "token_acc": 0.306933621110085 + }, + { + "epoch": 6.052184110231604, + "grad_norm": 0.4308092768756055, + "learning_rate": 0.000417727532731404, + "loss": 2.9882700443267822, + "step": 10325, + "token_acc": 0.30047418307904256 + }, + { + "epoch": 6.052770448548813, + "grad_norm": 0.3698731130131215, + "learning_rate": 0.0004177095643028731, + "loss": 2.929323196411133, + "step": 10326, + "token_acc": 0.3084223367440934 + }, + { + "epoch": 6.053356786866022, + "grad_norm": 0.25251449455407415, + "learning_rate": 0.0004176915942989369, + "loss": 2.947962760925293, + "step": 10327, + "token_acc": 0.3062000812237715 + }, + { + "epoch": 6.05394312518323, + "grad_norm": 0.3089726113668326, + "learning_rate": 0.0004176736227197642, + "loss": 2.9194841384887695, + "step": 10328, + "token_acc": 0.31022529384672826 + }, + { + "epoch": 6.054529463500439, + "grad_norm": 0.24805624562202344, + "learning_rate": 0.0004176556495655237, + "loss": 2.907470703125, + "step": 10329, + "token_acc": 0.3106066336124942 + }, + { + "epoch": 6.0551158018176485, + "grad_norm": 0.23731230430167546, + "learning_rate": 0.00041763767483638436, + "loss": 2.908590793609619, + "step": 10330, + "token_acc": 0.3115572770701053 + }, + { + "epoch": 6.055702140134858, + "grad_norm": 0.26702752980083244, + "learning_rate": 0.0004176196985325149, + "loss": 2.92924427986145, + "step": 10331, + "token_acc": 0.30931783750796293 + }, + { + "epoch": 6.056288478452067, + "grad_norm": 0.16294038506958036, + "learning_rate": 0.0004176017206540843, + "loss": 2.961214780807495, + "step": 10332, + "token_acc": 0.3022245555521242 + }, + { + "epoch": 6.056874816769276, + "grad_norm": 0.22398987850303756, + "learning_rate": 0.0004175837412012613, + "loss": 2.931196689605713, + "step": 10333, + "token_acc": 0.30635985657908643 + }, + { + "epoch": 6.057461155086485, + "grad_norm": 0.2242662464076655, + "learning_rate": 0.0004175657601742149, + "loss": 2.9264841079711914, + "step": 10334, + "token_acc": 0.30820708072235004 + }, + { + "epoch": 6.058047493403694, + "grad_norm": 0.16316157263405973, + "learning_rate": 0.000417547777573114, + "loss": 2.9220733642578125, + "step": 10335, + "token_acc": 0.30918539949331436 + }, + { + "epoch": 6.058633831720903, + "grad_norm": 0.21982210638704597, + "learning_rate": 0.0004175297933981275, + "loss": 2.9503307342529297, + "step": 10336, + "token_acc": 0.30366494941675787 + }, + { + "epoch": 6.059220170038112, + "grad_norm": 0.21920984189241072, + "learning_rate": 0.00041751180764942436, + "loss": 2.9321038722991943, + "step": 10337, + "token_acc": 0.30667895003045037 + }, + { + "epoch": 6.059806508355321, + "grad_norm": 0.20396125926540085, + "learning_rate": 0.00041749382032717355, + "loss": 2.965158700942993, + "step": 10338, + "token_acc": 0.302707999106472 + }, + { + "epoch": 6.06039284667253, + "grad_norm": 0.25614332075158935, + "learning_rate": 0.0004174758314315439, + "loss": 2.96305513381958, + "step": 10339, + "token_acc": 0.30326771216863657 + }, + { + "epoch": 6.060979184989739, + "grad_norm": 0.15565924371182874, + "learning_rate": 0.0004174578409627045, + "loss": 2.932305335998535, + "step": 10340, + "token_acc": 0.30747541279436974 + }, + { + "epoch": 6.061565523306948, + "grad_norm": 0.1746454932040459, + "learning_rate": 0.0004174398489208244, + "loss": 2.8862733840942383, + "step": 10341, + "token_acc": 0.3145143114717151 + }, + { + "epoch": 6.062151861624157, + "grad_norm": 0.16985274150991805, + "learning_rate": 0.0004174218553060725, + "loss": 2.9634270668029785, + "step": 10342, + "token_acc": 0.30349763832839866 + }, + { + "epoch": 6.062738199941366, + "grad_norm": 0.15919040365585663, + "learning_rate": 0.00041740386011861793, + "loss": 2.939575672149658, + "step": 10343, + "token_acc": 0.3073319695217765 + }, + { + "epoch": 6.063324538258575, + "grad_norm": 0.16570205638267985, + "learning_rate": 0.0004173858633586296, + "loss": 2.9637322425842285, + "step": 10344, + "token_acc": 0.3025522231457888 + }, + { + "epoch": 6.063910876575784, + "grad_norm": 0.20100833698627005, + "learning_rate": 0.00041736786502627666, + "loss": 2.9333088397979736, + "step": 10345, + "token_acc": 0.30603469721767596 + }, + { + "epoch": 6.064497214892993, + "grad_norm": 0.16385173446042253, + "learning_rate": 0.0004173498651217282, + "loss": 2.9404385089874268, + "step": 10346, + "token_acc": 0.30659420157379136 + }, + { + "epoch": 6.0650835532102025, + "grad_norm": 0.17535650844877715, + "learning_rate": 0.00041733186364515323, + "loss": 2.934835433959961, + "step": 10347, + "token_acc": 0.3061625921362644 + }, + { + "epoch": 6.065669891527412, + "grad_norm": 0.15514254421292142, + "learning_rate": 0.0004173138605967209, + "loss": 2.907177209854126, + "step": 10348, + "token_acc": 0.3100863605638361 + }, + { + "epoch": 6.066256229844621, + "grad_norm": 0.17387340040836205, + "learning_rate": 0.00041729585597660024, + "loss": 2.894984483718872, + "step": 10349, + "token_acc": 0.31431665397935005 + }, + { + "epoch": 6.066842568161829, + "grad_norm": 0.17624445673799993, + "learning_rate": 0.0004172778497849605, + "loss": 2.9011363983154297, + "step": 10350, + "token_acc": 0.3123098260848731 + }, + { + "epoch": 6.067428906479038, + "grad_norm": 0.1726879628423072, + "learning_rate": 0.0004172598420219708, + "loss": 2.950467109680176, + "step": 10351, + "token_acc": 0.3032352006777507 + }, + { + "epoch": 6.068015244796247, + "grad_norm": 0.16052206973811742, + "learning_rate": 0.0004172418326878003, + "loss": 2.9330697059631348, + "step": 10352, + "token_acc": 0.3067943193240513 + }, + { + "epoch": 6.068601583113456, + "grad_norm": 0.2174646690809696, + "learning_rate": 0.00041722382178261807, + "loss": 2.950131416320801, + "step": 10353, + "token_acc": 0.304539651201053 + }, + { + "epoch": 6.069187921430665, + "grad_norm": 0.15493196486403835, + "learning_rate": 0.00041720580930659334, + "loss": 2.9184255599975586, + "step": 10354, + "token_acc": 0.3103981455803172 + }, + { + "epoch": 6.0697742597478745, + "grad_norm": 0.19493495426098145, + "learning_rate": 0.00041718779525989544, + "loss": 2.9730288982391357, + "step": 10355, + "token_acc": 0.30046266465263183 + }, + { + "epoch": 6.070360598065084, + "grad_norm": 0.1537075069181297, + "learning_rate": 0.0004171697796426934, + "loss": 2.9841341972351074, + "step": 10356, + "token_acc": 0.3002169006273792 + }, + { + "epoch": 6.070946936382293, + "grad_norm": 0.2060060782229896, + "learning_rate": 0.0004171517624551566, + "loss": 2.953420639038086, + "step": 10357, + "token_acc": 0.30366265047648167 + }, + { + "epoch": 6.071533274699502, + "grad_norm": 0.19142239605354902, + "learning_rate": 0.0004171337436974543, + "loss": 2.934462785720825, + "step": 10358, + "token_acc": 0.3072923409963075 + }, + { + "epoch": 6.072119613016711, + "grad_norm": 0.18570545252263554, + "learning_rate": 0.00041711572336975555, + "loss": 2.9109275341033936, + "step": 10359, + "token_acc": 0.31031673550717115 + }, + { + "epoch": 6.07270595133392, + "grad_norm": 0.20572328910303564, + "learning_rate": 0.0004170977014722298, + "loss": 2.9132065773010254, + "step": 10360, + "token_acc": 0.3096357130351046 + }, + { + "epoch": 6.073292289651128, + "grad_norm": 0.15045101696421187, + "learning_rate": 0.0004170796780050464, + "loss": 2.898057460784912, + "step": 10361, + "token_acc": 0.31256209265915225 + }, + { + "epoch": 6.073878627968337, + "grad_norm": 0.23070075014343516, + "learning_rate": 0.0004170616529683745, + "loss": 2.9632296562194824, + "step": 10362, + "token_acc": 0.30218249604658226 + }, + { + "epoch": 6.0744649662855466, + "grad_norm": 0.15220079606623535, + "learning_rate": 0.0004170436263623835, + "loss": 2.9829108715057373, + "step": 10363, + "token_acc": 0.2989781144208137 + }, + { + "epoch": 6.075051304602756, + "grad_norm": 0.21366120440288014, + "learning_rate": 0.00041702559818724275, + "loss": 2.951282024383545, + "step": 10364, + "token_acc": 0.30401181340001726 + }, + { + "epoch": 6.075637642919965, + "grad_norm": 0.1655978200246757, + "learning_rate": 0.0004170075684431215, + "loss": 2.9437506198883057, + "step": 10365, + "token_acc": 0.30710179209426564 + }, + { + "epoch": 6.076223981237174, + "grad_norm": 0.18383866862705062, + "learning_rate": 0.0004169895371301893, + "loss": 2.9523637294769287, + "step": 10366, + "token_acc": 0.303704137015428 + }, + { + "epoch": 6.076810319554383, + "grad_norm": 0.1698849773799001, + "learning_rate": 0.0004169715042486153, + "loss": 2.9248464107513428, + "step": 10367, + "token_acc": 0.3083811245901197 + }, + { + "epoch": 6.077396657871592, + "grad_norm": 0.20629911334996334, + "learning_rate": 0.0004169534697985691, + "loss": 2.964049816131592, + "step": 10368, + "token_acc": 0.3036072736618904 + }, + { + "epoch": 6.077982996188801, + "grad_norm": 0.16738523168024455, + "learning_rate": 0.00041693543378022003, + "loss": 2.9208626747131348, + "step": 10369, + "token_acc": 0.30869069113229797 + }, + { + "epoch": 6.07856933450601, + "grad_norm": 0.1963817078039234, + "learning_rate": 0.0004169173961937375, + "loss": 2.990365505218506, + "step": 10370, + "token_acc": 0.2982556032920959 + }, + { + "epoch": 6.0791556728232194, + "grad_norm": 0.18358411341470202, + "learning_rate": 0.00041689935703929096, + "loss": 2.9318721294403076, + "step": 10371, + "token_acc": 0.30747021872800434 + }, + { + "epoch": 6.079742011140428, + "grad_norm": 0.20610675953206628, + "learning_rate": 0.0004168813163170498, + "loss": 2.9484333992004395, + "step": 10372, + "token_acc": 0.30545186844179006 + }, + { + "epoch": 6.080328349457637, + "grad_norm": 0.22195504045360578, + "learning_rate": 0.00041686327402718355, + "loss": 2.940018653869629, + "step": 10373, + "token_acc": 0.3061286020164649 + }, + { + "epoch": 6.080914687774846, + "grad_norm": 0.16776124936257816, + "learning_rate": 0.00041684523016986174, + "loss": 2.930710554122925, + "step": 10374, + "token_acc": 0.3078344163298432 + }, + { + "epoch": 6.081501026092055, + "grad_norm": 0.21464859329026204, + "learning_rate": 0.00041682718474525375, + "loss": 2.886188507080078, + "step": 10375, + "token_acc": 0.3129126386034356 + }, + { + "epoch": 6.082087364409264, + "grad_norm": 0.1554091487789094, + "learning_rate": 0.00041680913775352926, + "loss": 2.9119367599487305, + "step": 10376, + "token_acc": 0.30948510774386745 + }, + { + "epoch": 6.082673702726473, + "grad_norm": 0.17676620794112669, + "learning_rate": 0.0004167910891948576, + "loss": 2.939983606338501, + "step": 10377, + "token_acc": 0.30554569211944094 + }, + { + "epoch": 6.083260041043682, + "grad_norm": 0.1913897392675056, + "learning_rate": 0.00041677303906940857, + "loss": 2.921018600463867, + "step": 10378, + "token_acc": 0.3108177373032812 + }, + { + "epoch": 6.0838463793608915, + "grad_norm": 0.1551338293577213, + "learning_rate": 0.0004167549873773514, + "loss": 2.90962553024292, + "step": 10379, + "token_acc": 0.31186792676034125 + }, + { + "epoch": 6.084432717678101, + "grad_norm": 0.17885400981708033, + "learning_rate": 0.0004167369341188559, + "loss": 2.9341928958892822, + "step": 10380, + "token_acc": 0.3062773951158422 + }, + { + "epoch": 6.08501905599531, + "grad_norm": 0.16633794368815003, + "learning_rate": 0.0004167188792940916, + "loss": 2.9201507568359375, + "step": 10381, + "token_acc": 0.3093780642462118 + }, + { + "epoch": 6.085605394312518, + "grad_norm": 0.1621370149843927, + "learning_rate": 0.0004167008229032281, + "loss": 2.9466941356658936, + "step": 10382, + "token_acc": 0.30603684210526316 + }, + { + "epoch": 6.086191732629727, + "grad_norm": 0.15176935183733528, + "learning_rate": 0.0004166827649464349, + "loss": 2.954894542694092, + "step": 10383, + "token_acc": 0.3041822516445738 + }, + { + "epoch": 6.086778070946936, + "grad_norm": 0.18001307935781927, + "learning_rate": 0.0004166647054238818, + "loss": 2.9423749446868896, + "step": 10384, + "token_acc": 0.3037615480318712 + }, + { + "epoch": 6.087364409264145, + "grad_norm": 0.1546156823247517, + "learning_rate": 0.0004166466443357384, + "loss": 2.9306800365448, + "step": 10385, + "token_acc": 0.30816204543863523 + }, + { + "epoch": 6.087950747581354, + "grad_norm": 0.16072157698909545, + "learning_rate": 0.0004166285816821743, + "loss": 2.947432518005371, + "step": 10386, + "token_acc": 0.3060800033913214 + }, + { + "epoch": 6.0885370858985635, + "grad_norm": 0.16588454146279294, + "learning_rate": 0.0004166105174633592, + "loss": 2.9169092178344727, + "step": 10387, + "token_acc": 0.3093053525940477 + }, + { + "epoch": 6.089123424215773, + "grad_norm": 0.17697315716621784, + "learning_rate": 0.0004165924516794628, + "loss": 2.971461534500122, + "step": 10388, + "token_acc": 0.3019210127905279 + }, + { + "epoch": 6.089709762532982, + "grad_norm": 0.17075973054790755, + "learning_rate": 0.0004165743843306548, + "loss": 2.9590089321136475, + "step": 10389, + "token_acc": 0.30229917455204347 + }, + { + "epoch": 6.090296100850191, + "grad_norm": 0.17848769234985856, + "learning_rate": 0.0004165563154171049, + "loss": 2.9149913787841797, + "step": 10390, + "token_acc": 0.3095460271543881 + }, + { + "epoch": 6.0908824391674, + "grad_norm": 0.16891302500068286, + "learning_rate": 0.000416538244938983, + "loss": 2.939793109893799, + "step": 10391, + "token_acc": 0.30691706590654405 + }, + { + "epoch": 6.091468777484609, + "grad_norm": 0.16493776937037627, + "learning_rate": 0.00041652017289645863, + "loss": 2.9628772735595703, + "step": 10392, + "token_acc": 0.3013392620262971 + }, + { + "epoch": 6.092055115801817, + "grad_norm": 0.17433678963184748, + "learning_rate": 0.00041650209928970153, + "loss": 2.9630610942840576, + "step": 10393, + "token_acc": 0.3026333887849352 + }, + { + "epoch": 6.092641454119026, + "grad_norm": 0.14871093322053644, + "learning_rate": 0.00041648402411888165, + "loss": 2.914952516555786, + "step": 10394, + "token_acc": 0.30964090867988225 + }, + { + "epoch": 6.0932277924362355, + "grad_norm": 0.1495034650058218, + "learning_rate": 0.00041646594738416865, + "loss": 2.9515929222106934, + "step": 10395, + "token_acc": 0.30394859311233624 + }, + { + "epoch": 6.093814130753445, + "grad_norm": 0.1610707294777987, + "learning_rate": 0.0004164478690857324, + "loss": 3.0026893615722656, + "step": 10396, + "token_acc": 0.2969383530073514 + }, + { + "epoch": 6.094400469070654, + "grad_norm": 0.1574988176923822, + "learning_rate": 0.00041642978922374274, + "loss": 2.929046869277954, + "step": 10397, + "token_acc": 0.3087426751522527 + }, + { + "epoch": 6.094986807387863, + "grad_norm": 0.16448617471376137, + "learning_rate": 0.0004164117077983695, + "loss": 2.924130916595459, + "step": 10398, + "token_acc": 0.3075816856321824 + }, + { + "epoch": 6.095573145705072, + "grad_norm": 0.16510430189383757, + "learning_rate": 0.0004163936248097825, + "loss": 2.952380418777466, + "step": 10399, + "token_acc": 0.3034254320567603 + }, + { + "epoch": 6.096159484022281, + "grad_norm": 0.181306744805994, + "learning_rate": 0.00041637554025815155, + "loss": 2.958345890045166, + "step": 10400, + "token_acc": 0.3029982194771097 + }, + { + "epoch": 6.09674582233949, + "grad_norm": 0.23725446881122828, + "learning_rate": 0.0004163574541436467, + "loss": 2.929875373840332, + "step": 10401, + "token_acc": 0.30899364431502 + }, + { + "epoch": 6.097332160656699, + "grad_norm": 0.3648114968090372, + "learning_rate": 0.00041633936646643765, + "loss": 2.9426755905151367, + "step": 10402, + "token_acc": 0.30700762813576526 + }, + { + "epoch": 6.097918498973908, + "grad_norm": 0.4005656859160203, + "learning_rate": 0.00041632127722669444, + "loss": 2.942589044570923, + "step": 10403, + "token_acc": 0.30582858653408673 + }, + { + "epoch": 6.098504837291117, + "grad_norm": 0.17847403506218268, + "learning_rate": 0.00041630318642458684, + "loss": 2.963380813598633, + "step": 10404, + "token_acc": 0.3020968904352968 + }, + { + "epoch": 6.099091175608326, + "grad_norm": 0.3123065790070527, + "learning_rate": 0.000416285094060285, + "loss": 2.9427084922790527, + "step": 10405, + "token_acc": 0.30541245892704605 + }, + { + "epoch": 6.099677513925535, + "grad_norm": 0.17612652484812036, + "learning_rate": 0.0004162670001339588, + "loss": 2.8996973037719727, + "step": 10406, + "token_acc": 0.31091194414808215 + }, + { + "epoch": 6.100263852242744, + "grad_norm": 0.23617927244769796, + "learning_rate": 0.00041624890464577813, + "loss": 2.955127716064453, + "step": 10407, + "token_acc": 0.30449148514032987 + }, + { + "epoch": 6.100850190559953, + "grad_norm": 0.14860805524360848, + "learning_rate": 0.00041623080759591307, + "loss": 2.9312400817871094, + "step": 10408, + "token_acc": 0.3069817165830432 + }, + { + "epoch": 6.101436528877162, + "grad_norm": 0.18863507958330297, + "learning_rate": 0.0004162127089845336, + "loss": 2.9177653789520264, + "step": 10409, + "token_acc": 0.3097544634059827 + }, + { + "epoch": 6.102022867194371, + "grad_norm": 0.16664255285207497, + "learning_rate": 0.0004161946088118096, + "loss": 2.922409772872925, + "step": 10410, + "token_acc": 0.3086878244299888 + }, + { + "epoch": 6.10260920551158, + "grad_norm": 0.19018691709468127, + "learning_rate": 0.0004161765070779112, + "loss": 2.981858253479004, + "step": 10411, + "token_acc": 0.3005881660351508 + }, + { + "epoch": 6.1031955438287895, + "grad_norm": 0.17707820506394414, + "learning_rate": 0.0004161584037830085, + "loss": 2.9327139854431152, + "step": 10412, + "token_acc": 0.3072470969766712 + }, + { + "epoch": 6.103781882145999, + "grad_norm": 0.16876120101255232, + "learning_rate": 0.00041614029892727156, + "loss": 2.935960531234741, + "step": 10413, + "token_acc": 0.3085030505482777 + }, + { + "epoch": 6.104368220463208, + "grad_norm": 0.16341499004660603, + "learning_rate": 0.00041612219251087025, + "loss": 2.972388744354248, + "step": 10414, + "token_acc": 0.3020122672442619 + }, + { + "epoch": 6.104954558780416, + "grad_norm": 0.1605948534220989, + "learning_rate": 0.00041610408453397486, + "loss": 2.972555637359619, + "step": 10415, + "token_acc": 0.299198486179631 + }, + { + "epoch": 6.105540897097625, + "grad_norm": 0.15588455452603286, + "learning_rate": 0.0004160859749967555, + "loss": 2.9388070106506348, + "step": 10416, + "token_acc": 0.30679257034577767 + }, + { + "epoch": 6.106127235414834, + "grad_norm": 0.17067469574548513, + "learning_rate": 0.0004160678638993821, + "loss": 2.928025245666504, + "step": 10417, + "token_acc": 0.3081767046590163 + }, + { + "epoch": 6.106713573732043, + "grad_norm": 0.16536639626237284, + "learning_rate": 0.00041604975124202495, + "loss": 2.9623336791992188, + "step": 10418, + "token_acc": 0.3037119750781387 + }, + { + "epoch": 6.107299912049252, + "grad_norm": 0.16178444705512532, + "learning_rate": 0.000416031637024854, + "loss": 2.943077802658081, + "step": 10419, + "token_acc": 0.3056953026033453 + }, + { + "epoch": 6.1078862503664615, + "grad_norm": 0.15830505530652575, + "learning_rate": 0.00041601352124803976, + "loss": 2.9410009384155273, + "step": 10420, + "token_acc": 0.30542754328055155 + }, + { + "epoch": 6.108472588683671, + "grad_norm": 0.16561414011616415, + "learning_rate": 0.00041599540391175214, + "loss": 2.8999428749084473, + "step": 10421, + "token_acc": 0.3130207093549155 + }, + { + "epoch": 6.10905892700088, + "grad_norm": 0.163444004215403, + "learning_rate": 0.00041597728501616127, + "loss": 2.975067615509033, + "step": 10422, + "token_acc": 0.30048044973245447 + }, + { + "epoch": 6.109645265318089, + "grad_norm": 0.16056654286590044, + "learning_rate": 0.0004159591645614376, + "loss": 2.928494453430176, + "step": 10423, + "token_acc": 0.30758260546859467 + }, + { + "epoch": 6.110231603635298, + "grad_norm": 0.18785863947086578, + "learning_rate": 0.0004159410425477511, + "loss": 2.9437694549560547, + "step": 10424, + "token_acc": 0.30490544939622427 + }, + { + "epoch": 6.110817941952506, + "grad_norm": 0.1505626054166065, + "learning_rate": 0.00041592291897527223, + "loss": 2.9447240829467773, + "step": 10425, + "token_acc": 0.30535251120018864 + }, + { + "epoch": 6.111404280269715, + "grad_norm": 0.18227095075713756, + "learning_rate": 0.000415904793844171, + "loss": 2.9433984756469727, + "step": 10426, + "token_acc": 0.3053795151474672 + }, + { + "epoch": 6.111990618586924, + "grad_norm": 0.16627628629075097, + "learning_rate": 0.00041588666715461787, + "loss": 2.896047353744507, + "step": 10427, + "token_acc": 0.31125370464281255 + }, + { + "epoch": 6.1125769569041335, + "grad_norm": 0.2025082858542502, + "learning_rate": 0.000415868538906783, + "loss": 2.935548782348633, + "step": 10428, + "token_acc": 0.30552157981609324 + }, + { + "epoch": 6.113163295221343, + "grad_norm": 0.27137076450341086, + "learning_rate": 0.0004158504091008367, + "loss": 2.9595813751220703, + "step": 10429, + "token_acc": 0.3024967119624162 + }, + { + "epoch": 6.113749633538552, + "grad_norm": 0.21093432711283527, + "learning_rate": 0.00041583227773694936, + "loss": 2.9636895656585693, + "step": 10430, + "token_acc": 0.3020448981876848 + }, + { + "epoch": 6.114335971855761, + "grad_norm": 0.16684935143641036, + "learning_rate": 0.00041581414481529124, + "loss": 2.8368446826934814, + "step": 10431, + "token_acc": 0.32148063265333315 + }, + { + "epoch": 6.11492231017297, + "grad_norm": 0.1962838716864451, + "learning_rate": 0.0004157960103360326, + "loss": 2.94777774810791, + "step": 10432, + "token_acc": 0.3046489913794975 + }, + { + "epoch": 6.115508648490179, + "grad_norm": 0.16862376808320972, + "learning_rate": 0.0004157778742993439, + "loss": 2.923959970474243, + "step": 10433, + "token_acc": 0.3085891864105799 + }, + { + "epoch": 6.116094986807388, + "grad_norm": 0.19066300799687697, + "learning_rate": 0.0004157597367053954, + "loss": 2.9415836334228516, + "step": 10434, + "token_acc": 0.30624595867022214 + }, + { + "epoch": 6.116681325124597, + "grad_norm": 0.21738615505634146, + "learning_rate": 0.0004157415975543576, + "loss": 2.9260506629943848, + "step": 10435, + "token_acc": 0.30755254056930037 + }, + { + "epoch": 6.1172676634418055, + "grad_norm": 0.1580646757013759, + "learning_rate": 0.0004157234568464008, + "loss": 2.895148754119873, + "step": 10436, + "token_acc": 0.31291539243032523 + }, + { + "epoch": 6.117854001759015, + "grad_norm": 0.227311014506415, + "learning_rate": 0.0004157053145816955, + "loss": 2.929617166519165, + "step": 10437, + "token_acc": 0.3068810105366432 + }, + { + "epoch": 6.118440340076224, + "grad_norm": 0.16883044052799437, + "learning_rate": 0.000415687170760412, + "loss": 2.9710538387298584, + "step": 10438, + "token_acc": 0.3027455672045928 + }, + { + "epoch": 6.119026678393433, + "grad_norm": 0.20366480542445012, + "learning_rate": 0.0004156690253827208, + "loss": 2.8886756896972656, + "step": 10439, + "token_acc": 0.3138385969981374 + }, + { + "epoch": 6.119613016710642, + "grad_norm": 0.19595579540082303, + "learning_rate": 0.0004156508784487923, + "loss": 2.952850103378296, + "step": 10440, + "token_acc": 0.30421357814650063 + }, + { + "epoch": 6.120199355027851, + "grad_norm": 0.1811051002729143, + "learning_rate": 0.0004156327299587971, + "loss": 2.922309398651123, + "step": 10441, + "token_acc": 0.30829966334296066 + }, + { + "epoch": 6.12078569334506, + "grad_norm": 0.20561917908929286, + "learning_rate": 0.00041561457991290553, + "loss": 2.9297471046447754, + "step": 10442, + "token_acc": 0.30786297136164403 + }, + { + "epoch": 6.121372031662269, + "grad_norm": 0.15780783618226757, + "learning_rate": 0.0004155964283112882, + "loss": 2.9822909832000732, + "step": 10443, + "token_acc": 0.30000659221854525 + }, + { + "epoch": 6.121958369979478, + "grad_norm": 0.19662208076033094, + "learning_rate": 0.00041557827515411553, + "loss": 2.9561009407043457, + "step": 10444, + "token_acc": 0.3045006728311329 + }, + { + "epoch": 6.1225447082966875, + "grad_norm": 0.16968830696216852, + "learning_rate": 0.00041556012044155813, + "loss": 2.9536328315734863, + "step": 10445, + "token_acc": 0.30562220026866965 + }, + { + "epoch": 6.123131046613897, + "grad_norm": 0.14749029166195343, + "learning_rate": 0.0004155419641737865, + "loss": 2.9082694053649902, + "step": 10446, + "token_acc": 0.3103008273705883 + }, + { + "epoch": 6.123717384931105, + "grad_norm": 0.16137758711028985, + "learning_rate": 0.0004155238063509711, + "loss": 2.9429492950439453, + "step": 10447, + "token_acc": 0.30485365648576823 + }, + { + "epoch": 6.124303723248314, + "grad_norm": 0.15868052175954908, + "learning_rate": 0.00041550564697328266, + "loss": 2.941316604614258, + "step": 10448, + "token_acc": 0.3039662722107625 + }, + { + "epoch": 6.124890061565523, + "grad_norm": 0.1756635081284907, + "learning_rate": 0.00041548748604089164, + "loss": 2.9978253841400146, + "step": 10449, + "token_acc": 0.2979858400325896 + }, + { + "epoch": 6.125476399882732, + "grad_norm": 0.1557158814790115, + "learning_rate": 0.0004154693235539688, + "loss": 2.9624266624450684, + "step": 10450, + "token_acc": 0.3028324842312598 + }, + { + "epoch": 6.126062738199941, + "grad_norm": 0.16639843939532703, + "learning_rate": 0.00041545115951268453, + "loss": 2.912062406539917, + "step": 10451, + "token_acc": 0.3103679441736234 + }, + { + "epoch": 6.12664907651715, + "grad_norm": 0.16494038756588472, + "learning_rate": 0.0004154329939172096, + "loss": 2.9039361476898193, + "step": 10452, + "token_acc": 0.3128679222926574 + }, + { + "epoch": 6.1272354148343595, + "grad_norm": 0.15824868073976797, + "learning_rate": 0.0004154148267677146, + "loss": 2.9260544776916504, + "step": 10453, + "token_acc": 0.30908763867319616 + }, + { + "epoch": 6.127821753151569, + "grad_norm": 0.1666834313703923, + "learning_rate": 0.00041539665806437015, + "loss": 2.9439620971679688, + "step": 10454, + "token_acc": 0.30692399979692664 + }, + { + "epoch": 6.128408091468778, + "grad_norm": 0.1594622401797017, + "learning_rate": 0.000415378487807347, + "loss": 2.9661808013916016, + "step": 10455, + "token_acc": 0.3019629455098254 + }, + { + "epoch": 6.128994429785987, + "grad_norm": 0.15216042026501475, + "learning_rate": 0.0004153603159968159, + "loss": 2.919219493865967, + "step": 10456, + "token_acc": 0.30898766249474796 + }, + { + "epoch": 6.129580768103196, + "grad_norm": 0.15332796688813455, + "learning_rate": 0.0004153421426329474, + "loss": 2.966148853302002, + "step": 10457, + "token_acc": 0.30279541276559363 + }, + { + "epoch": 6.130167106420404, + "grad_norm": 0.20204300457221028, + "learning_rate": 0.0004153239677159122, + "loss": 2.9437081813812256, + "step": 10458, + "token_acc": 0.3043364575188619 + }, + { + "epoch": 6.130753444737613, + "grad_norm": 0.3072558255041798, + "learning_rate": 0.0004153057912458812, + "loss": 2.9648566246032715, + "step": 10459, + "token_acc": 0.30356991319534743 + }, + { + "epoch": 6.131339783054822, + "grad_norm": 0.23521640693762688, + "learning_rate": 0.000415287613223025, + "loss": 2.90767240524292, + "step": 10460, + "token_acc": 0.30989104471031276 + }, + { + "epoch": 6.1319261213720315, + "grad_norm": 0.15682580678508576, + "learning_rate": 0.00041526943364751445, + "loss": 2.9724202156066895, + "step": 10461, + "token_acc": 0.3031968720032636 + }, + { + "epoch": 6.132512459689241, + "grad_norm": 0.23129569301531008, + "learning_rate": 0.0004152512525195202, + "loss": 2.927483558654785, + "step": 10462, + "token_acc": 0.30769992294190673 + }, + { + "epoch": 6.13309879800645, + "grad_norm": 0.17371185396455913, + "learning_rate": 0.0004152330698392132, + "loss": 2.91770601272583, + "step": 10463, + "token_acc": 0.30873325446813954 + }, + { + "epoch": 6.133685136323659, + "grad_norm": 0.2386006605877323, + "learning_rate": 0.0004152148856067641, + "loss": 2.950911521911621, + "step": 10464, + "token_acc": 0.3046612943842861 + }, + { + "epoch": 6.134271474640868, + "grad_norm": 0.3034959128654779, + "learning_rate": 0.00041519669982234374, + "loss": 2.9836935997009277, + "step": 10465, + "token_acc": 0.2996892957717207 + }, + { + "epoch": 6.134857812958077, + "grad_norm": 0.16925752192189283, + "learning_rate": 0.000415178512486123, + "loss": 2.9244637489318848, + "step": 10466, + "token_acc": 0.30740564289873484 + }, + { + "epoch": 6.135444151275286, + "grad_norm": 0.20227191017184934, + "learning_rate": 0.00041516032359827283, + "loss": 2.9298830032348633, + "step": 10467, + "token_acc": 0.3075545952619801 + }, + { + "epoch": 6.136030489592494, + "grad_norm": 0.16884120129399502, + "learning_rate": 0.00041514213315896385, + "loss": 2.947310209274292, + "step": 10468, + "token_acc": 0.3046397108416165 + }, + { + "epoch": 6.1366168279097035, + "grad_norm": 0.16223727692594062, + "learning_rate": 0.00041512394116836713, + "loss": 2.944086790084839, + "step": 10469, + "token_acc": 0.30537660506753356 + }, + { + "epoch": 6.137203166226913, + "grad_norm": 0.1662959516848505, + "learning_rate": 0.00041510574762665354, + "loss": 2.9615859985351562, + "step": 10470, + "token_acc": 0.3018576526026203 + }, + { + "epoch": 6.137789504544122, + "grad_norm": 0.15726566868467232, + "learning_rate": 0.00041508755253399384, + "loss": 2.907604694366455, + "step": 10471, + "token_acc": 0.30987845881858855 + }, + { + "epoch": 6.138375842861331, + "grad_norm": 0.1940583384552843, + "learning_rate": 0.00041506935589055905, + "loss": 2.957211494445801, + "step": 10472, + "token_acc": 0.30282178637239743 + }, + { + "epoch": 6.13896218117854, + "grad_norm": 0.16130884560368403, + "learning_rate": 0.0004150511576965201, + "loss": 2.940397024154663, + "step": 10473, + "token_acc": 0.30500466182354996 + }, + { + "epoch": 6.139548519495749, + "grad_norm": 0.17157180505395575, + "learning_rate": 0.00041503295795204795, + "loss": 2.913651943206787, + "step": 10474, + "token_acc": 0.3099711038040271 + }, + { + "epoch": 6.140134857812958, + "grad_norm": 0.16033968612033972, + "learning_rate": 0.0004150147566573136, + "loss": 2.9319586753845215, + "step": 10475, + "token_acc": 0.30749353338783425 + }, + { + "epoch": 6.140721196130167, + "grad_norm": 0.15857423117067557, + "learning_rate": 0.00041499655381248784, + "loss": 2.942533016204834, + "step": 10476, + "token_acc": 0.3065783033108068 + }, + { + "epoch": 6.141307534447376, + "grad_norm": 0.166747210589665, + "learning_rate": 0.00041497834941774185, + "loss": 2.9234941005706787, + "step": 10477, + "token_acc": 0.3093667923547003 + }, + { + "epoch": 6.1418938727645855, + "grad_norm": 0.17077343685105867, + "learning_rate": 0.0004149601434732466, + "loss": 2.9386777877807617, + "step": 10478, + "token_acc": 0.3074772027133957 + }, + { + "epoch": 6.142480211081795, + "grad_norm": 0.15781378144997013, + "learning_rate": 0.00041494193597917307, + "loss": 2.921415090560913, + "step": 10479, + "token_acc": 0.30959037762472524 + }, + { + "epoch": 6.143066549399003, + "grad_norm": 0.1599309841549574, + "learning_rate": 0.00041492372693569234, + "loss": 2.9854774475097656, + "step": 10480, + "token_acc": 0.29958764686549993 + }, + { + "epoch": 6.143652887716212, + "grad_norm": 0.17385564690899588, + "learning_rate": 0.00041490551634297546, + "loss": 2.9373931884765625, + "step": 10481, + "token_acc": 0.3066968108084338 + }, + { + "epoch": 6.144239226033421, + "grad_norm": 0.16403678017085097, + "learning_rate": 0.0004148873042011934, + "loss": 2.947756290435791, + "step": 10482, + "token_acc": 0.30470552955135144 + }, + { + "epoch": 6.14482556435063, + "grad_norm": 0.163869681078955, + "learning_rate": 0.0004148690905105174, + "loss": 2.926081895828247, + "step": 10483, + "token_acc": 0.3080807262117037 + }, + { + "epoch": 6.145411902667839, + "grad_norm": 0.17349128005486922, + "learning_rate": 0.0004148508752711183, + "loss": 2.9466233253479004, + "step": 10484, + "token_acc": 0.30614646246478633 + }, + { + "epoch": 6.145998240985048, + "grad_norm": 0.18661846379164745, + "learning_rate": 0.0004148326584831675, + "loss": 2.896618366241455, + "step": 10485, + "token_acc": 0.3123967068034516 + }, + { + "epoch": 6.1465845793022575, + "grad_norm": 0.14698230726421643, + "learning_rate": 0.00041481444014683587, + "loss": 2.9340977668762207, + "step": 10486, + "token_acc": 0.3076336968427068 + }, + { + "epoch": 6.147170917619467, + "grad_norm": 0.17944177546704151, + "learning_rate": 0.0004147962202622948, + "loss": 2.939802408218384, + "step": 10487, + "token_acc": 0.30672235161834394 + }, + { + "epoch": 6.147757255936676, + "grad_norm": 0.1567711650362066, + "learning_rate": 0.00041477799882971524, + "loss": 2.9326446056365967, + "step": 10488, + "token_acc": 0.3060327679120402 + }, + { + "epoch": 6.148343594253885, + "grad_norm": 0.2073171422380734, + "learning_rate": 0.00041475977584926846, + "loss": 2.957396984100342, + "step": 10489, + "token_acc": 0.30252747281114556 + }, + { + "epoch": 6.148929932571093, + "grad_norm": 0.24004030897757322, + "learning_rate": 0.00041474155132112557, + "loss": 2.9332456588745117, + "step": 10490, + "token_acc": 0.3063213460394012 + }, + { + "epoch": 6.149516270888302, + "grad_norm": 0.2554601458258769, + "learning_rate": 0.00041472332524545777, + "loss": 2.933067560195923, + "step": 10491, + "token_acc": 0.30784441186630146 + }, + { + "epoch": 6.150102609205511, + "grad_norm": 0.16863235777922414, + "learning_rate": 0.0004147050976224363, + "loss": 2.9686851501464844, + "step": 10492, + "token_acc": 0.30329630445772354 + }, + { + "epoch": 6.15068894752272, + "grad_norm": 0.2094739609225327, + "learning_rate": 0.00041468686845223243, + "loss": 2.936225652694702, + "step": 10493, + "token_acc": 0.3056676728962453 + }, + { + "epoch": 6.1512752858399296, + "grad_norm": 0.308046893252181, + "learning_rate": 0.0004146686377350173, + "loss": 2.958745002746582, + "step": 10494, + "token_acc": 0.3023653874437688 + }, + { + "epoch": 6.151861624157139, + "grad_norm": 0.244956779167131, + "learning_rate": 0.0004146504054709622, + "loss": 2.8869080543518066, + "step": 10495, + "token_acc": 0.31447863392366543 + }, + { + "epoch": 6.152447962474348, + "grad_norm": 0.17940980179641128, + "learning_rate": 0.0004146321716602385, + "loss": 2.9498534202575684, + "step": 10496, + "token_acc": 0.3067870646413622 + }, + { + "epoch": 6.153034300791557, + "grad_norm": 0.2329151701338742, + "learning_rate": 0.00041461393630301736, + "loss": 2.958310127258301, + "step": 10497, + "token_acc": 0.3040309385690912 + }, + { + "epoch": 6.153620639108766, + "grad_norm": 0.15179560881939755, + "learning_rate": 0.00041459569939947006, + "loss": 2.938229560852051, + "step": 10498, + "token_acc": 0.30802733816688754 + }, + { + "epoch": 6.154206977425975, + "grad_norm": 0.21381106397551344, + "learning_rate": 0.000414577460949768, + "loss": 2.9201273918151855, + "step": 10499, + "token_acc": 0.30766649978900024 + }, + { + "epoch": 6.154793315743184, + "grad_norm": 0.154464085089828, + "learning_rate": 0.00041455922095408245, + "loss": 2.9637928009033203, + "step": 10500, + "token_acc": 0.30151563153088634 + }, + { + "epoch": 6.1553796540603924, + "grad_norm": 0.21430677562982106, + "learning_rate": 0.00041454097941258475, + "loss": 2.930187225341797, + "step": 10501, + "token_acc": 0.3069289449496025 + }, + { + "epoch": 6.155965992377602, + "grad_norm": 0.15350299040524493, + "learning_rate": 0.0004145227363254462, + "loss": 2.920713424682617, + "step": 10502, + "token_acc": 0.3081558745550163 + }, + { + "epoch": 6.156552330694811, + "grad_norm": 0.1984802493298596, + "learning_rate": 0.00041450449169283846, + "loss": 2.915069103240967, + "step": 10503, + "token_acc": 0.3094154735321161 + }, + { + "epoch": 6.15713866901202, + "grad_norm": 0.16795478819239112, + "learning_rate": 0.00041448624551493254, + "loss": 2.9451258182525635, + "step": 10504, + "token_acc": 0.30570178947649146 + }, + { + "epoch": 6.157725007329229, + "grad_norm": 0.2139972699911268, + "learning_rate": 0.00041446799779189995, + "loss": 2.972809314727783, + "step": 10505, + "token_acc": 0.2998525210921411 + }, + { + "epoch": 6.158311345646438, + "grad_norm": 0.17299874943001023, + "learning_rate": 0.0004144497485239123, + "loss": 2.920882225036621, + "step": 10506, + "token_acc": 0.30833366075989155 + }, + { + "epoch": 6.158897683963647, + "grad_norm": 0.1965185103617262, + "learning_rate": 0.0004144314977111407, + "loss": 2.9171862602233887, + "step": 10507, + "token_acc": 0.307525509528676 + }, + { + "epoch": 6.159484022280856, + "grad_norm": 0.16348901491062998, + "learning_rate": 0.0004144132453537568, + "loss": 2.967710494995117, + "step": 10508, + "token_acc": 0.3029505257152172 + }, + { + "epoch": 6.160070360598065, + "grad_norm": 0.2332032975357556, + "learning_rate": 0.00041439499145193206, + "loss": 2.9047937393188477, + "step": 10509, + "token_acc": 0.3110390114009462 + }, + { + "epoch": 6.1606566989152745, + "grad_norm": 0.1705785540066222, + "learning_rate": 0.00041437673600583786, + "loss": 2.9172589778900146, + "step": 10510, + "token_acc": 0.31025348579352247 + }, + { + "epoch": 6.161243037232484, + "grad_norm": 0.16366281468649038, + "learning_rate": 0.0004143584790156457, + "loss": 2.9317102432250977, + "step": 10511, + "token_acc": 0.3067437460805961 + }, + { + "epoch": 6.161829375549692, + "grad_norm": 0.1601560742302561, + "learning_rate": 0.0004143402204815272, + "loss": 2.933523654937744, + "step": 10512, + "token_acc": 0.3063418092618516 + }, + { + "epoch": 6.162415713866901, + "grad_norm": 0.17373931166236636, + "learning_rate": 0.0004143219604036537, + "loss": 2.9422945976257324, + "step": 10513, + "token_acc": 0.30532538504942425 + }, + { + "epoch": 6.16300205218411, + "grad_norm": 0.18616943477253542, + "learning_rate": 0.00041430369878219685, + "loss": 2.95859956741333, + "step": 10514, + "token_acc": 0.30294241288961393 + }, + { + "epoch": 6.163588390501319, + "grad_norm": 0.1566410030301233, + "learning_rate": 0.00041428543561732815, + "loss": 2.941256523132324, + "step": 10515, + "token_acc": 0.30742792721085416 + }, + { + "epoch": 6.164174728818528, + "grad_norm": 0.21408578938545805, + "learning_rate": 0.0004142671709092191, + "loss": 2.9786064624786377, + "step": 10516, + "token_acc": 0.3022998965762136 + }, + { + "epoch": 6.164761067135737, + "grad_norm": 0.1868261068765775, + "learning_rate": 0.00041424890465804133, + "loss": 2.9580087661743164, + "step": 10517, + "token_acc": 0.30444521162742527 + }, + { + "epoch": 6.1653474054529465, + "grad_norm": 0.21312871740695563, + "learning_rate": 0.0004142306368639665, + "loss": 2.9835939407348633, + "step": 10518, + "token_acc": 0.3015483143205429 + }, + { + "epoch": 6.165933743770156, + "grad_norm": 0.2112837939078349, + "learning_rate": 0.0004142123675271661, + "loss": 2.9283437728881836, + "step": 10519, + "token_acc": 0.3074960853946843 + }, + { + "epoch": 6.166520082087365, + "grad_norm": 0.16353946420247256, + "learning_rate": 0.0004141940966478118, + "loss": 2.9730257987976074, + "step": 10520, + "token_acc": 0.30188471062186467 + }, + { + "epoch": 6.167106420404574, + "grad_norm": 0.17402030661932746, + "learning_rate": 0.0004141758242260753, + "loss": 2.939260721206665, + "step": 10521, + "token_acc": 0.3064985550241682 + }, + { + "epoch": 6.167692758721783, + "grad_norm": 0.1685335056651273, + "learning_rate": 0.000414157550262128, + "loss": 2.963682174682617, + "step": 10522, + "token_acc": 0.30210048982909454 + }, + { + "epoch": 6.168279097038991, + "grad_norm": 0.18375339375630045, + "learning_rate": 0.0004141392747561418, + "loss": 2.926851749420166, + "step": 10523, + "token_acc": 0.3093289078670122 + }, + { + "epoch": 6.1688654353562, + "grad_norm": 0.18060563101544722, + "learning_rate": 0.00041412099770828825, + "loss": 2.9130144119262695, + "step": 10524, + "token_acc": 0.3111232264670292 + }, + { + "epoch": 6.169451773673409, + "grad_norm": 0.1770623020561256, + "learning_rate": 0.0004141027191187392, + "loss": 2.9247725009918213, + "step": 10525, + "token_acc": 0.30822450766088366 + }, + { + "epoch": 6.1700381119906185, + "grad_norm": 0.18831891316628133, + "learning_rate": 0.0004140844389876661, + "loss": 2.9035959243774414, + "step": 10526, + "token_acc": 0.31152345572999307 + }, + { + "epoch": 6.170624450307828, + "grad_norm": 0.1958413440426652, + "learning_rate": 0.0004140661573152409, + "loss": 2.9531126022338867, + "step": 10527, + "token_acc": 0.30398385490381086 + }, + { + "epoch": 6.171210788625037, + "grad_norm": 0.1602918303491583, + "learning_rate": 0.0004140478741016351, + "loss": 2.9519577026367188, + "step": 10528, + "token_acc": 0.30397546605559556 + }, + { + "epoch": 6.171797126942246, + "grad_norm": 0.1682345823071401, + "learning_rate": 0.00041402958934702073, + "loss": 2.9233193397521973, + "step": 10529, + "token_acc": 0.3095600904587551 + }, + { + "epoch": 6.172383465259455, + "grad_norm": 0.18582547280023443, + "learning_rate": 0.00041401130305156934, + "loss": 2.950817108154297, + "step": 10530, + "token_acc": 0.30554172443818 + }, + { + "epoch": 6.172969803576664, + "grad_norm": 0.17912075198689428, + "learning_rate": 0.00041399301521545274, + "loss": 2.9918973445892334, + "step": 10531, + "token_acc": 0.2991281791914751 + }, + { + "epoch": 6.173556141893873, + "grad_norm": 0.16451893217338048, + "learning_rate": 0.00041397472583884276, + "loss": 2.9460551738739014, + "step": 10532, + "token_acc": 0.30513219035838157 + }, + { + "epoch": 6.174142480211081, + "grad_norm": 0.17925949619925596, + "learning_rate": 0.00041395643492191125, + "loss": 2.9483227729797363, + "step": 10533, + "token_acc": 0.3051188218984975 + }, + { + "epoch": 6.1747288185282905, + "grad_norm": 0.22341550213614858, + "learning_rate": 0.00041393814246482995, + "loss": 2.9327199459075928, + "step": 10534, + "token_acc": 0.3048220704249459 + }, + { + "epoch": 6.1753151568455, + "grad_norm": 0.1973700569113499, + "learning_rate": 0.00041391984846777066, + "loss": 2.9637317657470703, + "step": 10535, + "token_acc": 0.3035690698227353 + }, + { + "epoch": 6.175901495162709, + "grad_norm": 0.1477637010128084, + "learning_rate": 0.0004139015529309053, + "loss": 2.9454855918884277, + "step": 10536, + "token_acc": 0.3064243155511895 + }, + { + "epoch": 6.176487833479918, + "grad_norm": 0.20260139194917687, + "learning_rate": 0.00041388325585440573, + "loss": 2.9584174156188965, + "step": 10537, + "token_acc": 0.3043248903413332 + }, + { + "epoch": 6.177074171797127, + "grad_norm": 0.23760680729405217, + "learning_rate": 0.00041386495723844374, + "loss": 2.9367425441741943, + "step": 10538, + "token_acc": 0.3080889637794784 + }, + { + "epoch": 6.177660510114336, + "grad_norm": 0.17734900985459684, + "learning_rate": 0.00041384665708319135, + "loss": 2.9404971599578857, + "step": 10539, + "token_acc": 0.3058731413824277 + }, + { + "epoch": 6.178246848431545, + "grad_norm": 0.16519324863706866, + "learning_rate": 0.00041382835538882046, + "loss": 2.978525161743164, + "step": 10540, + "token_acc": 0.3001695669738085 + }, + { + "epoch": 6.178833186748754, + "grad_norm": 0.2582589652827132, + "learning_rate": 0.0004138100521555028, + "loss": 2.920891761779785, + "step": 10541, + "token_acc": 0.31047757483615146 + }, + { + "epoch": 6.179419525065963, + "grad_norm": 0.1746934332268986, + "learning_rate": 0.0004137917473834105, + "loss": 2.9548802375793457, + "step": 10542, + "token_acc": 0.3042654589586478 + }, + { + "epoch": 6.1800058633831725, + "grad_norm": 0.1754999240315258, + "learning_rate": 0.00041377344107271544, + "loss": 2.9370555877685547, + "step": 10543, + "token_acc": 0.30774912339353605 + }, + { + "epoch": 6.180592201700381, + "grad_norm": 0.21744993996331352, + "learning_rate": 0.0004137551332235896, + "loss": 2.9531164169311523, + "step": 10544, + "token_acc": 0.30436384401422073 + }, + { + "epoch": 6.18117854001759, + "grad_norm": 0.15339051426044142, + "learning_rate": 0.000413736823836205, + "loss": 3.021946907043457, + "step": 10545, + "token_acc": 0.2954817197858734 + }, + { + "epoch": 6.181764878334799, + "grad_norm": 0.19900483077986467, + "learning_rate": 0.00041371851291073346, + "loss": 2.948702096939087, + "step": 10546, + "token_acc": 0.3047502672979991 + }, + { + "epoch": 6.182351216652008, + "grad_norm": 0.17602034189120178, + "learning_rate": 0.0004137002004473471, + "loss": 2.926104784011841, + "step": 10547, + "token_acc": 0.3096696221181726 + }, + { + "epoch": 6.182937554969217, + "grad_norm": 0.17336696643789493, + "learning_rate": 0.0004136818864462181, + "loss": 2.9257566928863525, + "step": 10548, + "token_acc": 0.3082824773927578 + }, + { + "epoch": 6.183523893286426, + "grad_norm": 0.19256714152817334, + "learning_rate": 0.0004136635709075183, + "loss": 2.921236276626587, + "step": 10549, + "token_acc": 0.30967753994702213 + }, + { + "epoch": 6.184110231603635, + "grad_norm": 0.1544146454721351, + "learning_rate": 0.00041364525383141967, + "loss": 2.9522242546081543, + "step": 10550, + "token_acc": 0.3029366706672508 + }, + { + "epoch": 6.1846965699208445, + "grad_norm": 0.21067754109916811, + "learning_rate": 0.0004136269352180945, + "loss": 2.9091482162475586, + "step": 10551, + "token_acc": 0.3100873637993352 + }, + { + "epoch": 6.185282908238054, + "grad_norm": 0.1795930068297206, + "learning_rate": 0.0004136086150677147, + "loss": 2.9366633892059326, + "step": 10552, + "token_acc": 0.3061824401063588 + }, + { + "epoch": 6.185869246555263, + "grad_norm": 0.1670100388762526, + "learning_rate": 0.00041359029338045245, + "loss": 2.993654727935791, + "step": 10553, + "token_acc": 0.2977459915829697 + }, + { + "epoch": 6.186455584872472, + "grad_norm": 0.1858505244011271, + "learning_rate": 0.00041357197015647984, + "loss": 2.9274837970733643, + "step": 10554, + "token_acc": 0.30759988758859486 + }, + { + "epoch": 6.18704192318968, + "grad_norm": 0.17439971124781023, + "learning_rate": 0.00041355364539596897, + "loss": 2.893908739089966, + "step": 10555, + "token_acc": 0.31203162064119017 + }, + { + "epoch": 6.187628261506889, + "grad_norm": 0.1650632333940499, + "learning_rate": 0.000413535319099092, + "loss": 2.9777870178222656, + "step": 10556, + "token_acc": 0.29979753386740343 + }, + { + "epoch": 6.188214599824098, + "grad_norm": 0.1831232880811513, + "learning_rate": 0.0004135169912660211, + "loss": 2.92812180519104, + "step": 10557, + "token_acc": 0.3076560578976051 + }, + { + "epoch": 6.188800938141307, + "grad_norm": 0.16066796612015022, + "learning_rate": 0.0004134986618969284, + "loss": 2.896663188934326, + "step": 10558, + "token_acc": 0.31246574033563157 + }, + { + "epoch": 6.1893872764585165, + "grad_norm": 0.17697673798360594, + "learning_rate": 0.00041348033099198613, + "loss": 2.9662580490112305, + "step": 10559, + "token_acc": 0.3008626017789482 + }, + { + "epoch": 6.189973614775726, + "grad_norm": 0.18479492554503982, + "learning_rate": 0.0004134619985513663, + "loss": 2.9578285217285156, + "step": 10560, + "token_acc": 0.30268922817764965 + }, + { + "epoch": 6.190559953092935, + "grad_norm": 0.16264772367238586, + "learning_rate": 0.0004134436645752414, + "loss": 2.9407780170440674, + "step": 10561, + "token_acc": 0.3058838613117868 + }, + { + "epoch": 6.191146291410144, + "grad_norm": 0.1652355056553404, + "learning_rate": 0.0004134253290637834, + "loss": 2.9497134685516357, + "step": 10562, + "token_acc": 0.30572287996054426 + }, + { + "epoch": 6.191732629727353, + "grad_norm": 0.21199181328029393, + "learning_rate": 0.0004134069920171648, + "loss": 2.969010829925537, + "step": 10563, + "token_acc": 0.3008147145171735 + }, + { + "epoch": 6.192318968044562, + "grad_norm": 0.1907498254757952, + "learning_rate": 0.00041338865343555766, + "loss": 2.9849185943603516, + "step": 10564, + "token_acc": 0.2993372721873144 + }, + { + "epoch": 6.192905306361771, + "grad_norm": 0.1561922181528007, + "learning_rate": 0.00041337031331913424, + "loss": 2.944082021713257, + "step": 10565, + "token_acc": 0.30492502131501076 + }, + { + "epoch": 6.193491644678979, + "grad_norm": 0.17282277258781387, + "learning_rate": 0.0004133519716680668, + "loss": 2.9318013191223145, + "step": 10566, + "token_acc": 0.30631725372943414 + }, + { + "epoch": 6.1940779829961885, + "grad_norm": 0.19454831251353058, + "learning_rate": 0.0004133336284825278, + "loss": 2.950530767440796, + "step": 10567, + "token_acc": 0.30722735291065684 + }, + { + "epoch": 6.194664321313398, + "grad_norm": 0.16003830194876, + "learning_rate": 0.00041331528376268943, + "loss": 2.8945841789245605, + "step": 10568, + "token_acc": 0.3139257667019766 + }, + { + "epoch": 6.195250659630607, + "grad_norm": 0.2213236382389851, + "learning_rate": 0.0004132969375087241, + "loss": 2.9743778705596924, + "step": 10569, + "token_acc": 0.30164500662418775 + }, + { + "epoch": 6.195836997947816, + "grad_norm": 0.25683636217242495, + "learning_rate": 0.00041327858972080403, + "loss": 2.9589996337890625, + "step": 10570, + "token_acc": 0.30354978077553657 + }, + { + "epoch": 6.196423336265025, + "grad_norm": 0.15495366191015236, + "learning_rate": 0.00041326024039910165, + "loss": 2.9925193786621094, + "step": 10571, + "token_acc": 0.29781304726250774 + }, + { + "epoch": 6.197009674582234, + "grad_norm": 0.20917463335259337, + "learning_rate": 0.0004132418895437893, + "loss": 2.926042079925537, + "step": 10572, + "token_acc": 0.307583912527547 + }, + { + "epoch": 6.197596012899443, + "grad_norm": 0.20123856536472745, + "learning_rate": 0.00041322353715503936, + "loss": 2.9659156799316406, + "step": 10573, + "token_acc": 0.3021300096318941 + }, + { + "epoch": 6.198182351216652, + "grad_norm": 0.17292716563372482, + "learning_rate": 0.0004132051832330242, + "loss": 2.9990503787994385, + "step": 10574, + "token_acc": 0.2991860229031586 + }, + { + "epoch": 6.198768689533861, + "grad_norm": 0.2244204564596068, + "learning_rate": 0.0004131868277779164, + "loss": 2.947277545928955, + "step": 10575, + "token_acc": 0.30520025242221416 + }, + { + "epoch": 6.19935502785107, + "grad_norm": 0.14752679539506808, + "learning_rate": 0.00041316847078988814, + "loss": 2.9605541229248047, + "step": 10576, + "token_acc": 0.30307335322252216 + }, + { + "epoch": 6.199941366168279, + "grad_norm": 0.2431009581495613, + "learning_rate": 0.0004131501122691119, + "loss": 2.9222159385681152, + "step": 10577, + "token_acc": 0.3088159066441606 + }, + { + "epoch": 6.200527704485488, + "grad_norm": 0.2851418853037055, + "learning_rate": 0.0004131317522157604, + "loss": 2.9670753479003906, + "step": 10578, + "token_acc": 0.3025158314359568 + }, + { + "epoch": 6.201114042802697, + "grad_norm": 0.16928258551544348, + "learning_rate": 0.00041311339063000577, + "loss": 2.9220614433288574, + "step": 10579, + "token_acc": 0.30965163005073143 + }, + { + "epoch": 6.201700381119906, + "grad_norm": 0.24645415675124518, + "learning_rate": 0.00041309502751202064, + "loss": 2.925625801086426, + "step": 10580, + "token_acc": 0.30825324029469736 + }, + { + "epoch": 6.202286719437115, + "grad_norm": 0.194864823867627, + "learning_rate": 0.00041307666286197756, + "loss": 2.945146083831787, + "step": 10581, + "token_acc": 0.3050396420441165 + }, + { + "epoch": 6.202873057754324, + "grad_norm": 0.18122752535097372, + "learning_rate": 0.0004130582966800489, + "loss": 2.9606857299804688, + "step": 10582, + "token_acc": 0.3025630579803757 + }, + { + "epoch": 6.203459396071533, + "grad_norm": 0.1665079446212871, + "learning_rate": 0.00041303992896640735, + "loss": 3.0012216567993164, + "step": 10583, + "token_acc": 0.2971267719930008 + }, + { + "epoch": 6.2040457343887425, + "grad_norm": 0.17074151084204162, + "learning_rate": 0.00041302155972122533, + "loss": 2.9974000453948975, + "step": 10584, + "token_acc": 0.29681415701746494 + }, + { + "epoch": 6.204632072705952, + "grad_norm": 0.15207429357068372, + "learning_rate": 0.0004130031889446754, + "loss": 2.977205276489258, + "step": 10585, + "token_acc": 0.30183684191733445 + }, + { + "epoch": 6.205218411023161, + "grad_norm": 0.1935889226728444, + "learning_rate": 0.0004129848166369302, + "loss": 2.9002203941345215, + "step": 10586, + "token_acc": 0.3133992650896384 + }, + { + "epoch": 6.205804749340369, + "grad_norm": 0.23314623253247724, + "learning_rate": 0.00041296644279816225, + "loss": 2.931007146835327, + "step": 10587, + "token_acc": 0.30650158792403764 + }, + { + "epoch": 6.206391087657578, + "grad_norm": 0.16494521807394452, + "learning_rate": 0.0004129480674285442, + "loss": 2.9425692558288574, + "step": 10588, + "token_acc": 0.30667345369927906 + }, + { + "epoch": 6.206977425974787, + "grad_norm": 0.20479280777771355, + "learning_rate": 0.0004129296905282486, + "loss": 2.9431004524230957, + "step": 10589, + "token_acc": 0.30562295327915073 + }, + { + "epoch": 6.207563764291996, + "grad_norm": 0.20022152686696465, + "learning_rate": 0.00041291131209744816, + "loss": 2.919126510620117, + "step": 10590, + "token_acc": 0.3093269473810386 + }, + { + "epoch": 6.208150102609205, + "grad_norm": 0.20575951902433143, + "learning_rate": 0.0004128929321363154, + "loss": 2.9182581901550293, + "step": 10591, + "token_acc": 0.30933843129586364 + }, + { + "epoch": 6.2087364409264145, + "grad_norm": 0.24894864085792323, + "learning_rate": 0.00041287455064502305, + "loss": 2.976844310760498, + "step": 10592, + "token_acc": 0.30072805610592196 + }, + { + "epoch": 6.209322779243624, + "grad_norm": 0.19535920025598794, + "learning_rate": 0.0004128561676237438, + "loss": 2.9612884521484375, + "step": 10593, + "token_acc": 0.3036801475684263 + }, + { + "epoch": 6.209909117560833, + "grad_norm": 0.2622808499768753, + "learning_rate": 0.0004128377830726503, + "loss": 2.941671371459961, + "step": 10594, + "token_acc": 0.30617319031043544 + }, + { + "epoch": 6.210495455878042, + "grad_norm": 0.18015495440228485, + "learning_rate": 0.0004128193969919153, + "loss": 2.906928300857544, + "step": 10595, + "token_acc": 0.31053157034773327 + }, + { + "epoch": 6.211081794195251, + "grad_norm": 0.2505207807565127, + "learning_rate": 0.0004128010093817114, + "loss": 2.9427521228790283, + "step": 10596, + "token_acc": 0.3061699265975271 + }, + { + "epoch": 6.21166813251246, + "grad_norm": 0.18876674250551867, + "learning_rate": 0.0004127826202422114, + "loss": 2.934694290161133, + "step": 10597, + "token_acc": 0.30615599615234035 + }, + { + "epoch": 6.212254470829668, + "grad_norm": 0.21086203283000965, + "learning_rate": 0.00041276422957358804, + "loss": 2.927766799926758, + "step": 10598, + "token_acc": 0.3086313902646248 + }, + { + "epoch": 6.212840809146877, + "grad_norm": 0.16388671288980408, + "learning_rate": 0.00041274583737601413, + "loss": 2.9911603927612305, + "step": 10599, + "token_acc": 0.2996580600502761 + }, + { + "epoch": 6.2134271474640865, + "grad_norm": 0.21516643860499476, + "learning_rate": 0.0004127274436496623, + "loss": 2.9499707221984863, + "step": 10600, + "token_acc": 0.3044990610364821 + }, + { + "epoch": 6.214013485781296, + "grad_norm": 0.19291350989032158, + "learning_rate": 0.0004127090483947055, + "loss": 2.9630448818206787, + "step": 10601, + "token_acc": 0.3031496271924006 + }, + { + "epoch": 6.214599824098505, + "grad_norm": 0.19917281929130404, + "learning_rate": 0.00041269065161131634, + "loss": 2.949312210083008, + "step": 10602, + "token_acc": 0.3035276077552496 + }, + { + "epoch": 6.215186162415714, + "grad_norm": 0.18042766201609928, + "learning_rate": 0.00041267225329966786, + "loss": 2.9870777130126953, + "step": 10603, + "token_acc": 0.300208843265676 + }, + { + "epoch": 6.215772500732923, + "grad_norm": 0.20520671314327915, + "learning_rate": 0.00041265385345993263, + "loss": 2.910634994506836, + "step": 10604, + "token_acc": 0.3110755708468289 + }, + { + "epoch": 6.216358839050132, + "grad_norm": 0.14879723562835254, + "learning_rate": 0.0004126354520922837, + "loss": 2.9380698204040527, + "step": 10605, + "token_acc": 0.30740860889820143 + }, + { + "epoch": 6.216945177367341, + "grad_norm": 0.21555875902024238, + "learning_rate": 0.0004126170491968938, + "loss": 2.920599937438965, + "step": 10606, + "token_acc": 0.3099641573424179 + }, + { + "epoch": 6.21753151568455, + "grad_norm": 0.17596552856219855, + "learning_rate": 0.0004125986447739359, + "loss": 2.9261152744293213, + "step": 10607, + "token_acc": 0.3083567676268006 + }, + { + "epoch": 6.218117854001759, + "grad_norm": 0.16433946264449972, + "learning_rate": 0.0004125802388235829, + "loss": 2.9523723125457764, + "step": 10608, + "token_acc": 0.3043990141854734 + }, + { + "epoch": 6.218704192318968, + "grad_norm": 0.1678183359377147, + "learning_rate": 0.00041256183134600753, + "loss": 2.951498508453369, + "step": 10609, + "token_acc": 0.30636950727718854 + }, + { + "epoch": 6.219290530636177, + "grad_norm": 0.15843412406420335, + "learning_rate": 0.00041254342234138287, + "loss": 2.981130599975586, + "step": 10610, + "token_acc": 0.3007553274146913 + }, + { + "epoch": 6.219876868953386, + "grad_norm": 0.19014444058062469, + "learning_rate": 0.0004125250118098817, + "loss": 2.962453842163086, + "step": 10611, + "token_acc": 0.3039856029672954 + }, + { + "epoch": 6.220463207270595, + "grad_norm": 0.1506582300448767, + "learning_rate": 0.0004125065997516771, + "loss": 2.93302583694458, + "step": 10612, + "token_acc": 0.3062402545242633 + }, + { + "epoch": 6.221049545587804, + "grad_norm": 0.18270628448042128, + "learning_rate": 0.0004124881861669421, + "loss": 2.9435229301452637, + "step": 10613, + "token_acc": 0.3059334028509107 + }, + { + "epoch": 6.221635883905013, + "grad_norm": 0.17059894715104043, + "learning_rate": 0.0004124697710558494, + "loss": 2.951679229736328, + "step": 10614, + "token_acc": 0.3046258878113629 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.16600993091200414, + "learning_rate": 0.0004124513544185721, + "loss": 2.9631874561309814, + "step": 10615, + "token_acc": 0.30221542218204017 + }, + { + "epoch": 6.222808560539431, + "grad_norm": 0.1605004662713218, + "learning_rate": 0.00041243293625528333, + "loss": 2.976900577545166, + "step": 10616, + "token_acc": 0.3017416221163647 + }, + { + "epoch": 6.2233948988566405, + "grad_norm": 0.15122863828851124, + "learning_rate": 0.00041241451656615595, + "loss": 2.92988920211792, + "step": 10617, + "token_acc": 0.3089455894884698 + }, + { + "epoch": 6.22398123717385, + "grad_norm": 0.16539822499096626, + "learning_rate": 0.0004123960953513631, + "loss": 2.9358394145965576, + "step": 10618, + "token_acc": 0.30534026218206506 + }, + { + "epoch": 6.224567575491059, + "grad_norm": 0.18911295606837114, + "learning_rate": 0.00041237767261107774, + "loss": 2.9532246589660645, + "step": 10619, + "token_acc": 0.3032199429377201 + }, + { + "epoch": 6.225153913808267, + "grad_norm": 0.1907595861406025, + "learning_rate": 0.0004123592483454729, + "loss": 2.9623312950134277, + "step": 10620, + "token_acc": 0.3025152458311192 + }, + { + "epoch": 6.225740252125476, + "grad_norm": 0.16165371094321157, + "learning_rate": 0.00041234082255472174, + "loss": 2.8954315185546875, + "step": 10621, + "token_acc": 0.3123182535726155 + }, + { + "epoch": 6.226326590442685, + "grad_norm": 0.21737210165567597, + "learning_rate": 0.00041232239523899736, + "loss": 2.916842460632324, + "step": 10622, + "token_acc": 0.3094878119943768 + }, + { + "epoch": 6.226912928759894, + "grad_norm": 0.19976287297061474, + "learning_rate": 0.00041230396639847273, + "loss": 2.973588466644287, + "step": 10623, + "token_acc": 0.3016687033510994 + }, + { + "epoch": 6.227499267077103, + "grad_norm": 0.15725086569039393, + "learning_rate": 0.0004122855360333211, + "loss": 2.949235677719116, + "step": 10624, + "token_acc": 0.3049606790662839 + }, + { + "epoch": 6.2280856053943126, + "grad_norm": 0.20081187673899056, + "learning_rate": 0.0004122671041437155, + "loss": 2.9417974948883057, + "step": 10625, + "token_acc": 0.3069205860234089 + }, + { + "epoch": 6.228671943711522, + "grad_norm": 0.16556509350637266, + "learning_rate": 0.0004122486707298291, + "loss": 2.9606781005859375, + "step": 10626, + "token_acc": 0.30288652978791514 + }, + { + "epoch": 6.229258282028731, + "grad_norm": 0.18386168839143108, + "learning_rate": 0.00041223023579183514, + "loss": 2.932673454284668, + "step": 10627, + "token_acc": 0.30660960090147366 + }, + { + "epoch": 6.22984462034594, + "grad_norm": 0.18310642398555058, + "learning_rate": 0.00041221179932990674, + "loss": 2.942288398742676, + "step": 10628, + "token_acc": 0.3054085415157147 + }, + { + "epoch": 6.230430958663149, + "grad_norm": 0.15822318892666487, + "learning_rate": 0.0004121933613442169, + "loss": 2.953800678253174, + "step": 10629, + "token_acc": 0.30380770478982344 + }, + { + "epoch": 6.231017296980358, + "grad_norm": 0.1878142654466012, + "learning_rate": 0.0004121749218349392, + "loss": 2.9623899459838867, + "step": 10630, + "token_acc": 0.3034656016490595 + }, + { + "epoch": 6.231603635297566, + "grad_norm": 0.15155605011227305, + "learning_rate": 0.0004121564808022465, + "loss": 2.967517852783203, + "step": 10631, + "token_acc": 0.30249601037303014 + }, + { + "epoch": 6.2321899736147754, + "grad_norm": 0.20014022833391606, + "learning_rate": 0.00041213803824631225, + "loss": 2.956653118133545, + "step": 10632, + "token_acc": 0.3047242866245238 + }, + { + "epoch": 6.232776311931985, + "grad_norm": 0.18032777017946183, + "learning_rate": 0.0004121195941673096, + "loss": 2.9396469593048096, + "step": 10633, + "token_acc": 0.3061231566116595 + }, + { + "epoch": 6.233362650249194, + "grad_norm": 0.15933292269745034, + "learning_rate": 0.00041210114856541177, + "loss": 2.9592173099517822, + "step": 10634, + "token_acc": 0.3035840988995512 + }, + { + "epoch": 6.233948988566403, + "grad_norm": 0.2050048742268512, + "learning_rate": 0.0004120827014407921, + "loss": 2.911478042602539, + "step": 10635, + "token_acc": 0.3101621514834859 + }, + { + "epoch": 6.234535326883612, + "grad_norm": 0.17723124510921007, + "learning_rate": 0.0004120642527936239, + "loss": 2.9554505348205566, + "step": 10636, + "token_acc": 0.30478401881148076 + }, + { + "epoch": 6.235121665200821, + "grad_norm": 0.15100640746872449, + "learning_rate": 0.0004120458026240804, + "loss": 2.9596121311187744, + "step": 10637, + "token_acc": 0.3047194994676862 + }, + { + "epoch": 6.23570800351803, + "grad_norm": 0.16579914506750182, + "learning_rate": 0.00041202735093233487, + "loss": 2.9411568641662598, + "step": 10638, + "token_acc": 0.3059126640542553 + }, + { + "epoch": 6.236294341835239, + "grad_norm": 0.14967973179232463, + "learning_rate": 0.0004120088977185609, + "loss": 2.9596283435821533, + "step": 10639, + "token_acc": 0.3033594265877202 + }, + { + "epoch": 6.236880680152448, + "grad_norm": 0.2377987444772846, + "learning_rate": 0.00041199044298293147, + "loss": 2.903804302215576, + "step": 10640, + "token_acc": 0.31147248179389964 + }, + { + "epoch": 6.237467018469657, + "grad_norm": 0.2167525194078, + "learning_rate": 0.0004119719867256202, + "loss": 2.950406312942505, + "step": 10641, + "token_acc": 0.304936425247544 + }, + { + "epoch": 6.238053356786866, + "grad_norm": 0.14901019535474866, + "learning_rate": 0.00041195352894680037, + "loss": 2.9378597736358643, + "step": 10642, + "token_acc": 0.30805022156573114 + }, + { + "epoch": 6.238639695104075, + "grad_norm": 0.2271325239192477, + "learning_rate": 0.00041193506964664527, + "loss": 2.9547629356384277, + "step": 10643, + "token_acc": 0.30463851782238033 + }, + { + "epoch": 6.239226033421284, + "grad_norm": 0.19032938534746327, + "learning_rate": 0.00041191660882532855, + "loss": 2.9310765266418457, + "step": 10644, + "token_acc": 0.3073967238243222 + }, + { + "epoch": 6.239812371738493, + "grad_norm": 0.15363592005239932, + "learning_rate": 0.00041189814648302336, + "loss": 2.9882822036743164, + "step": 10645, + "token_acc": 0.2987873127127606 + }, + { + "epoch": 6.240398710055702, + "grad_norm": 0.15949628038650054, + "learning_rate": 0.0004118796826199034, + "loss": 2.9734439849853516, + "step": 10646, + "token_acc": 0.3001459846179443 + }, + { + "epoch": 6.240985048372911, + "grad_norm": 0.15152112932088116, + "learning_rate": 0.0004118612172361418, + "loss": 2.9163129329681396, + "step": 10647, + "token_acc": 0.31020423412204234 + }, + { + "epoch": 6.24157138669012, + "grad_norm": 0.15445323110352266, + "learning_rate": 0.0004118427503319122, + "loss": 2.8911545276641846, + "step": 10648, + "token_acc": 0.3149703208933219 + }, + { + "epoch": 6.2421577250073295, + "grad_norm": 0.15214329116770345, + "learning_rate": 0.0004118242819073881, + "loss": 2.919991970062256, + "step": 10649, + "token_acc": 0.3093935395277483 + }, + { + "epoch": 6.242744063324539, + "grad_norm": 0.15624040839653897, + "learning_rate": 0.0004118058119627429, + "loss": 2.9900355339050293, + "step": 10650, + "token_acc": 0.29949567778311853 + }, + { + "epoch": 6.243330401641748, + "grad_norm": 0.149668742436633, + "learning_rate": 0.0004117873404981501, + "loss": 2.9700136184692383, + "step": 10651, + "token_acc": 0.3019914859790601 + }, + { + "epoch": 6.243916739958956, + "grad_norm": 0.15648229810442482, + "learning_rate": 0.0004117688675137834, + "loss": 2.958305597305298, + "step": 10652, + "token_acc": 0.30462669618737004 + }, + { + "epoch": 6.244503078276165, + "grad_norm": 0.15091441042687145, + "learning_rate": 0.000411750393009816, + "loss": 2.927462577819824, + "step": 10653, + "token_acc": 0.30887810883893435 + }, + { + "epoch": 6.245089416593374, + "grad_norm": 0.15837125224753454, + "learning_rate": 0.00041173191698642167, + "loss": 2.957767963409424, + "step": 10654, + "token_acc": 0.3044582767721982 + }, + { + "epoch": 6.245675754910583, + "grad_norm": 0.16857482643157404, + "learning_rate": 0.0004117134394437739, + "loss": 2.948213577270508, + "step": 10655, + "token_acc": 0.30410502498818964 + }, + { + "epoch": 6.246262093227792, + "grad_norm": 0.16149726185990995, + "learning_rate": 0.00041169496038204634, + "loss": 2.927992105484009, + "step": 10656, + "token_acc": 0.3076125056979833 + }, + { + "epoch": 6.2468484315450015, + "grad_norm": 0.20331302161744527, + "learning_rate": 0.00041167647980141256, + "loss": 2.9670023918151855, + "step": 10657, + "token_acc": 0.3004794871794872 + }, + { + "epoch": 6.247434769862211, + "grad_norm": 0.2307344995926057, + "learning_rate": 0.00041165799770204593, + "loss": 2.954352378845215, + "step": 10658, + "token_acc": 0.303768237091992 + }, + { + "epoch": 6.24802110817942, + "grad_norm": 0.1846547138070567, + "learning_rate": 0.0004116395140841204, + "loss": 2.945906162261963, + "step": 10659, + "token_acc": 0.3035366356109036 + }, + { + "epoch": 6.248607446496629, + "grad_norm": 0.16546477917520447, + "learning_rate": 0.0004116210289478094, + "loss": 2.9402894973754883, + "step": 10660, + "token_acc": 0.30738529512870805 + }, + { + "epoch": 6.249193784813838, + "grad_norm": 0.21702884761918026, + "learning_rate": 0.0004116025422932866, + "loss": 3.002927780151367, + "step": 10661, + "token_acc": 0.29701584284231025 + }, + { + "epoch": 6.249780123131047, + "grad_norm": 0.2038536774505332, + "learning_rate": 0.00041158405412072575, + "loss": 2.921492338180542, + "step": 10662, + "token_acc": 0.3101951371131409 + }, + { + "epoch": 6.250366461448255, + "grad_norm": 0.15536977333799726, + "learning_rate": 0.00041156556443030037, + "loss": 2.936476230621338, + "step": 10663, + "token_acc": 0.3072034285386167 + }, + { + "epoch": 6.250952799765464, + "grad_norm": 0.18730158787331286, + "learning_rate": 0.0004115470732221842, + "loss": 2.9516639709472656, + "step": 10664, + "token_acc": 0.3054843397564818 + }, + { + "epoch": 6.2515391380826735, + "grad_norm": 0.1457630167677713, + "learning_rate": 0.0004115285804965511, + "loss": 2.9442296028137207, + "step": 10665, + "token_acc": 0.3063626443917673 + }, + { + "epoch": 6.252125476399883, + "grad_norm": 0.20472423990636346, + "learning_rate": 0.00041151008625357456, + "loss": 2.912977933883667, + "step": 10666, + "token_acc": 0.3107814258247451 + }, + { + "epoch": 6.252711814717092, + "grad_norm": 0.15686404190526013, + "learning_rate": 0.0004114915904934284, + "loss": 2.972034454345703, + "step": 10667, + "token_acc": 0.3011597360158903 + }, + { + "epoch": 6.253298153034301, + "grad_norm": 0.19565923117806938, + "learning_rate": 0.00041147309321628634, + "loss": 2.9297335147857666, + "step": 10668, + "token_acc": 0.30848329048843187 + }, + { + "epoch": 6.25388449135151, + "grad_norm": 0.2373732982714425, + "learning_rate": 0.00041145459442232214, + "loss": 2.9422566890716553, + "step": 10669, + "token_acc": 0.30595433342563205 + }, + { + "epoch": 6.254470829668719, + "grad_norm": 0.16385294194391983, + "learning_rate": 0.0004114360941117097, + "loss": 2.9481852054595947, + "step": 10670, + "token_acc": 0.30433493711665294 + }, + { + "epoch": 6.255057167985928, + "grad_norm": 0.18714660397597585, + "learning_rate": 0.00041141759228462263, + "loss": 2.9513778686523438, + "step": 10671, + "token_acc": 0.30406270533634255 + }, + { + "epoch": 6.255643506303137, + "grad_norm": 0.21346380241059978, + "learning_rate": 0.00041139908894123476, + "loss": 2.947744607925415, + "step": 10672, + "token_acc": 0.30515592504463224 + }, + { + "epoch": 6.256229844620346, + "grad_norm": 0.1500678347499334, + "learning_rate": 0.00041138058408171993, + "loss": 2.9296913146972656, + "step": 10673, + "token_acc": 0.3067953499015513 + }, + { + "epoch": 6.256816182937555, + "grad_norm": 0.23487004917199197, + "learning_rate": 0.000411362077706252, + "loss": 2.913830280303955, + "step": 10674, + "token_acc": 0.31146577579885587 + }, + { + "epoch": 6.257402521254764, + "grad_norm": 0.15953763185650172, + "learning_rate": 0.00041134356981500475, + "loss": 2.967125654220581, + "step": 10675, + "token_acc": 0.3017391736440657 + }, + { + "epoch": 6.257988859571973, + "grad_norm": 0.2107512876926024, + "learning_rate": 0.0004113250604081522, + "loss": 2.9537878036499023, + "step": 10676, + "token_acc": 0.3058582905667146 + }, + { + "epoch": 6.258575197889182, + "grad_norm": 0.16089644847551893, + "learning_rate": 0.000411306549485868, + "loss": 2.953906297683716, + "step": 10677, + "token_acc": 0.30430261626654026 + }, + { + "epoch": 6.259161536206391, + "grad_norm": 0.2097340429890355, + "learning_rate": 0.0004112880370483261, + "loss": 2.949415922164917, + "step": 10678, + "token_acc": 0.3052664324877834 + }, + { + "epoch": 6.2597478745236, + "grad_norm": 0.18834356685945014, + "learning_rate": 0.0004112695230957005, + "loss": 2.979276180267334, + "step": 10679, + "token_acc": 0.2990369016043056 + }, + { + "epoch": 6.260334212840809, + "grad_norm": 0.18317155903659374, + "learning_rate": 0.00041125100762816504, + "loss": 2.935314655303955, + "step": 10680, + "token_acc": 0.3066216549050684 + }, + { + "epoch": 6.260920551158018, + "grad_norm": 0.16897686751445973, + "learning_rate": 0.00041123249064589363, + "loss": 2.944854259490967, + "step": 10681, + "token_acc": 0.3066739259803239 + }, + { + "epoch": 6.2615068894752275, + "grad_norm": 0.17778404515846574, + "learning_rate": 0.0004112139721490602, + "loss": 2.9273746013641357, + "step": 10682, + "token_acc": 0.30693585655827643 + }, + { + "epoch": 6.262093227792437, + "grad_norm": 0.1755344305416732, + "learning_rate": 0.00041119545213783883, + "loss": 2.9078211784362793, + "step": 10683, + "token_acc": 0.3113049211772463 + }, + { + "epoch": 6.262679566109645, + "grad_norm": 0.16409451095068597, + "learning_rate": 0.00041117693061240335, + "loss": 2.943479061126709, + "step": 10684, + "token_acc": 0.30657008104666605 + }, + { + "epoch": 6.263265904426854, + "grad_norm": 0.15835245487741179, + "learning_rate": 0.0004111584075729278, + "loss": 2.9448561668395996, + "step": 10685, + "token_acc": 0.30608135742525455 + }, + { + "epoch": 6.263852242744063, + "grad_norm": 0.1818783039714962, + "learning_rate": 0.00041113988301958614, + "loss": 2.989250659942627, + "step": 10686, + "token_acc": 0.29814601686021824 + }, + { + "epoch": 6.264438581061272, + "grad_norm": 0.16556651322131954, + "learning_rate": 0.0004111213569525524, + "loss": 2.9533286094665527, + "step": 10687, + "token_acc": 0.3042867452736906 + }, + { + "epoch": 6.265024919378481, + "grad_norm": 0.15965665324936243, + "learning_rate": 0.0004111028293720007, + "loss": 2.949902057647705, + "step": 10688, + "token_acc": 0.305798246243946 + }, + { + "epoch": 6.26561125769569, + "grad_norm": 0.17584770587275117, + "learning_rate": 0.00041108430027810494, + "loss": 2.9338154792785645, + "step": 10689, + "token_acc": 0.3059811133402233 + }, + { + "epoch": 6.2661975960128995, + "grad_norm": 0.16018187994820884, + "learning_rate": 0.0004110657696710393, + "loss": 2.9734272956848145, + "step": 10690, + "token_acc": 0.30139277387735225 + }, + { + "epoch": 6.266783934330109, + "grad_norm": 0.18175213791119027, + "learning_rate": 0.00041104723755097785, + "loss": 3.0040438175201416, + "step": 10691, + "token_acc": 0.29737426935792494 + }, + { + "epoch": 6.267370272647318, + "grad_norm": 0.16634845346889324, + "learning_rate": 0.00041102870391809446, + "loss": 2.945624351501465, + "step": 10692, + "token_acc": 0.30524247122808595 + }, + { + "epoch": 6.267956610964527, + "grad_norm": 0.1659187729207687, + "learning_rate": 0.0004110101687725635, + "loss": 2.9529051780700684, + "step": 10693, + "token_acc": 0.3056869242565604 + }, + { + "epoch": 6.268542949281736, + "grad_norm": 0.1493977641897921, + "learning_rate": 0.0004109916321145589, + "loss": 2.944582223892212, + "step": 10694, + "token_acc": 0.3062965597777211 + }, + { + "epoch": 6.269129287598945, + "grad_norm": 0.1640755280303279, + "learning_rate": 0.0004109730939442549, + "loss": 2.9829139709472656, + "step": 10695, + "token_acc": 0.29930684898607823 + }, + { + "epoch": 6.269715625916153, + "grad_norm": 0.16639141992061351, + "learning_rate": 0.0004109545542618256, + "loss": 2.9654757976531982, + "step": 10696, + "token_acc": 0.30261252961305607 + }, + { + "epoch": 6.270301964233362, + "grad_norm": 0.17800215813250558, + "learning_rate": 0.0004109360130674451, + "loss": 2.956394672393799, + "step": 10697, + "token_acc": 0.303410135894554 + }, + { + "epoch": 6.2708883025505715, + "grad_norm": 0.17035056023812367, + "learning_rate": 0.0004109174703612877, + "loss": 2.9148378372192383, + "step": 10698, + "token_acc": 0.3102014983208473 + }, + { + "epoch": 6.271474640867781, + "grad_norm": 0.15090712339991302, + "learning_rate": 0.0004108989261435274, + "loss": 2.9530391693115234, + "step": 10699, + "token_acc": 0.3048058362740207 + }, + { + "epoch": 6.27206097918499, + "grad_norm": 0.17085833154876076, + "learning_rate": 0.0004108803804143386, + "loss": 2.9764904975891113, + "step": 10700, + "token_acc": 0.30132786648633914 + }, + { + "epoch": 6.272647317502199, + "grad_norm": 0.15338950763181258, + "learning_rate": 0.00041086183317389543, + "loss": 2.974369525909424, + "step": 10701, + "token_acc": 0.3027741707000354 + }, + { + "epoch": 6.273233655819408, + "grad_norm": 0.15494470286132764, + "learning_rate": 0.000410843284422372, + "loss": 2.964383602142334, + "step": 10702, + "token_acc": 0.3028221314519124 + }, + { + "epoch": 6.273819994136617, + "grad_norm": 0.17371430507974794, + "learning_rate": 0.0004108247341599427, + "loss": 2.9476237297058105, + "step": 10703, + "token_acc": 0.3035151559640724 + }, + { + "epoch": 6.274406332453826, + "grad_norm": 0.1867314652725611, + "learning_rate": 0.0004108061823867818, + "loss": 2.9776952266693115, + "step": 10704, + "token_acc": 0.3020551805327689 + }, + { + "epoch": 6.274992670771035, + "grad_norm": 0.25109485237791007, + "learning_rate": 0.00041078762910306345, + "loss": 2.941995143890381, + "step": 10705, + "token_acc": 0.30628548895899055 + }, + { + "epoch": 6.2755790090882435, + "grad_norm": 0.22445036432298882, + "learning_rate": 0.00041076907430896193, + "loss": 2.9733471870422363, + "step": 10706, + "token_acc": 0.3021375593126641 + }, + { + "epoch": 6.276165347405453, + "grad_norm": 0.16614532392500536, + "learning_rate": 0.00041075051800465163, + "loss": 2.9882400035858154, + "step": 10707, + "token_acc": 0.3001606803084854 + }, + { + "epoch": 6.276751685722662, + "grad_norm": 0.16250963895791565, + "learning_rate": 0.0004107319601903069, + "loss": 2.941617488861084, + "step": 10708, + "token_acc": 0.3065080348830877 + }, + { + "epoch": 6.277338024039871, + "grad_norm": 0.18435238114858168, + "learning_rate": 0.0004107134008661019, + "loss": 2.944436550140381, + "step": 10709, + "token_acc": 0.3057469335088736 + }, + { + "epoch": 6.27792436235708, + "grad_norm": 0.19967445659517602, + "learning_rate": 0.00041069484003221113, + "loss": 2.905338764190674, + "step": 10710, + "token_acc": 0.3113947012084515 + }, + { + "epoch": 6.278510700674289, + "grad_norm": 0.16198568081202638, + "learning_rate": 0.00041067627768880886, + "loss": 2.9634203910827637, + "step": 10711, + "token_acc": 0.30309914765605417 + }, + { + "epoch": 6.279097038991498, + "grad_norm": 0.1792011208820571, + "learning_rate": 0.00041065771383606945, + "loss": 2.968311309814453, + "step": 10712, + "token_acc": 0.3022901860535549 + }, + { + "epoch": 6.279683377308707, + "grad_norm": 0.17346103028731594, + "learning_rate": 0.00041063914847416734, + "loss": 2.9342784881591797, + "step": 10713, + "token_acc": 0.3077182751168892 + }, + { + "epoch": 6.280269715625916, + "grad_norm": 0.22581804042184764, + "learning_rate": 0.0004106205816032769, + "loss": 2.973550796508789, + "step": 10714, + "token_acc": 0.30108425021815766 + }, + { + "epoch": 6.2808560539431255, + "grad_norm": 0.21867694455752734, + "learning_rate": 0.0004106020132235725, + "loss": 2.9506702423095703, + "step": 10715, + "token_acc": 0.30548125299553114 + }, + { + "epoch": 6.281442392260335, + "grad_norm": 0.18094367586678523, + "learning_rate": 0.00041058344333522856, + "loss": 2.921837091445923, + "step": 10716, + "token_acc": 0.30944779165942615 + }, + { + "epoch": 6.282028730577543, + "grad_norm": 0.18854522143815203, + "learning_rate": 0.0004105648719384196, + "loss": 2.9769506454467773, + "step": 10717, + "token_acc": 0.30264637819808404 + }, + { + "epoch": 6.282615068894752, + "grad_norm": 0.18390104987635805, + "learning_rate": 0.0004105462990333201, + "loss": 2.9837560653686523, + "step": 10718, + "token_acc": 0.3003359462486002 + }, + { + "epoch": 6.283201407211961, + "grad_norm": 0.21556799366533075, + "learning_rate": 0.00041052772462010437, + "loss": 2.9653615951538086, + "step": 10719, + "token_acc": 0.3020806044284232 + }, + { + "epoch": 6.28378774552917, + "grad_norm": 0.1676256073899917, + "learning_rate": 0.00041050914869894707, + "loss": 2.9772818088531494, + "step": 10720, + "token_acc": 0.30162542869532083 + }, + { + "epoch": 6.284374083846379, + "grad_norm": 0.17275367495038868, + "learning_rate": 0.00041049057127002256, + "loss": 2.991433620452881, + "step": 10721, + "token_acc": 0.29839881285930725 + }, + { + "epoch": 6.284960422163588, + "grad_norm": 0.18609523295302807, + "learning_rate": 0.0004104719923335053, + "loss": 2.952453136444092, + "step": 10722, + "token_acc": 0.30565136362443235 + }, + { + "epoch": 6.2855467604807975, + "grad_norm": 0.16171305880854078, + "learning_rate": 0.00041045341188957005, + "loss": 2.9374561309814453, + "step": 10723, + "token_acc": 0.30677385220398945 + }, + { + "epoch": 6.286133098798007, + "grad_norm": 0.17842961758578577, + "learning_rate": 0.0004104348299383911, + "loss": 2.9456706047058105, + "step": 10724, + "token_acc": 0.30553190394731766 + }, + { + "epoch": 6.286719437115216, + "grad_norm": 0.2428224842586848, + "learning_rate": 0.0004104162464801432, + "loss": 2.9306259155273438, + "step": 10725, + "token_acc": 0.30736897805755303 + }, + { + "epoch": 6.287305775432425, + "grad_norm": 0.2378052573823697, + "learning_rate": 0.00041039766151500077, + "loss": 2.939516067504883, + "step": 10726, + "token_acc": 0.30643698309731404 + }, + { + "epoch": 6.287892113749633, + "grad_norm": 0.23035767689268827, + "learning_rate": 0.00041037907504313853, + "loss": 3.0056021213531494, + "step": 10727, + "token_acc": 0.29878050315890725 + }, + { + "epoch": 6.288478452066842, + "grad_norm": 0.15496644939193877, + "learning_rate": 0.00041036048706473096, + "loss": 2.8970255851745605, + "step": 10728, + "token_acc": 0.31402825969365544 + }, + { + "epoch": 6.289064790384051, + "grad_norm": 0.20608017110062515, + "learning_rate": 0.0004103418975799527, + "loss": 2.9943814277648926, + "step": 10729, + "token_acc": 0.2984552578933856 + }, + { + "epoch": 6.28965112870126, + "grad_norm": 0.26899912828276584, + "learning_rate": 0.0004103233065889783, + "loss": 2.965994358062744, + "step": 10730, + "token_acc": 0.30234542376236034 + }, + { + "epoch": 6.2902374670184695, + "grad_norm": 0.17017776867976034, + "learning_rate": 0.0004103047140919825, + "loss": 2.9503173828125, + "step": 10731, + "token_acc": 0.303994812810691 + }, + { + "epoch": 6.290823805335679, + "grad_norm": 0.18525437790137422, + "learning_rate": 0.00041028612008914, + "loss": 2.987330198287964, + "step": 10732, + "token_acc": 0.29802851438739636 + }, + { + "epoch": 6.291410143652888, + "grad_norm": 0.21629953656711082, + "learning_rate": 0.0004102675245806253, + "loss": 2.9833006858825684, + "step": 10733, + "token_acc": 0.3003798509313527 + }, + { + "epoch": 6.291996481970097, + "grad_norm": 0.15273965632863779, + "learning_rate": 0.00041024892756661325, + "loss": 2.9671432971954346, + "step": 10734, + "token_acc": 0.3008568336622318 + }, + { + "epoch": 6.292582820287306, + "grad_norm": 0.20384874288048652, + "learning_rate": 0.0004102303290472784, + "loss": 2.973121166229248, + "step": 10735, + "token_acc": 0.30207967149138226 + }, + { + "epoch": 6.293169158604515, + "grad_norm": 0.16719179953947944, + "learning_rate": 0.00041021172902279553, + "loss": 2.938772678375244, + "step": 10736, + "token_acc": 0.3071587031384009 + }, + { + "epoch": 6.293755496921724, + "grad_norm": 0.2056027587014875, + "learning_rate": 0.0004101931274933394, + "loss": 2.9587855339050293, + "step": 10737, + "token_acc": 0.30437942797166095 + }, + { + "epoch": 6.294341835238933, + "grad_norm": 0.19024847996486047, + "learning_rate": 0.00041017452445908463, + "loss": 2.9432830810546875, + "step": 10738, + "token_acc": 0.3062739482972946 + }, + { + "epoch": 6.2949281735561415, + "grad_norm": 0.18658931745565918, + "learning_rate": 0.00041015591992020614, + "loss": 2.933835029602051, + "step": 10739, + "token_acc": 0.3055280905892883 + }, + { + "epoch": 6.295514511873351, + "grad_norm": 0.21594335753156196, + "learning_rate": 0.00041013731387687857, + "loss": 2.98113751411438, + "step": 10740, + "token_acc": 0.3006611882246285 + }, + { + "epoch": 6.29610085019056, + "grad_norm": 0.1716717836494016, + "learning_rate": 0.0004101187063292766, + "loss": 2.9529056549072266, + "step": 10741, + "token_acc": 0.30346752739282407 + }, + { + "epoch": 6.296687188507769, + "grad_norm": 0.2432535300491896, + "learning_rate": 0.00041010009727757526, + "loss": 2.987607955932617, + "step": 10742, + "token_acc": 0.3003444897278479 + }, + { + "epoch": 6.297273526824978, + "grad_norm": 0.16032287542900392, + "learning_rate": 0.0004100814867219492, + "loss": 2.9829154014587402, + "step": 10743, + "token_acc": 0.3025430167358644 + }, + { + "epoch": 6.297859865142187, + "grad_norm": 0.294085684663191, + "learning_rate": 0.00041006287466257337, + "loss": 2.992262601852417, + "step": 10744, + "token_acc": 0.29858235432450947 + }, + { + "epoch": 6.298446203459396, + "grad_norm": 0.1593908889941105, + "learning_rate": 0.0004100442610996224, + "loss": 2.9359092712402344, + "step": 10745, + "token_acc": 0.3088082039767249 + }, + { + "epoch": 6.299032541776605, + "grad_norm": 0.2248581368980056, + "learning_rate": 0.0004100256460332713, + "loss": 2.978719711303711, + "step": 10746, + "token_acc": 0.3001815203897472 + }, + { + "epoch": 6.299618880093814, + "grad_norm": 0.1814648435840269, + "learning_rate": 0.00041000702946369493, + "loss": 2.9123282432556152, + "step": 10747, + "token_acc": 0.3106841949447507 + }, + { + "epoch": 6.3002052184110235, + "grad_norm": 0.22385824081681255, + "learning_rate": 0.00040998841139106815, + "loss": 2.9500255584716797, + "step": 10748, + "token_acc": 0.3046547691013746 + }, + { + "epoch": 6.300791556728232, + "grad_norm": 0.1639904790435948, + "learning_rate": 0.0004099697918155658, + "loss": 2.966064453125, + "step": 10749, + "token_acc": 0.30261989688634133 + }, + { + "epoch": 6.301377895045441, + "grad_norm": 0.1943748910450793, + "learning_rate": 0.0004099511707373628, + "loss": 2.9536399841308594, + "step": 10750, + "token_acc": 0.3056278753174933 + }, + { + "epoch": 6.30196423336265, + "grad_norm": 0.1649119386355125, + "learning_rate": 0.0004099325481566341, + "loss": 2.944915771484375, + "step": 10751, + "token_acc": 0.30660730261540153 + }, + { + "epoch": 6.302550571679859, + "grad_norm": 0.2000970836482914, + "learning_rate": 0.0004099139240735546, + "loss": 2.96683406829834, + "step": 10752, + "token_acc": 0.3023114666775932 + }, + { + "epoch": 6.303136909997068, + "grad_norm": 0.1568036186972172, + "learning_rate": 0.0004098952984882993, + "loss": 2.9767005443573, + "step": 10753, + "token_acc": 0.30112432190068533 + }, + { + "epoch": 6.303723248314277, + "grad_norm": 0.1849768311965063, + "learning_rate": 0.0004098766714010431, + "loss": 2.910928249359131, + "step": 10754, + "token_acc": 0.31102051305388323 + }, + { + "epoch": 6.304309586631486, + "grad_norm": 0.160824079004354, + "learning_rate": 0.00040985804281196104, + "loss": 2.996994972229004, + "step": 10755, + "token_acc": 0.29731959309805167 + }, + { + "epoch": 6.3048959249486956, + "grad_norm": 0.16811320855697715, + "learning_rate": 0.000409839412721228, + "loss": 2.9906435012817383, + "step": 10756, + "token_acc": 0.30185384569139373 + }, + { + "epoch": 6.305482263265905, + "grad_norm": 0.15207529638282197, + "learning_rate": 0.0004098207811290192, + "loss": 2.9579620361328125, + "step": 10757, + "token_acc": 0.3035878659739322 + }, + { + "epoch": 6.306068601583114, + "grad_norm": 0.19288553176804735, + "learning_rate": 0.0004098021480355095, + "loss": 2.99324893951416, + "step": 10758, + "token_acc": 0.2994201955947556 + }, + { + "epoch": 6.306654939900323, + "grad_norm": 0.20553617397124965, + "learning_rate": 0.0004097835134408739, + "loss": 2.9699487686157227, + "step": 10759, + "token_acc": 0.30162164960461774 + }, + { + "epoch": 6.307241278217531, + "grad_norm": 0.15684226834572326, + "learning_rate": 0.0004097648773452876, + "loss": 2.9380526542663574, + "step": 10760, + "token_acc": 0.30702303521597507 + }, + { + "epoch": 6.30782761653474, + "grad_norm": 0.15898458431356147, + "learning_rate": 0.0004097462397489253, + "loss": 3.0039005279541016, + "step": 10761, + "token_acc": 0.29803386889360584 + }, + { + "epoch": 6.308413954851949, + "grad_norm": 0.17580300484744313, + "learning_rate": 0.00040972760065196255, + "loss": 2.9903974533081055, + "step": 10762, + "token_acc": 0.2973094996536004 + }, + { + "epoch": 6.3090002931691584, + "grad_norm": 0.25844011839613906, + "learning_rate": 0.00040970896005457425, + "loss": 2.925718307495117, + "step": 10763, + "token_acc": 0.3072864578018165 + }, + { + "epoch": 6.309586631486368, + "grad_norm": 0.2181514226615147, + "learning_rate": 0.0004096903179569354, + "loss": 2.9795279502868652, + "step": 10764, + "token_acc": 0.2998364532948829 + }, + { + "epoch": 6.310172969803577, + "grad_norm": 0.16606603364458433, + "learning_rate": 0.00040967167435922126, + "loss": 2.9574387073516846, + "step": 10765, + "token_acc": 0.30466992445521823 + }, + { + "epoch": 6.310759308120786, + "grad_norm": 0.17189873451296944, + "learning_rate": 0.0004096530292616068, + "loss": 2.946988105773926, + "step": 10766, + "token_acc": 0.30562167909265414 + }, + { + "epoch": 6.311345646437995, + "grad_norm": 0.1541542443566039, + "learning_rate": 0.0004096343826642673, + "loss": 2.9323983192443848, + "step": 10767, + "token_acc": 0.3067842168624684 + }, + { + "epoch": 6.311931984755204, + "grad_norm": 0.16921138473562003, + "learning_rate": 0.000409615734567378, + "loss": 2.953695774078369, + "step": 10768, + "token_acc": 0.30343248952813 + }, + { + "epoch": 6.312518323072413, + "grad_norm": 0.16116794866834921, + "learning_rate": 0.0004095970849711138, + "loss": 2.958869457244873, + "step": 10769, + "token_acc": 0.30621989926436816 + }, + { + "epoch": 6.313104661389621, + "grad_norm": 0.16758321234128207, + "learning_rate": 0.00040957843387565016, + "loss": 2.9617886543273926, + "step": 10770, + "token_acc": 0.30238406253140054 + }, + { + "epoch": 6.3136909997068305, + "grad_norm": 0.1783437112168568, + "learning_rate": 0.0004095597812811621, + "loss": 2.952045202255249, + "step": 10771, + "token_acc": 0.30430610216865195 + }, + { + "epoch": 6.31427733802404, + "grad_norm": 0.15884957519110054, + "learning_rate": 0.0004095411271878249, + "loss": 2.9516546726226807, + "step": 10772, + "token_acc": 0.305449233951361 + }, + { + "epoch": 6.314863676341249, + "grad_norm": 0.15640376890595595, + "learning_rate": 0.00040952247159581383, + "loss": 2.9838786125183105, + "step": 10773, + "token_acc": 0.2995808277929284 + }, + { + "epoch": 6.315450014658458, + "grad_norm": 0.1877203457120048, + "learning_rate": 0.00040950381450530416, + "loss": 2.944627285003662, + "step": 10774, + "token_acc": 0.30475273216431986 + }, + { + "epoch": 6.316036352975667, + "grad_norm": 0.1810460021781974, + "learning_rate": 0.00040948515591647094, + "loss": 3.0127995014190674, + "step": 10775, + "token_acc": 0.2967367517250767 + }, + { + "epoch": 6.316622691292876, + "grad_norm": 0.17636068801339932, + "learning_rate": 0.0004094664958294897, + "loss": 2.928328514099121, + "step": 10776, + "token_acc": 0.30906427956368254 + }, + { + "epoch": 6.317209029610085, + "grad_norm": 0.17743338094801225, + "learning_rate": 0.00040944783424453555, + "loss": 2.993743896484375, + "step": 10777, + "token_acc": 0.29896734306298123 + }, + { + "epoch": 6.317795367927294, + "grad_norm": 0.1611740609120004, + "learning_rate": 0.0004094291711617839, + "loss": 2.9498472213745117, + "step": 10778, + "token_acc": 0.30496136786378597 + }, + { + "epoch": 6.318381706244503, + "grad_norm": 0.25534089475162786, + "learning_rate": 0.00040941050658141004, + "loss": 2.963923692703247, + "step": 10779, + "token_acc": 0.3022430900817393 + }, + { + "epoch": 6.3189680445617125, + "grad_norm": 0.21271098988981793, + "learning_rate": 0.00040939184050358925, + "loss": 2.9459118843078613, + "step": 10780, + "token_acc": 0.3060134207182 + }, + { + "epoch": 6.319554382878922, + "grad_norm": 0.21128522894272903, + "learning_rate": 0.00040937317292849687, + "loss": 2.965941905975342, + "step": 10781, + "token_acc": 0.3023571986021842 + }, + { + "epoch": 6.32014072119613, + "grad_norm": 0.21093245312582917, + "learning_rate": 0.00040935450385630836, + "loss": 2.956766128540039, + "step": 10782, + "token_acc": 0.30266745190818767 + }, + { + "epoch": 6.320727059513339, + "grad_norm": 0.2275620704213413, + "learning_rate": 0.000409335833287199, + "loss": 2.9423980712890625, + "step": 10783, + "token_acc": 0.306085447433278 + }, + { + "epoch": 6.321313397830548, + "grad_norm": 0.2927460585484521, + "learning_rate": 0.0004093171612213441, + "loss": 2.919191360473633, + "step": 10784, + "token_acc": 0.3102671628824848 + }, + { + "epoch": 6.321899736147757, + "grad_norm": 0.1822316931657284, + "learning_rate": 0.00040929848765891926, + "loss": 3.0211868286132812, + "step": 10785, + "token_acc": 0.2941877710641926 + }, + { + "epoch": 6.322486074464966, + "grad_norm": 0.27725910245655755, + "learning_rate": 0.0004092798126000997, + "loss": 2.9302666187286377, + "step": 10786, + "token_acc": 0.3082778714427775 + }, + { + "epoch": 6.323072412782175, + "grad_norm": 0.18790672437481956, + "learning_rate": 0.000409261136045061, + "loss": 2.978792905807495, + "step": 10787, + "token_acc": 0.30181798695122114 + }, + { + "epoch": 6.3236587510993845, + "grad_norm": 0.2329379330505155, + "learning_rate": 0.0004092424579939785, + "loss": 2.9792137145996094, + "step": 10788, + "token_acc": 0.3018024509489955 + }, + { + "epoch": 6.324245089416594, + "grad_norm": 0.1906342970821725, + "learning_rate": 0.0004092237784470277, + "loss": 2.944913864135742, + "step": 10789, + "token_acc": 0.307067015681557 + }, + { + "epoch": 6.324831427733803, + "grad_norm": 0.24182390121189637, + "learning_rate": 0.0004092050974043841, + "loss": 2.964467763900757, + "step": 10790, + "token_acc": 0.3020452356201916 + }, + { + "epoch": 6.325417766051012, + "grad_norm": 0.18254085837019518, + "learning_rate": 0.0004091864148662231, + "loss": 2.9857306480407715, + "step": 10791, + "token_acc": 0.29925223483313756 + }, + { + "epoch": 6.32600410436822, + "grad_norm": 0.21948705277912348, + "learning_rate": 0.0004091677308327203, + "loss": 2.978860378265381, + "step": 10792, + "token_acc": 0.30232393868757507 + }, + { + "epoch": 6.326590442685429, + "grad_norm": 0.1741871495638311, + "learning_rate": 0.0004091490453040511, + "loss": 2.9424777030944824, + "step": 10793, + "token_acc": 0.3054735118897828 + }, + { + "epoch": 6.327176781002638, + "grad_norm": 0.20933469060022383, + "learning_rate": 0.00040913035828039104, + "loss": 2.9874415397644043, + "step": 10794, + "token_acc": 0.2989740478953476 + }, + { + "epoch": 6.327763119319847, + "grad_norm": 0.1822761139502352, + "learning_rate": 0.0004091116697619157, + "loss": 2.967315196990967, + "step": 10795, + "token_acc": 0.30227283604630756 + }, + { + "epoch": 6.3283494576370565, + "grad_norm": 0.17049090060171704, + "learning_rate": 0.00040909297974880065, + "loss": 2.94614839553833, + "step": 10796, + "token_acc": 0.3047003139790993 + }, + { + "epoch": 6.328935795954266, + "grad_norm": 0.17771488769610239, + "learning_rate": 0.0004090742882412215, + "loss": 2.950713634490967, + "step": 10797, + "token_acc": 0.3043460049269208 + }, + { + "epoch": 6.329522134271475, + "grad_norm": 0.17986301762646423, + "learning_rate": 0.00040905559523935365, + "loss": 2.962371826171875, + "step": 10798, + "token_acc": 0.3024503940871592 + }, + { + "epoch": 6.330108472588684, + "grad_norm": 0.19717726009794678, + "learning_rate": 0.0004090369007433729, + "loss": 2.9636647701263428, + "step": 10799, + "token_acc": 0.30354625597003937 + }, + { + "epoch": 6.330694810905893, + "grad_norm": 0.18564921448974336, + "learning_rate": 0.00040901820475345473, + "loss": 2.923710346221924, + "step": 10800, + "token_acc": 0.3088022216855832 + }, + { + "epoch": 6.331281149223102, + "grad_norm": 0.17776704863023265, + "learning_rate": 0.00040899950726977486, + "loss": 2.9480600357055664, + "step": 10801, + "token_acc": 0.30531636190425787 + }, + { + "epoch": 6.331867487540311, + "grad_norm": 0.1734457013896376, + "learning_rate": 0.00040898080829250886, + "loss": 2.974599838256836, + "step": 10802, + "token_acc": 0.30180354435673584 + }, + { + "epoch": 6.33245382585752, + "grad_norm": 0.15816398986472657, + "learning_rate": 0.0004089621078218323, + "loss": 2.9576454162597656, + "step": 10803, + "token_acc": 0.303144851639038 + }, + { + "epoch": 6.3330401641747285, + "grad_norm": 0.15317147080163063, + "learning_rate": 0.000408943405857921, + "loss": 2.9852488040924072, + "step": 10804, + "token_acc": 0.2987735754478596 + }, + { + "epoch": 6.333626502491938, + "grad_norm": 0.15577659438689084, + "learning_rate": 0.00040892470240095057, + "loss": 2.9559624195098877, + "step": 10805, + "token_acc": 0.30403510183111787 + }, + { + "epoch": 6.334212840809147, + "grad_norm": 0.15648671109347695, + "learning_rate": 0.0004089059974510968, + "loss": 2.9770126342773438, + "step": 10806, + "token_acc": 0.30069942778810504 + }, + { + "epoch": 6.334799179126356, + "grad_norm": 0.18132125651048198, + "learning_rate": 0.0004088872910085352, + "loss": 2.956864595413208, + "step": 10807, + "token_acc": 0.30368590977451326 + }, + { + "epoch": 6.335385517443565, + "grad_norm": 0.19737683119097826, + "learning_rate": 0.00040886858307344164, + "loss": 2.9990720748901367, + "step": 10808, + "token_acc": 0.29925698180886495 + }, + { + "epoch": 6.335971855760774, + "grad_norm": 0.1546178639370173, + "learning_rate": 0.0004088498736459918, + "loss": 2.935983180999756, + "step": 10809, + "token_acc": 0.3057188953886586 + }, + { + "epoch": 6.336558194077983, + "grad_norm": 0.20306841301271356, + "learning_rate": 0.0004088311627263615, + "loss": 2.9671216011047363, + "step": 10810, + "token_acc": 0.3014625050085788 + }, + { + "epoch": 6.337144532395192, + "grad_norm": 0.14823643827571703, + "learning_rate": 0.00040881245031472647, + "loss": 2.935427665710449, + "step": 10811, + "token_acc": 0.3076261662702872 + }, + { + "epoch": 6.337730870712401, + "grad_norm": 0.18450799358697834, + "learning_rate": 0.00040879373641126244, + "loss": 2.957122325897217, + "step": 10812, + "token_acc": 0.30556591827405566 + }, + { + "epoch": 6.3383172090296105, + "grad_norm": 0.19014421271930487, + "learning_rate": 0.0004087750210161452, + "loss": 2.9455313682556152, + "step": 10813, + "token_acc": 0.3043840184798358 + }, + { + "epoch": 6.338903547346819, + "grad_norm": 0.1923612830543601, + "learning_rate": 0.0004087563041295506, + "loss": 2.9569859504699707, + "step": 10814, + "token_acc": 0.30666536869231786 + }, + { + "epoch": 6.339489885664028, + "grad_norm": 0.28448137683699515, + "learning_rate": 0.0004087375857516545, + "loss": 2.96341609954834, + "step": 10815, + "token_acc": 0.3042254049391051 + }, + { + "epoch": 6.340076223981237, + "grad_norm": 0.18397853657627436, + "learning_rate": 0.0004087188658826326, + "loss": 2.9583353996276855, + "step": 10816, + "token_acc": 0.30391012862691713 + }, + { + "epoch": 6.340662562298446, + "grad_norm": 0.2942493319439405, + "learning_rate": 0.0004087001445226609, + "loss": 2.974437952041626, + "step": 10817, + "token_acc": 0.3010475423045931 + }, + { + "epoch": 6.341248900615655, + "grad_norm": 0.17362676824394588, + "learning_rate": 0.00040868142167191525, + "loss": 2.936406373977661, + "step": 10818, + "token_acc": 0.3073456145232333 + }, + { + "epoch": 6.341835238932864, + "grad_norm": 0.2838777418632287, + "learning_rate": 0.0004086626973305714, + "loss": 2.9297938346862793, + "step": 10819, + "token_acc": 0.30684104358262054 + }, + { + "epoch": 6.342421577250073, + "grad_norm": 0.190440661248757, + "learning_rate": 0.00040864397149880527, + "loss": 3.0005035400390625, + "step": 10820, + "token_acc": 0.29801646126111464 + }, + { + "epoch": 6.3430079155672825, + "grad_norm": 0.2651197462610228, + "learning_rate": 0.0004086252441767928, + "loss": 2.974175214767456, + "step": 10821, + "token_acc": 0.30242416790897586 + }, + { + "epoch": 6.343594253884492, + "grad_norm": 0.2036625697360712, + "learning_rate": 0.00040860651536471, + "loss": 2.9158129692077637, + "step": 10822, + "token_acc": 0.3097171734847822 + }, + { + "epoch": 6.344180592201701, + "grad_norm": 0.21847400739862124, + "learning_rate": 0.0004085877850627326, + "loss": 2.9157142639160156, + "step": 10823, + "token_acc": 0.3093898545219346 + }, + { + "epoch": 6.34476693051891, + "grad_norm": 0.16971870620343465, + "learning_rate": 0.0004085690532710368, + "loss": 2.9479660987854004, + "step": 10824, + "token_acc": 0.30397138445920224 + }, + { + "epoch": 6.345353268836118, + "grad_norm": 0.23650637259409174, + "learning_rate": 0.0004085503199897983, + "loss": 2.945699691772461, + "step": 10825, + "token_acc": 0.3046584130669763 + }, + { + "epoch": 6.345939607153327, + "grad_norm": 0.15974963529213068, + "learning_rate": 0.00040853158521919326, + "loss": 2.938174247741699, + "step": 10826, + "token_acc": 0.3070989762851862 + }, + { + "epoch": 6.346525945470536, + "grad_norm": 0.23098417142956673, + "learning_rate": 0.00040851284895939755, + "loss": 2.9526498317718506, + "step": 10827, + "token_acc": 0.30498470783924453 + }, + { + "epoch": 6.347112283787745, + "grad_norm": 0.1847132406253998, + "learning_rate": 0.00040849411121058726, + "loss": 2.9610795974731445, + "step": 10828, + "token_acc": 0.303367884830593 + }, + { + "epoch": 6.3476986221049545, + "grad_norm": 0.20438673346664957, + "learning_rate": 0.00040847537197293834, + "loss": 2.970602035522461, + "step": 10829, + "token_acc": 0.30247527423818443 + }, + { + "epoch": 6.348284960422164, + "grad_norm": 0.1735034025965014, + "learning_rate": 0.0004084566312466269, + "loss": 2.9392409324645996, + "step": 10830, + "token_acc": 0.30602609500111655 + }, + { + "epoch": 6.348871298739373, + "grad_norm": 0.169675159898126, + "learning_rate": 0.0004084378890318289, + "loss": 2.9660706520080566, + "step": 10831, + "token_acc": 0.30317148614480255 + }, + { + "epoch": 6.349457637056582, + "grad_norm": 0.17645184364705807, + "learning_rate": 0.0004084191453287204, + "loss": 2.962535858154297, + "step": 10832, + "token_acc": 0.30366989433908015 + }, + { + "epoch": 6.350043975373791, + "grad_norm": 0.1638831989949693, + "learning_rate": 0.0004084004001374775, + "loss": 2.9805829524993896, + "step": 10833, + "token_acc": 0.30172266802374537 + }, + { + "epoch": 6.350630313691, + "grad_norm": 0.1684463481529548, + "learning_rate": 0.00040838165345827635, + "loss": 2.986117362976074, + "step": 10834, + "token_acc": 0.2994642451875142 + }, + { + "epoch": 6.351216652008208, + "grad_norm": 0.1660768017385439, + "learning_rate": 0.00040836290529129294, + "loss": 2.9212846755981445, + "step": 10835, + "token_acc": 0.3096131433128182 + }, + { + "epoch": 6.351802990325417, + "grad_norm": 0.17403147724622514, + "learning_rate": 0.00040834415563670344, + "loss": 2.953073024749756, + "step": 10836, + "token_acc": 0.30453788885591526 + }, + { + "epoch": 6.3523893286426265, + "grad_norm": 0.16854161116773753, + "learning_rate": 0.000408325404494684, + "loss": 2.9603171348571777, + "step": 10837, + "token_acc": 0.30508944847669206 + }, + { + "epoch": 6.352975666959836, + "grad_norm": 0.16632996679032383, + "learning_rate": 0.0004083066518654107, + "loss": 2.959322690963745, + "step": 10838, + "token_acc": 0.30260860376959753 + }, + { + "epoch": 6.353562005277045, + "grad_norm": 0.15416111578109223, + "learning_rate": 0.00040828789774905973, + "loss": 2.9420764446258545, + "step": 10839, + "token_acc": 0.30680719345819935 + }, + { + "epoch": 6.354148343594254, + "grad_norm": 0.18643884886268097, + "learning_rate": 0.00040826914214580723, + "loss": 2.9215569496154785, + "step": 10840, + "token_acc": 0.30880543400169813 + }, + { + "epoch": 6.354734681911463, + "grad_norm": 0.15989325044506605, + "learning_rate": 0.00040825038505582943, + "loss": 2.9394874572753906, + "step": 10841, + "token_acc": 0.3070223503965393 + }, + { + "epoch": 6.355321020228672, + "grad_norm": 0.18561601282956042, + "learning_rate": 0.00040823162647930257, + "loss": 2.9898955821990967, + "step": 10842, + "token_acc": 0.2988686151775287 + }, + { + "epoch": 6.355907358545881, + "grad_norm": 0.15419911008065623, + "learning_rate": 0.00040821286641640273, + "loss": 2.936950445175171, + "step": 10843, + "token_acc": 0.30715273995982106 + }, + { + "epoch": 6.35649369686309, + "grad_norm": 0.15529703422324756, + "learning_rate": 0.00040819410486730626, + "loss": 2.9549407958984375, + "step": 10844, + "token_acc": 0.30446500412727096 + }, + { + "epoch": 6.357080035180299, + "grad_norm": 0.15018961002272643, + "learning_rate": 0.0004081753418321893, + "loss": 2.9974050521850586, + "step": 10845, + "token_acc": 0.2985105271501974 + }, + { + "epoch": 6.3576663734975085, + "grad_norm": 0.16635531336148643, + "learning_rate": 0.00040815657731122813, + "loss": 2.942469358444214, + "step": 10846, + "token_acc": 0.3047651272779611 + }, + { + "epoch": 6.358252711814717, + "grad_norm": 0.188006068034605, + "learning_rate": 0.00040813781130459906, + "loss": 2.989856004714966, + "step": 10847, + "token_acc": 0.2989411634028461 + }, + { + "epoch": 6.358839050131926, + "grad_norm": 0.14892789592477348, + "learning_rate": 0.0004081190438124783, + "loss": 2.961075782775879, + "step": 10848, + "token_acc": 0.30411342417198556 + }, + { + "epoch": 6.359425388449135, + "grad_norm": 0.18338071795135955, + "learning_rate": 0.00040810027483504226, + "loss": 2.936551570892334, + "step": 10849, + "token_acc": 0.3042010289289315 + }, + { + "epoch": 6.360011726766344, + "grad_norm": 0.17708582958768235, + "learning_rate": 0.00040808150437246716, + "loss": 2.945286273956299, + "step": 10850, + "token_acc": 0.30593272234466684 + }, + { + "epoch": 6.360598065083553, + "grad_norm": 0.16148340883805717, + "learning_rate": 0.00040806273242492935, + "loss": 2.996377468109131, + "step": 10851, + "token_acc": 0.30021760875117054 + }, + { + "epoch": 6.361184403400762, + "grad_norm": 0.19331688261037655, + "learning_rate": 0.00040804395899260513, + "loss": 2.9838149547576904, + "step": 10852, + "token_acc": 0.3001470534932972 + }, + { + "epoch": 6.361770741717971, + "grad_norm": 0.17344990666641738, + "learning_rate": 0.0004080251840756709, + "loss": 2.957798957824707, + "step": 10853, + "token_acc": 0.3030363135805016 + }, + { + "epoch": 6.3623570800351805, + "grad_norm": 0.1570562756866253, + "learning_rate": 0.000408006407674303, + "loss": 2.986405849456787, + "step": 10854, + "token_acc": 0.29971788671502897 + }, + { + "epoch": 6.36294341835239, + "grad_norm": 0.17383835423386193, + "learning_rate": 0.00040798762978867787, + "loss": 2.9317831993103027, + "step": 10855, + "token_acc": 0.30786995146273544 + }, + { + "epoch": 6.363529756669599, + "grad_norm": 0.15056905926299197, + "learning_rate": 0.0004079688504189718, + "loss": 2.944356918334961, + "step": 10856, + "token_acc": 0.3055736354483418 + }, + { + "epoch": 6.364116094986807, + "grad_norm": 0.18679124920399348, + "learning_rate": 0.00040795006956536125, + "loss": 2.9912161827087402, + "step": 10857, + "token_acc": 0.29985641122564965 + }, + { + "epoch": 6.364702433304016, + "grad_norm": 0.21837390433273712, + "learning_rate": 0.00040793128722802267, + "loss": 2.990884304046631, + "step": 10858, + "token_acc": 0.2994574378423936 + }, + { + "epoch": 6.365288771621225, + "grad_norm": 0.16203994532035224, + "learning_rate": 0.00040791250340713235, + "loss": 2.9752655029296875, + "step": 10859, + "token_acc": 0.30146775015310506 + }, + { + "epoch": 6.365875109938434, + "grad_norm": 0.2068809153540409, + "learning_rate": 0.000407893718102867, + "loss": 2.933103322982788, + "step": 10860, + "token_acc": 0.30650429881073576 + }, + { + "epoch": 6.366461448255643, + "grad_norm": 0.32050532498582485, + "learning_rate": 0.0004078749313154028, + "loss": 2.973238945007324, + "step": 10861, + "token_acc": 0.3005563483028972 + }, + { + "epoch": 6.3670477865728525, + "grad_norm": 0.19336847793540174, + "learning_rate": 0.00040785614304491647, + "loss": 2.95566725730896, + "step": 10862, + "token_acc": 0.3036985276545339 + }, + { + "epoch": 6.367634124890062, + "grad_norm": 0.2630405707313785, + "learning_rate": 0.00040783735329158433, + "loss": 2.953232765197754, + "step": 10863, + "token_acc": 0.3042019102513968 + }, + { + "epoch": 6.368220463207271, + "grad_norm": 0.2316992143845289, + "learning_rate": 0.0004078185620555829, + "loss": 2.9680933952331543, + "step": 10864, + "token_acc": 0.30257090709883083 + }, + { + "epoch": 6.36880680152448, + "grad_norm": 0.1831685546451254, + "learning_rate": 0.00040779976933708886, + "loss": 2.942295551300049, + "step": 10865, + "token_acc": 0.3059243500330582 + }, + { + "epoch": 6.369393139841689, + "grad_norm": 0.1840300298439578, + "learning_rate": 0.00040778097513627853, + "loss": 2.9506733417510986, + "step": 10866, + "token_acc": 0.3060844126736971 + }, + { + "epoch": 6.369979478158898, + "grad_norm": 0.19895509226032182, + "learning_rate": 0.0004077621794533286, + "loss": 2.9362006187438965, + "step": 10867, + "token_acc": 0.30948858946736557 + }, + { + "epoch": 6.370565816476106, + "grad_norm": 0.17036374163993093, + "learning_rate": 0.00040774338228841557, + "loss": 2.987150192260742, + "step": 10868, + "token_acc": 0.29975110974264235 + }, + { + "epoch": 6.371152154793315, + "grad_norm": 0.19105024764681075, + "learning_rate": 0.000407724583641716, + "loss": 3.0011301040649414, + "step": 10869, + "token_acc": 0.29667671881640995 + }, + { + "epoch": 6.3717384931105245, + "grad_norm": 0.16941502393887453, + "learning_rate": 0.0004077057835134065, + "loss": 2.9660325050354004, + "step": 10870, + "token_acc": 0.3042255296685287 + }, + { + "epoch": 6.372324831427734, + "grad_norm": 0.20297351491449264, + "learning_rate": 0.0004076869819036637, + "loss": 2.9667439460754395, + "step": 10871, + "token_acc": 0.3017885362209658 + }, + { + "epoch": 6.372911169744943, + "grad_norm": 0.15621510987112286, + "learning_rate": 0.00040766817881266425, + "loss": 2.9813055992126465, + "step": 10872, + "token_acc": 0.3009944215377153 + }, + { + "epoch": 6.373497508062152, + "grad_norm": 0.1879396645837518, + "learning_rate": 0.00040764937424058465, + "loss": 2.9365572929382324, + "step": 10873, + "token_acc": 0.3072442572294036 + }, + { + "epoch": 6.374083846379361, + "grad_norm": 0.15944379709442696, + "learning_rate": 0.0004076305681876016, + "loss": 2.928072214126587, + "step": 10874, + "token_acc": 0.3082971599295034 + }, + { + "epoch": 6.37467018469657, + "grad_norm": 0.16772170958770188, + "learning_rate": 0.0004076117606538918, + "loss": 2.955548048019409, + "step": 10875, + "token_acc": 0.30468142036308593 + }, + { + "epoch": 6.375256523013779, + "grad_norm": 0.17314208271890746, + "learning_rate": 0.000407592951639632, + "loss": 2.9402618408203125, + "step": 10876, + "token_acc": 0.3062215143540105 + }, + { + "epoch": 6.375842861330988, + "grad_norm": 0.1467150302752035, + "learning_rate": 0.0004075741411449986, + "loss": 2.9789655208587646, + "step": 10877, + "token_acc": 0.2999671293838024 + }, + { + "epoch": 6.3764291996481965, + "grad_norm": 0.1695742513699787, + "learning_rate": 0.0004075553291701685, + "loss": 2.918882131576538, + "step": 10878, + "token_acc": 0.31006785437969175 + }, + { + "epoch": 6.377015537965406, + "grad_norm": 0.15877419773961907, + "learning_rate": 0.00040753651571531844, + "loss": 2.98256254196167, + "step": 10879, + "token_acc": 0.30085511849499147 + }, + { + "epoch": 6.377601876282615, + "grad_norm": 0.15984131095840176, + "learning_rate": 0.00040751770078062513, + "loss": 2.9471163749694824, + "step": 10880, + "token_acc": 0.304554722982506 + }, + { + "epoch": 6.378188214599824, + "grad_norm": 0.16875222079120633, + "learning_rate": 0.00040749888436626523, + "loss": 2.935910224914551, + "step": 10881, + "token_acc": 0.30772079292530397 + }, + { + "epoch": 6.378774552917033, + "grad_norm": 0.16437580296060417, + "learning_rate": 0.0004074800664724155, + "loss": 2.967566728591919, + "step": 10882, + "token_acc": 0.3023087170028455 + }, + { + "epoch": 6.379360891234242, + "grad_norm": 0.1684293100389546, + "learning_rate": 0.00040746124709925286, + "loss": 2.9030261039733887, + "step": 10883, + "token_acc": 0.31215960790906916 + }, + { + "epoch": 6.379947229551451, + "grad_norm": 0.16529405148219053, + "learning_rate": 0.00040744242624695396, + "loss": 2.946683883666992, + "step": 10884, + "token_acc": 0.3055111758736437 + }, + { + "epoch": 6.38053356786866, + "grad_norm": 0.1709058387925002, + "learning_rate": 0.0004074236039156956, + "loss": 2.970705509185791, + "step": 10885, + "token_acc": 0.3017043384600763 + }, + { + "epoch": 6.381119906185869, + "grad_norm": 0.17452504406676456, + "learning_rate": 0.00040740478010565465, + "loss": 2.9911231994628906, + "step": 10886, + "token_acc": 0.2963657300666984 + }, + { + "epoch": 6.3817062445030786, + "grad_norm": 0.2104857927652964, + "learning_rate": 0.0004073859548170078, + "loss": 2.989306926727295, + "step": 10887, + "token_acc": 0.30027754469760537 + }, + { + "epoch": 6.382292582820288, + "grad_norm": 0.1814747431436101, + "learning_rate": 0.00040736712804993204, + "loss": 2.985593795776367, + "step": 10888, + "token_acc": 0.299658376153076 + }, + { + "epoch": 6.382878921137497, + "grad_norm": 0.14911102478475433, + "learning_rate": 0.00040734829980460413, + "loss": 2.991454601287842, + "step": 10889, + "token_acc": 0.2997452912567272 + }, + { + "epoch": 6.383465259454705, + "grad_norm": 0.20297154006380874, + "learning_rate": 0.000407329470081201, + "loss": 2.9836232662200928, + "step": 10890, + "token_acc": 0.29891252453450745 + }, + { + "epoch": 6.384051597771914, + "grad_norm": 0.21397312152753128, + "learning_rate": 0.00040731063887989953, + "loss": 2.9542083740234375, + "step": 10891, + "token_acc": 0.30463627606351573 + }, + { + "epoch": 6.384637936089123, + "grad_norm": 0.16555912180259255, + "learning_rate": 0.00040729180620087657, + "loss": 2.9222214221954346, + "step": 10892, + "token_acc": 0.3094107805857626 + }, + { + "epoch": 6.385224274406332, + "grad_norm": 0.16306872109829346, + "learning_rate": 0.000407272972044309, + "loss": 2.9782533645629883, + "step": 10893, + "token_acc": 0.3021652426164514 + }, + { + "epoch": 6.3858106127235414, + "grad_norm": 0.15796602484119113, + "learning_rate": 0.0004072541364103739, + "loss": 2.9529407024383545, + "step": 10894, + "token_acc": 0.30396469929966086 + }, + { + "epoch": 6.386396951040751, + "grad_norm": 0.154041998245844, + "learning_rate": 0.000407235299299248, + "loss": 2.9794280529022217, + "step": 10895, + "token_acc": 0.3015317582744716 + }, + { + "epoch": 6.38698328935796, + "grad_norm": 0.17211627880276303, + "learning_rate": 0.0004072164607111084, + "loss": 2.926893711090088, + "step": 10896, + "token_acc": 0.30857590855061834 + }, + { + "epoch": 6.387569627675169, + "grad_norm": 0.22738903255385115, + "learning_rate": 0.0004071976206461319, + "loss": 2.9317312240600586, + "step": 10897, + "token_acc": 0.307181005018549 + }, + { + "epoch": 6.388155965992378, + "grad_norm": 0.23456960120684114, + "learning_rate": 0.0004071787791044956, + "loss": 2.9224226474761963, + "step": 10898, + "token_acc": 0.30805666888141114 + }, + { + "epoch": 6.388742304309587, + "grad_norm": 0.17534350646128327, + "learning_rate": 0.0004071599360863766, + "loss": 3.009730339050293, + "step": 10899, + "token_acc": 0.29538303588758696 + }, + { + "epoch": 6.389328642626795, + "grad_norm": 0.18890614746666187, + "learning_rate": 0.00040714109159195166, + "loss": 2.9382848739624023, + "step": 10900, + "token_acc": 0.30688043808013626 + }, + { + "epoch": 6.389914980944004, + "grad_norm": 0.19766894672509425, + "learning_rate": 0.000407122245621398, + "loss": 2.97365665435791, + "step": 10901, + "token_acc": 0.30228476812770083 + }, + { + "epoch": 6.3905013192612135, + "grad_norm": 0.16379061696352967, + "learning_rate": 0.0004071033981748925, + "loss": 2.9871082305908203, + "step": 10902, + "token_acc": 0.30101765436433287 + }, + { + "epoch": 6.391087657578423, + "grad_norm": 0.22988086638369795, + "learning_rate": 0.0004070845492526123, + "loss": 2.967031717300415, + "step": 10903, + "token_acc": 0.30156521649009077 + }, + { + "epoch": 6.391673995895632, + "grad_norm": 0.17542942578296394, + "learning_rate": 0.0004070656988547344, + "loss": 2.9390015602111816, + "step": 10904, + "token_acc": 0.30510544447077265 + }, + { + "epoch": 6.392260334212841, + "grad_norm": 0.20382663067529233, + "learning_rate": 0.0004070468469814359, + "loss": 2.949906826019287, + "step": 10905, + "token_acc": 0.30340397330257945 + }, + { + "epoch": 6.39284667253005, + "grad_norm": 0.2383607438058896, + "learning_rate": 0.000407027993632894, + "loss": 2.9648690223693848, + "step": 10906, + "token_acc": 0.30386580226367604 + }, + { + "epoch": 6.393433010847259, + "grad_norm": 0.16733988996217825, + "learning_rate": 0.0004070091388092856, + "loss": 2.972419023513794, + "step": 10907, + "token_acc": 0.30139712408051705 + }, + { + "epoch": 6.394019349164468, + "grad_norm": 0.18227419031116351, + "learning_rate": 0.0004069902825107879, + "loss": 2.986868381500244, + "step": 10908, + "token_acc": 0.29885926462111767 + }, + { + "epoch": 6.394605687481677, + "grad_norm": 0.15174184916484817, + "learning_rate": 0.00040697142473757807, + "loss": 2.949087619781494, + "step": 10909, + "token_acc": 0.3040002074661895 + }, + { + "epoch": 6.395192025798886, + "grad_norm": 0.1778677908854828, + "learning_rate": 0.00040695256548983327, + "loss": 2.983928918838501, + "step": 10910, + "token_acc": 0.3002261029559676 + }, + { + "epoch": 6.395778364116095, + "grad_norm": 0.1623496587079988, + "learning_rate": 0.0004069337047677306, + "loss": 2.9465432167053223, + "step": 10911, + "token_acc": 0.3056307152530186 + }, + { + "epoch": 6.396364702433304, + "grad_norm": 0.17169051533787727, + "learning_rate": 0.00040691484257144716, + "loss": 2.9196062088012695, + "step": 10912, + "token_acc": 0.30980072039160855 + }, + { + "epoch": 6.396951040750513, + "grad_norm": 0.1568976140199779, + "learning_rate": 0.0004068959789011603, + "loss": 2.939314603805542, + "step": 10913, + "token_acc": 0.3070295960481577 + }, + { + "epoch": 6.397537379067722, + "grad_norm": 0.1821329309509476, + "learning_rate": 0.00040687711375704717, + "loss": 2.9934403896331787, + "step": 10914, + "token_acc": 0.2984685861494953 + }, + { + "epoch": 6.398123717384931, + "grad_norm": 0.14839242106965686, + "learning_rate": 0.00040685824713928485, + "loss": 2.9265429973602295, + "step": 10915, + "token_acc": 0.30866645958280564 + }, + { + "epoch": 6.39871005570214, + "grad_norm": 0.229685389060179, + "learning_rate": 0.00040683937904805074, + "loss": 2.941117763519287, + "step": 10916, + "token_acc": 0.30591567124551455 + }, + { + "epoch": 6.399296394019349, + "grad_norm": 0.16653251154956047, + "learning_rate": 0.000406820509483522, + "loss": 3.009489059448242, + "step": 10917, + "token_acc": 0.2982285655912683 + }, + { + "epoch": 6.399882732336558, + "grad_norm": 0.19949653974624404, + "learning_rate": 0.00040680163844587585, + "loss": 2.9745571613311768, + "step": 10918, + "token_acc": 0.3027926449373837 + }, + { + "epoch": 6.4004690706537675, + "grad_norm": 0.16132393619535174, + "learning_rate": 0.00040678276593528965, + "loss": 2.9241137504577637, + "step": 10919, + "token_acc": 0.3064816244494401 + }, + { + "epoch": 6.401055408970977, + "grad_norm": 0.1741910853088498, + "learning_rate": 0.0004067638919519406, + "loss": 3.0008418560028076, + "step": 10920, + "token_acc": 0.2981756309670101 + }, + { + "epoch": 6.401641747288186, + "grad_norm": 0.18905464535413485, + "learning_rate": 0.00040674501649600603, + "loss": 2.9460060596466064, + "step": 10921, + "token_acc": 0.3045412395679306 + }, + { + "epoch": 6.402228085605394, + "grad_norm": 0.1454600385789069, + "learning_rate": 0.00040672613956766325, + "loss": 2.9477379322052, + "step": 10922, + "token_acc": 0.308213197617134 + }, + { + "epoch": 6.402814423922603, + "grad_norm": 0.19005003823953123, + "learning_rate": 0.00040670726116708955, + "loss": 3.013434410095215, + "step": 10923, + "token_acc": 0.296986624989039 + }, + { + "epoch": 6.403400762239812, + "grad_norm": 0.15423088570831267, + "learning_rate": 0.0004066883812944624, + "loss": 2.9453978538513184, + "step": 10924, + "token_acc": 0.30603970572325 + }, + { + "epoch": 6.403987100557021, + "grad_norm": 0.21662421102239449, + "learning_rate": 0.00040666949994995895, + "loss": 2.9981839656829834, + "step": 10925, + "token_acc": 0.29735401889090424 + }, + { + "epoch": 6.40457343887423, + "grad_norm": 0.16439498280380213, + "learning_rate": 0.00040665061713375674, + "loss": 2.9333629608154297, + "step": 10926, + "token_acc": 0.3065291311408851 + }, + { + "epoch": 6.4051597771914395, + "grad_norm": 0.21215230691443124, + "learning_rate": 0.00040663173284603295, + "loss": 2.915843963623047, + "step": 10927, + "token_acc": 0.30945634850363024 + }, + { + "epoch": 6.405746115508649, + "grad_norm": 0.21730398422606226, + "learning_rate": 0.00040661284708696523, + "loss": 2.9760735034942627, + "step": 10928, + "token_acc": 0.30106388609537926 + }, + { + "epoch": 6.406332453825858, + "grad_norm": 0.15927186760527645, + "learning_rate": 0.00040659395985673073, + "loss": 2.921783208847046, + "step": 10929, + "token_acc": 0.30797585846372505 + }, + { + "epoch": 6.406918792143067, + "grad_norm": 0.1981744615992735, + "learning_rate": 0.00040657507115550705, + "loss": 2.961003541946411, + "step": 10930, + "token_acc": 0.30425831871851733 + }, + { + "epoch": 6.407505130460276, + "grad_norm": 0.16960941166030244, + "learning_rate": 0.00040655618098347157, + "loss": 3.0157952308654785, + "step": 10931, + "token_acc": 0.2977501045637128 + }, + { + "epoch": 6.408091468777485, + "grad_norm": 0.18251406087814726, + "learning_rate": 0.0004065372893408017, + "loss": 2.9727673530578613, + "step": 10932, + "token_acc": 0.30135214700581325 + }, + { + "epoch": 6.408677807094693, + "grad_norm": 0.16345760781767105, + "learning_rate": 0.00040651839622767494, + "loss": 2.9810328483581543, + "step": 10933, + "token_acc": 0.3022723214632291 + }, + { + "epoch": 6.409264145411902, + "grad_norm": 0.17920946174833166, + "learning_rate": 0.0004064995016442689, + "loss": 2.9735050201416016, + "step": 10934, + "token_acc": 0.30085968424959836 + }, + { + "epoch": 6.4098504837291115, + "grad_norm": 0.16417655011168159, + "learning_rate": 0.0004064806055907607, + "loss": 2.9547266960144043, + "step": 10935, + "token_acc": 0.30357299180034325 + }, + { + "epoch": 6.410436822046321, + "grad_norm": 0.17369072768422933, + "learning_rate": 0.0004064617080673282, + "loss": 2.9730939865112305, + "step": 10936, + "token_acc": 0.30244193357746374 + }, + { + "epoch": 6.41102316036353, + "grad_norm": 0.18921432437639116, + "learning_rate": 0.0004064428090741488, + "loss": 2.968780040740967, + "step": 10937, + "token_acc": 0.3027877837897582 + }, + { + "epoch": 6.411609498680739, + "grad_norm": 0.15011849277658057, + "learning_rate": 0.00040642390861139987, + "loss": 2.980011463165283, + "step": 10938, + "token_acc": 0.30059114654935387 + }, + { + "epoch": 6.412195836997948, + "grad_norm": 0.2621792390565829, + "learning_rate": 0.0004064050066792593, + "loss": 2.9537606239318848, + "step": 10939, + "token_acc": 0.3025850034580706 + }, + { + "epoch": 6.412782175315157, + "grad_norm": 0.2647163741710445, + "learning_rate": 0.00040638610327790436, + "loss": 2.95698881149292, + "step": 10940, + "token_acc": 0.3047461680387908 + }, + { + "epoch": 6.413368513632366, + "grad_norm": 0.1707620018382443, + "learning_rate": 0.0004063671984075127, + "loss": 2.993807077407837, + "step": 10941, + "token_acc": 0.2982854820728209 + }, + { + "epoch": 6.413954851949575, + "grad_norm": 0.27226692079271986, + "learning_rate": 0.0004063482920682619, + "loss": 2.974234104156494, + "step": 10942, + "token_acc": 0.3001864763043879 + }, + { + "epoch": 6.4145411902667835, + "grad_norm": 0.1744053772779599, + "learning_rate": 0.0004063293842603296, + "loss": 2.9970741271972656, + "step": 10943, + "token_acc": 0.29814119610689666 + }, + { + "epoch": 6.415127528583993, + "grad_norm": 0.21833086762629847, + "learning_rate": 0.0004063104749838935, + "loss": 2.954303741455078, + "step": 10944, + "token_acc": 0.30532259613683355 + }, + { + "epoch": 6.415713866901202, + "grad_norm": 0.17125161360615437, + "learning_rate": 0.0004062915642391309, + "loss": 2.9773969650268555, + "step": 10945, + "token_acc": 0.29978872935799594 + }, + { + "epoch": 6.416300205218411, + "grad_norm": 0.27661298536844026, + "learning_rate": 0.0004062726520262198, + "loss": 2.935421943664551, + "step": 10946, + "token_acc": 0.30808216791523463 + }, + { + "epoch": 6.41688654353562, + "grad_norm": 0.2658161484675333, + "learning_rate": 0.00040625373834533775, + "loss": 2.986954927444458, + "step": 10947, + "token_acc": 0.2984972081949322 + }, + { + "epoch": 6.417472881852829, + "grad_norm": 0.22428112038665238, + "learning_rate": 0.00040623482319666226, + "loss": 2.9350357055664062, + "step": 10948, + "token_acc": 0.30672836561873346 + }, + { + "epoch": 6.418059220170038, + "grad_norm": 0.1966022432563209, + "learning_rate": 0.00040621590658037124, + "loss": 2.8928277492523193, + "step": 10949, + "token_acc": 0.31273036204417726 + }, + { + "epoch": 6.418645558487247, + "grad_norm": 0.1971832014305048, + "learning_rate": 0.0004061969884966423, + "loss": 2.9627113342285156, + "step": 10950, + "token_acc": 0.30377342739074176 + }, + { + "epoch": 6.419231896804456, + "grad_norm": 0.16534826267074784, + "learning_rate": 0.000406178068945653, + "loss": 2.9686803817749023, + "step": 10951, + "token_acc": 0.3034993518742952 + }, + { + "epoch": 6.4198182351216655, + "grad_norm": 0.20937830104149904, + "learning_rate": 0.0004061591479275813, + "loss": 3.0157692432403564, + "step": 10952, + "token_acc": 0.29358285395787104 + }, + { + "epoch": 6.420404573438875, + "grad_norm": 0.14558590870342775, + "learning_rate": 0.0004061402254426048, + "loss": 2.9498801231384277, + "step": 10953, + "token_acc": 0.3057905756159119 + }, + { + "epoch": 6.420990911756084, + "grad_norm": 0.1906831222027915, + "learning_rate": 0.0004061213014909013, + "loss": 2.956395149230957, + "step": 10954, + "token_acc": 0.3042340152376142 + }, + { + "epoch": 6.421577250073292, + "grad_norm": 0.1587880618742563, + "learning_rate": 0.00040610237607264854, + "loss": 2.9648256301879883, + "step": 10955, + "token_acc": 0.3036884102740565 + }, + { + "epoch": 6.422163588390501, + "grad_norm": 0.17660925807762717, + "learning_rate": 0.00040608344918802424, + "loss": 2.917018175125122, + "step": 10956, + "token_acc": 0.3128428321729955 + }, + { + "epoch": 6.42274992670771, + "grad_norm": 0.1810069180786094, + "learning_rate": 0.00040606452083720635, + "loss": 2.98777437210083, + "step": 10957, + "token_acc": 0.2980536396859169 + }, + { + "epoch": 6.423336265024919, + "grad_norm": 0.20177902944406664, + "learning_rate": 0.0004060455910203725, + "loss": 2.946408271789551, + "step": 10958, + "token_acc": 0.30679166245820333 + }, + { + "epoch": 6.423922603342128, + "grad_norm": 0.16319900841920762, + "learning_rate": 0.0004060266597377007, + "loss": 2.963646411895752, + "step": 10959, + "token_acc": 0.30293928564921696 + }, + { + "epoch": 6.4245089416593375, + "grad_norm": 0.1788792863041922, + "learning_rate": 0.00040600772698936867, + "loss": 2.8948006629943848, + "step": 10960, + "token_acc": 0.31280953126772126 + }, + { + "epoch": 6.425095279976547, + "grad_norm": 0.1682483790250407, + "learning_rate": 0.0004059887927755542, + "loss": 2.96028995513916, + "step": 10961, + "token_acc": 0.30412131313492813 + }, + { + "epoch": 6.425681618293756, + "grad_norm": 0.19236229201662874, + "learning_rate": 0.0004059698570964352, + "loss": 2.9720747470855713, + "step": 10962, + "token_acc": 0.30084902747761655 + }, + { + "epoch": 6.426267956610965, + "grad_norm": 0.16062061909247952, + "learning_rate": 0.0004059509199521897, + "loss": 2.956996202468872, + "step": 10963, + "token_acc": 0.30439489817414794 + }, + { + "epoch": 6.426854294928174, + "grad_norm": 0.21864820679158495, + "learning_rate": 0.00040593198134299536, + "loss": 2.997164487838745, + "step": 10964, + "token_acc": 0.2979842758561754 + }, + { + "epoch": 6.427440633245382, + "grad_norm": 0.1560265522791331, + "learning_rate": 0.0004059130412690302, + "loss": 2.9648303985595703, + "step": 10965, + "token_acc": 0.30293301821927443 + }, + { + "epoch": 6.428026971562591, + "grad_norm": 0.21607294998910473, + "learning_rate": 0.0004058940997304721, + "loss": 3.006887435913086, + "step": 10966, + "token_acc": 0.2975354331899703 + }, + { + "epoch": 6.4286133098798, + "grad_norm": 0.17064828886511826, + "learning_rate": 0.00040587515672749897, + "loss": 2.965193271636963, + "step": 10967, + "token_acc": 0.3021501448216476 + }, + { + "epoch": 6.4291996481970095, + "grad_norm": 0.18106276263927537, + "learning_rate": 0.0004058562122602888, + "loss": 2.9416446685791016, + "step": 10968, + "token_acc": 0.3056566878079531 + }, + { + "epoch": 6.429785986514219, + "grad_norm": 0.16088293066732756, + "learning_rate": 0.00040583726632901964, + "loss": 2.978781223297119, + "step": 10969, + "token_acc": 0.30182337932969105 + }, + { + "epoch": 6.430372324831428, + "grad_norm": 0.2023570348559319, + "learning_rate": 0.00040581831893386923, + "loss": 2.9697704315185547, + "step": 10970, + "token_acc": 0.30196639181165114 + }, + { + "epoch": 6.430958663148637, + "grad_norm": 0.15160292167210002, + "learning_rate": 0.00040579937007501564, + "loss": 2.932023048400879, + "step": 10971, + "token_acc": 0.307907106622609 + }, + { + "epoch": 6.431545001465846, + "grad_norm": 0.18202856830457265, + "learning_rate": 0.000405780419752637, + "loss": 2.970111846923828, + "step": 10972, + "token_acc": 0.30331300304295705 + }, + { + "epoch": 6.432131339783055, + "grad_norm": 0.18304136976814175, + "learning_rate": 0.0004057614679669113, + "loss": 2.9986703395843506, + "step": 10973, + "token_acc": 0.29703468887775425 + }, + { + "epoch": 6.432717678100264, + "grad_norm": 0.16373862883180998, + "learning_rate": 0.00040574251471801637, + "loss": 2.960819721221924, + "step": 10974, + "token_acc": 0.3024405980718369 + }, + { + "epoch": 6.433304016417473, + "grad_norm": 0.17067745589540356, + "learning_rate": 0.0004057235600061305, + "loss": 2.9610447883605957, + "step": 10975, + "token_acc": 0.3004389404743231 + }, + { + "epoch": 6.4338903547346815, + "grad_norm": 0.16462512542468266, + "learning_rate": 0.0004057046038314315, + "loss": 2.9647059440612793, + "step": 10976, + "token_acc": 0.30300859789592527 + }, + { + "epoch": 6.434476693051891, + "grad_norm": 0.18076375674709722, + "learning_rate": 0.00040568564619409766, + "loss": 2.960850477218628, + "step": 10977, + "token_acc": 0.3028802663483618 + }, + { + "epoch": 6.4350630313691, + "grad_norm": 0.18544645809887664, + "learning_rate": 0.00040566668709430685, + "loss": 2.9548404216766357, + "step": 10978, + "token_acc": 0.3043620806844988 + }, + { + "epoch": 6.435649369686309, + "grad_norm": 0.155322214427238, + "learning_rate": 0.0004056477265322374, + "loss": 2.966245174407959, + "step": 10979, + "token_acc": 0.30023630675648477 + }, + { + "epoch": 6.436235708003518, + "grad_norm": 0.15957014881905904, + "learning_rate": 0.0004056287645080673, + "loss": 2.976929187774658, + "step": 10980, + "token_acc": 0.3001660496373539 + }, + { + "epoch": 6.436822046320727, + "grad_norm": 0.16763193404178797, + "learning_rate": 0.0004056098010219745, + "loss": 2.9870004653930664, + "step": 10981, + "token_acc": 0.29938123801097916 + }, + { + "epoch": 6.437408384637936, + "grad_norm": 0.14546282127208945, + "learning_rate": 0.0004055908360741375, + "loss": 2.9692420959472656, + "step": 10982, + "token_acc": 0.3008335111084022 + }, + { + "epoch": 6.437994722955145, + "grad_norm": 0.16861749167645715, + "learning_rate": 0.0004055718696647342, + "loss": 2.990567684173584, + "step": 10983, + "token_acc": 0.2990827229899327 + }, + { + "epoch": 6.438581061272354, + "grad_norm": 0.16006648482588084, + "learning_rate": 0.0004055529017939428, + "loss": 2.9518535137176514, + "step": 10984, + "token_acc": 0.3055904189385372 + }, + { + "epoch": 6.4391673995895635, + "grad_norm": 0.1805971278296849, + "learning_rate": 0.0004055339324619415, + "loss": 2.9713170528411865, + "step": 10985, + "token_acc": 0.30313592489027785 + }, + { + "epoch": 6.439753737906772, + "grad_norm": 0.16145350516167511, + "learning_rate": 0.00040551496166890845, + "loss": 2.9153270721435547, + "step": 10986, + "token_acc": 0.3091028676664036 + }, + { + "epoch": 6.440340076223981, + "grad_norm": 0.1779904315534651, + "learning_rate": 0.00040549598941502194, + "loss": 2.968998432159424, + "step": 10987, + "token_acc": 0.30381659171495284 + }, + { + "epoch": 6.44092641454119, + "grad_norm": 0.17565821801267537, + "learning_rate": 0.00040547701570046015, + "loss": 2.9445443153381348, + "step": 10988, + "token_acc": 0.30463646102083614 + }, + { + "epoch": 6.441512752858399, + "grad_norm": 0.16365530604727446, + "learning_rate": 0.00040545804052540127, + "loss": 2.9817442893981934, + "step": 10989, + "token_acc": 0.30108656919906834 + }, + { + "epoch": 6.442099091175608, + "grad_norm": 0.16998328359580273, + "learning_rate": 0.0004054390638900236, + "loss": 2.9653778076171875, + "step": 10990, + "token_acc": 0.3028326524448194 + }, + { + "epoch": 6.442685429492817, + "grad_norm": 0.2044879965170746, + "learning_rate": 0.0004054200857945054, + "loss": 2.9568662643432617, + "step": 10991, + "token_acc": 0.3029704795630699 + }, + { + "epoch": 6.443271767810026, + "grad_norm": 0.19857680427712052, + "learning_rate": 0.00040540110623902485, + "loss": 2.945683002471924, + "step": 10992, + "token_acc": 0.30467796823630966 + }, + { + "epoch": 6.4438581061272355, + "grad_norm": 0.15382924554057, + "learning_rate": 0.0004053821252237604, + "loss": 2.9848220348358154, + "step": 10993, + "token_acc": 0.3026869235380345 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.17035612422243762, + "learning_rate": 0.0004053631427488903, + "loss": 2.9737548828125, + "step": 10994, + "token_acc": 0.3026927213865237 + }, + { + "epoch": 6.445030782761654, + "grad_norm": 0.20585445726308768, + "learning_rate": 0.0004053441588145927, + "loss": 2.9946248531341553, + "step": 10995, + "token_acc": 0.2990310370529521 + }, + { + "epoch": 6.445617121078863, + "grad_norm": 0.18421816371910196, + "learning_rate": 0.00040532517342104613, + "loss": 2.9479610919952393, + "step": 10996, + "token_acc": 0.30516617053903944 + }, + { + "epoch": 6.446203459396072, + "grad_norm": 0.15079240314429534, + "learning_rate": 0.00040530618656842886, + "loss": 2.939673662185669, + "step": 10997, + "token_acc": 0.30784405015326405 + }, + { + "epoch": 6.44678979771328, + "grad_norm": 0.1569719461174747, + "learning_rate": 0.00040528719825691923, + "loss": 2.968632698059082, + "step": 10998, + "token_acc": 0.3027323560126373 + }, + { + "epoch": 6.447376136030489, + "grad_norm": 0.14987361625202347, + "learning_rate": 0.00040526820848669565, + "loss": 2.9488604068756104, + "step": 10999, + "token_acc": 0.30570862586939984 + }, + { + "epoch": 6.447962474347698, + "grad_norm": 0.16078618993224303, + "learning_rate": 0.0004052492172579364, + "loss": 3.022296190261841, + "step": 11000, + "token_acc": 0.2928704720814198 + }, + { + "epoch": 6.4485488126649075, + "grad_norm": 0.18774993923038064, + "learning_rate": 0.00040523022457082, + "loss": 2.967209577560425, + "step": 11001, + "token_acc": 0.30199318393060004 + }, + { + "epoch": 6.449135150982117, + "grad_norm": 0.1611828550020077, + "learning_rate": 0.0004052112304255249, + "loss": 2.9375970363616943, + "step": 11002, + "token_acc": 0.30710354544063406 + }, + { + "epoch": 6.449721489299326, + "grad_norm": 0.15888655781049046, + "learning_rate": 0.00040519223482222934, + "loss": 2.936610221862793, + "step": 11003, + "token_acc": 0.307128273744476 + }, + { + "epoch": 6.450307827616535, + "grad_norm": 0.15563442168753397, + "learning_rate": 0.0004051732377611119, + "loss": 2.955447196960449, + "step": 11004, + "token_acc": 0.30553366855654346 + }, + { + "epoch": 6.450894165933744, + "grad_norm": 0.16638699384737135, + "learning_rate": 0.00040515423924235094, + "loss": 2.969627857208252, + "step": 11005, + "token_acc": 0.3021407590302463 + }, + { + "epoch": 6.451480504250953, + "grad_norm": 0.16536871974523756, + "learning_rate": 0.000405135239266125, + "loss": 2.985635280609131, + "step": 11006, + "token_acc": 0.3007485271619776 + }, + { + "epoch": 6.452066842568162, + "grad_norm": 0.18204464321068242, + "learning_rate": 0.0004051162378326125, + "loss": 2.9771459102630615, + "step": 11007, + "token_acc": 0.29979941606065535 + }, + { + "epoch": 6.45265318088537, + "grad_norm": 0.18119099182986687, + "learning_rate": 0.00040509723494199206, + "loss": 2.979257345199585, + "step": 11008, + "token_acc": 0.3011885495160243 + }, + { + "epoch": 6.4532395192025795, + "grad_norm": 0.17647198192377894, + "learning_rate": 0.00040507823059444205, + "loss": 2.97053861618042, + "step": 11009, + "token_acc": 0.3006866181265058 + }, + { + "epoch": 6.453825857519789, + "grad_norm": 0.17572981843950644, + "learning_rate": 0.000405059224790141, + "loss": 2.945981502532959, + "step": 11010, + "token_acc": 0.30668648457886505 + }, + { + "epoch": 6.454412195836998, + "grad_norm": 0.1810737399166512, + "learning_rate": 0.00040504021752926756, + "loss": 2.9665284156799316, + "step": 11011, + "token_acc": 0.30283879814821707 + }, + { + "epoch": 6.454998534154207, + "grad_norm": 0.23303855946178137, + "learning_rate": 0.0004050212088120001, + "loss": 2.979137659072876, + "step": 11012, + "token_acc": 0.30002830382205464 + }, + { + "epoch": 6.455584872471416, + "grad_norm": 0.18550357882650165, + "learning_rate": 0.0004050021986385173, + "loss": 2.982293128967285, + "step": 11013, + "token_acc": 0.30091033667683814 + }, + { + "epoch": 6.456171210788625, + "grad_norm": 0.16276016417952907, + "learning_rate": 0.0004049831870089978, + "loss": 2.917393922805786, + "step": 11014, + "token_acc": 0.310715242010427 + }, + { + "epoch": 6.456757549105834, + "grad_norm": 0.18105753600890304, + "learning_rate": 0.00040496417392362, + "loss": 2.9471259117126465, + "step": 11015, + "token_acc": 0.3056361316154255 + }, + { + "epoch": 6.457343887423043, + "grad_norm": 0.19732029779609106, + "learning_rate": 0.0004049451593825626, + "loss": 2.9281907081604004, + "step": 11016, + "token_acc": 0.3084952582347039 + }, + { + "epoch": 6.457930225740252, + "grad_norm": 0.16987720435874348, + "learning_rate": 0.0004049261433860043, + "loss": 2.9240832328796387, + "step": 11017, + "token_acc": 0.30802106985396355 + }, + { + "epoch": 6.4585165640574616, + "grad_norm": 0.16232685604741973, + "learning_rate": 0.0004049071259341236, + "loss": 2.978379964828491, + "step": 11018, + "token_acc": 0.30260693100218544 + }, + { + "epoch": 6.45910290237467, + "grad_norm": 0.15835180754354355, + "learning_rate": 0.0004048881070270992, + "loss": 2.9198546409606934, + "step": 11019, + "token_acc": 0.3112500995936579 + }, + { + "epoch": 6.459689240691879, + "grad_norm": 0.16073894935547114, + "learning_rate": 0.0004048690866651097, + "loss": 2.972555160522461, + "step": 11020, + "token_acc": 0.30151994527341913 + }, + { + "epoch": 6.460275579009088, + "grad_norm": 0.1549009792349556, + "learning_rate": 0.00040485006484833384, + "loss": 2.9436280727386475, + "step": 11021, + "token_acc": 0.30581908336562913 + }, + { + "epoch": 6.460861917326297, + "grad_norm": 0.16447859903800646, + "learning_rate": 0.00040483104157695035, + "loss": 2.9682822227478027, + "step": 11022, + "token_acc": 0.30397973737010187 + }, + { + "epoch": 6.461448255643506, + "grad_norm": 0.20129253412972184, + "learning_rate": 0.00040481201685113783, + "loss": 2.9514973163604736, + "step": 11023, + "token_acc": 0.30446153317817864 + }, + { + "epoch": 6.462034593960715, + "grad_norm": 0.3312656766050174, + "learning_rate": 0.000404792990671075, + "loss": 2.9329237937927246, + "step": 11024, + "token_acc": 0.30678949454536747 + }, + { + "epoch": 6.4626209322779244, + "grad_norm": 0.359687891140474, + "learning_rate": 0.0004047739630369406, + "loss": 2.923241138458252, + "step": 11025, + "token_acc": 0.3091828273371673 + }, + { + "epoch": 6.463207270595134, + "grad_norm": 0.18999998095476667, + "learning_rate": 0.0004047549339489134, + "loss": 2.974426507949829, + "step": 11026, + "token_acc": 0.3018828333696503 + }, + { + "epoch": 6.463793608912343, + "grad_norm": 0.23845697061423365, + "learning_rate": 0.00040473590340717213, + "loss": 2.965245246887207, + "step": 11027, + "token_acc": 0.30380509641873277 + }, + { + "epoch": 6.464379947229552, + "grad_norm": 0.24183730893845434, + "learning_rate": 0.0004047168714118956, + "loss": 2.9721157550811768, + "step": 11028, + "token_acc": 0.3009478489531856 + }, + { + "epoch": 6.464966285546761, + "grad_norm": 0.20764023463261477, + "learning_rate": 0.00040469783796326245, + "loss": 2.911668300628662, + "step": 11029, + "token_acc": 0.3096965405093503 + }, + { + "epoch": 6.465552623863969, + "grad_norm": 0.24801309810389363, + "learning_rate": 0.00040467880306145165, + "loss": 3.0214481353759766, + "step": 11030, + "token_acc": 0.29407991858132193 + }, + { + "epoch": 6.466138962181178, + "grad_norm": 0.16091745918354255, + "learning_rate": 0.0004046597667066419, + "loss": 2.964920997619629, + "step": 11031, + "token_acc": 0.30342505296408107 + }, + { + "epoch": 6.466725300498387, + "grad_norm": 0.19904757272612675, + "learning_rate": 0.00040464072889901206, + "loss": 2.9421639442443848, + "step": 11032, + "token_acc": 0.3075618889392601 + }, + { + "epoch": 6.4673116388155965, + "grad_norm": 0.18344316535051605, + "learning_rate": 0.0004046216896387409, + "loss": 2.983275890350342, + "step": 11033, + "token_acc": 0.3008353803122763 + }, + { + "epoch": 6.467897977132806, + "grad_norm": 0.19049978161785916, + "learning_rate": 0.0004046026489260074, + "loss": 3.003890037536621, + "step": 11034, + "token_acc": 0.29769240318103424 + }, + { + "epoch": 6.468484315450015, + "grad_norm": 0.18661858050436544, + "learning_rate": 0.0004045836067609903, + "loss": 2.9422879219055176, + "step": 11035, + "token_acc": 0.3057319286897656 + }, + { + "epoch": 6.469070653767224, + "grad_norm": 0.17148065335789506, + "learning_rate": 0.00040456456314386845, + "loss": 2.9301772117614746, + "step": 11036, + "token_acc": 0.30922511458488083 + }, + { + "epoch": 6.469656992084433, + "grad_norm": 0.21583330851988694, + "learning_rate": 0.00040454551807482097, + "loss": 2.9676766395568848, + "step": 11037, + "token_acc": 0.30144693486839536 + }, + { + "epoch": 6.470243330401642, + "grad_norm": 0.1652773508973196, + "learning_rate": 0.0004045264715540265, + "loss": 2.946106433868408, + "step": 11038, + "token_acc": 0.3063622298532083 + }, + { + "epoch": 6.470829668718851, + "grad_norm": 0.2009158894428785, + "learning_rate": 0.000404507423581664, + "loss": 2.9670166969299316, + "step": 11039, + "token_acc": 0.3017612692517222 + }, + { + "epoch": 6.47141600703606, + "grad_norm": 0.14933398826241234, + "learning_rate": 0.00040448837415791255, + "loss": 2.9930896759033203, + "step": 11040, + "token_acc": 0.30068521999979453 + }, + { + "epoch": 6.4720023453532685, + "grad_norm": 0.18025734834700444, + "learning_rate": 0.000404469323282951, + "loss": 2.9450619220733643, + "step": 11041, + "token_acc": 0.30822418927691847 + }, + { + "epoch": 6.472588683670478, + "grad_norm": 0.1433719502956986, + "learning_rate": 0.0004044502709569583, + "loss": 2.9645299911499023, + "step": 11042, + "token_acc": 0.3029112742501072 + }, + { + "epoch": 6.473175021987687, + "grad_norm": 0.17519320857277454, + "learning_rate": 0.00040443121718011343, + "loss": 2.9609498977661133, + "step": 11043, + "token_acc": 0.30290424548174016 + }, + { + "epoch": 6.473761360304896, + "grad_norm": 0.15932307920817604, + "learning_rate": 0.0004044121619525953, + "loss": 2.983809232711792, + "step": 11044, + "token_acc": 0.3008128020119765 + }, + { + "epoch": 6.474347698622105, + "grad_norm": 0.17436110107963299, + "learning_rate": 0.00040439310527458307, + "loss": 2.964989423751831, + "step": 11045, + "token_acc": 0.30319696479515307 + }, + { + "epoch": 6.474934036939314, + "grad_norm": 0.1666664248681814, + "learning_rate": 0.0004043740471462556, + "loss": 2.9466147422790527, + "step": 11046, + "token_acc": 0.3050879237146263 + }, + { + "epoch": 6.475520375256523, + "grad_norm": 0.19103484175670707, + "learning_rate": 0.00040435498756779203, + "loss": 2.9719066619873047, + "step": 11047, + "token_acc": 0.3012914676632004 + }, + { + "epoch": 6.476106713573732, + "grad_norm": 0.1568177668594751, + "learning_rate": 0.00040433592653937135, + "loss": 2.9644248485565186, + "step": 11048, + "token_acc": 0.3021300948750663 + }, + { + "epoch": 6.476693051890941, + "grad_norm": 0.24384840681966846, + "learning_rate": 0.00040431686406117254, + "loss": 2.947810411453247, + "step": 11049, + "token_acc": 0.3058180307005013 + }, + { + "epoch": 6.4772793902081505, + "grad_norm": 0.14228161784566254, + "learning_rate": 0.0004042978001333748, + "loss": 2.9597244262695312, + "step": 11050, + "token_acc": 0.3038345588526708 + }, + { + "epoch": 6.477865728525359, + "grad_norm": 0.24128007140971666, + "learning_rate": 0.0004042787347561571, + "loss": 2.9976391792297363, + "step": 11051, + "token_acc": 0.2981018850694671 + }, + { + "epoch": 6.478452066842568, + "grad_norm": 0.17771078577716284, + "learning_rate": 0.00040425966792969866, + "loss": 2.946054697036743, + "step": 11052, + "token_acc": 0.3062355849286874 + }, + { + "epoch": 6.479038405159777, + "grad_norm": 0.1972764012027858, + "learning_rate": 0.00040424059965417846, + "loss": 2.958761692047119, + "step": 11053, + "token_acc": 0.30292308836999277 + }, + { + "epoch": 6.479624743476986, + "grad_norm": 0.180484643260157, + "learning_rate": 0.0004042215299297757, + "loss": 2.937087059020996, + "step": 11054, + "token_acc": 0.3086102978721221 + }, + { + "epoch": 6.480211081794195, + "grad_norm": 0.25769498179007067, + "learning_rate": 0.0004042024587566694, + "loss": 2.9915108680725098, + "step": 11055, + "token_acc": 0.3007145410953142 + }, + { + "epoch": 6.480797420111404, + "grad_norm": 0.2316684140044936, + "learning_rate": 0.0004041833861350388, + "loss": 2.939375400543213, + "step": 11056, + "token_acc": 0.30582905354863094 + }, + { + "epoch": 6.481383758428613, + "grad_norm": 0.2614347270163304, + "learning_rate": 0.0004041643120650631, + "loss": 2.9615275859832764, + "step": 11057, + "token_acc": 0.30285711270132776 + }, + { + "epoch": 6.4819700967458225, + "grad_norm": 0.27686865347087153, + "learning_rate": 0.0004041452365469215, + "loss": 2.9852428436279297, + "step": 11058, + "token_acc": 0.29849912866874606 + }, + { + "epoch": 6.482556435063032, + "grad_norm": 0.21266965391364331, + "learning_rate": 0.00040412615958079296, + "loss": 3.0051534175872803, + "step": 11059, + "token_acc": 0.2960327157699744 + }, + { + "epoch": 6.483142773380241, + "grad_norm": 0.22115245201070177, + "learning_rate": 0.0004041070811668569, + "loss": 2.9762964248657227, + "step": 11060, + "token_acc": 0.30322849949778896 + }, + { + "epoch": 6.48372911169745, + "grad_norm": 0.19203711554660205, + "learning_rate": 0.0004040880013052925, + "loss": 2.949918031692505, + "step": 11061, + "token_acc": 0.30508062110435996 + }, + { + "epoch": 6.484315450014659, + "grad_norm": 0.1957515101659472, + "learning_rate": 0.00040406891999627897, + "loss": 2.9779117107391357, + "step": 11062, + "token_acc": 0.3024635345776165 + }, + { + "epoch": 6.484901788331867, + "grad_norm": 0.17192739121866624, + "learning_rate": 0.00040404983723999556, + "loss": 2.953482151031494, + "step": 11063, + "token_acc": 0.3037689831722254 + }, + { + "epoch": 6.485488126649076, + "grad_norm": 0.20204318522950107, + "learning_rate": 0.0004040307530366214, + "loss": 2.9332275390625, + "step": 11064, + "token_acc": 0.30831269201601613 + }, + { + "epoch": 6.486074464966285, + "grad_norm": 0.21347918986004039, + "learning_rate": 0.000404011667386336, + "loss": 2.9197020530700684, + "step": 11065, + "token_acc": 0.3088927434737735 + }, + { + "epoch": 6.4866608032834945, + "grad_norm": 0.18967424204918165, + "learning_rate": 0.00040399258028931843, + "loss": 2.990913152694702, + "step": 11066, + "token_acc": 0.29995835770729434 + }, + { + "epoch": 6.487247141600704, + "grad_norm": 0.20324701700526862, + "learning_rate": 0.00040397349174574814, + "loss": 2.9768433570861816, + "step": 11067, + "token_acc": 0.3015643864471217 + }, + { + "epoch": 6.487833479917913, + "grad_norm": 0.18338190804273638, + "learning_rate": 0.0004039544017558044, + "loss": 2.980958938598633, + "step": 11068, + "token_acc": 0.30056974143701454 + }, + { + "epoch": 6.488419818235122, + "grad_norm": 0.18919699248370125, + "learning_rate": 0.00040393531031966646, + "loss": 3.013619899749756, + "step": 11069, + "token_acc": 0.2962669839211221 + }, + { + "epoch": 6.489006156552331, + "grad_norm": 0.20775545955321417, + "learning_rate": 0.00040391621743751373, + "loss": 2.978884220123291, + "step": 11070, + "token_acc": 0.3002861055602186 + }, + { + "epoch": 6.48959249486954, + "grad_norm": 0.1791088632399652, + "learning_rate": 0.00040389712310952546, + "loss": 2.9460225105285645, + "step": 11071, + "token_acc": 0.3041709135133553 + }, + { + "epoch": 6.490178833186749, + "grad_norm": 0.18329203057404797, + "learning_rate": 0.0004038780273358812, + "loss": 2.9357171058654785, + "step": 11072, + "token_acc": 0.3067763306791762 + }, + { + "epoch": 6.490765171503957, + "grad_norm": 0.18800826651635513, + "learning_rate": 0.0004038589301167602, + "loss": 2.9659342765808105, + "step": 11073, + "token_acc": 0.3034585446240747 + }, + { + "epoch": 6.4913515098211665, + "grad_norm": 0.1902953162359805, + "learning_rate": 0.0004038398314523419, + "loss": 2.9764933586120605, + "step": 11074, + "token_acc": 0.30184485838224034 + }, + { + "epoch": 6.491937848138376, + "grad_norm": 0.16447794515883632, + "learning_rate": 0.0004038207313428056, + "loss": 2.9616081714630127, + "step": 11075, + "token_acc": 0.3050008027625918 + }, + { + "epoch": 6.492524186455585, + "grad_norm": 0.20159833388841156, + "learning_rate": 0.0004038016297883309, + "loss": 2.9155707359313965, + "step": 11076, + "token_acc": 0.3102328841481731 + }, + { + "epoch": 6.493110524772794, + "grad_norm": 0.1464278051951114, + "learning_rate": 0.0004037825267890971, + "loss": 2.96445894241333, + "step": 11077, + "token_acc": 0.30274721490215484 + }, + { + "epoch": 6.493696863090003, + "grad_norm": 0.18730342961676183, + "learning_rate": 0.0004037634223452836, + "loss": 2.9765865802764893, + "step": 11078, + "token_acc": 0.30059302304022967 + }, + { + "epoch": 6.494283201407212, + "grad_norm": 0.1547025149518504, + "learning_rate": 0.0004037443164570701, + "loss": 2.893035888671875, + "step": 11079, + "token_acc": 0.3129223637155889 + }, + { + "epoch": 6.494869539724421, + "grad_norm": 0.21560707900866685, + "learning_rate": 0.00040372520912463586, + "loss": 2.949376106262207, + "step": 11080, + "token_acc": 0.3053082738649569 + }, + { + "epoch": 6.49545587804163, + "grad_norm": 0.17054647591230307, + "learning_rate": 0.00040370610034816043, + "loss": 2.960934638977051, + "step": 11081, + "token_acc": 0.30258094391543405 + }, + { + "epoch": 6.496042216358839, + "grad_norm": 0.18547531815637075, + "learning_rate": 0.00040368699012782326, + "loss": 3.0074872970581055, + "step": 11082, + "token_acc": 0.29707040604544727 + }, + { + "epoch": 6.4966285546760485, + "grad_norm": 0.18844877787442277, + "learning_rate": 0.00040366787846380395, + "loss": 2.990184783935547, + "step": 11083, + "token_acc": 0.30053078944481304 + }, + { + "epoch": 6.497214892993257, + "grad_norm": 0.2385125878390623, + "learning_rate": 0.00040364876535628204, + "loss": 2.9769949913024902, + "step": 11084, + "token_acc": 0.29980267507763075 + }, + { + "epoch": 6.497801231310466, + "grad_norm": 0.20130320205267963, + "learning_rate": 0.00040362965080543696, + "loss": 2.95400071144104, + "step": 11085, + "token_acc": 0.3049591405514116 + }, + { + "epoch": 6.498387569627675, + "grad_norm": 0.21267316651769153, + "learning_rate": 0.0004036105348114484, + "loss": 3.0000522136688232, + "step": 11086, + "token_acc": 0.29770213115216615 + }, + { + "epoch": 6.498973907944884, + "grad_norm": 0.29276819219364786, + "learning_rate": 0.00040359141737449577, + "loss": 2.972066879272461, + "step": 11087, + "token_acc": 0.3011774202311376 + }, + { + "epoch": 6.499560246262093, + "grad_norm": 0.1594625919880992, + "learning_rate": 0.00040357229849475874, + "loss": 2.9509942531585693, + "step": 11088, + "token_acc": 0.305596837944664 + }, + { + "epoch": 6.500146584579302, + "grad_norm": 0.2549587862401406, + "learning_rate": 0.000403553178172417, + "loss": 3.016068458557129, + "step": 11089, + "token_acc": 0.2955493811568578 + }, + { + "epoch": 6.500732922896511, + "grad_norm": 0.1848205064782445, + "learning_rate": 0.00040353405640764997, + "loss": 2.9823317527770996, + "step": 11090, + "token_acc": 0.3005174552635927 + }, + { + "epoch": 6.5013192612137205, + "grad_norm": 0.2287311698612774, + "learning_rate": 0.0004035149332006374, + "loss": 2.937300443649292, + "step": 11091, + "token_acc": 0.3081988716953364 + }, + { + "epoch": 6.50190559953093, + "grad_norm": 0.16495878775379488, + "learning_rate": 0.000403495808551559, + "loss": 2.98425030708313, + "step": 11092, + "token_acc": 0.3021631571039131 + }, + { + "epoch": 6.502491937848139, + "grad_norm": 0.23513495163756104, + "learning_rate": 0.00040347668246059416, + "loss": 3.0295958518981934, + "step": 11093, + "token_acc": 0.2943973720572516 + }, + { + "epoch": 6.503078276165347, + "grad_norm": 0.17645608386483533, + "learning_rate": 0.00040345755492792276, + "loss": 2.9699559211730957, + "step": 11094, + "token_acc": 0.30352967667103914 + }, + { + "epoch": 6.503664614482556, + "grad_norm": 0.19432317053656015, + "learning_rate": 0.0004034384259537244, + "loss": 2.969893455505371, + "step": 11095, + "token_acc": 0.30074205672045734 + }, + { + "epoch": 6.504250952799765, + "grad_norm": 0.15641497195410242, + "learning_rate": 0.0004034192955381788, + "loss": 2.962783098220825, + "step": 11096, + "token_acc": 0.3045185988642889 + }, + { + "epoch": 6.504837291116974, + "grad_norm": 0.1784147853953123, + "learning_rate": 0.00040340016368146573, + "loss": 2.965872049331665, + "step": 11097, + "token_acc": 0.30374863200643454 + }, + { + "epoch": 6.505423629434183, + "grad_norm": 0.14982213055844684, + "learning_rate": 0.00040338103038376475, + "loss": 2.988189458847046, + "step": 11098, + "token_acc": 0.30027467289189846 + }, + { + "epoch": 6.5060099677513925, + "grad_norm": 0.15696720245700319, + "learning_rate": 0.00040336189564525564, + "loss": 2.945434331893921, + "step": 11099, + "token_acc": 0.3055884467945911 + }, + { + "epoch": 6.506596306068602, + "grad_norm": 0.15062387034080923, + "learning_rate": 0.00040334275946611825, + "loss": 2.991771697998047, + "step": 11100, + "token_acc": 0.2976693658323072 + }, + { + "epoch": 6.507182644385811, + "grad_norm": 0.15413636192794616, + "learning_rate": 0.00040332362184653225, + "loss": 2.9476187229156494, + "step": 11101, + "token_acc": 0.3051726013152345 + }, + { + "epoch": 6.50776898270302, + "grad_norm": 0.15590603656144794, + "learning_rate": 0.00040330448278667744, + "loss": 2.9957499504089355, + "step": 11102, + "token_acc": 0.2963924994616266 + }, + { + "epoch": 6.508355321020229, + "grad_norm": 0.18207075011384252, + "learning_rate": 0.00040328534228673354, + "loss": 3.0018863677978516, + "step": 11103, + "token_acc": 0.2972667028276301 + }, + { + "epoch": 6.508941659337438, + "grad_norm": 0.15906994016353035, + "learning_rate": 0.0004032662003468804, + "loss": 3.008528232574463, + "step": 11104, + "token_acc": 0.2964832091278475 + }, + { + "epoch": 6.509527997654647, + "grad_norm": 0.17118606757593374, + "learning_rate": 0.00040324705696729793, + "loss": 2.9678163528442383, + "step": 11105, + "token_acc": 0.30353108369466497 + }, + { + "epoch": 6.510114335971855, + "grad_norm": 0.15688641913000426, + "learning_rate": 0.00040322791214816584, + "loss": 2.979048252105713, + "step": 11106, + "token_acc": 0.30115884828954415 + }, + { + "epoch": 6.5107006742890645, + "grad_norm": 0.1617357399531289, + "learning_rate": 0.0004032087658896639, + "loss": 2.9192066192626953, + "step": 11107, + "token_acc": 0.3098767839450218 + }, + { + "epoch": 6.511287012606274, + "grad_norm": 0.17487596786405155, + "learning_rate": 0.0004031896181919722, + "loss": 2.9957218170166016, + "step": 11108, + "token_acc": 0.29726094272596965 + }, + { + "epoch": 6.511873350923483, + "grad_norm": 0.14805540790508762, + "learning_rate": 0.0004031704690552703, + "loss": 2.9292633533477783, + "step": 11109, + "token_acc": 0.30912352694733 + }, + { + "epoch": 6.512459689240692, + "grad_norm": 0.15503514520808528, + "learning_rate": 0.0004031513184797383, + "loss": 2.986847400665283, + "step": 11110, + "token_acc": 0.29821361980802924 + }, + { + "epoch": 6.513046027557901, + "grad_norm": 0.16031802276915977, + "learning_rate": 0.0004031321664655562, + "loss": 2.9577951431274414, + "step": 11111, + "token_acc": 0.30411198005805945 + }, + { + "epoch": 6.51363236587511, + "grad_norm": 0.16964532366968735, + "learning_rate": 0.00040311301301290355, + "loss": 2.956791877746582, + "step": 11112, + "token_acc": 0.30412319816397426 + }, + { + "epoch": 6.514218704192319, + "grad_norm": 0.15050471024297965, + "learning_rate": 0.0004030938581219605, + "loss": 2.98944354057312, + "step": 11113, + "token_acc": 0.29986565432450585 + }, + { + "epoch": 6.514805042509528, + "grad_norm": 0.16780579458213124, + "learning_rate": 0.00040307470179290695, + "loss": 2.968371868133545, + "step": 11114, + "token_acc": 0.30215735144507766 + }, + { + "epoch": 6.515391380826737, + "grad_norm": 0.1806889114251306, + "learning_rate": 0.0004030555440259229, + "loss": 2.9723801612854004, + "step": 11115, + "token_acc": 0.302467138890111 + }, + { + "epoch": 6.515977719143946, + "grad_norm": 0.1698671949536607, + "learning_rate": 0.0004030363848211882, + "loss": 2.960174322128296, + "step": 11116, + "token_acc": 0.30352090862157977 + }, + { + "epoch": 6.516564057461155, + "grad_norm": 0.1709178425125163, + "learning_rate": 0.00040301722417888285, + "loss": 2.957637071609497, + "step": 11117, + "token_acc": 0.3043306915408282 + }, + { + "epoch": 6.517150395778364, + "grad_norm": 0.15693031581161077, + "learning_rate": 0.00040299806209918696, + "loss": 2.961458683013916, + "step": 11118, + "token_acc": 0.30221822201680626 + }, + { + "epoch": 6.517736734095573, + "grad_norm": 0.18467919253195114, + "learning_rate": 0.00040297889858228043, + "loss": 2.9668545722961426, + "step": 11119, + "token_acc": 0.3034947118464014 + }, + { + "epoch": 6.518323072412782, + "grad_norm": 0.17617648984024362, + "learning_rate": 0.00040295973362834334, + "loss": 2.9362878799438477, + "step": 11120, + "token_acc": 0.3076757131062402 + }, + { + "epoch": 6.518909410729991, + "grad_norm": 0.167724274651725, + "learning_rate": 0.00040294056723755555, + "loss": 2.9585776329040527, + "step": 11121, + "token_acc": 0.30354692325525 + }, + { + "epoch": 6.5194957490472, + "grad_norm": 0.16246740202872795, + "learning_rate": 0.0004029213994100973, + "loss": 2.979480266571045, + "step": 11122, + "token_acc": 0.30170337723409657 + }, + { + "epoch": 6.520082087364409, + "grad_norm": 0.1698085538343002, + "learning_rate": 0.00040290223014614857, + "loss": 2.945061445236206, + "step": 11123, + "token_acc": 0.30597005267312893 + }, + { + "epoch": 6.5206684256816185, + "grad_norm": 0.15524081041669321, + "learning_rate": 0.0004028830594458894, + "loss": 2.9161622524261475, + "step": 11124, + "token_acc": 0.31004983929105884 + }, + { + "epoch": 6.521254763998828, + "grad_norm": 0.18531615583148847, + "learning_rate": 0.00040286388730949985, + "loss": 2.995180130004883, + "step": 11125, + "token_acc": 0.29803020980029193 + }, + { + "epoch": 6.521841102316037, + "grad_norm": 0.22921665883701475, + "learning_rate": 0.00040284471373716016, + "loss": 3.0060696601867676, + "step": 11126, + "token_acc": 0.2971086414107277 + }, + { + "epoch": 6.522427440633246, + "grad_norm": 0.21149266143443865, + "learning_rate": 0.00040282553872905024, + "loss": 2.975681781768799, + "step": 11127, + "token_acc": 0.30133797088440717 + }, + { + "epoch": 6.523013778950454, + "grad_norm": 0.1631919708140131, + "learning_rate": 0.0004028063622853504, + "loss": 2.976017713546753, + "step": 11128, + "token_acc": 0.3011949106153456 + }, + { + "epoch": 6.523600117267663, + "grad_norm": 0.16375836806997898, + "learning_rate": 0.0004027871844062407, + "loss": 2.9656481742858887, + "step": 11129, + "token_acc": 0.30218003005053956 + }, + { + "epoch": 6.524186455584872, + "grad_norm": 0.18274123697187822, + "learning_rate": 0.00040276800509190126, + "loss": 2.9700756072998047, + "step": 11130, + "token_acc": 0.300978792822186 + }, + { + "epoch": 6.524772793902081, + "grad_norm": 0.18383092967811648, + "learning_rate": 0.00040274882434251225, + "loss": 2.9753477573394775, + "step": 11131, + "token_acc": 0.3030169560366814 + }, + { + "epoch": 6.5253591322192905, + "grad_norm": 0.1435593970516185, + "learning_rate": 0.00040272964215825387, + "loss": 2.9538607597351074, + "step": 11132, + "token_acc": 0.30444851800072686 + }, + { + "epoch": 6.5259454705365, + "grad_norm": 0.15660165711008048, + "learning_rate": 0.0004027104585393063, + "loss": 2.939438819885254, + "step": 11133, + "token_acc": 0.3070572056632669 + }, + { + "epoch": 6.526531808853709, + "grad_norm": 0.16240888988111302, + "learning_rate": 0.0004026912734858498, + "loss": 2.9781806468963623, + "step": 11134, + "token_acc": 0.3022858218397415 + }, + { + "epoch": 6.527118147170918, + "grad_norm": 0.16107338658447198, + "learning_rate": 0.00040267208699806454, + "loss": 3.0356955528259277, + "step": 11135, + "token_acc": 0.2936122791940695 + }, + { + "epoch": 6.527704485488127, + "grad_norm": 0.15150660891741344, + "learning_rate": 0.00040265289907613074, + "loss": 2.932180404663086, + "step": 11136, + "token_acc": 0.3081678392650425 + }, + { + "epoch": 6.528290823805335, + "grad_norm": 0.16098823009396673, + "learning_rate": 0.0004026337097202286, + "loss": 2.9713478088378906, + "step": 11137, + "token_acc": 0.301908216049899 + }, + { + "epoch": 6.528877162122544, + "grad_norm": 0.1676462419563933, + "learning_rate": 0.0004026145189305385, + "loss": 2.9314002990722656, + "step": 11138, + "token_acc": 0.30772203380669916 + }, + { + "epoch": 6.529463500439753, + "grad_norm": 0.1649185496930345, + "learning_rate": 0.0004025953267072406, + "loss": 3.0186400413513184, + "step": 11139, + "token_acc": 0.2955524776854417 + }, + { + "epoch": 6.5300498387569625, + "grad_norm": 0.16457025830336075, + "learning_rate": 0.0004025761330505152, + "loss": 2.987926483154297, + "step": 11140, + "token_acc": 0.2992171105950669 + }, + { + "epoch": 6.530636177074172, + "grad_norm": 0.17330383111609074, + "learning_rate": 0.0004025569379605427, + "loss": 2.951719284057617, + "step": 11141, + "token_acc": 0.3037747881022273 + }, + { + "epoch": 6.531222515391381, + "grad_norm": 0.26333654755591723, + "learning_rate": 0.0004025377414375033, + "loss": 2.9190478324890137, + "step": 11142, + "token_acc": 0.30985625850268395 + }, + { + "epoch": 6.53180885370859, + "grad_norm": 0.37302851216171645, + "learning_rate": 0.00040251854348157743, + "loss": 2.9722611904144287, + "step": 11143, + "token_acc": 0.30288942198508434 + }, + { + "epoch": 6.532395192025799, + "grad_norm": 0.26622405130076077, + "learning_rate": 0.0004024993440929453, + "loss": 2.9543251991271973, + "step": 11144, + "token_acc": 0.30283647927780055 + }, + { + "epoch": 6.532981530343008, + "grad_norm": 0.21202817297220808, + "learning_rate": 0.0004024801432717874, + "loss": 2.938084125518799, + "step": 11145, + "token_acc": 0.308535713339866 + }, + { + "epoch": 6.533567868660217, + "grad_norm": 0.2555084660870878, + "learning_rate": 0.00040246094101828396, + "loss": 3.028245210647583, + "step": 11146, + "token_acc": 0.29508889335145017 + }, + { + "epoch": 6.534154206977426, + "grad_norm": 0.1936756239757135, + "learning_rate": 0.00040244173733261534, + "loss": 2.9544386863708496, + "step": 11147, + "token_acc": 0.30477834969326706 + }, + { + "epoch": 6.534740545294635, + "grad_norm": 0.2127091818913759, + "learning_rate": 0.00040242253221496214, + "loss": 2.992856740951538, + "step": 11148, + "token_acc": 0.2992971631370184 + }, + { + "epoch": 6.535326883611844, + "grad_norm": 0.19251183226628163, + "learning_rate": 0.0004024033256655046, + "loss": 2.9649558067321777, + "step": 11149, + "token_acc": 0.3030177644675327 + }, + { + "epoch": 6.535913221929053, + "grad_norm": 0.20750561467201029, + "learning_rate": 0.0004023841176844233, + "loss": 2.9915642738342285, + "step": 11150, + "token_acc": 0.2974764231040518 + }, + { + "epoch": 6.536499560246262, + "grad_norm": 0.16179455324145559, + "learning_rate": 0.00040236490827189845, + "loss": 2.9879589080810547, + "step": 11151, + "token_acc": 0.30011260674180734 + }, + { + "epoch": 6.537085898563471, + "grad_norm": 0.1743826101093737, + "learning_rate": 0.00040234569742811057, + "loss": 2.9683661460876465, + "step": 11152, + "token_acc": 0.30300225541512404 + }, + { + "epoch": 6.53767223688068, + "grad_norm": 0.16419115269212634, + "learning_rate": 0.00040232648515324017, + "loss": 2.975533962249756, + "step": 11153, + "token_acc": 0.3014663367625993 + }, + { + "epoch": 6.538258575197889, + "grad_norm": 0.18478801943418596, + "learning_rate": 0.0004023072714474677, + "loss": 2.964743137359619, + "step": 11154, + "token_acc": 0.3022246587695208 + }, + { + "epoch": 6.538844913515098, + "grad_norm": 0.15050397061360785, + "learning_rate": 0.0004022880563109737, + "loss": 2.949847936630249, + "step": 11155, + "token_acc": 0.30535603452050136 + }, + { + "epoch": 6.5394312518323074, + "grad_norm": 0.15521764653476097, + "learning_rate": 0.00040226883974393856, + "loss": 2.9790592193603516, + "step": 11156, + "token_acc": 0.2991982927673319 + }, + { + "epoch": 6.540017590149517, + "grad_norm": 0.14434526780177975, + "learning_rate": 0.0004022496217465429, + "loss": 2.945652484893799, + "step": 11157, + "token_acc": 0.30517695371290454 + }, + { + "epoch": 6.540603928466726, + "grad_norm": 0.17019691836625972, + "learning_rate": 0.00040223040231896715, + "loss": 2.984821319580078, + "step": 11158, + "token_acc": 0.3020581161225625 + }, + { + "epoch": 6.541190266783934, + "grad_norm": 0.15991862652942382, + "learning_rate": 0.00040221118146139195, + "loss": 2.947481870651245, + "step": 11159, + "token_acc": 0.30513531752990375 + }, + { + "epoch": 6.541776605101143, + "grad_norm": 0.1406013242174145, + "learning_rate": 0.0004021919591739978, + "loss": 2.970341682434082, + "step": 11160, + "token_acc": 0.3007312566549053 + }, + { + "epoch": 6.542362943418352, + "grad_norm": 0.15241765134568322, + "learning_rate": 0.00040217273545696525, + "loss": 2.9497389793395996, + "step": 11161, + "token_acc": 0.3057951863080997 + }, + { + "epoch": 6.542949281735561, + "grad_norm": 0.1651751665964146, + "learning_rate": 0.00040215351031047496, + "loss": 2.9672632217407227, + "step": 11162, + "token_acc": 0.3012107793589929 + }, + { + "epoch": 6.54353562005277, + "grad_norm": 0.16702809655806958, + "learning_rate": 0.0004021342837347074, + "loss": 2.959089756011963, + "step": 11163, + "token_acc": 0.3047524793091778 + }, + { + "epoch": 6.5441219583699795, + "grad_norm": 0.16651638106318, + "learning_rate": 0.0004021150557298433, + "loss": 2.9721546173095703, + "step": 11164, + "token_acc": 0.30175225768158426 + }, + { + "epoch": 6.544708296687189, + "grad_norm": 0.16771850223291276, + "learning_rate": 0.00040209582629606325, + "loss": 2.964848518371582, + "step": 11165, + "token_acc": 0.3009714428831018 + }, + { + "epoch": 6.545294635004398, + "grad_norm": 0.16463912302476785, + "learning_rate": 0.0004020765954335478, + "loss": 2.9655258655548096, + "step": 11166, + "token_acc": 0.3031997450893729 + }, + { + "epoch": 6.545880973321607, + "grad_norm": 0.15505401813929393, + "learning_rate": 0.00040205736314247767, + "loss": 2.9873099327087402, + "step": 11167, + "token_acc": 0.29897726219027787 + }, + { + "epoch": 6.546467311638816, + "grad_norm": 0.18476019278044695, + "learning_rate": 0.0004020381294230335, + "loss": 2.9754834175109863, + "step": 11168, + "token_acc": 0.30082285869134723 + }, + { + "epoch": 6.547053649956025, + "grad_norm": 0.20358891319795588, + "learning_rate": 0.00040201889427539606, + "loss": 2.9633750915527344, + "step": 11169, + "token_acc": 0.3029113560728886 + }, + { + "epoch": 6.547639988273234, + "grad_norm": 0.1476443707931811, + "learning_rate": 0.0004019996576997459, + "loss": 2.9446303844451904, + "step": 11170, + "token_acc": 0.30619191265690243 + }, + { + "epoch": 6.548226326590442, + "grad_norm": 0.22145729808791412, + "learning_rate": 0.00040198041969626377, + "loss": 2.9365415573120117, + "step": 11171, + "token_acc": 0.30588020170698915 + }, + { + "epoch": 6.5488126649076515, + "grad_norm": 0.2661125124865491, + "learning_rate": 0.0004019611802651304, + "loss": 2.987351655960083, + "step": 11172, + "token_acc": 0.3000746718821832 + }, + { + "epoch": 6.549399003224861, + "grad_norm": 0.18983413726452605, + "learning_rate": 0.0004019419394065266, + "loss": 2.979475498199463, + "step": 11173, + "token_acc": 0.30192675241489747 + }, + { + "epoch": 6.54998534154207, + "grad_norm": 0.1787259396518735, + "learning_rate": 0.0004019226971206329, + "loss": 2.989509105682373, + "step": 11174, + "token_acc": 0.2994121969140338 + }, + { + "epoch": 6.550571679859279, + "grad_norm": 0.1899232531587389, + "learning_rate": 0.00040190345340763024, + "loss": 2.9316983222961426, + "step": 11175, + "token_acc": 0.30696294881386327 + }, + { + "epoch": 6.551158018176488, + "grad_norm": 0.15998517076276333, + "learning_rate": 0.00040188420826769923, + "loss": 3.002680778503418, + "step": 11176, + "token_acc": 0.2970772610442392 + }, + { + "epoch": 6.551744356493697, + "grad_norm": 0.31964437712731397, + "learning_rate": 0.0004018649617010209, + "loss": 2.9744925498962402, + "step": 11177, + "token_acc": 0.29989171089111955 + }, + { + "epoch": 6.552330694810906, + "grad_norm": 0.27486265657509795, + "learning_rate": 0.0004018457137077758, + "loss": 2.9604973793029785, + "step": 11178, + "token_acc": 0.30386945792525655 + }, + { + "epoch": 6.552917033128115, + "grad_norm": 0.167832092391946, + "learning_rate": 0.00040182646428814486, + "loss": 2.976696491241455, + "step": 11179, + "token_acc": 0.3009972299168975 + }, + { + "epoch": 6.5535033714453235, + "grad_norm": 0.26145871289075223, + "learning_rate": 0.0004018072134423089, + "loss": 2.937617301940918, + "step": 11180, + "token_acc": 0.3069281785787036 + }, + { + "epoch": 6.554089709762533, + "grad_norm": 0.15296203546293735, + "learning_rate": 0.0004017879611704487, + "loss": 3.014613151550293, + "step": 11181, + "token_acc": 0.2958672185915268 + }, + { + "epoch": 6.554676048079742, + "grad_norm": 0.21084253731101335, + "learning_rate": 0.0004017687074727452, + "loss": 2.965714931488037, + "step": 11182, + "token_acc": 0.30383736771493247 + }, + { + "epoch": 6.555262386396951, + "grad_norm": 0.16989467738679165, + "learning_rate": 0.0004017494523493791, + "loss": 2.9914798736572266, + "step": 11183, + "token_acc": 0.2998882461895055 + }, + { + "epoch": 6.55584872471416, + "grad_norm": 0.22025874710734558, + "learning_rate": 0.00040173019580053143, + "loss": 2.9733872413635254, + "step": 11184, + "token_acc": 0.3017755655824702 + }, + { + "epoch": 6.556435063031369, + "grad_norm": 0.14963016294788, + "learning_rate": 0.00040171093782638307, + "loss": 2.986146926879883, + "step": 11185, + "token_acc": 0.29900933961224074 + }, + { + "epoch": 6.557021401348578, + "grad_norm": 0.18231632845402326, + "learning_rate": 0.0004016916784271148, + "loss": 2.9921412467956543, + "step": 11186, + "token_acc": 0.29869956215096616 + }, + { + "epoch": 6.557607739665787, + "grad_norm": 0.1561226368181726, + "learning_rate": 0.00040167241760290766, + "loss": 2.969301462173462, + "step": 11187, + "token_acc": 0.30307405757349665 + }, + { + "epoch": 6.558194077982996, + "grad_norm": 0.17722776522688904, + "learning_rate": 0.0004016531553539425, + "loss": 2.974203586578369, + "step": 11188, + "token_acc": 0.3020356508158759 + }, + { + "epoch": 6.5587804163002055, + "grad_norm": 0.15868407061233872, + "learning_rate": 0.00040163389168040045, + "loss": 2.9477462768554688, + "step": 11189, + "token_acc": 0.3060852729032874 + }, + { + "epoch": 6.559366754617415, + "grad_norm": 0.15926496747014127, + "learning_rate": 0.0004016146265824622, + "loss": 2.9946155548095703, + "step": 11190, + "token_acc": 0.2993076167425593 + }, + { + "epoch": 6.559953092934624, + "grad_norm": 0.1674568520768532, + "learning_rate": 0.0004015953600603088, + "loss": 2.961996555328369, + "step": 11191, + "token_acc": 0.30339379906134484 + }, + { + "epoch": 6.560539431251832, + "grad_norm": 0.17170312257271134, + "learning_rate": 0.00040157609211412135, + "loss": 2.9521708488464355, + "step": 11192, + "token_acc": 0.3043437301282001 + }, + { + "epoch": 6.561125769569041, + "grad_norm": 0.1530956888546429, + "learning_rate": 0.00040155682274408067, + "loss": 2.972043514251709, + "step": 11193, + "token_acc": 0.30137425095900683 + }, + { + "epoch": 6.56171210788625, + "grad_norm": 0.1557623495844131, + "learning_rate": 0.000401537551950368, + "loss": 2.961657762527466, + "step": 11194, + "token_acc": 0.30311488334714964 + }, + { + "epoch": 6.562298446203459, + "grad_norm": 0.16003021059210945, + "learning_rate": 0.0004015182797331641, + "loss": 2.968696117401123, + "step": 11195, + "token_acc": 0.301645058470508 + }, + { + "epoch": 6.562884784520668, + "grad_norm": 0.1786294270629262, + "learning_rate": 0.00040149900609265024, + "loss": 2.9579410552978516, + "step": 11196, + "token_acc": 0.30328853182864607 + }, + { + "epoch": 6.5634711228378775, + "grad_norm": 0.17763957090967267, + "learning_rate": 0.00040147973102900724, + "loss": 2.9871959686279297, + "step": 11197, + "token_acc": 0.2991581162175162 + }, + { + "epoch": 6.564057461155087, + "grad_norm": 0.18231735563094909, + "learning_rate": 0.0004014604545424164, + "loss": 2.9686074256896973, + "step": 11198, + "token_acc": 0.3019304284295718 + }, + { + "epoch": 6.564643799472296, + "grad_norm": 0.1737406443680257, + "learning_rate": 0.0004014411766330587, + "loss": 2.976677179336548, + "step": 11199, + "token_acc": 0.30153282140735743 + }, + { + "epoch": 6.565230137789505, + "grad_norm": 0.16011320529324602, + "learning_rate": 0.0004014218973011151, + "loss": 2.9515035152435303, + "step": 11200, + "token_acc": 0.3047002806683775 + }, + { + "epoch": 6.565816476106714, + "grad_norm": 0.1619442764634733, + "learning_rate": 0.0004014026165467669, + "loss": 2.975390911102295, + "step": 11201, + "token_acc": 0.3001782503076001 + }, + { + "epoch": 6.566402814423922, + "grad_norm": 0.16733250096343202, + "learning_rate": 0.00040138333437019516, + "loss": 2.935001850128174, + "step": 11202, + "token_acc": 0.3078750233060642 + }, + { + "epoch": 6.566989152741131, + "grad_norm": 0.1891730378653655, + "learning_rate": 0.00040136405077158087, + "loss": 2.9878616333007812, + "step": 11203, + "token_acc": 0.30076576964151047 + }, + { + "epoch": 6.56757549105834, + "grad_norm": 0.1659767218358766, + "learning_rate": 0.0004013447657511054, + "loss": 2.9527807235717773, + "step": 11204, + "token_acc": 0.3044030602647454 + }, + { + "epoch": 6.5681618293755495, + "grad_norm": 0.1532420207129692, + "learning_rate": 0.00040132547930894975, + "loss": 2.9237451553344727, + "step": 11205, + "token_acc": 0.30833082277577817 + }, + { + "epoch": 6.568748167692759, + "grad_norm": 0.1825513720403444, + "learning_rate": 0.00040130619144529515, + "loss": 2.9488725662231445, + "step": 11206, + "token_acc": 0.3040480561003648 + }, + { + "epoch": 6.569334506009968, + "grad_norm": 0.17243852168682142, + "learning_rate": 0.00040128690216032284, + "loss": 2.9678449630737305, + "step": 11207, + "token_acc": 0.3041058498130173 + }, + { + "epoch": 6.569920844327177, + "grad_norm": 0.17437718427260404, + "learning_rate": 0.00040126761145421383, + "loss": 2.988457679748535, + "step": 11208, + "token_acc": 0.3011408582849544 + }, + { + "epoch": 6.570507182644386, + "grad_norm": 0.18657848849176717, + "learning_rate": 0.00040124831932714946, + "loss": 2.971238136291504, + "step": 11209, + "token_acc": 0.3030426770532311 + }, + { + "epoch": 6.571093520961595, + "grad_norm": 0.19553339550398852, + "learning_rate": 0.000401229025779311, + "loss": 2.934677839279175, + "step": 11210, + "token_acc": 0.30813865152174025 + }, + { + "epoch": 6.571679859278804, + "grad_norm": 0.16242794869941907, + "learning_rate": 0.0004012097308108795, + "loss": 2.9933226108551025, + "step": 11211, + "token_acc": 0.29941702913203644 + }, + { + "epoch": 6.572266197596013, + "grad_norm": 0.18507054383700744, + "learning_rate": 0.0004011904344220365, + "loss": 2.94520902633667, + "step": 11212, + "token_acc": 0.3052172650779111 + }, + { + "epoch": 6.572852535913222, + "grad_norm": 0.20344307617890334, + "learning_rate": 0.000401171136612963, + "loss": 2.994338274002075, + "step": 11213, + "token_acc": 0.29907069999840874 + }, + { + "epoch": 6.573438874230431, + "grad_norm": 0.16494569677580434, + "learning_rate": 0.0004011518373838404, + "loss": 2.985337257385254, + "step": 11214, + "token_acc": 0.29951017292380117 + }, + { + "epoch": 6.57402521254764, + "grad_norm": 0.15881999450891515, + "learning_rate": 0.00040113253673484993, + "loss": 2.93871808052063, + "step": 11215, + "token_acc": 0.30591864333905777 + }, + { + "epoch": 6.574611550864849, + "grad_norm": 0.15664374797376182, + "learning_rate": 0.000401113234666173, + "loss": 2.9544553756713867, + "step": 11216, + "token_acc": 0.3051158600253619 + }, + { + "epoch": 6.575197889182058, + "grad_norm": 0.16009462209079725, + "learning_rate": 0.0004010939311779909, + "loss": 2.920405149459839, + "step": 11217, + "token_acc": 0.3092036139208881 + }, + { + "epoch": 6.575784227499267, + "grad_norm": 0.16481042129008847, + "learning_rate": 0.00040107462627048487, + "loss": 2.9739761352539062, + "step": 11218, + "token_acc": 0.30430224566151676 + }, + { + "epoch": 6.576370565816476, + "grad_norm": 0.15946977336207707, + "learning_rate": 0.0004010553199438363, + "loss": 2.931671380996704, + "step": 11219, + "token_acc": 0.30687955702915726 + }, + { + "epoch": 6.576956904133685, + "grad_norm": 0.16953680646510555, + "learning_rate": 0.00040103601219822644, + "loss": 3.0140914916992188, + "step": 11220, + "token_acc": 0.2957710557217473 + }, + { + "epoch": 6.577543242450894, + "grad_norm": 0.1701523026092201, + "learning_rate": 0.00040101670303383685, + "loss": 2.9883460998535156, + "step": 11221, + "token_acc": 0.2982902474290959 + }, + { + "epoch": 6.5781295807681035, + "grad_norm": 0.21875221446136267, + "learning_rate": 0.0004009973924508489, + "loss": 2.964977741241455, + "step": 11222, + "token_acc": 0.3023220927285236 + }, + { + "epoch": 6.578715919085313, + "grad_norm": 0.21434783057040763, + "learning_rate": 0.00040097808044944384, + "loss": 2.962202787399292, + "step": 11223, + "token_acc": 0.3050518474937752 + }, + { + "epoch": 6.579302257402521, + "grad_norm": 0.17424674084696992, + "learning_rate": 0.00040095876702980315, + "loss": 3.005999803543091, + "step": 11224, + "token_acc": 0.2980569820903087 + }, + { + "epoch": 6.57988859571973, + "grad_norm": 0.20087966181329328, + "learning_rate": 0.00040093945219210825, + "loss": 2.9673056602478027, + "step": 11225, + "token_acc": 0.303547943071513 + }, + { + "epoch": 6.580474934036939, + "grad_norm": 0.20435472173356972, + "learning_rate": 0.0004009201359365407, + "loss": 2.982630491256714, + "step": 11226, + "token_acc": 0.3000470207152373 + }, + { + "epoch": 6.581061272354148, + "grad_norm": 0.1805361586376264, + "learning_rate": 0.0004009008182632817, + "loss": 2.9506354331970215, + "step": 11227, + "token_acc": 0.30605738575983 + }, + { + "epoch": 6.581647610671357, + "grad_norm": 0.18641077785242474, + "learning_rate": 0.00040088149917251296, + "loss": 2.984947681427002, + "step": 11228, + "token_acc": 0.297964899658247 + }, + { + "epoch": 6.582233948988566, + "grad_norm": 0.18877877111446134, + "learning_rate": 0.0004008621786644159, + "loss": 2.963156223297119, + "step": 11229, + "token_acc": 0.30306955801812013 + }, + { + "epoch": 6.5828202873057755, + "grad_norm": 0.17334043146748826, + "learning_rate": 0.0004008428567391718, + "loss": 2.99684476852417, + "step": 11230, + "token_acc": 0.2968765642206427 + }, + { + "epoch": 6.583406625622985, + "grad_norm": 0.19024658869410907, + "learning_rate": 0.0004008235333969624, + "loss": 2.965780735015869, + "step": 11231, + "token_acc": 0.302976737244852 + }, + { + "epoch": 6.583992963940194, + "grad_norm": 0.19598266352144297, + "learning_rate": 0.0004008042086379692, + "loss": 2.9838173389434814, + "step": 11232, + "token_acc": 0.2983281730566809 + }, + { + "epoch": 6.584579302257403, + "grad_norm": 0.1542214787173342, + "learning_rate": 0.0004007848824623736, + "loss": 2.982515335083008, + "step": 11233, + "token_acc": 0.29947915993537966 + }, + { + "epoch": 6.585165640574612, + "grad_norm": 0.18870182275863476, + "learning_rate": 0.00040076555487035726, + "loss": 2.936087131500244, + "step": 11234, + "token_acc": 0.3077537258212572 + }, + { + "epoch": 6.585751978891821, + "grad_norm": 0.21990111832902462, + "learning_rate": 0.00040074622586210165, + "loss": 2.966492176055908, + "step": 11235, + "token_acc": 0.3015228320395091 + }, + { + "epoch": 6.586338317209029, + "grad_norm": 0.1730571691483246, + "learning_rate": 0.0004007268954377884, + "loss": 3.032069206237793, + "step": 11236, + "token_acc": 0.2932425221249204 + }, + { + "epoch": 6.586924655526238, + "grad_norm": 0.18292674111527507, + "learning_rate": 0.0004007075635975991, + "loss": 3.0040929317474365, + "step": 11237, + "token_acc": 0.2960183689947517 + }, + { + "epoch": 6.5875109938434475, + "grad_norm": 0.20896854429305198, + "learning_rate": 0.0004006882303417152, + "loss": 3.0218987464904785, + "step": 11238, + "token_acc": 0.29448428303390134 + }, + { + "epoch": 6.588097332160657, + "grad_norm": 0.1665642206983382, + "learning_rate": 0.0004006688956703186, + "loss": 2.9605095386505127, + "step": 11239, + "token_acc": 0.30383915256306154 + }, + { + "epoch": 6.588683670477866, + "grad_norm": 0.1900831292834334, + "learning_rate": 0.0004006495595835906, + "loss": 2.982273817062378, + "step": 11240, + "token_acc": 0.30070123355684086 + }, + { + "epoch": 6.589270008795075, + "grad_norm": 0.18985697433528803, + "learning_rate": 0.00040063022208171306, + "loss": 2.949173927307129, + "step": 11241, + "token_acc": 0.3054469130477095 + }, + { + "epoch": 6.589856347112284, + "grad_norm": 0.1748365561690017, + "learning_rate": 0.0004006108831648676, + "loss": 2.953469753265381, + "step": 11242, + "token_acc": 0.30341288545852646 + }, + { + "epoch": 6.590442685429493, + "grad_norm": 0.24564981606984573, + "learning_rate": 0.0004005915428332358, + "loss": 2.976010322570801, + "step": 11243, + "token_acc": 0.3004418755736133 + }, + { + "epoch": 6.591029023746702, + "grad_norm": 0.1952095042570715, + "learning_rate": 0.00040057220108699935, + "loss": 2.9934163093566895, + "step": 11244, + "token_acc": 0.29856175353019376 + }, + { + "epoch": 6.59161536206391, + "grad_norm": 0.1886815152948818, + "learning_rate": 0.00040055285792634, + "loss": 2.9731338024139404, + "step": 11245, + "token_acc": 0.3014417716490023 + }, + { + "epoch": 6.5922017003811195, + "grad_norm": 0.18511738942371814, + "learning_rate": 0.0004005335133514395, + "loss": 2.949902057647705, + "step": 11246, + "token_acc": 0.3046860694461157 + }, + { + "epoch": 6.592788038698329, + "grad_norm": 0.16832657512394414, + "learning_rate": 0.0004005141673624794, + "loss": 2.980968475341797, + "step": 11247, + "token_acc": 0.29957453908400766 + }, + { + "epoch": 6.593374377015538, + "grad_norm": 0.1962482673027451, + "learning_rate": 0.0004004948199596415, + "loss": 3.009364128112793, + "step": 11248, + "token_acc": 0.2956463709646841 + }, + { + "epoch": 6.593960715332747, + "grad_norm": 0.20120199114745718, + "learning_rate": 0.00040047547114310756, + "loss": 3.010697364807129, + "step": 11249, + "token_acc": 0.2961451307263162 + }, + { + "epoch": 6.594547053649956, + "grad_norm": 0.18916567753920846, + "learning_rate": 0.0004004561209130594, + "loss": 2.983368396759033, + "step": 11250, + "token_acc": 0.2992909686686876 + }, + { + "epoch": 6.595133391967165, + "grad_norm": 0.18504645649833162, + "learning_rate": 0.0004004367692696788, + "loss": 2.9660043716430664, + "step": 11251, + "token_acc": 0.3024168770846135 + }, + { + "epoch": 6.595719730284374, + "grad_norm": 0.1676705082216995, + "learning_rate": 0.0004004174162131473, + "loss": 2.9570086002349854, + "step": 11252, + "token_acc": 0.30335053828405356 + }, + { + "epoch": 6.596306068601583, + "grad_norm": 0.17593995390247955, + "learning_rate": 0.0004003980617436469, + "loss": 2.9862563610076904, + "step": 11253, + "token_acc": 0.29958512511560587 + }, + { + "epoch": 6.596892406918792, + "grad_norm": 0.1684482167915221, + "learning_rate": 0.0004003787058613594, + "loss": 2.964564800262451, + "step": 11254, + "token_acc": 0.30418020698745807 + }, + { + "epoch": 6.5974787452360015, + "grad_norm": 0.17990097223482265, + "learning_rate": 0.00040035934856646663, + "loss": 2.956902027130127, + "step": 11255, + "token_acc": 0.3033822786591511 + }, + { + "epoch": 6.598065083553211, + "grad_norm": 0.1869272391398067, + "learning_rate": 0.00040033998985915037, + "loss": 2.9767754077911377, + "step": 11256, + "token_acc": 0.3023617540521849 + }, + { + "epoch": 6.598651421870419, + "grad_norm": 0.16109429483886992, + "learning_rate": 0.00040032062973959247, + "loss": 2.9803686141967773, + "step": 11257, + "token_acc": 0.3001485845231431 + }, + { + "epoch": 6.599237760187628, + "grad_norm": 0.19020374212928276, + "learning_rate": 0.00040030126820797486, + "loss": 2.965989589691162, + "step": 11258, + "token_acc": 0.3026991210702201 + }, + { + "epoch": 6.599824098504837, + "grad_norm": 0.1687357345172246, + "learning_rate": 0.00040028190526447926, + "loss": 2.9698033332824707, + "step": 11259, + "token_acc": 0.3024019880008409 + }, + { + "epoch": 6.600410436822046, + "grad_norm": 0.17998202483812448, + "learning_rate": 0.0004002625409092878, + "loss": 2.955725908279419, + "step": 11260, + "token_acc": 0.3041121475263785 + }, + { + "epoch": 6.600996775139255, + "grad_norm": 0.15161831970961853, + "learning_rate": 0.00040024317514258224, + "loss": 3.0178322792053223, + "step": 11261, + "token_acc": 0.2966456159147805 + }, + { + "epoch": 6.601583113456464, + "grad_norm": 0.1652386523746586, + "learning_rate": 0.0004002238079645444, + "loss": 2.939131259918213, + "step": 11262, + "token_acc": 0.3057705107946202 + }, + { + "epoch": 6.6021694517736735, + "grad_norm": 0.1585342635997751, + "learning_rate": 0.0004002044393753564, + "loss": 2.970935583114624, + "step": 11263, + "token_acc": 0.30150134715904425 + }, + { + "epoch": 6.602755790090883, + "grad_norm": 0.17874758350722286, + "learning_rate": 0.00040018506937520003, + "loss": 2.94380521774292, + "step": 11264, + "token_acc": 0.3054671773689896 + }, + { + "epoch": 6.603342128408092, + "grad_norm": 0.21557875427627826, + "learning_rate": 0.00040016569796425737, + "loss": 2.965689182281494, + "step": 11265, + "token_acc": 0.3013336039893269 + }, + { + "epoch": 6.603928466725301, + "grad_norm": 0.3044733490108263, + "learning_rate": 0.0004001463251427103, + "loss": 2.961544990539551, + "step": 11266, + "token_acc": 0.30331916043084933 + }, + { + "epoch": 6.604514805042509, + "grad_norm": 0.27772725864653786, + "learning_rate": 0.0004001269509107409, + "loss": 2.975137710571289, + "step": 11267, + "token_acc": 0.302118156631256 + }, + { + "epoch": 6.605101143359718, + "grad_norm": 0.16136409679450592, + "learning_rate": 0.00040010757526853097, + "loss": 2.9820151329040527, + "step": 11268, + "token_acc": 0.2998230937761174 + }, + { + "epoch": 6.605687481676927, + "grad_norm": 0.19867519207947842, + "learning_rate": 0.0004000881982162627, + "loss": 2.951850175857544, + "step": 11269, + "token_acc": 0.30432672574687064 + }, + { + "epoch": 6.606273819994136, + "grad_norm": 0.1682162728476659, + "learning_rate": 0.000400068819754118, + "loss": 2.9683351516723633, + "step": 11270, + "token_acc": 0.3041029900793706 + }, + { + "epoch": 6.6068601583113455, + "grad_norm": 0.18361017807144261, + "learning_rate": 0.00040004943988227907, + "loss": 2.9812755584716797, + "step": 11271, + "token_acc": 0.30288669028134035 + }, + { + "epoch": 6.607446496628555, + "grad_norm": 0.1755034064134464, + "learning_rate": 0.00040003005860092777, + "loss": 2.9711644649505615, + "step": 11272, + "token_acc": 0.302183261231885 + }, + { + "epoch": 6.608032834945764, + "grad_norm": 0.15978620826140932, + "learning_rate": 0.0004000106759102463, + "loss": 2.944441080093384, + "step": 11273, + "token_acc": 0.3057343778620787 + }, + { + "epoch": 6.608619173262973, + "grad_norm": 0.17103776973949536, + "learning_rate": 0.0003999912918104166, + "loss": 2.9782581329345703, + "step": 11274, + "token_acc": 0.3027661953850904 + }, + { + "epoch": 6.609205511580182, + "grad_norm": 0.16473655633248444, + "learning_rate": 0.0003999719063016209, + "loss": 3.0013179779052734, + "step": 11275, + "token_acc": 0.2978080440398768 + }, + { + "epoch": 6.609791849897391, + "grad_norm": 0.16068811863320862, + "learning_rate": 0.0003999525193840412, + "loss": 2.9495091438293457, + "step": 11276, + "token_acc": 0.3057050760061086 + }, + { + "epoch": 6.6103781882146, + "grad_norm": 0.15847140481231592, + "learning_rate": 0.0003999331310578596, + "loss": 2.9653549194335938, + "step": 11277, + "token_acc": 0.3023673991106682 + }, + { + "epoch": 6.610964526531809, + "grad_norm": 0.15978898150342716, + "learning_rate": 0.0003999137413232583, + "loss": 3.0211563110351562, + "step": 11278, + "token_acc": 0.29484481441643673 + }, + { + "epoch": 6.6115508648490176, + "grad_norm": 0.1550866787487519, + "learning_rate": 0.0003998943501804194, + "loss": 2.962747812271118, + "step": 11279, + "token_acc": 0.3023202284563765 + }, + { + "epoch": 6.612137203166227, + "grad_norm": 0.16061791384692747, + "learning_rate": 0.00039987495762952514, + "loss": 2.9647843837738037, + "step": 11280, + "token_acc": 0.30258597922808683 + }, + { + "epoch": 6.612723541483436, + "grad_norm": 0.1576971320760154, + "learning_rate": 0.00039985556367075754, + "loss": 2.97267484664917, + "step": 11281, + "token_acc": 0.3016032958093889 + }, + { + "epoch": 6.613309879800645, + "grad_norm": 0.153657028608998, + "learning_rate": 0.00039983616830429887, + "loss": 2.990851402282715, + "step": 11282, + "token_acc": 0.29982219144488326 + }, + { + "epoch": 6.613896218117854, + "grad_norm": 0.15423272946294572, + "learning_rate": 0.0003998167715303313, + "loss": 2.9909074306488037, + "step": 11283, + "token_acc": 0.2995448837837695 + }, + { + "epoch": 6.614482556435063, + "grad_norm": 0.14871675841811052, + "learning_rate": 0.00039979737334903704, + "loss": 2.9922878742218018, + "step": 11284, + "token_acc": 0.30002985902275686 + }, + { + "epoch": 6.615068894752272, + "grad_norm": 0.15798849544513727, + "learning_rate": 0.00039977797376059834, + "loss": 2.9922990798950195, + "step": 11285, + "token_acc": 0.2968589975790332 + }, + { + "epoch": 6.615655233069481, + "grad_norm": 0.14606783816881663, + "learning_rate": 0.00039975857276519736, + "loss": 3.00759220123291, + "step": 11286, + "token_acc": 0.2978142003875588 + }, + { + "epoch": 6.6162415713866904, + "grad_norm": 0.17191826425833984, + "learning_rate": 0.0003997391703630164, + "loss": 3.011841297149658, + "step": 11287, + "token_acc": 0.29837340221888314 + }, + { + "epoch": 6.616827909703899, + "grad_norm": 0.16073184456246847, + "learning_rate": 0.0003997197665542377, + "loss": 2.995091438293457, + "step": 11288, + "token_acc": 0.29715104668869113 + }, + { + "epoch": 6.617414248021108, + "grad_norm": 0.1605085025934662, + "learning_rate": 0.0003997003613390436, + "loss": 2.9804296493530273, + "step": 11289, + "token_acc": 0.30143200354049443 + }, + { + "epoch": 6.618000586338317, + "grad_norm": 0.17013224521671716, + "learning_rate": 0.00039968095471761635, + "loss": 2.9749085903167725, + "step": 11290, + "token_acc": 0.3008406498200409 + }, + { + "epoch": 6.618586924655526, + "grad_norm": 0.15732755057500827, + "learning_rate": 0.0003996615466901381, + "loss": 2.9856176376342773, + "step": 11291, + "token_acc": 0.3006596010795866 + }, + { + "epoch": 6.619173262972735, + "grad_norm": 0.17827725954953713, + "learning_rate": 0.00039964213725679146, + "loss": 3.006697416305542, + "step": 11292, + "token_acc": 0.29841368442223337 + }, + { + "epoch": 6.619759601289944, + "grad_norm": 0.17991959864122428, + "learning_rate": 0.00039962272641775844, + "loss": 2.9707937240600586, + "step": 11293, + "token_acc": 0.3033506285655283 + }, + { + "epoch": 6.620345939607153, + "grad_norm": 0.26187656411690885, + "learning_rate": 0.00039960331417322157, + "loss": 2.956144332885742, + "step": 11294, + "token_acc": 0.3027453274024821 + }, + { + "epoch": 6.6209322779243625, + "grad_norm": 0.34432967288291105, + "learning_rate": 0.00039958390052336314, + "loss": 3.0120749473571777, + "step": 11295, + "token_acc": 0.2960285941223193 + }, + { + "epoch": 6.621518616241572, + "grad_norm": 0.27889487123375595, + "learning_rate": 0.0003995644854683655, + "loss": 2.992389678955078, + "step": 11296, + "token_acc": 0.2992779372002196 + }, + { + "epoch": 6.622104954558781, + "grad_norm": 0.1681474203128766, + "learning_rate": 0.00039954506900841114, + "loss": 2.958573818206787, + "step": 11297, + "token_acc": 0.3053966607790075 + }, + { + "epoch": 6.62269129287599, + "grad_norm": 0.21352704113694201, + "learning_rate": 0.00039952565114368234, + "loss": 2.9816384315490723, + "step": 11298, + "token_acc": 0.3016189448625607 + }, + { + "epoch": 6.623277631193199, + "grad_norm": 0.2529299917132346, + "learning_rate": 0.0003995062318743615, + "loss": 2.9975266456604004, + "step": 11299, + "token_acc": 0.30087965963496477 + }, + { + "epoch": 6.623863969510407, + "grad_norm": 0.16246424444462493, + "learning_rate": 0.0003994868112006312, + "loss": 2.994394302368164, + "step": 11300, + "token_acc": 0.2990252859144457 + }, + { + "epoch": 6.624450307827616, + "grad_norm": 0.2585178836100943, + "learning_rate": 0.0003994673891226737, + "loss": 2.9406607151031494, + "step": 11301, + "token_acc": 0.30620253450950125 + }, + { + "epoch": 6.625036646144825, + "grad_norm": 0.1674858029010497, + "learning_rate": 0.0003994479656406714, + "loss": 2.989301919937134, + "step": 11302, + "token_acc": 0.2991464284770487 + }, + { + "epoch": 6.6256229844620345, + "grad_norm": 0.23907903713222117, + "learning_rate": 0.00039942854075480683, + "loss": 2.9460976123809814, + "step": 11303, + "token_acc": 0.3050778546945721 + }, + { + "epoch": 6.626209322779244, + "grad_norm": 0.1733634729341798, + "learning_rate": 0.00039940911446526256, + "loss": 2.92620587348938, + "step": 11304, + "token_acc": 0.30871569048196595 + }, + { + "epoch": 6.626795661096453, + "grad_norm": 0.22544127734143735, + "learning_rate": 0.000399389686772221, + "loss": 2.966820240020752, + "step": 11305, + "token_acc": 0.30217138981989855 + }, + { + "epoch": 6.627381999413662, + "grad_norm": 0.1933680911373531, + "learning_rate": 0.0003993702576758646, + "loss": 2.931497097015381, + "step": 11306, + "token_acc": 0.30754804309498396 + }, + { + "epoch": 6.627968337730871, + "grad_norm": 0.18020681447448758, + "learning_rate": 0.00039935082717637593, + "loss": 2.980104446411133, + "step": 11307, + "token_acc": 0.30057539123762084 + }, + { + "epoch": 6.62855467604808, + "grad_norm": 0.1731775260199237, + "learning_rate": 0.00039933139527393744, + "loss": 2.969069719314575, + "step": 11308, + "token_acc": 0.30227127110111535 + }, + { + "epoch": 6.629141014365289, + "grad_norm": 0.22069848635505473, + "learning_rate": 0.0003993119619687318, + "loss": 2.9962916374206543, + "step": 11309, + "token_acc": 0.2969698691411143 + }, + { + "epoch": 6.629727352682497, + "grad_norm": 0.16130669577377058, + "learning_rate": 0.00039929252726094144, + "loss": 2.911189079284668, + "step": 11310, + "token_acc": 0.3115277452140392 + }, + { + "epoch": 6.6303136909997065, + "grad_norm": 0.2199124118643532, + "learning_rate": 0.00039927309115074896, + "loss": 2.9559974670410156, + "step": 11311, + "token_acc": 0.30275293082557053 + }, + { + "epoch": 6.630900029316916, + "grad_norm": 0.15738936091451544, + "learning_rate": 0.0003992536536383369, + "loss": 2.9630136489868164, + "step": 11312, + "token_acc": 0.30459864709167883 + }, + { + "epoch": 6.631486367634125, + "grad_norm": 0.2545698903081588, + "learning_rate": 0.00039923421472388786, + "loss": 2.9613256454467773, + "step": 11313, + "token_acc": 0.30383339391080416 + }, + { + "epoch": 6.632072705951334, + "grad_norm": 0.1536925509587512, + "learning_rate": 0.00039921477440758456, + "loss": 2.9535117149353027, + "step": 11314, + "token_acc": 0.30526684098035173 + }, + { + "epoch": 6.632659044268543, + "grad_norm": 0.2261107277589054, + "learning_rate": 0.0003991953326896095, + "loss": 2.9762346744537354, + "step": 11315, + "token_acc": 0.30294161193777297 + }, + { + "epoch": 6.633245382585752, + "grad_norm": 0.16426183098494554, + "learning_rate": 0.00039917588957014534, + "loss": 2.9472107887268066, + "step": 11316, + "token_acc": 0.3050770758300525 + }, + { + "epoch": 6.633831720902961, + "grad_norm": 0.2145499042313142, + "learning_rate": 0.0003991564450493747, + "loss": 3.020759344100952, + "step": 11317, + "token_acc": 0.2951953845672771 + }, + { + "epoch": 6.63441805922017, + "grad_norm": 0.17814728411850064, + "learning_rate": 0.00039913699912748026, + "loss": 2.9859585762023926, + "step": 11318, + "token_acc": 0.29976791794776475 + }, + { + "epoch": 6.635004397537379, + "grad_norm": 0.1879720329454861, + "learning_rate": 0.0003991175518046446, + "loss": 2.9675140380859375, + "step": 11319, + "token_acc": 0.30041296412374174 + }, + { + "epoch": 6.6355907358545885, + "grad_norm": 0.18202018110081905, + "learning_rate": 0.0003990981030810506, + "loss": 2.9443275928497314, + "step": 11320, + "token_acc": 0.30576355617603196 + }, + { + "epoch": 6.636177074171798, + "grad_norm": 0.19244164916234927, + "learning_rate": 0.0003990786529568807, + "loss": 2.973818302154541, + "step": 11321, + "token_acc": 0.3025326994870142 + }, + { + "epoch": 6.636763412489006, + "grad_norm": 0.16071303484187896, + "learning_rate": 0.0003990592014323179, + "loss": 2.925814628601074, + "step": 11322, + "token_acc": 0.30808422550354236 + }, + { + "epoch": 6.637349750806215, + "grad_norm": 0.1599602395163467, + "learning_rate": 0.00039903974850754464, + "loss": 2.975641965866089, + "step": 11323, + "token_acc": 0.3022494388342611 + }, + { + "epoch": 6.637936089123424, + "grad_norm": 0.18456231069785628, + "learning_rate": 0.00039902029418274385, + "loss": 2.9288320541381836, + "step": 11324, + "token_acc": 0.30712462275682284 + }, + { + "epoch": 6.638522427440633, + "grad_norm": 0.15320914130243188, + "learning_rate": 0.0003990008384580982, + "loss": 2.9629385471343994, + "step": 11325, + "token_acc": 0.3028232192031953 + }, + { + "epoch": 6.639108765757842, + "grad_norm": 0.166298481540386, + "learning_rate": 0.00039898138133379036, + "loss": 2.9879112243652344, + "step": 11326, + "token_acc": 0.3004662579261225 + }, + { + "epoch": 6.639695104075051, + "grad_norm": 0.15404862208584968, + "learning_rate": 0.0003989619228100033, + "loss": 2.9541945457458496, + "step": 11327, + "token_acc": 0.3045424972067499 + }, + { + "epoch": 6.6402814423922605, + "grad_norm": 0.18990610287037202, + "learning_rate": 0.0003989424628869196, + "loss": 2.95762038230896, + "step": 11328, + "token_acc": 0.30331468126305516 + }, + { + "epoch": 6.64086778070947, + "grad_norm": 0.17273292187125022, + "learning_rate": 0.0003989230015647223, + "loss": 2.966157913208008, + "step": 11329, + "token_acc": 0.30294152878250497 + }, + { + "epoch": 6.641454119026679, + "grad_norm": 0.1789580370560568, + "learning_rate": 0.00039890353884359397, + "loss": 2.9753592014312744, + "step": 11330, + "token_acc": 0.3034994968026816 + }, + { + "epoch": 6.642040457343887, + "grad_norm": 0.18401936389355647, + "learning_rate": 0.00039888407472371757, + "loss": 2.939253807067871, + "step": 11331, + "token_acc": 0.3071318396904656 + }, + { + "epoch": 6.642626795661096, + "grad_norm": 0.18840871853493116, + "learning_rate": 0.0003988646092052759, + "loss": 2.992238998413086, + "step": 11332, + "token_acc": 0.2986090598362731 + }, + { + "epoch": 6.643213133978305, + "grad_norm": 0.16661194799982856, + "learning_rate": 0.00039884514228845185, + "loss": 2.9246954917907715, + "step": 11333, + "token_acc": 0.3085244200928477 + }, + { + "epoch": 6.643799472295514, + "grad_norm": 0.1909322920754988, + "learning_rate": 0.0003988256739734283, + "loss": 2.978854179382324, + "step": 11334, + "token_acc": 0.30011719232981576 + }, + { + "epoch": 6.644385810612723, + "grad_norm": 0.17486995596637397, + "learning_rate": 0.00039880620426038804, + "loss": 3.017129898071289, + "step": 11335, + "token_acc": 0.29652975464280873 + }, + { + "epoch": 6.6449721489299325, + "grad_norm": 0.22381391879663706, + "learning_rate": 0.00039878673314951396, + "loss": 2.954921245574951, + "step": 11336, + "token_acc": 0.30441744892324685 + }, + { + "epoch": 6.645558487247142, + "grad_norm": 0.2687697638986463, + "learning_rate": 0.0003987672606409891, + "loss": 3.00222110748291, + "step": 11337, + "token_acc": 0.2976112735893403 + }, + { + "epoch": 6.646144825564351, + "grad_norm": 0.18437348364461953, + "learning_rate": 0.00039874778673499624, + "loss": 2.9416041374206543, + "step": 11338, + "token_acc": 0.3054880932768599 + }, + { + "epoch": 6.64673116388156, + "grad_norm": 0.2450197236217183, + "learning_rate": 0.0003987283114317184, + "loss": 2.9084627628326416, + "step": 11339, + "token_acc": 0.3123429881246578 + }, + { + "epoch": 6.647317502198769, + "grad_norm": 0.2848056883882818, + "learning_rate": 0.00039870883473133847, + "loss": 2.9422760009765625, + "step": 11340, + "token_acc": 0.3083463187110111 + }, + { + "epoch": 6.647903840515978, + "grad_norm": 0.15853114272354413, + "learning_rate": 0.00039868935663403944, + "loss": 2.988128662109375, + "step": 11341, + "token_acc": 0.2999840155049602 + }, + { + "epoch": 6.648490178833187, + "grad_norm": 0.2322339602601575, + "learning_rate": 0.0003986698771400042, + "loss": 2.994713306427002, + "step": 11342, + "token_acc": 0.29932864525260017 + }, + { + "epoch": 6.649076517150396, + "grad_norm": 0.15229090812477258, + "learning_rate": 0.0003986503962494159, + "loss": 2.975273847579956, + "step": 11343, + "token_acc": 0.30006314800129236 + }, + { + "epoch": 6.6496628554676045, + "grad_norm": 0.21826083613405795, + "learning_rate": 0.00039863091396245744, + "loss": 2.985178232192993, + "step": 11344, + "token_acc": 0.3009278958092726 + }, + { + "epoch": 6.650249193784814, + "grad_norm": 0.17413434387675664, + "learning_rate": 0.00039861143027931174, + "loss": 2.9989888668060303, + "step": 11345, + "token_acc": 0.2982934276241239 + }, + { + "epoch": 6.650835532102023, + "grad_norm": 0.19602224656811673, + "learning_rate": 0.00039859194520016196, + "loss": 3.0043396949768066, + "step": 11346, + "token_acc": 0.29739406679702446 + }, + { + "epoch": 6.651421870419232, + "grad_norm": 0.16917231433944352, + "learning_rate": 0.000398572458725191, + "loss": 2.951287269592285, + "step": 11347, + "token_acc": 0.3036410231111365 + }, + { + "epoch": 6.652008208736441, + "grad_norm": 0.22558505873386944, + "learning_rate": 0.0003985529708545821, + "loss": 2.990481376647949, + "step": 11348, + "token_acc": 0.299524301536374 + }, + { + "epoch": 6.65259454705365, + "grad_norm": 0.17242011803185772, + "learning_rate": 0.0003985334815885182, + "loss": 2.975827693939209, + "step": 11349, + "token_acc": 0.30318547107962357 + }, + { + "epoch": 6.653180885370859, + "grad_norm": 0.22068679908703276, + "learning_rate": 0.00039851399092718235, + "loss": 3.0046346187591553, + "step": 11350, + "token_acc": 0.29883451805955763 + }, + { + "epoch": 6.653767223688068, + "grad_norm": 0.18977190717218675, + "learning_rate": 0.00039849449887075774, + "loss": 2.9638400077819824, + "step": 11351, + "token_acc": 0.30285654847811105 + }, + { + "epoch": 6.654353562005277, + "grad_norm": 0.18960398653368563, + "learning_rate": 0.00039847500541942725, + "loss": 2.977375030517578, + "step": 11352, + "token_acc": 0.30059698698381165 + }, + { + "epoch": 6.654939900322486, + "grad_norm": 0.17752790292338771, + "learning_rate": 0.00039845551057337436, + "loss": 3.014984607696533, + "step": 11353, + "token_acc": 0.29502940448675086 + }, + { + "epoch": 6.655526238639695, + "grad_norm": 0.20052522966708472, + "learning_rate": 0.0003984360143327819, + "loss": 2.9672741889953613, + "step": 11354, + "token_acc": 0.3029964262331566 + }, + { + "epoch": 6.656112576956904, + "grad_norm": 0.16946088297302775, + "learning_rate": 0.00039841651669783314, + "loss": 3.0039467811584473, + "step": 11355, + "token_acc": 0.29748688765885106 + }, + { + "epoch": 6.656698915274113, + "grad_norm": 0.25410664578718156, + "learning_rate": 0.00039839701766871115, + "loss": 2.969050168991089, + "step": 11356, + "token_acc": 0.3025026349720288 + }, + { + "epoch": 6.657285253591322, + "grad_norm": 0.1697919890376909, + "learning_rate": 0.00039837751724559916, + "loss": 2.998368501663208, + "step": 11357, + "token_acc": 0.2967860742708972 + }, + { + "epoch": 6.657871591908531, + "grad_norm": 0.20422285579647964, + "learning_rate": 0.00039835801542868033, + "loss": 2.9559884071350098, + "step": 11358, + "token_acc": 0.30283790567876445 + }, + { + "epoch": 6.65845793022574, + "grad_norm": 0.20112100628267332, + "learning_rate": 0.00039833851221813795, + "loss": 2.990658760070801, + "step": 11359, + "token_acc": 0.3000247181592494 + }, + { + "epoch": 6.659044268542949, + "grad_norm": 0.17427786204975684, + "learning_rate": 0.0003983190076141551, + "loss": 2.9911274909973145, + "step": 11360, + "token_acc": 0.2997285789762662 + }, + { + "epoch": 6.6596306068601585, + "grad_norm": 0.17571834224115132, + "learning_rate": 0.00039829950161691496, + "loss": 2.9850687980651855, + "step": 11361, + "token_acc": 0.30093128226413135 + }, + { + "epoch": 6.660216945177368, + "grad_norm": 0.16404291267475324, + "learning_rate": 0.00039827999422660087, + "loss": 2.948847532272339, + "step": 11362, + "token_acc": 0.30450124049935023 + }, + { + "epoch": 6.660803283494577, + "grad_norm": 0.15501037350033997, + "learning_rate": 0.00039826048544339604, + "loss": 2.962644100189209, + "step": 11363, + "token_acc": 0.30116452484607986 + }, + { + "epoch": 6.661389621811786, + "grad_norm": 0.16210604216854, + "learning_rate": 0.00039824097526748375, + "loss": 3.0023744106292725, + "step": 11364, + "token_acc": 0.29782760279563036 + }, + { + "epoch": 6.661975960128994, + "grad_norm": 0.16028527562722358, + "learning_rate": 0.0003982214636990473, + "loss": 2.9896163940429688, + "step": 11365, + "token_acc": 0.3004510015715946 + }, + { + "epoch": 6.662562298446203, + "grad_norm": 0.16284612716906804, + "learning_rate": 0.0003982019507382698, + "loss": 2.963650703430176, + "step": 11366, + "token_acc": 0.3016701254575262 + }, + { + "epoch": 6.663148636763412, + "grad_norm": 0.1600422740184805, + "learning_rate": 0.0003981824363853348, + "loss": 2.9712648391723633, + "step": 11367, + "token_acc": 0.30307501104585516 + }, + { + "epoch": 6.663734975080621, + "grad_norm": 0.17231644150423228, + "learning_rate": 0.00039816292064042547, + "loss": 2.9800655841827393, + "step": 11368, + "token_acc": 0.3003442501178118 + }, + { + "epoch": 6.6643213133978305, + "grad_norm": 0.1558288489373433, + "learning_rate": 0.0003981434035037251, + "loss": 2.972364902496338, + "step": 11369, + "token_acc": 0.30436563779385123 + }, + { + "epoch": 6.66490765171504, + "grad_norm": 0.1554496406919713, + "learning_rate": 0.0003981238849754171, + "loss": 2.961440324783325, + "step": 11370, + "token_acc": 0.30170340214878 + }, + { + "epoch": 6.665493990032249, + "grad_norm": 0.1633196544310385, + "learning_rate": 0.00039810436505568483, + "loss": 2.980769634246826, + "step": 11371, + "token_acc": 0.30040120489649424 + }, + { + "epoch": 6.666080328349458, + "grad_norm": 0.1472713446337543, + "learning_rate": 0.0003980848437447116, + "loss": 2.9376120567321777, + "step": 11372, + "token_acc": 0.30590338516151205 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.15996808334395465, + "learning_rate": 0.0003980653210426808, + "loss": 2.995889663696289, + "step": 11373, + "token_acc": 0.30017272119456206 + }, + { + "epoch": 6.667253004983876, + "grad_norm": 0.16260672518631145, + "learning_rate": 0.0003980457969497759, + "loss": 2.931584358215332, + "step": 11374, + "token_acc": 0.3080101840462079 + }, + { + "epoch": 6.667839343301084, + "grad_norm": 0.14881729636078642, + "learning_rate": 0.00039802627146618016, + "loss": 2.9780993461608887, + "step": 11375, + "token_acc": 0.30363366045376367 + }, + { + "epoch": 6.668425681618293, + "grad_norm": 0.15577388953062649, + "learning_rate": 0.0003980067445920771, + "loss": 3.023399591445923, + "step": 11376, + "token_acc": 0.29595491513200944 + }, + { + "epoch": 6.6690120199355025, + "grad_norm": 0.17539517943152408, + "learning_rate": 0.00039798721632765006, + "loss": 2.9995086193084717, + "step": 11377, + "token_acc": 0.2980259470466337 + }, + { + "epoch": 6.669598358252712, + "grad_norm": 0.19779285818013395, + "learning_rate": 0.0003979676866730826, + "loss": 3.0004284381866455, + "step": 11378, + "token_acc": 0.29822697133976744 + }, + { + "epoch": 6.670184696569921, + "grad_norm": 0.22744218841690456, + "learning_rate": 0.0003979481556285581, + "loss": 2.9864578247070312, + "step": 11379, + "token_acc": 0.29997855614889357 + }, + { + "epoch": 6.67077103488713, + "grad_norm": 0.21881702236564107, + "learning_rate": 0.00039792862319426006, + "loss": 2.97074294090271, + "step": 11380, + "token_acc": 0.3022918260140806 + }, + { + "epoch": 6.671357373204339, + "grad_norm": 0.20744695640484034, + "learning_rate": 0.00039790908937037184, + "loss": 2.9663262367248535, + "step": 11381, + "token_acc": 0.3029587826313757 + }, + { + "epoch": 6.671943711521548, + "grad_norm": 0.16986284935226043, + "learning_rate": 0.0003978895541570771, + "loss": 2.9617903232574463, + "step": 11382, + "token_acc": 0.303431661524406 + }, + { + "epoch": 6.672530049838757, + "grad_norm": 0.1561473548659989, + "learning_rate": 0.0003978700175545593, + "loss": 2.980672836303711, + "step": 11383, + "token_acc": 0.2999207846790727 + }, + { + "epoch": 6.673116388155966, + "grad_norm": 0.2211981636040716, + "learning_rate": 0.00039785047956300193, + "loss": 2.962477207183838, + "step": 11384, + "token_acc": 0.3033675769972353 + }, + { + "epoch": 6.673702726473175, + "grad_norm": 0.17076739641469774, + "learning_rate": 0.0003978309401825886, + "loss": 2.986905574798584, + "step": 11385, + "token_acc": 0.2995022589375646 + }, + { + "epoch": 6.6742890647903845, + "grad_norm": 0.16765755074504907, + "learning_rate": 0.00039781139941350263, + "loss": 2.9670209884643555, + "step": 11386, + "token_acc": 0.30220433478415837 + }, + { + "epoch": 6.674875403107593, + "grad_norm": 0.24438881204459117, + "learning_rate": 0.00039779185725592785, + "loss": 2.9719505310058594, + "step": 11387, + "token_acc": 0.30192105661966984 + }, + { + "epoch": 6.675461741424802, + "grad_norm": 0.1833358863087755, + "learning_rate": 0.00039777231371004775, + "loss": 2.988298177719116, + "step": 11388, + "token_acc": 0.2985937907644143 + }, + { + "epoch": 6.676048079742011, + "grad_norm": 0.16750504086245546, + "learning_rate": 0.00039775276877604583, + "loss": 2.994284152984619, + "step": 11389, + "token_acc": 0.2981171602493125 + }, + { + "epoch": 6.67663441805922, + "grad_norm": 0.24000613632202555, + "learning_rate": 0.0003977332224541058, + "loss": 2.9887194633483887, + "step": 11390, + "token_acc": 0.3011829783156135 + }, + { + "epoch": 6.677220756376429, + "grad_norm": 0.1452197601517279, + "learning_rate": 0.0003977136747444112, + "loss": 2.9103033542633057, + "step": 11391, + "token_acc": 0.3108374664775257 + }, + { + "epoch": 6.677807094693638, + "grad_norm": 0.2366979988530175, + "learning_rate": 0.0003976941256471456, + "loss": 2.986128091812134, + "step": 11392, + "token_acc": 0.2986735725789623 + }, + { + "epoch": 6.678393433010847, + "grad_norm": 0.17994698541319323, + "learning_rate": 0.00039767457516249276, + "loss": 2.9973793029785156, + "step": 11393, + "token_acc": 0.2990064299287857 + }, + { + "epoch": 6.6789797713280565, + "grad_norm": 0.16438750332071767, + "learning_rate": 0.00039765502329063636, + "loss": 2.9811553955078125, + "step": 11394, + "token_acc": 0.30108103278834986 + }, + { + "epoch": 6.679566109645266, + "grad_norm": 0.1549691251026536, + "learning_rate": 0.0003976354700317599, + "loss": 2.9553847312927246, + "step": 11395, + "token_acc": 0.3041669602545271 + }, + { + "epoch": 6.680152447962474, + "grad_norm": 0.17816800106733896, + "learning_rate": 0.0003976159153860471, + "loss": 2.9815449714660645, + "step": 11396, + "token_acc": 0.30144271720613286 + }, + { + "epoch": 6.680738786279683, + "grad_norm": 0.2013883068736934, + "learning_rate": 0.00039759635935368175, + "loss": 2.9550509452819824, + "step": 11397, + "token_acc": 0.30500478985034735 + }, + { + "epoch": 6.681325124596892, + "grad_norm": 0.1561848370858565, + "learning_rate": 0.0003975768019348475, + "loss": 3.0009632110595703, + "step": 11398, + "token_acc": 0.2975030642069204 + }, + { + "epoch": 6.681911462914101, + "grad_norm": 0.2138414501531232, + "learning_rate": 0.0003975572431297281, + "loss": 3.0197954177856445, + "step": 11399, + "token_acc": 0.29623594893372707 + }, + { + "epoch": 6.68249780123131, + "grad_norm": 0.19750749170737644, + "learning_rate": 0.0003975376829385071, + "loss": 2.9644269943237305, + "step": 11400, + "token_acc": 0.30316190879720356 + }, + { + "epoch": 6.683084139548519, + "grad_norm": 0.1834038092913836, + "learning_rate": 0.00039751812136136846, + "loss": 2.9684314727783203, + "step": 11401, + "token_acc": 0.30197478591694943 + }, + { + "epoch": 6.6836704778657285, + "grad_norm": 0.225212574135076, + "learning_rate": 0.00039749855839849593, + "loss": 2.990743637084961, + "step": 11402, + "token_acc": 0.30024662055665785 + }, + { + "epoch": 6.684256816182938, + "grad_norm": 0.15930740786059144, + "learning_rate": 0.0003974789940500731, + "loss": 2.979135036468506, + "step": 11403, + "token_acc": 0.302282593547946 + }, + { + "epoch": 6.684843154500147, + "grad_norm": 0.2571862811236799, + "learning_rate": 0.0003974594283162839, + "loss": 2.9956653118133545, + "step": 11404, + "token_acc": 0.2995337103409694 + }, + { + "epoch": 6.685429492817356, + "grad_norm": 0.2809083945566357, + "learning_rate": 0.000397439861197312, + "loss": 2.9527673721313477, + "step": 11405, + "token_acc": 0.30408394811007955 + }, + { + "epoch": 6.686015831134565, + "grad_norm": 0.1785346315351098, + "learning_rate": 0.00039742029269334135, + "loss": 3.017627716064453, + "step": 11406, + "token_acc": 0.29444740785897533 + }, + { + "epoch": 6.686602169451774, + "grad_norm": 0.2896679081758272, + "learning_rate": 0.00039740072280455564, + "loss": 2.9676661491394043, + "step": 11407, + "token_acc": 0.3033405242180884 + }, + { + "epoch": 6.687188507768982, + "grad_norm": 0.15761621473427762, + "learning_rate": 0.00039738115153113886, + "loss": 2.9629921913146973, + "step": 11408, + "token_acc": 0.30046340868538274 + }, + { + "epoch": 6.687774846086191, + "grad_norm": 0.18653703652886608, + "learning_rate": 0.00039736157887327467, + "loss": 2.962462902069092, + "step": 11409, + "token_acc": 0.301782415447265 + }, + { + "epoch": 6.6883611844034006, + "grad_norm": 0.15088859435176058, + "learning_rate": 0.00039734200483114706, + "loss": 2.993987560272217, + "step": 11410, + "token_acc": 0.29966419296397084 + }, + { + "epoch": 6.68894752272061, + "grad_norm": 0.1656135642445275, + "learning_rate": 0.00039732242940493986, + "loss": 2.950441837310791, + "step": 11411, + "token_acc": 0.3054687846016366 + }, + { + "epoch": 6.689533861037819, + "grad_norm": 0.1504809502385022, + "learning_rate": 0.0003973028525948369, + "loss": 2.975975751876831, + "step": 11412, + "token_acc": 0.30017452921661014 + }, + { + "epoch": 6.690120199355028, + "grad_norm": 0.15898046842313324, + "learning_rate": 0.00039728327440102226, + "loss": 2.977590560913086, + "step": 11413, + "token_acc": 0.29912872859779027 + }, + { + "epoch": 6.690706537672237, + "grad_norm": 0.14026560960836, + "learning_rate": 0.0003972636948236796, + "loss": 2.970689058303833, + "step": 11414, + "token_acc": 0.30247910090177166 + }, + { + "epoch": 6.691292875989446, + "grad_norm": 0.1626919012453347, + "learning_rate": 0.00039724411386299303, + "loss": 2.990302085876465, + "step": 11415, + "token_acc": 0.2994936007594644 + }, + { + "epoch": 6.691879214306655, + "grad_norm": 0.15417430579077235, + "learning_rate": 0.0003972245315191464, + "loss": 2.999255418777466, + "step": 11416, + "token_acc": 0.2988714513846722 + }, + { + "epoch": 6.692465552623864, + "grad_norm": 0.15001454997459318, + "learning_rate": 0.0003972049477923237, + "loss": 2.952442169189453, + "step": 11417, + "token_acc": 0.305041956430718 + }, + { + "epoch": 6.693051890941073, + "grad_norm": 0.1615351487815618, + "learning_rate": 0.0003971853626827089, + "loss": 3.0144004821777344, + "step": 11418, + "token_acc": 0.2963099640323229 + }, + { + "epoch": 6.693638229258282, + "grad_norm": 0.16516916460729322, + "learning_rate": 0.0003971657761904859, + "loss": 3.0056591033935547, + "step": 11419, + "token_acc": 0.29687299241937554 + }, + { + "epoch": 6.694224567575491, + "grad_norm": 0.14095426621359747, + "learning_rate": 0.0003971461883158387, + "loss": 2.9287383556365967, + "step": 11420, + "token_acc": 0.3088137437154083 + }, + { + "epoch": 6.6948109058927, + "grad_norm": 0.17412852894989478, + "learning_rate": 0.00039712659905895146, + "loss": 2.952803611755371, + "step": 11421, + "token_acc": 0.3043351546356564 + }, + { + "epoch": 6.695397244209909, + "grad_norm": 0.1768829543387441, + "learning_rate": 0.000397107008420008, + "loss": 3.008906602859497, + "step": 11422, + "token_acc": 0.2972392989219714 + }, + { + "epoch": 6.695983582527118, + "grad_norm": 0.1868379971798176, + "learning_rate": 0.00039708741639919243, + "loss": 2.9618592262268066, + "step": 11423, + "token_acc": 0.3016537881880913 + }, + { + "epoch": 6.696569920844327, + "grad_norm": 0.20008529600607577, + "learning_rate": 0.0003970678229966889, + "loss": 2.927588701248169, + "step": 11424, + "token_acc": 0.30641786307913266 + }, + { + "epoch": 6.697156259161536, + "grad_norm": 0.17699882457841284, + "learning_rate": 0.00039704822821268117, + "loss": 2.9790148735046387, + "step": 11425, + "token_acc": 0.29897127426445186 + }, + { + "epoch": 6.6977425974787455, + "grad_norm": 0.16613557546879118, + "learning_rate": 0.0003970286320473536, + "loss": 2.9229955673217773, + "step": 11426, + "token_acc": 0.3096192537109207 + }, + { + "epoch": 6.698328935795955, + "grad_norm": 0.20456499536036857, + "learning_rate": 0.0003970090345008901, + "loss": 3.0020337104797363, + "step": 11427, + "token_acc": 0.2974807914511389 + }, + { + "epoch": 6.698915274113164, + "grad_norm": 0.16381672051835902, + "learning_rate": 0.0003969894355734749, + "loss": 2.98104190826416, + "step": 11428, + "token_acc": 0.29981165754315425 + }, + { + "epoch": 6.699501612430373, + "grad_norm": 0.17656741880838678, + "learning_rate": 0.00039696983526529194, + "loss": 2.944035053253174, + "step": 11429, + "token_acc": 0.3049011902669052 + }, + { + "epoch": 6.700087950747581, + "grad_norm": 0.1954456924887954, + "learning_rate": 0.0003969502335765254, + "loss": 2.976634979248047, + "step": 11430, + "token_acc": 0.30155167975119035 + }, + { + "epoch": 6.70067428906479, + "grad_norm": 0.17754278379781377, + "learning_rate": 0.0003969306305073595, + "loss": 2.956234931945801, + "step": 11431, + "token_acc": 0.3039499824391604 + }, + { + "epoch": 6.701260627381999, + "grad_norm": 0.16416145261026252, + "learning_rate": 0.0003969110260579783, + "loss": 2.9606378078460693, + "step": 11432, + "token_acc": 0.3047588484629682 + }, + { + "epoch": 6.701846965699208, + "grad_norm": 0.19104054260369127, + "learning_rate": 0.00039689142022856594, + "loss": 2.939675807952881, + "step": 11433, + "token_acc": 0.3061887261140012 + }, + { + "epoch": 6.7024333040164175, + "grad_norm": 0.21902285019659587, + "learning_rate": 0.0003968718130193067, + "loss": 2.9641435146331787, + "step": 11434, + "token_acc": 0.3026163738934081 + }, + { + "epoch": 6.703019642333627, + "grad_norm": 0.1963317144635166, + "learning_rate": 0.0003968522044303846, + "loss": 3.012782096862793, + "step": 11435, + "token_acc": 0.2969321231233732 + }, + { + "epoch": 6.703605980650836, + "grad_norm": 0.1579879993421375, + "learning_rate": 0.000396832594461984, + "loss": 2.9762790203094482, + "step": 11436, + "token_acc": 0.30051967680657565 + }, + { + "epoch": 6.704192318968045, + "grad_norm": 0.15545930764846078, + "learning_rate": 0.00039681298311428905, + "loss": 2.9659769535064697, + "step": 11437, + "token_acc": 0.30327344823748353 + }, + { + "epoch": 6.704778657285254, + "grad_norm": 0.15206612055060373, + "learning_rate": 0.00039679337038748386, + "loss": 2.955127000808716, + "step": 11438, + "token_acc": 0.3046055147215023 + }, + { + "epoch": 6.705364995602462, + "grad_norm": 0.16162466106772894, + "learning_rate": 0.00039677375628175283, + "loss": 2.95009446144104, + "step": 11439, + "token_acc": 0.30425929773530025 + }, + { + "epoch": 6.705951333919671, + "grad_norm": 0.167058006112606, + "learning_rate": 0.0003967541407972801, + "loss": 2.9866719245910645, + "step": 11440, + "token_acc": 0.2992782761957234 + }, + { + "epoch": 6.70653767223688, + "grad_norm": 0.15972261804088006, + "learning_rate": 0.00039673452393425003, + "loss": 2.982419013977051, + "step": 11441, + "token_acc": 0.3008372280169612 + }, + { + "epoch": 6.7071240105540895, + "grad_norm": 0.20851651944220445, + "learning_rate": 0.0003967149056928468, + "loss": 2.9721455574035645, + "step": 11442, + "token_acc": 0.29992864935293234 + }, + { + "epoch": 6.707710348871299, + "grad_norm": 0.18897336229577827, + "learning_rate": 0.0003966952860732548, + "loss": 2.9999399185180664, + "step": 11443, + "token_acc": 0.29903194231626634 + }, + { + "epoch": 6.708296687188508, + "grad_norm": 0.16063842874759024, + "learning_rate": 0.00039667566507565815, + "loss": 2.998539447784424, + "step": 11444, + "token_acc": 0.29909668785547006 + }, + { + "epoch": 6.708883025505717, + "grad_norm": 0.1648507483099146, + "learning_rate": 0.0003966560427002413, + "loss": 2.9928231239318848, + "step": 11445, + "token_acc": 0.2979166453487777 + }, + { + "epoch": 6.709469363822926, + "grad_norm": 0.15133694221869654, + "learning_rate": 0.0003966364189471886, + "loss": 3.037770986557007, + "step": 11446, + "token_acc": 0.29101974974478334 + }, + { + "epoch": 6.710055702140135, + "grad_norm": 0.1604097305553905, + "learning_rate": 0.00039661679381668423, + "loss": 2.9558980464935303, + "step": 11447, + "token_acc": 0.3039214432752426 + }, + { + "epoch": 6.710642040457344, + "grad_norm": 0.15208083009011727, + "learning_rate": 0.0003965971673089128, + "loss": 2.977978229522705, + "step": 11448, + "token_acc": 0.3009759593458857 + }, + { + "epoch": 6.711228378774553, + "grad_norm": 0.16365484244495107, + "learning_rate": 0.0003965775394240585, + "loss": 2.959310531616211, + "step": 11449, + "token_acc": 0.3042475737738576 + }, + { + "epoch": 6.711814717091762, + "grad_norm": 0.16650873266645058, + "learning_rate": 0.0003965579101623056, + "loss": 2.9658875465393066, + "step": 11450, + "token_acc": 0.30290446477833366 + }, + { + "epoch": 6.7124010554089715, + "grad_norm": 0.16173322458223713, + "learning_rate": 0.00039653827952383874, + "loss": 2.989811897277832, + "step": 11451, + "token_acc": 0.30018939636635766 + }, + { + "epoch": 6.71298739372618, + "grad_norm": 0.16087964543478805, + "learning_rate": 0.00039651864750884217, + "loss": 2.979311466217041, + "step": 11452, + "token_acc": 0.2996406480977204 + }, + { + "epoch": 6.713573732043389, + "grad_norm": 0.2318406177786617, + "learning_rate": 0.0003964990141175004, + "loss": 2.996365785598755, + "step": 11453, + "token_acc": 0.29877736079296663 + }, + { + "epoch": 6.714160070360598, + "grad_norm": 0.2574949755184213, + "learning_rate": 0.00039647937934999774, + "loss": 2.964022636413574, + "step": 11454, + "token_acc": 0.3014292349545485 + }, + { + "epoch": 6.714746408677807, + "grad_norm": 0.266753159723662, + "learning_rate": 0.0003964597432065187, + "loss": 2.981719493865967, + "step": 11455, + "token_acc": 0.3006012913534669 + }, + { + "epoch": 6.715332746995016, + "grad_norm": 0.16797089260512005, + "learning_rate": 0.00039644010568724776, + "loss": 2.9960997104644775, + "step": 11456, + "token_acc": 0.2992775921149497 + }, + { + "epoch": 6.715919085312225, + "grad_norm": 0.25923051664864993, + "learning_rate": 0.00039642046679236933, + "loss": 2.977847099304199, + "step": 11457, + "token_acc": 0.30059645500265775 + }, + { + "epoch": 6.716505423629434, + "grad_norm": 0.2558621824881665, + "learning_rate": 0.0003964008265220679, + "loss": 2.955122470855713, + "step": 11458, + "token_acc": 0.3035966593830851 + }, + { + "epoch": 6.7170917619466435, + "grad_norm": 0.15510102955938807, + "learning_rate": 0.000396381184876528, + "loss": 2.915142059326172, + "step": 11459, + "token_acc": 0.309123970478126 + }, + { + "epoch": 6.717678100263853, + "grad_norm": 0.19013991032479824, + "learning_rate": 0.0003963615418559341, + "loss": 2.975236415863037, + "step": 11460, + "token_acc": 0.3014166253617511 + }, + { + "epoch": 6.718264438581061, + "grad_norm": 0.15361337894880858, + "learning_rate": 0.00039634189746047077, + "loss": 2.9246745109558105, + "step": 11461, + "token_acc": 0.30814509271631607 + }, + { + "epoch": 6.71885077689827, + "grad_norm": 0.1686916331906177, + "learning_rate": 0.00039632225169032256, + "loss": 2.9644534587860107, + "step": 11462, + "token_acc": 0.30389152218952553 + }, + { + "epoch": 6.719437115215479, + "grad_norm": 0.1792400260994154, + "learning_rate": 0.0003963026045456739, + "loss": 3.0062899589538574, + "step": 11463, + "token_acc": 0.29789639279982927 + }, + { + "epoch": 6.720023453532688, + "grad_norm": 0.17703563605682096, + "learning_rate": 0.0003962829560267094, + "loss": 2.9580507278442383, + "step": 11464, + "token_acc": 0.30488392704429895 + }, + { + "epoch": 6.720609791849897, + "grad_norm": 0.1700551582338296, + "learning_rate": 0.0003962633061336137, + "loss": 2.997164487838745, + "step": 11465, + "token_acc": 0.2995014188765867 + }, + { + "epoch": 6.721196130167106, + "grad_norm": 0.16480213493276874, + "learning_rate": 0.0003962436548665713, + "loss": 2.9949779510498047, + "step": 11466, + "token_acc": 0.2983558091418955 + }, + { + "epoch": 6.7217824684843155, + "grad_norm": 0.18174588841154093, + "learning_rate": 0.0003962240022257668, + "loss": 2.96295166015625, + "step": 11467, + "token_acc": 0.3032173910733404 + }, + { + "epoch": 6.722368806801525, + "grad_norm": 0.16523648525800896, + "learning_rate": 0.0003962043482113849, + "loss": 2.9636545181274414, + "step": 11468, + "token_acc": 0.30239857535340375 + }, + { + "epoch": 6.722955145118734, + "grad_norm": 0.1743742593603934, + "learning_rate": 0.0003961846928236101, + "loss": 2.9745452404022217, + "step": 11469, + "token_acc": 0.30285229301726 + }, + { + "epoch": 6.723541483435943, + "grad_norm": 0.21039738554492693, + "learning_rate": 0.00039616503606262714, + "loss": 2.9557456970214844, + "step": 11470, + "token_acc": 0.304314615123251 + }, + { + "epoch": 6.724127821753152, + "grad_norm": 0.17541357991042905, + "learning_rate": 0.0003961453779286206, + "loss": 2.9444963932037354, + "step": 11471, + "token_acc": 0.30677319648376605 + }, + { + "epoch": 6.724714160070361, + "grad_norm": 0.17228360511669832, + "learning_rate": 0.00039612571842177524, + "loss": 2.9841527938842773, + "step": 11472, + "token_acc": 0.3007851984619072 + }, + { + "epoch": 6.725300498387569, + "grad_norm": 0.158510852369442, + "learning_rate": 0.0003961060575422756, + "loss": 2.9433116912841797, + "step": 11473, + "token_acc": 0.30456452566665726 + }, + { + "epoch": 6.725886836704778, + "grad_norm": 0.16876547034241407, + "learning_rate": 0.00039608639529030643, + "loss": 2.928025960922241, + "step": 11474, + "token_acc": 0.3088698306751006 + }, + { + "epoch": 6.7264731750219875, + "grad_norm": 0.15773256534309038, + "learning_rate": 0.00039606673166605244, + "loss": 2.9357941150665283, + "step": 11475, + "token_acc": 0.30644267155149557 + }, + { + "epoch": 6.727059513339197, + "grad_norm": 0.16713640074407096, + "learning_rate": 0.00039604706666969837, + "loss": 2.9290642738342285, + "step": 11476, + "token_acc": 0.3068775673430087 + }, + { + "epoch": 6.727645851656406, + "grad_norm": 0.19725760102992, + "learning_rate": 0.00039602740030142885, + "loss": 3.0053625106811523, + "step": 11477, + "token_acc": 0.2973092613258336 + }, + { + "epoch": 6.728232189973615, + "grad_norm": 0.20321797163338054, + "learning_rate": 0.0003960077325614287, + "loss": 2.948953628540039, + "step": 11478, + "token_acc": 0.3040985270797834 + }, + { + "epoch": 6.728818528290824, + "grad_norm": 0.17178196383917113, + "learning_rate": 0.00039598806344988267, + "loss": 2.996030807495117, + "step": 11479, + "token_acc": 0.2981013322144131 + }, + { + "epoch": 6.729404866608033, + "grad_norm": 0.14375899509215215, + "learning_rate": 0.00039596839296697543, + "loss": 2.9461045265197754, + "step": 11480, + "token_acc": 0.30591016548463357 + }, + { + "epoch": 6.729991204925242, + "grad_norm": 0.15202292356686473, + "learning_rate": 0.0003959487211128919, + "loss": 2.9922120571136475, + "step": 11481, + "token_acc": 0.2987676769831282 + }, + { + "epoch": 6.730577543242451, + "grad_norm": 0.16693567369714019, + "learning_rate": 0.0003959290478878168, + "loss": 3.006425142288208, + "step": 11482, + "token_acc": 0.29705168626201256 + }, + { + "epoch": 6.7311638815596595, + "grad_norm": 0.18321554020803385, + "learning_rate": 0.0003959093732919349, + "loss": 2.9566164016723633, + "step": 11483, + "token_acc": 0.304208504118283 + }, + { + "epoch": 6.731750219876869, + "grad_norm": 0.1661013570498967, + "learning_rate": 0.00039588969732543114, + "loss": 3.0147783756256104, + "step": 11484, + "token_acc": 0.29578203471000825 + }, + { + "epoch": 6.732336558194078, + "grad_norm": 0.15822077486735442, + "learning_rate": 0.00039587001998849013, + "loss": 2.9752917289733887, + "step": 11485, + "token_acc": 0.30153773287844127 + }, + { + "epoch": 6.732922896511287, + "grad_norm": 0.1458516837995497, + "learning_rate": 0.00039585034128129695, + "loss": 2.9652607440948486, + "step": 11486, + "token_acc": 0.3023669550483917 + }, + { + "epoch": 6.733509234828496, + "grad_norm": 0.14388629138859232, + "learning_rate": 0.0003958306612040363, + "loss": 2.9546666145324707, + "step": 11487, + "token_acc": 0.3046769260281564 + }, + { + "epoch": 6.734095573145705, + "grad_norm": 0.15247590995264726, + "learning_rate": 0.00039581097975689315, + "loss": 2.983140230178833, + "step": 11488, + "token_acc": 0.30046715427580645 + }, + { + "epoch": 6.734681911462914, + "grad_norm": 0.15429800170821226, + "learning_rate": 0.0003957912969400523, + "loss": 2.998532772064209, + "step": 11489, + "token_acc": 0.29811450245706683 + }, + { + "epoch": 6.735268249780123, + "grad_norm": 0.16346473061707514, + "learning_rate": 0.00039577161275369864, + "loss": 3.0059261322021484, + "step": 11490, + "token_acc": 0.2980360893495025 + }, + { + "epoch": 6.735854588097332, + "grad_norm": 0.17205259323937172, + "learning_rate": 0.0003957519271980171, + "loss": 2.9583284854888916, + "step": 11491, + "token_acc": 0.3044184664156925 + }, + { + "epoch": 6.7364409264145415, + "grad_norm": 0.1858788997640012, + "learning_rate": 0.0003957322402731927, + "loss": 2.9780406951904297, + "step": 11492, + "token_acc": 0.303267770019188 + }, + { + "epoch": 6.737027264731751, + "grad_norm": 0.17703923960991277, + "learning_rate": 0.00039571255197941025, + "loss": 3.0128934383392334, + "step": 11493, + "token_acc": 0.2966084083764054 + }, + { + "epoch": 6.73761360304896, + "grad_norm": 0.16588750906991814, + "learning_rate": 0.00039569286231685465, + "loss": 3.007120370864868, + "step": 11494, + "token_acc": 0.2973906142017968 + }, + { + "epoch": 6.738199941366168, + "grad_norm": 0.16717041694970475, + "learning_rate": 0.000395673171285711, + "loss": 2.9930107593536377, + "step": 11495, + "token_acc": 0.2987991391495919 + }, + { + "epoch": 6.738786279683377, + "grad_norm": 0.19841039212529712, + "learning_rate": 0.00039565347888616416, + "loss": 2.955406904220581, + "step": 11496, + "token_acc": 0.30562690384962377 + }, + { + "epoch": 6.739372618000586, + "grad_norm": 0.2350716724381974, + "learning_rate": 0.0003956337851183992, + "loss": 2.97166109085083, + "step": 11497, + "token_acc": 0.30379756885723586 + }, + { + "epoch": 6.739958956317795, + "grad_norm": 0.2070905162978886, + "learning_rate": 0.0003956140899826011, + "loss": 2.985978841781616, + "step": 11498, + "token_acc": 0.3004621817154381 + }, + { + "epoch": 6.740545294635004, + "grad_norm": 0.18754222510520704, + "learning_rate": 0.0003955943934789547, + "loss": 3.021526575088501, + "step": 11499, + "token_acc": 0.2949423911811455 + }, + { + "epoch": 6.7411316329522135, + "grad_norm": 0.23048321471820848, + "learning_rate": 0.00039557469560764526, + "loss": 2.950100898742676, + "step": 11500, + "token_acc": 0.30496787057342795 + }, + { + "epoch": 6.741717971269423, + "grad_norm": 0.2648172322573145, + "learning_rate": 0.0003955549963688577, + "loss": 2.952327013015747, + "step": 11501, + "token_acc": 0.30422020640447156 + }, + { + "epoch": 6.742304309586632, + "grad_norm": 0.18847791818133786, + "learning_rate": 0.00039553529576277714, + "loss": 2.961784839630127, + "step": 11502, + "token_acc": 0.3025832109771942 + }, + { + "epoch": 6.742890647903841, + "grad_norm": 0.21313476982150964, + "learning_rate": 0.00039551559378958855, + "loss": 2.955805778503418, + "step": 11503, + "token_acc": 0.3048662421737442 + }, + { + "epoch": 6.743476986221049, + "grad_norm": 0.20869158747076305, + "learning_rate": 0.000395495890449477, + "loss": 2.98655366897583, + "step": 11504, + "token_acc": 0.30046692646803114 + }, + { + "epoch": 6.744063324538258, + "grad_norm": 0.17893776640727163, + "learning_rate": 0.0003954761857426277, + "loss": 2.99147891998291, + "step": 11505, + "token_acc": 0.3008376215218413 + }, + { + "epoch": 6.744649662855467, + "grad_norm": 0.20607404808495444, + "learning_rate": 0.0003954564796692256, + "loss": 3.026346206665039, + "step": 11506, + "token_acc": 0.29349765680152756 + }, + { + "epoch": 6.745236001172676, + "grad_norm": 0.18787202441424802, + "learning_rate": 0.0003954367722294559, + "loss": 2.984320640563965, + "step": 11507, + "token_acc": 0.30087749726986573 + }, + { + "epoch": 6.7458223394898855, + "grad_norm": 0.23308987392576105, + "learning_rate": 0.0003954170634235037, + "loss": 3.021008014678955, + "step": 11508, + "token_acc": 0.29556874419310186 + }, + { + "epoch": 6.746408677807095, + "grad_norm": 0.1732800854431111, + "learning_rate": 0.0003953973532515541, + "loss": 3.00510311126709, + "step": 11509, + "token_acc": 0.29536773313301146 + }, + { + "epoch": 6.746995016124304, + "grad_norm": 0.19131957897508126, + "learning_rate": 0.0003953776417137924, + "loss": 2.994755268096924, + "step": 11510, + "token_acc": 0.29816661061904387 + }, + { + "epoch": 6.747581354441513, + "grad_norm": 0.15674542219896306, + "learning_rate": 0.0003953579288104036, + "loss": 2.981248378753662, + "step": 11511, + "token_acc": 0.30045408015958425 + }, + { + "epoch": 6.748167692758722, + "grad_norm": 0.16626129748061105, + "learning_rate": 0.00039533821454157294, + "loss": 2.973879337310791, + "step": 11512, + "token_acc": 0.30147481627155315 + }, + { + "epoch": 6.748754031075931, + "grad_norm": 0.148490137085822, + "learning_rate": 0.0003953184989074855, + "loss": 2.9606704711914062, + "step": 11513, + "token_acc": 0.30443997394739053 + }, + { + "epoch": 6.74934036939314, + "grad_norm": 0.15796892771559037, + "learning_rate": 0.0003952987819083267, + "loss": 2.9425177574157715, + "step": 11514, + "token_acc": 0.3068812953026452 + }, + { + "epoch": 6.749926707710349, + "grad_norm": 0.16356087009431436, + "learning_rate": 0.00039527906354428155, + "loss": 3.005080223083496, + "step": 11515, + "token_acc": 0.2985207526970271 + }, + { + "epoch": 6.7505130460275575, + "grad_norm": 0.1810867579875682, + "learning_rate": 0.00039525934381553547, + "loss": 2.9707722663879395, + "step": 11516, + "token_acc": 0.30112775034650036 + }, + { + "epoch": 6.751099384344767, + "grad_norm": 0.18581876382224982, + "learning_rate": 0.0003952396227222735, + "loss": 2.969979763031006, + "step": 11517, + "token_acc": 0.3012135738067609 + }, + { + "epoch": 6.751685722661976, + "grad_norm": 0.19218717805482774, + "learning_rate": 0.00039521990026468104, + "loss": 2.924926280975342, + "step": 11518, + "token_acc": 0.30899103816848655 + }, + { + "epoch": 6.752272060979185, + "grad_norm": 0.21361703522535758, + "learning_rate": 0.00039520017644294333, + "loss": 2.9774489402770996, + "step": 11519, + "token_acc": 0.3032800386963513 + }, + { + "epoch": 6.752858399296394, + "grad_norm": 0.16581166326799177, + "learning_rate": 0.00039518045125724557, + "loss": 2.996628522872925, + "step": 11520, + "token_acc": 0.2984814936128398 + }, + { + "epoch": 6.753444737613603, + "grad_norm": 0.17277259045512303, + "learning_rate": 0.0003951607247077731, + "loss": 3.0042214393615723, + "step": 11521, + "token_acc": 0.2994068323062769 + }, + { + "epoch": 6.754031075930812, + "grad_norm": 0.1700588335449712, + "learning_rate": 0.00039514099679471127, + "loss": 2.945157527923584, + "step": 11522, + "token_acc": 0.30562858896134026 + }, + { + "epoch": 6.754617414248021, + "grad_norm": 0.16114751493813664, + "learning_rate": 0.0003951212675182453, + "loss": 2.983963966369629, + "step": 11523, + "token_acc": 0.29970211490459986 + }, + { + "epoch": 6.75520375256523, + "grad_norm": 0.16061151200255297, + "learning_rate": 0.0003951015368785607, + "loss": 2.9542856216430664, + "step": 11524, + "token_acc": 0.3044669873842855 + }, + { + "epoch": 6.7557900908824395, + "grad_norm": 0.16263681074630146, + "learning_rate": 0.0003950818048758425, + "loss": 2.9794771671295166, + "step": 11525, + "token_acc": 0.3008689469116309 + }, + { + "epoch": 6.756376429199648, + "grad_norm": 0.15136003354234842, + "learning_rate": 0.0003950620715102764, + "loss": 2.953697681427002, + "step": 11526, + "token_acc": 0.3025988219504534 + }, + { + "epoch": 6.756962767516857, + "grad_norm": 0.15278856869202703, + "learning_rate": 0.00039504233678204747, + "loss": 2.9756898880004883, + "step": 11527, + "token_acc": 0.30330868162238106 + }, + { + "epoch": 6.757549105834066, + "grad_norm": 0.15290740658849838, + "learning_rate": 0.0003950226006913413, + "loss": 2.949049711227417, + "step": 11528, + "token_acc": 0.30562880311317336 + }, + { + "epoch": 6.758135444151275, + "grad_norm": 0.14819716119841855, + "learning_rate": 0.0003950028632383432, + "loss": 2.9888970851898193, + "step": 11529, + "token_acc": 0.30025993899287573 + }, + { + "epoch": 6.758721782468484, + "grad_norm": 0.17918413291684476, + "learning_rate": 0.0003949831244232387, + "loss": 2.99914813041687, + "step": 11530, + "token_acc": 0.2990253164222696 + }, + { + "epoch": 6.759308120785693, + "grad_norm": 0.20739440949144017, + "learning_rate": 0.00039496338424621305, + "loss": 2.957144021987915, + "step": 11531, + "token_acc": 0.30406981117881593 + }, + { + "epoch": 6.759894459102902, + "grad_norm": 0.20594981256943412, + "learning_rate": 0.0003949436427074517, + "loss": 2.9398584365844727, + "step": 11532, + "token_acc": 0.30769919627465464 + }, + { + "epoch": 6.7604807974201115, + "grad_norm": 0.17405599879909062, + "learning_rate": 0.0003949238998071402, + "loss": 2.9859185218811035, + "step": 11533, + "token_acc": 0.3007505533366961 + }, + { + "epoch": 6.761067135737321, + "grad_norm": 0.14852573463486676, + "learning_rate": 0.0003949041555454639, + "loss": 3.0099306106567383, + "step": 11534, + "token_acc": 0.2969432544930015 + }, + { + "epoch": 6.76165347405453, + "grad_norm": 0.184693955571649, + "learning_rate": 0.0003948844099226083, + "loss": 2.971446990966797, + "step": 11535, + "token_acc": 0.30301257149927435 + }, + { + "epoch": 6.762239812371739, + "grad_norm": 0.2049823120662438, + "learning_rate": 0.000394864662938759, + "loss": 2.9630303382873535, + "step": 11536, + "token_acc": 0.30290180972797665 + }, + { + "epoch": 6.762826150688948, + "grad_norm": 0.19537275938541532, + "learning_rate": 0.00039484491459410134, + "loss": 2.9980673789978027, + "step": 11537, + "token_acc": 0.2970245327668203 + }, + { + "epoch": 6.763412489006156, + "grad_norm": 0.15916917749864046, + "learning_rate": 0.0003948251648888208, + "loss": 2.944810628890991, + "step": 11538, + "token_acc": 0.30536347438411177 + }, + { + "epoch": 6.763998827323365, + "grad_norm": 0.24929323312007814, + "learning_rate": 0.0003948054138231031, + "loss": 2.981698513031006, + "step": 11539, + "token_acc": 0.3004834753940174 + }, + { + "epoch": 6.764585165640574, + "grad_norm": 0.3133734272944794, + "learning_rate": 0.00039478566139713366, + "loss": 2.981476306915283, + "step": 11540, + "token_acc": 0.3009886555434209 + }, + { + "epoch": 6.7651715039577835, + "grad_norm": 0.17613963765919682, + "learning_rate": 0.00039476590761109803, + "loss": 2.9939332008361816, + "step": 11541, + "token_acc": 0.2970473123728509 + }, + { + "epoch": 6.765757842274993, + "grad_norm": 0.23323969381611873, + "learning_rate": 0.0003947461524651818, + "loss": 2.9460811614990234, + "step": 11542, + "token_acc": 0.3057751831887476 + }, + { + "epoch": 6.766344180592202, + "grad_norm": 0.23846156732643098, + "learning_rate": 0.00039472639595957044, + "loss": 3.023005962371826, + "step": 11543, + "token_acc": 0.2926903748397474 + }, + { + "epoch": 6.766930518909411, + "grad_norm": 0.1715534130041322, + "learning_rate": 0.0003947066380944496, + "loss": 3.0024967193603516, + "step": 11544, + "token_acc": 0.2988230434572696 + }, + { + "epoch": 6.76751685722662, + "grad_norm": 0.22083473630112432, + "learning_rate": 0.000394686878870005, + "loss": 2.989177942276001, + "step": 11545, + "token_acc": 0.2996123033364435 + }, + { + "epoch": 6.768103195543829, + "grad_norm": 0.16734941858278113, + "learning_rate": 0.0003946671182864221, + "loss": 2.9711222648620605, + "step": 11546, + "token_acc": 0.3028971384418488 + }, + { + "epoch": 6.768689533861037, + "grad_norm": 0.2169538768258249, + "learning_rate": 0.0003946473563438865, + "loss": 3.0118746757507324, + "step": 11547, + "token_acc": 0.2985385244943295 + }, + { + "epoch": 6.7692758721782464, + "grad_norm": 0.1511884384981121, + "learning_rate": 0.00039462759304258394, + "loss": 3.0192019939422607, + "step": 11548, + "token_acc": 0.2948051570555422 + }, + { + "epoch": 6.769862210495456, + "grad_norm": 0.2008901533765202, + "learning_rate": 0.00039460782838270005, + "loss": 2.9511473178863525, + "step": 11549, + "token_acc": 0.305786471911625 + }, + { + "epoch": 6.770448548812665, + "grad_norm": 0.1688867903467657, + "learning_rate": 0.00039458806236442046, + "loss": 2.9981157779693604, + "step": 11550, + "token_acc": 0.2984890303657787 + }, + { + "epoch": 6.771034887129874, + "grad_norm": 0.1668637948419691, + "learning_rate": 0.00039456829498793087, + "loss": 2.9873576164245605, + "step": 11551, + "token_acc": 0.3003670837416553 + }, + { + "epoch": 6.771621225447083, + "grad_norm": 0.16332442491547605, + "learning_rate": 0.00039454852625341687, + "loss": 2.972841739654541, + "step": 11552, + "token_acc": 0.3015342483749296 + }, + { + "epoch": 6.772207563764292, + "grad_norm": 0.18464896950364837, + "learning_rate": 0.0003945287561610644, + "loss": 2.9945921897888184, + "step": 11553, + "token_acc": 0.29862314616928953 + }, + { + "epoch": 6.772793902081501, + "grad_norm": 0.18614791348959603, + "learning_rate": 0.0003945089847110589, + "loss": 2.9459640979766846, + "step": 11554, + "token_acc": 0.30536367918446183 + }, + { + "epoch": 6.77338024039871, + "grad_norm": 0.21189460505970756, + "learning_rate": 0.0003944892119035863, + "loss": 3.0124449729919434, + "step": 11555, + "token_acc": 0.2962415038122924 + }, + { + "epoch": 6.773966578715919, + "grad_norm": 0.1598058020935131, + "learning_rate": 0.0003944694377388322, + "loss": 2.9895286560058594, + "step": 11556, + "token_acc": 0.29911994993038793 + }, + { + "epoch": 6.7745529170331285, + "grad_norm": 0.1549407673556061, + "learning_rate": 0.0003944496622169824, + "loss": 2.9838523864746094, + "step": 11557, + "token_acc": 0.2993662864385298 + }, + { + "epoch": 6.775139255350338, + "grad_norm": 0.1721596812951165, + "learning_rate": 0.00039442988533822267, + "loss": 2.998366355895996, + "step": 11558, + "token_acc": 0.2973919302800879 + }, + { + "epoch": 6.775725593667546, + "grad_norm": 0.1928050292269371, + "learning_rate": 0.00039441010710273883, + "loss": 2.977863311767578, + "step": 11559, + "token_acc": 0.2993067308602004 + }, + { + "epoch": 6.776311931984755, + "grad_norm": 0.1489748180115067, + "learning_rate": 0.0003943903275107166, + "loss": 2.998818874359131, + "step": 11560, + "token_acc": 0.2981613847743713 + }, + { + "epoch": 6.776898270301964, + "grad_norm": 0.2372633185013389, + "learning_rate": 0.00039437054656234183, + "loss": 2.9870150089263916, + "step": 11561, + "token_acc": 0.2994970640796528 + }, + { + "epoch": 6.777484608619173, + "grad_norm": 0.15708906238072473, + "learning_rate": 0.0003943507642578003, + "loss": 2.973066568374634, + "step": 11562, + "token_acc": 0.30277421195264587 + }, + { + "epoch": 6.778070946936382, + "grad_norm": 0.18366629821923414, + "learning_rate": 0.0003943309805972779, + "loss": 2.95536470413208, + "step": 11563, + "token_acc": 0.3041023791394488 + }, + { + "epoch": 6.778657285253591, + "grad_norm": 0.15467451082334238, + "learning_rate": 0.0003943111955809604, + "loss": 3.0104329586029053, + "step": 11564, + "token_acc": 0.29741946024163873 + }, + { + "epoch": 6.7792436235708005, + "grad_norm": 0.178612960506566, + "learning_rate": 0.0003942914092090337, + "loss": 2.9964513778686523, + "step": 11565, + "token_acc": 0.2990949934781558 + }, + { + "epoch": 6.77982996188801, + "grad_norm": 0.15897730936721075, + "learning_rate": 0.00039427162148168363, + "loss": 2.970902919769287, + "step": 11566, + "token_acc": 0.3020746623122073 + }, + { + "epoch": 6.780416300205219, + "grad_norm": 0.15509038060151067, + "learning_rate": 0.000394251832399096, + "loss": 2.949418544769287, + "step": 11567, + "token_acc": 0.3065515133384493 + }, + { + "epoch": 6.781002638522428, + "grad_norm": 0.1672390698424791, + "learning_rate": 0.0003942320419614569, + "loss": 2.953291416168213, + "step": 11568, + "token_acc": 0.3030630649497517 + }, + { + "epoch": 6.781588976839636, + "grad_norm": 0.1735530227168004, + "learning_rate": 0.00039421225016895203, + "loss": 2.9651939868927, + "step": 11569, + "token_acc": 0.30290431679306157 + }, + { + "epoch": 6.782175315156845, + "grad_norm": 0.16867436415106482, + "learning_rate": 0.0003941924570217674, + "loss": 2.996748447418213, + "step": 11570, + "token_acc": 0.29924092866612106 + }, + { + "epoch": 6.782761653474054, + "grad_norm": 0.163582482296536, + "learning_rate": 0.0003941726625200891, + "loss": 2.962548017501831, + "step": 11571, + "token_acc": 0.30176613237749816 + }, + { + "epoch": 6.783347991791263, + "grad_norm": 0.2063742112221261, + "learning_rate": 0.0003941528666641027, + "loss": 2.9715094566345215, + "step": 11572, + "token_acc": 0.3028142320568519 + }, + { + "epoch": 6.7839343301084725, + "grad_norm": 0.15689777973354346, + "learning_rate": 0.00039413306945399455, + "loss": 2.9831771850585938, + "step": 11573, + "token_acc": 0.30011492846235294 + }, + { + "epoch": 6.784520668425682, + "grad_norm": 0.2305764150714964, + "learning_rate": 0.0003941132708899503, + "loss": 2.9912753105163574, + "step": 11574, + "token_acc": 0.2997486553713519 + }, + { + "epoch": 6.785107006742891, + "grad_norm": 0.20693199595276215, + "learning_rate": 0.0003940934709721561, + "loss": 2.9474563598632812, + "step": 11575, + "token_acc": 0.3058722328715479 + }, + { + "epoch": 6.7856933450601, + "grad_norm": 0.1570801021600605, + "learning_rate": 0.00039407366970079796, + "loss": 2.9580841064453125, + "step": 11576, + "token_acc": 0.304716959543128 + }, + { + "epoch": 6.786279683377309, + "grad_norm": 0.22222845867201804, + "learning_rate": 0.0003940538670760617, + "loss": 2.9611153602600098, + "step": 11577, + "token_acc": 0.3021958109181384 + }, + { + "epoch": 6.786866021694518, + "grad_norm": 0.18275101611013528, + "learning_rate": 0.0003940340630981336, + "loss": 2.9507999420166016, + "step": 11578, + "token_acc": 0.3050683556841433 + }, + { + "epoch": 6.787452360011727, + "grad_norm": 0.17426836863417955, + "learning_rate": 0.00039401425776719955, + "loss": 2.979895830154419, + "step": 11579, + "token_acc": 0.3004485879337678 + }, + { + "epoch": 6.788038698328936, + "grad_norm": 0.17255843395367024, + "learning_rate": 0.0003939944510834456, + "loss": 2.972538471221924, + "step": 11580, + "token_acc": 0.3022905218110198 + }, + { + "epoch": 6.7886250366461445, + "grad_norm": 0.15373701630187564, + "learning_rate": 0.0003939746430470579, + "loss": 2.957298755645752, + "step": 11581, + "token_acc": 0.3041560981651447 + }, + { + "epoch": 6.789211374963354, + "grad_norm": 0.18548568269878304, + "learning_rate": 0.0003939548336582223, + "loss": 2.9947452545166016, + "step": 11582, + "token_acc": 0.29820577452457186 + }, + { + "epoch": 6.789797713280563, + "grad_norm": 0.1630054545418031, + "learning_rate": 0.000393935022917125, + "loss": 2.9864251613616943, + "step": 11583, + "token_acc": 0.2995224129828053 + }, + { + "epoch": 6.790384051597772, + "grad_norm": 0.15718918838460197, + "learning_rate": 0.0003939152108239522, + "loss": 2.9676175117492676, + "step": 11584, + "token_acc": 0.3029986798920955 + }, + { + "epoch": 6.790970389914981, + "grad_norm": 0.15545140765453122, + "learning_rate": 0.0003938953973788899, + "loss": 2.987039566040039, + "step": 11585, + "token_acc": 0.30003087144549606 + }, + { + "epoch": 6.79155672823219, + "grad_norm": 0.15652426434457511, + "learning_rate": 0.0003938755825821243, + "loss": 2.987412452697754, + "step": 11586, + "token_acc": 0.2999010832329342 + }, + { + "epoch": 6.792143066549399, + "grad_norm": 0.1827417417541671, + "learning_rate": 0.00039385576643384134, + "loss": 2.9207797050476074, + "step": 11587, + "token_acc": 0.31141747614246773 + }, + { + "epoch": 6.792729404866608, + "grad_norm": 0.1576406312738965, + "learning_rate": 0.0003938359489342274, + "loss": 3.006629705429077, + "step": 11588, + "token_acc": 0.29616861547160034 + }, + { + "epoch": 6.793315743183817, + "grad_norm": 0.20273197780092267, + "learning_rate": 0.0003938161300834686, + "loss": 2.9420042037963867, + "step": 11589, + "token_acc": 0.30544274445500175 + }, + { + "epoch": 6.7939020815010265, + "grad_norm": 0.26601222085945747, + "learning_rate": 0.000393796309881751, + "loss": 3.0006086826324463, + "step": 11590, + "token_acc": 0.2963497114242747 + }, + { + "epoch": 6.794488419818235, + "grad_norm": 0.15903031760504424, + "learning_rate": 0.00039377648832926074, + "loss": 2.978531837463379, + "step": 11591, + "token_acc": 0.30120723740328553 + }, + { + "epoch": 6.795074758135444, + "grad_norm": 0.26405906080537384, + "learning_rate": 0.0003937566654261842, + "loss": 2.9976654052734375, + "step": 11592, + "token_acc": 0.2997663941264809 + }, + { + "epoch": 6.795661096452653, + "grad_norm": 0.20355855519441834, + "learning_rate": 0.0003937368411727075, + "loss": 2.943694591522217, + "step": 11593, + "token_acc": 0.3068827620865344 + }, + { + "epoch": 6.796247434769862, + "grad_norm": 0.213111676156233, + "learning_rate": 0.00039371701556901686, + "loss": 2.972809076309204, + "step": 11594, + "token_acc": 0.3019167726418599 + }, + { + "epoch": 6.796833773087071, + "grad_norm": 0.2102821517409978, + "learning_rate": 0.0003936971886152985, + "loss": 2.9829165935516357, + "step": 11595, + "token_acc": 0.3000268995302545 + }, + { + "epoch": 6.79742011140428, + "grad_norm": 0.17707894001326263, + "learning_rate": 0.00039367736031173866, + "loss": 3.01369047164917, + "step": 11596, + "token_acc": 0.29639866170694706 + }, + { + "epoch": 6.798006449721489, + "grad_norm": 0.20333774919045303, + "learning_rate": 0.00039365753065852374, + "loss": 3.006033420562744, + "step": 11597, + "token_acc": 0.2964986910181219 + }, + { + "epoch": 6.7985927880386985, + "grad_norm": 0.19431307756761462, + "learning_rate": 0.00039363769965583984, + "loss": 2.9322497844696045, + "step": 11598, + "token_acc": 0.3084968047268326 + }, + { + "epoch": 6.799179126355908, + "grad_norm": 0.21546096785925278, + "learning_rate": 0.0003936178673038733, + "loss": 2.9905667304992676, + "step": 11599, + "token_acc": 0.29876764104898734 + }, + { + "epoch": 6.799765464673117, + "grad_norm": 0.18047640108193683, + "learning_rate": 0.0003935980336028104, + "loss": 2.9647064208984375, + "step": 11600, + "token_acc": 0.30452075181647714 + }, + { + "epoch": 6.800351802990326, + "grad_norm": 0.1829915390708398, + "learning_rate": 0.0003935781985528375, + "loss": 2.9450063705444336, + "step": 11601, + "token_acc": 0.305358867489916 + }, + { + "epoch": 6.800938141307535, + "grad_norm": 0.1701278952655037, + "learning_rate": 0.0003935583621541409, + "loss": 2.9871788024902344, + "step": 11602, + "token_acc": 0.3011451456130194 + }, + { + "epoch": 6.801524479624743, + "grad_norm": 0.18104097654567916, + "learning_rate": 0.0003935385244069069, + "loss": 2.987971782684326, + "step": 11603, + "token_acc": 0.3008784509655764 + }, + { + "epoch": 6.802110817941952, + "grad_norm": 0.15007697645227325, + "learning_rate": 0.00039351868531132194, + "loss": 2.936387538909912, + "step": 11604, + "token_acc": 0.3080193527759501 + }, + { + "epoch": 6.802697156259161, + "grad_norm": 0.1746934503512794, + "learning_rate": 0.0003934988448675723, + "loss": 2.9657599925994873, + "step": 11605, + "token_acc": 0.30339610451507676 + }, + { + "epoch": 6.8032834945763705, + "grad_norm": 0.15267090806808237, + "learning_rate": 0.0003934790030758444, + "loss": 2.9769229888916016, + "step": 11606, + "token_acc": 0.30137786989197435 + }, + { + "epoch": 6.80386983289358, + "grad_norm": 0.17854870618415986, + "learning_rate": 0.0003934591599363245, + "loss": 2.994415760040283, + "step": 11607, + "token_acc": 0.29903814866395073 + }, + { + "epoch": 6.804456171210789, + "grad_norm": 0.15644539588439593, + "learning_rate": 0.00039343931544919924, + "loss": 2.9980106353759766, + "step": 11608, + "token_acc": 0.29752241329506574 + }, + { + "epoch": 6.805042509527998, + "grad_norm": 0.16487058386167547, + "learning_rate": 0.00039341946961465483, + "loss": 2.978438377380371, + "step": 11609, + "token_acc": 0.30263254844583853 + }, + { + "epoch": 6.805628847845207, + "grad_norm": 0.1691050588007732, + "learning_rate": 0.00039339962243287787, + "loss": 2.995551347732544, + "step": 11610, + "token_acc": 0.2995381743646083 + }, + { + "epoch": 6.806215186162416, + "grad_norm": 0.164714585870821, + "learning_rate": 0.00039337977390405457, + "loss": 2.9820971488952637, + "step": 11611, + "token_acc": 0.30025294797932056 + }, + { + "epoch": 6.806801524479624, + "grad_norm": 0.16491873167965881, + "learning_rate": 0.00039335992402837153, + "loss": 2.9999372959136963, + "step": 11612, + "token_acc": 0.2979064020267645 + }, + { + "epoch": 6.807387862796833, + "grad_norm": 0.1451506557215027, + "learning_rate": 0.0003933400728060151, + "loss": 3.0204498767852783, + "step": 11613, + "token_acc": 0.2948417704090865 + }, + { + "epoch": 6.8079742011140425, + "grad_norm": 0.16908115927182538, + "learning_rate": 0.000393320220237172, + "loss": 2.965651035308838, + "step": 11614, + "token_acc": 0.30411801056090554 + }, + { + "epoch": 6.808560539431252, + "grad_norm": 0.1463599063841434, + "learning_rate": 0.0003933003663220285, + "loss": 2.9881362915039062, + "step": 11615, + "token_acc": 0.29876793552195896 + }, + { + "epoch": 6.809146877748461, + "grad_norm": 0.1787531783987317, + "learning_rate": 0.0003932805110607711, + "loss": 3.0090205669403076, + "step": 11616, + "token_acc": 0.2994712960821684 + }, + { + "epoch": 6.80973321606567, + "grad_norm": 0.22418310616684273, + "learning_rate": 0.0003932606544535864, + "loss": 2.993567705154419, + "step": 11617, + "token_acc": 0.29839883551673946 + }, + { + "epoch": 6.810319554382879, + "grad_norm": 0.16362262409831566, + "learning_rate": 0.0003932407965006609, + "loss": 2.953751802444458, + "step": 11618, + "token_acc": 0.30359957299936163 + }, + { + "epoch": 6.810905892700088, + "grad_norm": 0.16952164689901786, + "learning_rate": 0.0003932209372021812, + "loss": 2.981529951095581, + "step": 11619, + "token_acc": 0.2996828066678088 + }, + { + "epoch": 6.811492231017297, + "grad_norm": 0.15680471113975752, + "learning_rate": 0.00039320107655833376, + "loss": 2.973137855529785, + "step": 11620, + "token_acc": 0.3029456269013186 + }, + { + "epoch": 6.812078569334506, + "grad_norm": 0.15571765119605566, + "learning_rate": 0.00039318121456930513, + "loss": 2.977614402770996, + "step": 11621, + "token_acc": 0.30084120997656405 + }, + { + "epoch": 6.812664907651715, + "grad_norm": 0.1694194613573455, + "learning_rate": 0.00039316135123528196, + "loss": 3.0071518421173096, + "step": 11622, + "token_acc": 0.2964382146849824 + }, + { + "epoch": 6.8132512459689245, + "grad_norm": 0.15332634871058373, + "learning_rate": 0.0003931414865564508, + "loss": 2.9971706867218018, + "step": 11623, + "token_acc": 0.2989519673367488 + }, + { + "epoch": 6.813837584286133, + "grad_norm": 0.16815999304841078, + "learning_rate": 0.0003931216205329982, + "loss": 2.980884313583374, + "step": 11624, + "token_acc": 0.30035730797837057 + }, + { + "epoch": 6.814423922603342, + "grad_norm": 0.1590140537525872, + "learning_rate": 0.00039310175316511093, + "loss": 2.968369483947754, + "step": 11625, + "token_acc": 0.30294270907596704 + }, + { + "epoch": 6.815010260920551, + "grad_norm": 0.14861616002803776, + "learning_rate": 0.0003930818844529754, + "loss": 2.9992194175720215, + "step": 11626, + "token_acc": 0.2971770227505987 + }, + { + "epoch": 6.81559659923776, + "grad_norm": 0.1414313774014646, + "learning_rate": 0.00039306201439677854, + "loss": 2.9639291763305664, + "step": 11627, + "token_acc": 0.3040394751255251 + }, + { + "epoch": 6.816182937554969, + "grad_norm": 0.14794481386540587, + "learning_rate": 0.00039304214299670673, + "loss": 2.9717040061950684, + "step": 11628, + "token_acc": 0.30168710478482375 + }, + { + "epoch": 6.816769275872178, + "grad_norm": 0.17694370353133437, + "learning_rate": 0.00039302227025294674, + "loss": 3.0060343742370605, + "step": 11629, + "token_acc": 0.29703140775293296 + }, + { + "epoch": 6.817355614189387, + "grad_norm": 0.16708814615084477, + "learning_rate": 0.00039300239616568526, + "loss": 3.002920627593994, + "step": 11630, + "token_acc": 0.29581983447226556 + }, + { + "epoch": 6.8179419525065965, + "grad_norm": 0.14645786586725693, + "learning_rate": 0.00039298252073510895, + "loss": 2.940195322036743, + "step": 11631, + "token_acc": 0.3084593331605539 + }, + { + "epoch": 6.818528290823806, + "grad_norm": 0.22639182838683544, + "learning_rate": 0.00039296264396140456, + "loss": 3.0081708431243896, + "step": 11632, + "token_acc": 0.2965828876322743 + }, + { + "epoch": 6.819114629141015, + "grad_norm": 0.24251021036787143, + "learning_rate": 0.0003929427658447587, + "loss": 2.983792304992676, + "step": 11633, + "token_acc": 0.3000128832775058 + }, + { + "epoch": 6.819700967458223, + "grad_norm": 0.152684834729, + "learning_rate": 0.0003929228863853582, + "loss": 2.983001708984375, + "step": 11634, + "token_acc": 0.3005014466306374 + }, + { + "epoch": 6.820287305775432, + "grad_norm": 0.250334910049205, + "learning_rate": 0.00039290300558338987, + "loss": 3.0014262199401855, + "step": 11635, + "token_acc": 0.2994212987341239 + }, + { + "epoch": 6.820873644092641, + "grad_norm": 0.24514144788877448, + "learning_rate": 0.00039288312343904025, + "loss": 2.9712653160095215, + "step": 11636, + "token_acc": 0.3035899638498157 + }, + { + "epoch": 6.82145998240985, + "grad_norm": 0.1553312306408857, + "learning_rate": 0.00039286323995249626, + "loss": 3.006707191467285, + "step": 11637, + "token_acc": 0.29694669891794007 + }, + { + "epoch": 6.822046320727059, + "grad_norm": 0.22987611284167314, + "learning_rate": 0.0003928433551239447, + "loss": 2.9817044734954834, + "step": 11638, + "token_acc": 0.30079572676902433 + }, + { + "epoch": 6.8226326590442685, + "grad_norm": 0.16399639045180195, + "learning_rate": 0.00039282346895357225, + "loss": 2.9868717193603516, + "step": 11639, + "token_acc": 0.2988611359624731 + }, + { + "epoch": 6.823218997361478, + "grad_norm": 0.2151690480190841, + "learning_rate": 0.00039280358144156574, + "loss": 2.9520158767700195, + "step": 11640, + "token_acc": 0.305098896307109 + }, + { + "epoch": 6.823805335678687, + "grad_norm": 0.1633442410442377, + "learning_rate": 0.0003927836925881121, + "loss": 2.93984317779541, + "step": 11641, + "token_acc": 0.3083804896900102 + }, + { + "epoch": 6.824391673995896, + "grad_norm": 0.1850081073851398, + "learning_rate": 0.00039276380239339806, + "loss": 2.9744629859924316, + "step": 11642, + "token_acc": 0.300420784812604 + }, + { + "epoch": 6.824978012313105, + "grad_norm": 0.16988939794489624, + "learning_rate": 0.0003927439108576104, + "loss": 2.9958372116088867, + "step": 11643, + "token_acc": 0.29902630026868215 + }, + { + "epoch": 6.825564350630314, + "grad_norm": 0.17497167374147898, + "learning_rate": 0.000392724017980936, + "loss": 3.006831645965576, + "step": 11644, + "token_acc": 0.2974191605557229 + }, + { + "epoch": 6.826150688947523, + "grad_norm": 0.16516715101063453, + "learning_rate": 0.0003927041237635619, + "loss": 2.9722230434417725, + "step": 11645, + "token_acc": 0.30148307895619963 + }, + { + "epoch": 6.826737027264731, + "grad_norm": 0.1450250620258408, + "learning_rate": 0.0003926842282056748, + "loss": 2.99434757232666, + "step": 11646, + "token_acc": 0.29992667531254324 + }, + { + "epoch": 6.8273233655819405, + "grad_norm": 0.16686499526491372, + "learning_rate": 0.00039266433130746164, + "loss": 3.0002710819244385, + "step": 11647, + "token_acc": 0.3002069166329536 + }, + { + "epoch": 6.82790970389915, + "grad_norm": 0.13723372289626537, + "learning_rate": 0.00039264443306910937, + "loss": 2.98030424118042, + "step": 11648, + "token_acc": 0.30049203036500327 + }, + { + "epoch": 6.828496042216359, + "grad_norm": 0.14456840221184772, + "learning_rate": 0.00039262453349080486, + "loss": 2.974066734313965, + "step": 11649, + "token_acc": 0.2996495549119326 + }, + { + "epoch": 6.829082380533568, + "grad_norm": 0.14794416966908008, + "learning_rate": 0.0003926046325727349, + "loss": 3.024742603302002, + "step": 11650, + "token_acc": 0.2949483393514712 + }, + { + "epoch": 6.829668718850777, + "grad_norm": 0.15061430041844995, + "learning_rate": 0.0003925847303150868, + "loss": 2.97066593170166, + "step": 11651, + "token_acc": 0.30160277319416195 + }, + { + "epoch": 6.830255057167986, + "grad_norm": 0.1487413476506025, + "learning_rate": 0.0003925648267180472, + "loss": 2.99392032623291, + "step": 11652, + "token_acc": 0.29900313152400837 + }, + { + "epoch": 6.830841395485195, + "grad_norm": 0.15882127138231913, + "learning_rate": 0.0003925449217818031, + "loss": 3.002487897872925, + "step": 11653, + "token_acc": 0.29622637025457677 + }, + { + "epoch": 6.831427733802404, + "grad_norm": 0.1381305591582993, + "learning_rate": 0.00039252501550654165, + "loss": 2.9671549797058105, + "step": 11654, + "token_acc": 0.30347537656435125 + }, + { + "epoch": 6.8320140721196125, + "grad_norm": 0.15426221949025817, + "learning_rate": 0.00039250510789244966, + "loss": 3.001903533935547, + "step": 11655, + "token_acc": 0.29808751822314816 + }, + { + "epoch": 6.832600410436822, + "grad_norm": 0.13949762108855956, + "learning_rate": 0.00039248519893971424, + "loss": 3.030393600463867, + "step": 11656, + "token_acc": 0.29422590952758865 + }, + { + "epoch": 6.833186748754031, + "grad_norm": 0.1599644371140646, + "learning_rate": 0.0003924652886485224, + "loss": 2.9425408840179443, + "step": 11657, + "token_acc": 0.306530968839569 + }, + { + "epoch": 6.83377308707124, + "grad_norm": 0.17499084019343752, + "learning_rate": 0.0003924453770190611, + "loss": 2.990678310394287, + "step": 11658, + "token_acc": 0.2988651102464332 + }, + { + "epoch": 6.834359425388449, + "grad_norm": 0.21834952292230386, + "learning_rate": 0.0003924254640515175, + "loss": 2.982257843017578, + "step": 11659, + "token_acc": 0.30182547219584255 + }, + { + "epoch": 6.834945763705658, + "grad_norm": 0.16897419958111382, + "learning_rate": 0.00039240554974607843, + "loss": 2.935482978820801, + "step": 11660, + "token_acc": 0.3087362083667089 + }, + { + "epoch": 6.835532102022867, + "grad_norm": 0.15146092993815297, + "learning_rate": 0.0003923856341029313, + "loss": 2.973356246948242, + "step": 11661, + "token_acc": 0.3046509035645056 + }, + { + "epoch": 6.836118440340076, + "grad_norm": 0.17124725104781033, + "learning_rate": 0.0003923657171222629, + "loss": 2.9884839057922363, + "step": 11662, + "token_acc": 0.3007037332411994 + }, + { + "epoch": 6.836704778657285, + "grad_norm": 0.17892336475888285, + "learning_rate": 0.0003923457988042604, + "loss": 2.996511936187744, + "step": 11663, + "token_acc": 0.2994192392168488 + }, + { + "epoch": 6.8372911169744945, + "grad_norm": 0.1712077393409036, + "learning_rate": 0.000392325879149111, + "loss": 3.0041728019714355, + "step": 11664, + "token_acc": 0.29817567725802546 + }, + { + "epoch": 6.837877455291704, + "grad_norm": 0.14618176368120178, + "learning_rate": 0.00039230595815700173, + "loss": 2.9733285903930664, + "step": 11665, + "token_acc": 0.3004451000447156 + }, + { + "epoch": 6.838463793608913, + "grad_norm": 0.15788124891897257, + "learning_rate": 0.0003922860358281197, + "loss": 2.9759531021118164, + "step": 11666, + "token_acc": 0.3019369441863121 + }, + { + "epoch": 6.839050131926121, + "grad_norm": 0.17261117122412808, + "learning_rate": 0.00039226611216265216, + "loss": 2.9577317237854004, + "step": 11667, + "token_acc": 0.3033464229949215 + }, + { + "epoch": 6.83963647024333, + "grad_norm": 0.14839595457497345, + "learning_rate": 0.0003922461871607862, + "loss": 2.968186855316162, + "step": 11668, + "token_acc": 0.30248703660606185 + }, + { + "epoch": 6.840222808560539, + "grad_norm": 0.15007170248950966, + "learning_rate": 0.00039222626082270905, + "loss": 2.9764349460601807, + "step": 11669, + "token_acc": 0.3015973536188233 + }, + { + "epoch": 6.840809146877748, + "grad_norm": 0.2069049867203262, + "learning_rate": 0.00039220633314860764, + "loss": 2.960733413696289, + "step": 11670, + "token_acc": 0.3045761361730672 + }, + { + "epoch": 6.841395485194957, + "grad_norm": 0.2838425705322793, + "learning_rate": 0.00039218640413866957, + "loss": 3.0168042182922363, + "step": 11671, + "token_acc": 0.2967158545854848 + }, + { + "epoch": 6.8419818235121665, + "grad_norm": 0.26457691160377805, + "learning_rate": 0.0003921664737930817, + "loss": 2.957150459289551, + "step": 11672, + "token_acc": 0.3048661573332089 + }, + { + "epoch": 6.842568161829376, + "grad_norm": 0.17786987374541322, + "learning_rate": 0.00039214654211203144, + "loss": 2.965214729309082, + "step": 11673, + "token_acc": 0.3026891539556384 + }, + { + "epoch": 6.843154500146585, + "grad_norm": 0.17696503641613867, + "learning_rate": 0.00039212660909570597, + "loss": 2.989487648010254, + "step": 11674, + "token_acc": 0.3021244275786088 + }, + { + "epoch": 6.843740838463794, + "grad_norm": 0.1757289689977439, + "learning_rate": 0.0003921066747442924, + "loss": 2.9921646118164062, + "step": 11675, + "token_acc": 0.2973377695053598 + }, + { + "epoch": 6.844327176781003, + "grad_norm": 0.20370940470296375, + "learning_rate": 0.00039208673905797827, + "loss": 2.964958429336548, + "step": 11676, + "token_acc": 0.30315098501831494 + }, + { + "epoch": 6.844913515098211, + "grad_norm": 0.15624900788059104, + "learning_rate": 0.00039206680203695063, + "loss": 2.973414421081543, + "step": 11677, + "token_acc": 0.3020743485714894 + }, + { + "epoch": 6.84549985341542, + "grad_norm": 0.16537313144614482, + "learning_rate": 0.0003920468636813968, + "loss": 2.9612526893615723, + "step": 11678, + "token_acc": 0.3041126514460447 + }, + { + "epoch": 6.8460861917326294, + "grad_norm": 0.1733044801398502, + "learning_rate": 0.0003920269239915041, + "loss": 2.96962571144104, + "step": 11679, + "token_acc": 0.3017782537696435 + }, + { + "epoch": 6.846672530049839, + "grad_norm": 0.15696062819547466, + "learning_rate": 0.00039200698296745984, + "loss": 2.944326639175415, + "step": 11680, + "token_acc": 0.30493475719400887 + }, + { + "epoch": 6.847258868367048, + "grad_norm": 0.15421707654434716, + "learning_rate": 0.00039198704060945135, + "loss": 2.975033760070801, + "step": 11681, + "token_acc": 0.3012039746482457 + }, + { + "epoch": 6.847845206684257, + "grad_norm": 0.17021314161472845, + "learning_rate": 0.00039196709691766596, + "loss": 2.9589273929595947, + "step": 11682, + "token_acc": 0.30565837756240916 + }, + { + "epoch": 6.848431545001466, + "grad_norm": 0.1585261835976282, + "learning_rate": 0.000391947151892291, + "loss": 2.952051877975464, + "step": 11683, + "token_acc": 0.3061527423220701 + }, + { + "epoch": 6.849017883318675, + "grad_norm": 0.17643368243495614, + "learning_rate": 0.00039192720553351374, + "loss": 3.0112178325653076, + "step": 11684, + "token_acc": 0.29810780189870945 + }, + { + "epoch": 6.849604221635884, + "grad_norm": 0.15575972972113078, + "learning_rate": 0.0003919072578415217, + "loss": 2.9291515350341797, + "step": 11685, + "token_acc": 0.3082979751866373 + }, + { + "epoch": 6.850190559953093, + "grad_norm": 0.16026352827047924, + "learning_rate": 0.0003918873088165022, + "loss": 2.9707016944885254, + "step": 11686, + "token_acc": 0.3014722548342981 + }, + { + "epoch": 6.850776898270302, + "grad_norm": 0.1539503210581957, + "learning_rate": 0.0003918673584586426, + "loss": 2.970628261566162, + "step": 11687, + "token_acc": 0.3030747518915821 + }, + { + "epoch": 6.8513632365875115, + "grad_norm": 0.14926249781289877, + "learning_rate": 0.00039184740676813037, + "loss": 2.937852621078491, + "step": 11688, + "token_acc": 0.3066189800421633 + }, + { + "epoch": 6.85194957490472, + "grad_norm": 0.15692241646490154, + "learning_rate": 0.0003918274537451528, + "loss": 2.964616060256958, + "step": 11689, + "token_acc": 0.30331537471638964 + }, + { + "epoch": 6.852535913221929, + "grad_norm": 0.1841719246833981, + "learning_rate": 0.0003918074993898976, + "loss": 2.94278621673584, + "step": 11690, + "token_acc": 0.3058318514153068 + }, + { + "epoch": 6.853122251539138, + "grad_norm": 0.26153122748683105, + "learning_rate": 0.0003917875437025519, + "loss": 2.9410276412963867, + "step": 11691, + "token_acc": 0.30496920377170583 + }, + { + "epoch": 6.853708589856347, + "grad_norm": 0.32784484200899217, + "learning_rate": 0.00039176758668330323, + "loss": 2.9647693634033203, + "step": 11692, + "token_acc": 0.30202165432442324 + }, + { + "epoch": 6.854294928173556, + "grad_norm": 0.2370146148456396, + "learning_rate": 0.00039174762833233924, + "loss": 2.9802162647247314, + "step": 11693, + "token_acc": 0.29994179440972163 + }, + { + "epoch": 6.854881266490765, + "grad_norm": 0.1600174224162427, + "learning_rate": 0.0003917276686498472, + "loss": 3.005119800567627, + "step": 11694, + "token_acc": 0.2977689218187392 + }, + { + "epoch": 6.855467604807974, + "grad_norm": 0.1579581882845999, + "learning_rate": 0.00039170770763601476, + "loss": 2.9815316200256348, + "step": 11695, + "token_acc": 0.29908986725731934 + }, + { + "epoch": 6.8560539431251835, + "grad_norm": 0.1461303305820812, + "learning_rate": 0.00039168774529102935, + "loss": 2.9841880798339844, + "step": 11696, + "token_acc": 0.30087171207816577 + }, + { + "epoch": 6.856640281442393, + "grad_norm": 0.14577401238915538, + "learning_rate": 0.00039166778161507854, + "loss": 3.0143861770629883, + "step": 11697, + "token_acc": 0.29623175833625437 + }, + { + "epoch": 6.857226619759601, + "grad_norm": 0.14808830314611282, + "learning_rate": 0.00039164781660834975, + "loss": 2.980742931365967, + "step": 11698, + "token_acc": 0.30044139809211984 + }, + { + "epoch": 6.85781295807681, + "grad_norm": 0.15336615967868994, + "learning_rate": 0.0003916278502710306, + "loss": 2.9505152702331543, + "step": 11699, + "token_acc": 0.30482012934519 + }, + { + "epoch": 6.858399296394019, + "grad_norm": 0.1545638792458985, + "learning_rate": 0.00039160788260330867, + "loss": 3.0311241149902344, + "step": 11700, + "token_acc": 0.2933646376241518 + }, + { + "epoch": 6.858985634711228, + "grad_norm": 0.1713852240745086, + "learning_rate": 0.00039158791360537147, + "loss": 2.9566478729248047, + "step": 11701, + "token_acc": 0.3045838752472822 + }, + { + "epoch": 6.859571973028437, + "grad_norm": 0.1490428308836565, + "learning_rate": 0.00039156794327740665, + "loss": 2.9988174438476562, + "step": 11702, + "token_acc": 0.2989718750412438 + }, + { + "epoch": 6.860158311345646, + "grad_norm": 0.16564574294842133, + "learning_rate": 0.0003915479716196018, + "loss": 3.0127007961273193, + "step": 11703, + "token_acc": 0.29778584260846225 + }, + { + "epoch": 6.8607446496628555, + "grad_norm": 0.152678601227875, + "learning_rate": 0.0003915279986321444, + "loss": 2.9870998859405518, + "step": 11704, + "token_acc": 0.2992416872264639 + }, + { + "epoch": 6.861330987980065, + "grad_norm": 0.14624487176497464, + "learning_rate": 0.00039150802431522225, + "loss": 2.994795799255371, + "step": 11705, + "token_acc": 0.2987401496862747 + }, + { + "epoch": 6.861917326297274, + "grad_norm": 0.16399337462002495, + "learning_rate": 0.00039148804866902286, + "loss": 2.948556423187256, + "step": 11706, + "token_acc": 0.3070787953893695 + }, + { + "epoch": 6.862503664614483, + "grad_norm": 0.1654014639942079, + "learning_rate": 0.00039146807169373386, + "loss": 2.9480366706848145, + "step": 11707, + "token_acc": 0.3071945084196669 + }, + { + "epoch": 6.863090002931692, + "grad_norm": 0.15436898928029913, + "learning_rate": 0.00039144809338954303, + "loss": 2.992832660675049, + "step": 11708, + "token_acc": 0.2993295505100567 + }, + { + "epoch": 6.863676341248901, + "grad_norm": 0.18050228149850386, + "learning_rate": 0.0003914281137566379, + "loss": 2.9588282108306885, + "step": 11709, + "token_acc": 0.303640057029663 + }, + { + "epoch": 6.86426267956611, + "grad_norm": 0.20579533794262228, + "learning_rate": 0.0003914081327952063, + "loss": 2.9685373306274414, + "step": 11710, + "token_acc": 0.3024497725789396 + }, + { + "epoch": 6.864849017883318, + "grad_norm": 0.16300578609003785, + "learning_rate": 0.0003913881505054358, + "loss": 3.00211763381958, + "step": 11711, + "token_acc": 0.2973758566239915 + }, + { + "epoch": 6.8654353562005275, + "grad_norm": 0.1808041439587513, + "learning_rate": 0.0003913681668875142, + "loss": 2.9760894775390625, + "step": 11712, + "token_acc": 0.3020449979377054 + }, + { + "epoch": 6.866021694517737, + "grad_norm": 0.25385692960627604, + "learning_rate": 0.0003913481819416291, + "loss": 2.9609689712524414, + "step": 11713, + "token_acc": 0.30338395692021475 + }, + { + "epoch": 6.866608032834946, + "grad_norm": 0.21470357794171666, + "learning_rate": 0.0003913281956679683, + "loss": 2.9859812259674072, + "step": 11714, + "token_acc": 0.30034866429510987 + }, + { + "epoch": 6.867194371152155, + "grad_norm": 0.17908612457263334, + "learning_rate": 0.00039130820806671955, + "loss": 2.979170799255371, + "step": 11715, + "token_acc": 0.29979498259019866 + }, + { + "epoch": 6.867780709469364, + "grad_norm": 0.25915820696029424, + "learning_rate": 0.00039128821913807064, + "loss": 2.9745588302612305, + "step": 11716, + "token_acc": 0.30318829531154545 + }, + { + "epoch": 6.868367047786573, + "grad_norm": 0.16407772962468534, + "learning_rate": 0.00039126822888220926, + "loss": 2.9711427688598633, + "step": 11717, + "token_acc": 0.30314034634272424 + }, + { + "epoch": 6.868953386103782, + "grad_norm": 0.18613934189704082, + "learning_rate": 0.0003912482372993232, + "loss": 2.966268539428711, + "step": 11718, + "token_acc": 0.30313986776808716 + }, + { + "epoch": 6.869539724420991, + "grad_norm": 0.18661103907496468, + "learning_rate": 0.0003912282443896004, + "loss": 3.0209670066833496, + "step": 11719, + "token_acc": 0.29706640754489716 + }, + { + "epoch": 6.8701260627381995, + "grad_norm": 0.15228308761010492, + "learning_rate": 0.00039120825015322847, + "loss": 2.9917116165161133, + "step": 11720, + "token_acc": 0.29894704024780255 + }, + { + "epoch": 6.870712401055409, + "grad_norm": 0.17673202343163127, + "learning_rate": 0.0003911882545903953, + "loss": 3.027149200439453, + "step": 11721, + "token_acc": 0.2935131112596647 + }, + { + "epoch": 6.871298739372618, + "grad_norm": 0.15882406000929414, + "learning_rate": 0.0003911682577012887, + "loss": 2.962437868118286, + "step": 11722, + "token_acc": 0.30275033540675694 + }, + { + "epoch": 6.871885077689827, + "grad_norm": 0.1506358594734898, + "learning_rate": 0.0003911482594860967, + "loss": 2.9518747329711914, + "step": 11723, + "token_acc": 0.3045365013720768 + }, + { + "epoch": 6.872471416007036, + "grad_norm": 0.14470991729019358, + "learning_rate": 0.00039112825994500684, + "loss": 2.9915945529937744, + "step": 11724, + "token_acc": 0.2985555351647796 + }, + { + "epoch": 6.873057754324245, + "grad_norm": 0.1589336414582065, + "learning_rate": 0.0003911082590782072, + "loss": 2.9508283138275146, + "step": 11725, + "token_acc": 0.30538875131538307 + }, + { + "epoch": 6.873644092641454, + "grad_norm": 0.1595454779902532, + "learning_rate": 0.00039108825688588566, + "loss": 2.9584591388702393, + "step": 11726, + "token_acc": 0.30420618315827486 + }, + { + "epoch": 6.874230430958663, + "grad_norm": 0.1570664415365281, + "learning_rate": 0.00039106825336823005, + "loss": 2.955723762512207, + "step": 11727, + "token_acc": 0.30404311964919606 + }, + { + "epoch": 6.874816769275872, + "grad_norm": 0.1577236032236176, + "learning_rate": 0.00039104824852542817, + "loss": 2.9925882816314697, + "step": 11728, + "token_acc": 0.3006066179436297 + }, + { + "epoch": 6.8754031075930815, + "grad_norm": 0.16216711564026365, + "learning_rate": 0.0003910282423576682, + "loss": 2.961024761199951, + "step": 11729, + "token_acc": 0.3038065750417002 + }, + { + "epoch": 6.875989445910291, + "grad_norm": 0.1883112035361387, + "learning_rate": 0.0003910082348651379, + "loss": 2.9814677238464355, + "step": 11730, + "token_acc": 0.29976942164726783 + }, + { + "epoch": 6.8765757842275, + "grad_norm": 0.1784186145661516, + "learning_rate": 0.00039098822604802523, + "loss": 2.948641777038574, + "step": 11731, + "token_acc": 0.30531565015107354 + }, + { + "epoch": 6.877162122544708, + "grad_norm": 0.15330187208941493, + "learning_rate": 0.0003909682159065182, + "loss": 2.9843952655792236, + "step": 11732, + "token_acc": 0.300951756652842 + }, + { + "epoch": 6.877748460861917, + "grad_norm": 0.23109139219187153, + "learning_rate": 0.0003909482044408047, + "loss": 2.993312358856201, + "step": 11733, + "token_acc": 0.2993722005925004 + }, + { + "epoch": 6.878334799179126, + "grad_norm": 0.16389196650480709, + "learning_rate": 0.00039092819165107275, + "loss": 2.974762439727783, + "step": 11734, + "token_acc": 0.3006497184907808 + }, + { + "epoch": 6.878921137496335, + "grad_norm": 0.19532395711457398, + "learning_rate": 0.0003909081775375103, + "loss": 2.982950210571289, + "step": 11735, + "token_acc": 0.29940507085160156 + }, + { + "epoch": 6.879507475813544, + "grad_norm": 0.24599810812646922, + "learning_rate": 0.0003908881621003055, + "loss": 2.9341750144958496, + "step": 11736, + "token_acc": 0.3088608332318316 + }, + { + "epoch": 6.8800938141307535, + "grad_norm": 0.15580670393558282, + "learning_rate": 0.0003908681453396462, + "loss": 2.971388339996338, + "step": 11737, + "token_acc": 0.30290704598081375 + }, + { + "epoch": 6.880680152447963, + "grad_norm": 0.2707148270950727, + "learning_rate": 0.0003908481272557205, + "loss": 3.0030364990234375, + "step": 11738, + "token_acc": 0.2972993471983931 + }, + { + "epoch": 6.881266490765172, + "grad_norm": 0.23571741169451438, + "learning_rate": 0.0003908281078487164, + "loss": 2.9933886528015137, + "step": 11739, + "token_acc": 0.29898876462112006 + }, + { + "epoch": 6.881852829082381, + "grad_norm": 0.1659700802409054, + "learning_rate": 0.0003908080871188221, + "loss": 2.9729669094085693, + "step": 11740, + "token_acc": 0.3010195657557996 + }, + { + "epoch": 6.88243916739959, + "grad_norm": 0.20912230547905833, + "learning_rate": 0.00039078806506622545, + "loss": 2.9782423973083496, + "step": 11741, + "token_acc": 0.29809322812488515 + }, + { + "epoch": 6.883025505716798, + "grad_norm": 0.1705220638385243, + "learning_rate": 0.00039076804169111475, + "loss": 2.9534168243408203, + "step": 11742, + "token_acc": 0.30499774749408715 + }, + { + "epoch": 6.883611844034007, + "grad_norm": 0.2057129839450563, + "learning_rate": 0.0003907480169936779, + "loss": 3.02030086517334, + "step": 11743, + "token_acc": 0.29664586351948236 + }, + { + "epoch": 6.884198182351216, + "grad_norm": 0.15143426095090767, + "learning_rate": 0.0003907279909741031, + "loss": 3.0042381286621094, + "step": 11744, + "token_acc": 0.29747828406173027 + }, + { + "epoch": 6.8847845206684255, + "grad_norm": 0.1898938179486049, + "learning_rate": 0.00039070796363257853, + "loss": 2.948796272277832, + "step": 11745, + "token_acc": 0.30429735844802097 + }, + { + "epoch": 6.885370858985635, + "grad_norm": 0.14881218518566464, + "learning_rate": 0.00039068793496929217, + "loss": 2.9900426864624023, + "step": 11746, + "token_acc": 0.29959716947273124 + }, + { + "epoch": 6.885957197302844, + "grad_norm": 0.17715126974194192, + "learning_rate": 0.00039066790498443226, + "loss": 2.9903740882873535, + "step": 11747, + "token_acc": 0.30070205781003173 + }, + { + "epoch": 6.886543535620053, + "grad_norm": 0.16798929924065284, + "learning_rate": 0.0003906478736781869, + "loss": 3.016535520553589, + "step": 11748, + "token_acc": 0.2945578007054092 + }, + { + "epoch": 6.887129873937262, + "grad_norm": 0.181732147387609, + "learning_rate": 0.0003906278410507444, + "loss": 2.9581291675567627, + "step": 11749, + "token_acc": 0.30472753655360607 + }, + { + "epoch": 6.887716212254471, + "grad_norm": 0.17044056474085387, + "learning_rate": 0.0003906078071022928, + "loss": 2.9962620735168457, + "step": 11750, + "token_acc": 0.29831138846233435 + }, + { + "epoch": 6.88830255057168, + "grad_norm": 0.1769806650356267, + "learning_rate": 0.0003905877718330203, + "loss": 2.995988130569458, + "step": 11751, + "token_acc": 0.29943910152132325 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.14368481623889762, + "learning_rate": 0.00039056773524311506, + "loss": 2.954618215560913, + "step": 11752, + "token_acc": 0.30515657383252737 + }, + { + "epoch": 6.889475227206098, + "grad_norm": 0.15627866286193465, + "learning_rate": 0.0003905476973327654, + "loss": 2.939596176147461, + "step": 11753, + "token_acc": 0.3071210287808582 + }, + { + "epoch": 6.890061565523307, + "grad_norm": 0.13886588960171475, + "learning_rate": 0.00039052765810215957, + "loss": 2.9758479595184326, + "step": 11754, + "token_acc": 0.30085066914005565 + }, + { + "epoch": 6.890647903840516, + "grad_norm": 0.20349497196066435, + "learning_rate": 0.00039050761755148576, + "loss": 3.018279552459717, + "step": 11755, + "token_acc": 0.2938936271498363 + }, + { + "epoch": 6.891234242157725, + "grad_norm": 0.15249341274041836, + "learning_rate": 0.0003904875756809322, + "loss": 2.9879746437072754, + "step": 11756, + "token_acc": 0.3018520099456829 + }, + { + "epoch": 6.891820580474934, + "grad_norm": 0.1701403492778671, + "learning_rate": 0.0003904675324906871, + "loss": 3.0039114952087402, + "step": 11757, + "token_acc": 0.29902844208546564 + }, + { + "epoch": 6.892406918792143, + "grad_norm": 0.1806604825690394, + "learning_rate": 0.0003904474879809389, + "loss": 3.0207743644714355, + "step": 11758, + "token_acc": 0.2943410864822365 + }, + { + "epoch": 6.892993257109352, + "grad_norm": 0.1562616265794891, + "learning_rate": 0.00039042744215187576, + "loss": 2.995577335357666, + "step": 11759, + "token_acc": 0.299441886239791 + }, + { + "epoch": 6.893579595426561, + "grad_norm": 0.17478319840533038, + "learning_rate": 0.00039040739500368607, + "loss": 2.981396436691284, + "step": 11760, + "token_acc": 0.2984123638737602 + }, + { + "epoch": 6.89416593374377, + "grad_norm": 0.154868206747603, + "learning_rate": 0.00039038734653655804, + "loss": 2.9527242183685303, + "step": 11761, + "token_acc": 0.3042757573408121 + }, + { + "epoch": 6.8947522720609795, + "grad_norm": 0.21350116358602947, + "learning_rate": 0.00039036729675068015, + "loss": 2.959106922149658, + "step": 11762, + "token_acc": 0.30456083760774155 + }, + { + "epoch": 6.895338610378188, + "grad_norm": 0.24649431183095927, + "learning_rate": 0.0003903472456462406, + "loss": 2.998145341873169, + "step": 11763, + "token_acc": 0.29782207197476995 + }, + { + "epoch": 6.895924948695397, + "grad_norm": 0.16037500933215432, + "learning_rate": 0.0003903271932234278, + "loss": 2.943204879760742, + "step": 11764, + "token_acc": 0.307348533360598 + }, + { + "epoch": 6.896511287012606, + "grad_norm": 0.20840701415940696, + "learning_rate": 0.0003903071394824301, + "loss": 3.0207266807556152, + "step": 11765, + "token_acc": 0.29615161998800105 + }, + { + "epoch": 6.897097625329815, + "grad_norm": 0.18977743695736873, + "learning_rate": 0.000390287084423436, + "loss": 2.9632294178009033, + "step": 11766, + "token_acc": 0.30349585062240664 + }, + { + "epoch": 6.897683963647024, + "grad_norm": 0.18312595222550798, + "learning_rate": 0.0003902670280466336, + "loss": 2.9983348846435547, + "step": 11767, + "token_acc": 0.29928454625170176 + }, + { + "epoch": 6.898270301964233, + "grad_norm": 0.22138317524657736, + "learning_rate": 0.0003902469703522116, + "loss": 2.9409523010253906, + "step": 11768, + "token_acc": 0.30584507629450514 + }, + { + "epoch": 6.898856640281442, + "grad_norm": 0.1432590545847616, + "learning_rate": 0.0003902269113403583, + "loss": 2.936412811279297, + "step": 11769, + "token_acc": 0.3075153213830032 + }, + { + "epoch": 6.8994429785986515, + "grad_norm": 0.1936241012505152, + "learning_rate": 0.0003902068510112621, + "loss": 2.9736828804016113, + "step": 11770, + "token_acc": 0.30176787207768885 + }, + { + "epoch": 6.900029316915861, + "grad_norm": 0.14958651155860428, + "learning_rate": 0.00039018678936511146, + "loss": 2.9696903228759766, + "step": 11771, + "token_acc": 0.30097810074593845 + }, + { + "epoch": 6.90061565523307, + "grad_norm": 0.1902903252190865, + "learning_rate": 0.00039016672640209484, + "loss": 2.9661245346069336, + "step": 11772, + "token_acc": 0.3043516857658335 + }, + { + "epoch": 6.901201993550279, + "grad_norm": 0.14254590129059455, + "learning_rate": 0.0003901466621224007, + "loss": 2.979823112487793, + "step": 11773, + "token_acc": 0.3006787711556487 + }, + { + "epoch": 6.901788331867488, + "grad_norm": 0.1818002735556006, + "learning_rate": 0.00039012659652621755, + "loss": 2.939370632171631, + "step": 11774, + "token_acc": 0.307278811249287 + }, + { + "epoch": 6.902374670184696, + "grad_norm": 0.1522364342946901, + "learning_rate": 0.0003901065296137338, + "loss": 2.963469982147217, + "step": 11775, + "token_acc": 0.30309135456412134 + }, + { + "epoch": 6.902961008501905, + "grad_norm": 0.1816070720398656, + "learning_rate": 0.00039008646138513804, + "loss": 2.9807066917419434, + "step": 11776, + "token_acc": 0.3013243040273391 + }, + { + "epoch": 6.903547346819114, + "grad_norm": 0.2151382651975947, + "learning_rate": 0.00039006639184061876, + "loss": 2.9377634525299072, + "step": 11777, + "token_acc": 0.306210438010016 + }, + { + "epoch": 6.9041336851363235, + "grad_norm": 0.14332249560420796, + "learning_rate": 0.0003900463209803644, + "loss": 2.9598278999328613, + "step": 11778, + "token_acc": 0.30331980000950326 + }, + { + "epoch": 6.904720023453533, + "grad_norm": 0.21283316950963813, + "learning_rate": 0.00039002624880456367, + "loss": 2.972876787185669, + "step": 11779, + "token_acc": 0.3020908771131205 + }, + { + "epoch": 6.905306361770742, + "grad_norm": 0.19044640736516466, + "learning_rate": 0.000390006175313405, + "loss": 3.000403881072998, + "step": 11780, + "token_acc": 0.29721695629278566 + }, + { + "epoch": 6.905892700087951, + "grad_norm": 0.1547311689420805, + "learning_rate": 0.0003899861005070769, + "loss": 2.9923577308654785, + "step": 11781, + "token_acc": 0.3005274009883071 + }, + { + "epoch": 6.90647903840516, + "grad_norm": 0.20741202967063185, + "learning_rate": 0.000389966024385768, + "loss": 2.973814010620117, + "step": 11782, + "token_acc": 0.30181658151440816 + }, + { + "epoch": 6.907065376722369, + "grad_norm": 0.1643510913003733, + "learning_rate": 0.000389945946949667, + "loss": 2.9644293785095215, + "step": 11783, + "token_acc": 0.3035869576887152 + }, + { + "epoch": 6.907651715039578, + "grad_norm": 0.1657535197609546, + "learning_rate": 0.0003899258681989624, + "loss": 3.026782274246216, + "step": 11784, + "token_acc": 0.2943737218745928 + }, + { + "epoch": 6.908238053356786, + "grad_norm": 0.17360099787285244, + "learning_rate": 0.0003899057881338428, + "loss": 2.961697578430176, + "step": 11785, + "token_acc": 0.3034607208703509 + }, + { + "epoch": 6.9088243916739955, + "grad_norm": 0.16874729074076256, + "learning_rate": 0.0003898857067544968, + "loss": 2.9831490516662598, + "step": 11786, + "token_acc": 0.29998105681901893 + }, + { + "epoch": 6.909410729991205, + "grad_norm": 0.15540030912941716, + "learning_rate": 0.00038986562406111315, + "loss": 2.978808879852295, + "step": 11787, + "token_acc": 0.30035871607596704 + }, + { + "epoch": 6.909997068308414, + "grad_norm": 0.1415674456189763, + "learning_rate": 0.00038984554005388035, + "loss": 2.9639575481414795, + "step": 11788, + "token_acc": 0.3019653784382687 + }, + { + "epoch": 6.910583406625623, + "grad_norm": 0.16501963139354275, + "learning_rate": 0.00038982545473298727, + "loss": 3.049868583679199, + "step": 11789, + "token_acc": 0.29249697948157877 + }, + { + "epoch": 6.911169744942832, + "grad_norm": 0.13963501611247967, + "learning_rate": 0.00038980536809862234, + "loss": 2.9788479804992676, + "step": 11790, + "token_acc": 0.30249433454571495 + }, + { + "epoch": 6.911756083260041, + "grad_norm": 0.1668836305008115, + "learning_rate": 0.00038978528015097444, + "loss": 2.970059394836426, + "step": 11791, + "token_acc": 0.303032473924545 + }, + { + "epoch": 6.91234242157725, + "grad_norm": 0.18172092299289974, + "learning_rate": 0.0003897651908902321, + "loss": 3.0090959072113037, + "step": 11792, + "token_acc": 0.2979577800116396 + }, + { + "epoch": 6.912928759894459, + "grad_norm": 0.15297418467609925, + "learning_rate": 0.00038974510031658424, + "loss": 3.0174202919006348, + "step": 11793, + "token_acc": 0.29566676203604964 + }, + { + "epoch": 6.913515098211668, + "grad_norm": 0.17239415431112426, + "learning_rate": 0.0003897250084302194, + "loss": 2.974062919616699, + "step": 11794, + "token_acc": 0.30140016057356467 + }, + { + "epoch": 6.9141014365288775, + "grad_norm": 0.1621281319705989, + "learning_rate": 0.00038970491523132643, + "loss": 2.971731185913086, + "step": 11795, + "token_acc": 0.3022499623287943 + }, + { + "epoch": 6.914687774846087, + "grad_norm": 0.1584799185553344, + "learning_rate": 0.00038968482072009405, + "loss": 2.9675002098083496, + "step": 11796, + "token_acc": 0.301887034363565 + }, + { + "epoch": 6.915274113163295, + "grad_norm": 0.19386930251655318, + "learning_rate": 0.000389664724896711, + "loss": 2.9651296138763428, + "step": 11797, + "token_acc": 0.30283500027848725 + }, + { + "epoch": 6.915860451480504, + "grad_norm": 0.15417297267448893, + "learning_rate": 0.000389644627761366, + "loss": 2.9632906913757324, + "step": 11798, + "token_acc": 0.30375573083023705 + }, + { + "epoch": 6.916446789797713, + "grad_norm": 0.18563914007626597, + "learning_rate": 0.00038962452931424796, + "loss": 2.9614453315734863, + "step": 11799, + "token_acc": 0.3035214877415894 + }, + { + "epoch": 6.917033128114922, + "grad_norm": 0.18203730709462101, + "learning_rate": 0.0003896044295555456, + "loss": 2.955831527709961, + "step": 11800, + "token_acc": 0.30259708111187117 + }, + { + "epoch": 6.917619466432131, + "grad_norm": 0.16560121129589625, + "learning_rate": 0.0003895843284854477, + "loss": 2.9896678924560547, + "step": 11801, + "token_acc": 0.29955052572437596 + }, + { + "epoch": 6.91820580474934, + "grad_norm": 0.17349842530727605, + "learning_rate": 0.0003895642261041432, + "loss": 2.972221851348877, + "step": 11802, + "token_acc": 0.3017772215269086 + }, + { + "epoch": 6.9187921430665495, + "grad_norm": 0.21009843498659433, + "learning_rate": 0.0003895441224118208, + "loss": 2.9719138145446777, + "step": 11803, + "token_acc": 0.30187244631501337 + }, + { + "epoch": 6.919378481383759, + "grad_norm": 0.16073780333660168, + "learning_rate": 0.00038952401740866943, + "loss": 2.9791951179504395, + "step": 11804, + "token_acc": 0.30076098781263305 + }, + { + "epoch": 6.919964819700968, + "grad_norm": 0.1637529366041675, + "learning_rate": 0.0003895039110948779, + "loss": 2.976726531982422, + "step": 11805, + "token_acc": 0.30365360758733845 + }, + { + "epoch": 6.920551158018176, + "grad_norm": 0.17385442172446414, + "learning_rate": 0.00038948380347063517, + "loss": 2.9564895629882812, + "step": 11806, + "token_acc": 0.30524144417605437 + }, + { + "epoch": 6.921137496335385, + "grad_norm": 0.16634107477862997, + "learning_rate": 0.00038946369453613, + "loss": 2.969212055206299, + "step": 11807, + "token_acc": 0.30362493681374514 + }, + { + "epoch": 6.921723834652594, + "grad_norm": 0.18734790859647837, + "learning_rate": 0.00038944358429155134, + "loss": 2.984787702560425, + "step": 11808, + "token_acc": 0.29869633099141296 + }, + { + "epoch": 6.922310172969803, + "grad_norm": 0.22720318634854345, + "learning_rate": 0.0003894234727370882, + "loss": 3.011652708053589, + "step": 11809, + "token_acc": 0.2965390516472808 + }, + { + "epoch": 6.9228965112870124, + "grad_norm": 0.32767597266372034, + "learning_rate": 0.0003894033598729294, + "loss": 2.993985652923584, + "step": 11810, + "token_acc": 0.2996117297928881 + }, + { + "epoch": 6.923482849604222, + "grad_norm": 0.3061113766748395, + "learning_rate": 0.00038938324569926376, + "loss": 2.9869818687438965, + "step": 11811, + "token_acc": 0.29926455408753094 + }, + { + "epoch": 6.924069187921431, + "grad_norm": 0.19548215468170163, + "learning_rate": 0.00038936313021628046, + "loss": 2.971559524536133, + "step": 11812, + "token_acc": 0.302699767382967 + }, + { + "epoch": 6.92465552623864, + "grad_norm": 0.18066848859739787, + "learning_rate": 0.00038934301342416835, + "loss": 2.9372119903564453, + "step": 11813, + "token_acc": 0.30729884274329894 + }, + { + "epoch": 6.925241864555849, + "grad_norm": 0.21374463500694874, + "learning_rate": 0.00038932289532311637, + "loss": 2.9711790084838867, + "step": 11814, + "token_acc": 0.30122626230266786 + }, + { + "epoch": 6.925828202873058, + "grad_norm": 0.15431696311838963, + "learning_rate": 0.0003893027759133135, + "loss": 3.0014514923095703, + "step": 11815, + "token_acc": 0.298686161757332 + }, + { + "epoch": 6.926414541190267, + "grad_norm": 0.18420493112845854, + "learning_rate": 0.00038928265519494877, + "loss": 3.0034584999084473, + "step": 11816, + "token_acc": 0.2974038116567552 + }, + { + "epoch": 6.927000879507476, + "grad_norm": 0.1498923812111919, + "learning_rate": 0.00038926253316821116, + "loss": 2.9295225143432617, + "step": 11817, + "token_acc": 0.3073597601366663 + }, + { + "epoch": 6.927587217824685, + "grad_norm": 0.16146598003577137, + "learning_rate": 0.00038924240983328974, + "loss": 2.9788880348205566, + "step": 11818, + "token_acc": 0.30195872473359386 + }, + { + "epoch": 6.928173556141894, + "grad_norm": 0.1505482905149616, + "learning_rate": 0.0003892222851903735, + "loss": 2.976224422454834, + "step": 11819, + "token_acc": 0.30206864811822054 + }, + { + "epoch": 6.928759894459103, + "grad_norm": 0.17457911643333568, + "learning_rate": 0.0003892021592396515, + "loss": 2.926795482635498, + "step": 11820, + "token_acc": 0.3086684863459374 + }, + { + "epoch": 6.929346232776312, + "grad_norm": 0.16876744131809343, + "learning_rate": 0.00038918203198131285, + "loss": 2.980958938598633, + "step": 11821, + "token_acc": 0.3005658529214147 + }, + { + "epoch": 6.929932571093521, + "grad_norm": 0.15581391023882385, + "learning_rate": 0.0003891619034155465, + "loss": 2.998814105987549, + "step": 11822, + "token_acc": 0.29680609942824426 + }, + { + "epoch": 6.93051890941073, + "grad_norm": 0.14882227932705785, + "learning_rate": 0.00038914177354254156, + "loss": 2.9813952445983887, + "step": 11823, + "token_acc": 0.30254263713381213 + }, + { + "epoch": 6.931105247727939, + "grad_norm": 0.16102643415224085, + "learning_rate": 0.00038912164236248723, + "loss": 2.925581932067871, + "step": 11824, + "token_acc": 0.309238916148252 + }, + { + "epoch": 6.931691586045148, + "grad_norm": 0.15815514611320536, + "learning_rate": 0.00038910150987557247, + "loss": 3.010441303253174, + "step": 11825, + "token_acc": 0.2966284230032959 + }, + { + "epoch": 6.932277924362357, + "grad_norm": 0.1804558746195359, + "learning_rate": 0.00038908137608198646, + "loss": 2.988403797149658, + "step": 11826, + "token_acc": 0.30030603423128127 + }, + { + "epoch": 6.9328642626795665, + "grad_norm": 0.19672337922227914, + "learning_rate": 0.0003890612409819184, + "loss": 2.942270278930664, + "step": 11827, + "token_acc": 0.3059263636651436 + }, + { + "epoch": 6.933450600996775, + "grad_norm": 0.18218655435466163, + "learning_rate": 0.0003890411045755574, + "loss": 2.98616623878479, + "step": 11828, + "token_acc": 0.2995707269476347 + }, + { + "epoch": 6.934036939313984, + "grad_norm": 0.19786330011083625, + "learning_rate": 0.00038902096686309253, + "loss": 3.0078067779541016, + "step": 11829, + "token_acc": 0.29622496652595776 + }, + { + "epoch": 6.934623277631193, + "grad_norm": 0.2320478307161566, + "learning_rate": 0.00038900082784471294, + "loss": 2.9770588874816895, + "step": 11830, + "token_acc": 0.30183826640375194 + }, + { + "epoch": 6.935209615948402, + "grad_norm": 0.19534389549011252, + "learning_rate": 0.00038898068752060797, + "loss": 2.9735546112060547, + "step": 11831, + "token_acc": 0.3006743616428564 + }, + { + "epoch": 6.935795954265611, + "grad_norm": 0.1631540080647636, + "learning_rate": 0.00038896054589096664, + "loss": 3.0009546279907227, + "step": 11832, + "token_acc": 0.29754109857013145 + }, + { + "epoch": 6.93638229258282, + "grad_norm": 0.1750290044103385, + "learning_rate": 0.0003889404029559783, + "loss": 2.9819021224975586, + "step": 11833, + "token_acc": 0.30050715566969066 + }, + { + "epoch": 6.936968630900029, + "grad_norm": 0.1814293168820925, + "learning_rate": 0.00038892025871583213, + "loss": 2.943075180053711, + "step": 11834, + "token_acc": 0.30664539857890505 + }, + { + "epoch": 6.9375549692172385, + "grad_norm": 0.1667460063296604, + "learning_rate": 0.0003889001131707173, + "loss": 2.970065116882324, + "step": 11835, + "token_acc": 0.30195312985640577 + }, + { + "epoch": 6.938141307534448, + "grad_norm": 0.16504499316882293, + "learning_rate": 0.000388879966320823, + "loss": 2.948068141937256, + "step": 11836, + "token_acc": 0.3052755540853457 + }, + { + "epoch": 6.938727645851657, + "grad_norm": 0.1933100473902244, + "learning_rate": 0.00038885981816633863, + "loss": 2.986741542816162, + "step": 11837, + "token_acc": 0.29929654262657374 + }, + { + "epoch": 6.939313984168866, + "grad_norm": 0.15341977427726425, + "learning_rate": 0.0003888396687074534, + "loss": 2.990516185760498, + "step": 11838, + "token_acc": 0.29859228271509436 + }, + { + "epoch": 6.939900322486075, + "grad_norm": 0.17790928095873598, + "learning_rate": 0.0003888195179443565, + "loss": 2.980855941772461, + "step": 11839, + "token_acc": 0.3007755823462685 + }, + { + "epoch": 6.940486660803283, + "grad_norm": 0.17430760140775906, + "learning_rate": 0.0003887993658772373, + "loss": 2.9866514205932617, + "step": 11840, + "token_acc": 0.3010280879269602 + }, + { + "epoch": 6.941072999120492, + "grad_norm": 0.1664801969658795, + "learning_rate": 0.0003887792125062851, + "loss": 2.9780468940734863, + "step": 11841, + "token_acc": 0.300259372540319 + }, + { + "epoch": 6.941659337437701, + "grad_norm": 0.15428460033087163, + "learning_rate": 0.0003887590578316893, + "loss": 2.9663310050964355, + "step": 11842, + "token_acc": 0.30254358176327834 + }, + { + "epoch": 6.9422456757549105, + "grad_norm": 0.18367440256970818, + "learning_rate": 0.00038873890185363904, + "loss": 2.9993748664855957, + "step": 11843, + "token_acc": 0.2966317540683435 + }, + { + "epoch": 6.94283201407212, + "grad_norm": 0.2014406090181704, + "learning_rate": 0.0003887187445723238, + "loss": 2.961099147796631, + "step": 11844, + "token_acc": 0.3062983384327724 + }, + { + "epoch": 6.943418352389329, + "grad_norm": 0.14806715951974583, + "learning_rate": 0.00038869858598793286, + "loss": 2.9646453857421875, + "step": 11845, + "token_acc": 0.3021858519924503 + }, + { + "epoch": 6.944004690706538, + "grad_norm": 0.2308601416081772, + "learning_rate": 0.0003886784261006555, + "loss": 2.9629299640655518, + "step": 11846, + "token_acc": 0.30336975367118535 + }, + { + "epoch": 6.944591029023747, + "grad_norm": 0.3275526592804235, + "learning_rate": 0.00038865826491068134, + "loss": 2.963777780532837, + "step": 11847, + "token_acc": 0.30300747670210765 + }, + { + "epoch": 6.945177367340956, + "grad_norm": 0.21542420383451788, + "learning_rate": 0.00038863810241819964, + "loss": 3.040379524230957, + "step": 11848, + "token_acc": 0.29388647347552677 + }, + { + "epoch": 6.945763705658165, + "grad_norm": 0.2597587869362111, + "learning_rate": 0.00038861793862339966, + "loss": 2.9740726947784424, + "step": 11849, + "token_acc": 0.30098373205741624 + }, + { + "epoch": 6.946350043975373, + "grad_norm": 0.26398500369768646, + "learning_rate": 0.00038859777352647103, + "loss": 2.9820775985717773, + "step": 11850, + "token_acc": 0.3007370968673383 + }, + { + "epoch": 6.9469363822925825, + "grad_norm": 0.18177602710244337, + "learning_rate": 0.00038857760712760305, + "loss": 2.952928304672241, + "step": 11851, + "token_acc": 0.306773169300415 + }, + { + "epoch": 6.947522720609792, + "grad_norm": 0.21464013595721174, + "learning_rate": 0.0003885574394269852, + "loss": 2.9824817180633545, + "step": 11852, + "token_acc": 0.3016224868302931 + }, + { + "epoch": 6.948109058927001, + "grad_norm": 0.1890069637281629, + "learning_rate": 0.0003885372704248069, + "loss": 3.0263400077819824, + "step": 11853, + "token_acc": 0.29405302880625916 + }, + { + "epoch": 6.94869539724421, + "grad_norm": 0.19724463683013002, + "learning_rate": 0.00038851710012125765, + "loss": 2.954951286315918, + "step": 11854, + "token_acc": 0.30400521497367644 + }, + { + "epoch": 6.949281735561419, + "grad_norm": 0.1773223362984139, + "learning_rate": 0.00038849692851652685, + "loss": 2.9575095176696777, + "step": 11855, + "token_acc": 0.3042853420094588 + }, + { + "epoch": 6.949868073878628, + "grad_norm": 0.20850960361905455, + "learning_rate": 0.00038847675561080403, + "loss": 2.981243371963501, + "step": 11856, + "token_acc": 0.3015122070501023 + }, + { + "epoch": 6.950454412195837, + "grad_norm": 0.1769840979521871, + "learning_rate": 0.00038845658140427875, + "loss": 2.9723339080810547, + "step": 11857, + "token_acc": 0.3019284971418788 + }, + { + "epoch": 6.951040750513046, + "grad_norm": 0.21776011631044515, + "learning_rate": 0.0003884364058971404, + "loss": 2.9789717197418213, + "step": 11858, + "token_acc": 0.30142594872008843 + }, + { + "epoch": 6.951627088830255, + "grad_norm": 0.13654777909421822, + "learning_rate": 0.0003884162290895786, + "loss": 3.0028533935546875, + "step": 11859, + "token_acc": 0.29892559415937453 + }, + { + "epoch": 6.9522134271474645, + "grad_norm": 0.16933441713032454, + "learning_rate": 0.0003883960509817828, + "loss": 2.9555234909057617, + "step": 11860, + "token_acc": 0.30419066614083434 + }, + { + "epoch": 6.952799765464674, + "grad_norm": 0.17535406179964494, + "learning_rate": 0.0003883758715739427, + "loss": 2.9621896743774414, + "step": 11861, + "token_acc": 0.30250836867944475 + }, + { + "epoch": 6.953386103781882, + "grad_norm": 0.1515013427590912, + "learning_rate": 0.0003883556908662477, + "loss": 2.9844155311584473, + "step": 11862, + "token_acc": 0.3001840699202364 + }, + { + "epoch": 6.953972442099091, + "grad_norm": 0.16336225612330396, + "learning_rate": 0.00038833550885888733, + "loss": 2.9939842224121094, + "step": 11863, + "token_acc": 0.29771214451768624 + }, + { + "epoch": 6.9545587804163, + "grad_norm": 0.14918923848348314, + "learning_rate": 0.0003883153255520514, + "loss": 2.9977543354034424, + "step": 11864, + "token_acc": 0.2988264598633866 + }, + { + "epoch": 6.955145118733509, + "grad_norm": 0.20029159849200973, + "learning_rate": 0.0003882951409459293, + "loss": 2.9561972618103027, + "step": 11865, + "token_acc": 0.3027169300408196 + }, + { + "epoch": 6.955731457050718, + "grad_norm": 0.17376184548657353, + "learning_rate": 0.00038827495504071066, + "loss": 2.9627346992492676, + "step": 11866, + "token_acc": 0.302417413585817 + }, + { + "epoch": 6.956317795367927, + "grad_norm": 0.19640677986219787, + "learning_rate": 0.0003882547678365852, + "loss": 2.952059268951416, + "step": 11867, + "token_acc": 0.30445750566239016 + }, + { + "epoch": 6.9569041336851365, + "grad_norm": 0.17755505443936187, + "learning_rate": 0.0003882345793337425, + "loss": 2.954103946685791, + "step": 11868, + "token_acc": 0.3041139427663366 + }, + { + "epoch": 6.957490472002346, + "grad_norm": 0.18999952978638338, + "learning_rate": 0.0003882143895323722, + "loss": 3.0186946392059326, + "step": 11869, + "token_acc": 0.29581427886670725 + }, + { + "epoch": 6.958076810319555, + "grad_norm": 0.22788369107590167, + "learning_rate": 0.0003881941984326639, + "loss": 3.0099925994873047, + "step": 11870, + "token_acc": 0.29535835560547957 + }, + { + "epoch": 6.958663148636763, + "grad_norm": 0.16476671037416862, + "learning_rate": 0.0003881740060348074, + "loss": 2.960766077041626, + "step": 11871, + "token_acc": 0.30270131446849247 + }, + { + "epoch": 6.959249486953972, + "grad_norm": 0.29480484249150984, + "learning_rate": 0.00038815381233899227, + "loss": 2.9811296463012695, + "step": 11872, + "token_acc": 0.29947509217671986 + }, + { + "epoch": 6.959835825271181, + "grad_norm": 0.17722320324264834, + "learning_rate": 0.00038813361734540833, + "loss": 2.967946767807007, + "step": 11873, + "token_acc": 0.30201780683329926 + }, + { + "epoch": 6.96042216358839, + "grad_norm": 0.19070509247068793, + "learning_rate": 0.00038811342105424506, + "loss": 2.984808921813965, + "step": 11874, + "token_acc": 0.3005461592670895 + }, + { + "epoch": 6.961008501905599, + "grad_norm": 0.17290972718052475, + "learning_rate": 0.0003880932234656923, + "loss": 2.9692840576171875, + "step": 11875, + "token_acc": 0.3023128014647659 + }, + { + "epoch": 6.9615948402228085, + "grad_norm": 0.2509431827073499, + "learning_rate": 0.00038807302457993987, + "loss": 3.038240909576416, + "step": 11876, + "token_acc": 0.2924673117870425 + }, + { + "epoch": 6.962181178540018, + "grad_norm": 0.17789443476834227, + "learning_rate": 0.0003880528243971774, + "loss": 2.9883527755737305, + "step": 11877, + "token_acc": 0.30113352969280766 + }, + { + "epoch": 6.962767516857227, + "grad_norm": 0.20661350170485288, + "learning_rate": 0.0003880326229175948, + "loss": 3.0049376487731934, + "step": 11878, + "token_acc": 0.29725913283757976 + }, + { + "epoch": 6.963353855174436, + "grad_norm": 0.17826356169374144, + "learning_rate": 0.0003880124201413815, + "loss": 2.965385675430298, + "step": 11879, + "token_acc": 0.30393816200669266 + }, + { + "epoch": 6.963940193491645, + "grad_norm": 0.18557449076924937, + "learning_rate": 0.0003879922160687276, + "loss": 2.952457904815674, + "step": 11880, + "token_acc": 0.3029825900714634 + }, + { + "epoch": 6.964526531808854, + "grad_norm": 0.1542285801594966, + "learning_rate": 0.00038797201069982275, + "loss": 3.0019419193267822, + "step": 11881, + "token_acc": 0.29606803204082777 + }, + { + "epoch": 6.965112870126063, + "grad_norm": 0.19478810826296444, + "learning_rate": 0.00038795180403485675, + "loss": 2.9999148845672607, + "step": 11882, + "token_acc": 0.29880894109969 + }, + { + "epoch": 6.965699208443271, + "grad_norm": 0.15454891632472795, + "learning_rate": 0.0003879315960740195, + "loss": 2.9689674377441406, + "step": 11883, + "token_acc": 0.30258649067233645 + }, + { + "epoch": 6.9662855467604805, + "grad_norm": 0.17877819661328376, + "learning_rate": 0.0003879113868175008, + "loss": 2.981320381164551, + "step": 11884, + "token_acc": 0.30073210095454495 + }, + { + "epoch": 6.96687188507769, + "grad_norm": 0.1634616702727411, + "learning_rate": 0.00038789117626549034, + "loss": 2.960458278656006, + "step": 11885, + "token_acc": 0.30306764517352797 + }, + { + "epoch": 6.967458223394899, + "grad_norm": 0.15943051240147912, + "learning_rate": 0.00038787096441817814, + "loss": 2.943133592605591, + "step": 11886, + "token_acc": 0.3070735967080749 + }, + { + "epoch": 6.968044561712108, + "grad_norm": 0.16665852136587966, + "learning_rate": 0.0003878507512757541, + "loss": 3.0206692218780518, + "step": 11887, + "token_acc": 0.2949171347340684 + }, + { + "epoch": 6.968630900029317, + "grad_norm": 0.18896653961704424, + "learning_rate": 0.0003878305368384079, + "loss": 2.995310068130493, + "step": 11888, + "token_acc": 0.2982020584357589 + }, + { + "epoch": 6.969217238346526, + "grad_norm": 0.18449659799469423, + "learning_rate": 0.00038781032110632956, + "loss": 2.951279401779175, + "step": 11889, + "token_acc": 0.3048406164072964 + }, + { + "epoch": 6.969803576663735, + "grad_norm": 0.1684822466390478, + "learning_rate": 0.00038779010407970893, + "loss": 2.993988513946533, + "step": 11890, + "token_acc": 0.29925059508512897 + }, + { + "epoch": 6.970389914980944, + "grad_norm": 0.21365373519726502, + "learning_rate": 0.000387769885758736, + "loss": 2.98005747795105, + "step": 11891, + "token_acc": 0.30032944228274966 + }, + { + "epoch": 6.970976253298153, + "grad_norm": 0.15706201433295597, + "learning_rate": 0.0003877496661436006, + "loss": 3.001300096511841, + "step": 11892, + "token_acc": 0.2984726165203491 + }, + { + "epoch": 6.971562591615362, + "grad_norm": 0.18745137940522882, + "learning_rate": 0.00038772944523449284, + "loss": 2.9794063568115234, + "step": 11893, + "token_acc": 0.30212146477680285 + }, + { + "epoch": 6.972148929932571, + "grad_norm": 0.17064544968015227, + "learning_rate": 0.0003877092230316024, + "loss": 2.981074810028076, + "step": 11894, + "token_acc": 0.3007713307507566 + }, + { + "epoch": 6.97273526824978, + "grad_norm": 0.1793925047344118, + "learning_rate": 0.0003876889995351194, + "loss": 3.0228986740112305, + "step": 11895, + "token_acc": 0.29352387025114673 + }, + { + "epoch": 6.973321606566989, + "grad_norm": 0.20281970034469948, + "learning_rate": 0.00038766877474523376, + "loss": 3.0035018920898438, + "step": 11896, + "token_acc": 0.29833465995430913 + }, + { + "epoch": 6.973907944884198, + "grad_norm": 0.1522896412984321, + "learning_rate": 0.0003876485486621355, + "loss": 2.9274332523345947, + "step": 11897, + "token_acc": 0.30709156552670497 + }, + { + "epoch": 6.974494283201407, + "grad_norm": 0.18194504783118648, + "learning_rate": 0.00038762832128601466, + "loss": 2.989865303039551, + "step": 11898, + "token_acc": 0.3004782098239969 + }, + { + "epoch": 6.975080621518616, + "grad_norm": 0.16864427036411161, + "learning_rate": 0.00038760809261706116, + "loss": 2.9992339611053467, + "step": 11899, + "token_acc": 0.2993785222640501 + }, + { + "epoch": 6.975666959835825, + "grad_norm": 0.15783380818870318, + "learning_rate": 0.000387587862655465, + "loss": 2.9351344108581543, + "step": 11900, + "token_acc": 0.3083638603675362 + }, + { + "epoch": 6.9762532981530345, + "grad_norm": 0.1965535902373825, + "learning_rate": 0.00038756763140141635, + "loss": 2.972707986831665, + "step": 11901, + "token_acc": 0.30240944588075375 + }, + { + "epoch": 6.976839636470244, + "grad_norm": 0.1816684257278128, + "learning_rate": 0.0003875473988551052, + "loss": 3.009921073913574, + "step": 11902, + "token_acc": 0.2965945574681738 + }, + { + "epoch": 6.977425974787453, + "grad_norm": 0.16362313052834068, + "learning_rate": 0.00038752716501672156, + "loss": 2.986109733581543, + "step": 11903, + "token_acc": 0.3000424877221039 + }, + { + "epoch": 6.978012313104662, + "grad_norm": 0.17285328332209363, + "learning_rate": 0.0003875069298864555, + "loss": 2.9464831352233887, + "step": 11904, + "token_acc": 0.3053131051463893 + }, + { + "epoch": 6.97859865142187, + "grad_norm": 0.20766953746005248, + "learning_rate": 0.0003874866934644972, + "loss": 3.0101985931396484, + "step": 11905, + "token_acc": 0.29603603129033107 + }, + { + "epoch": 6.979184989739079, + "grad_norm": 0.18758376937072027, + "learning_rate": 0.00038746645575103657, + "loss": 2.9682278633117676, + "step": 11906, + "token_acc": 0.30467352774237766 + }, + { + "epoch": 6.979771328056288, + "grad_norm": 0.1686147319226834, + "learning_rate": 0.0003874462167462639, + "loss": 3.0142905712127686, + "step": 11907, + "token_acc": 0.2958079886455799 + }, + { + "epoch": 6.980357666373497, + "grad_norm": 0.20476151240962562, + "learning_rate": 0.0003874259764503692, + "loss": 2.950381278991699, + "step": 11908, + "token_acc": 0.3054970148427951 + }, + { + "epoch": 6.9809440046907065, + "grad_norm": 0.16670379881738911, + "learning_rate": 0.00038740573486354264, + "loss": 2.9987053871154785, + "step": 11909, + "token_acc": 0.2965705992979261 + }, + { + "epoch": 6.981530343007916, + "grad_norm": 0.18445184539402162, + "learning_rate": 0.0003873854919859744, + "loss": 3.0164108276367188, + "step": 11910, + "token_acc": 0.29492443434961124 + }, + { + "epoch": 6.982116681325125, + "grad_norm": 0.2223049892643541, + "learning_rate": 0.00038736524781785454, + "loss": 2.9460744857788086, + "step": 11911, + "token_acc": 0.3061510955374204 + }, + { + "epoch": 6.982703019642334, + "grad_norm": 0.15233253597623456, + "learning_rate": 0.0003873450023593733, + "loss": 2.989528179168701, + "step": 11912, + "token_acc": 0.2992863288960875 + }, + { + "epoch": 6.983289357959543, + "grad_norm": 0.20323017103114738, + "learning_rate": 0.00038732475561072084, + "loss": 2.9950332641601562, + "step": 11913, + "token_acc": 0.29852691545661414 + }, + { + "epoch": 6.983875696276751, + "grad_norm": 0.17552907008160606, + "learning_rate": 0.0003873045075720873, + "loss": 2.9589099884033203, + "step": 11914, + "token_acc": 0.30289009001581846 + }, + { + "epoch": 6.98446203459396, + "grad_norm": 0.19893439210842653, + "learning_rate": 0.000387284258243663, + "loss": 3.0079078674316406, + "step": 11915, + "token_acc": 0.29869623787723165 + }, + { + "epoch": 6.985048372911169, + "grad_norm": 0.1828970980134003, + "learning_rate": 0.0003872640076256381, + "loss": 2.97236967086792, + "step": 11916, + "token_acc": 0.3018000518000518 + }, + { + "epoch": 6.9856347112283785, + "grad_norm": 0.16028155127030871, + "learning_rate": 0.00038724375571820276, + "loss": 2.968897819519043, + "step": 11917, + "token_acc": 0.30180905632655997 + }, + { + "epoch": 6.986221049545588, + "grad_norm": 0.15967885963589842, + "learning_rate": 0.00038722350252154723, + "loss": 2.955930709838867, + "step": 11918, + "token_acc": 0.30374130276323635 + }, + { + "epoch": 6.986807387862797, + "grad_norm": 0.14767918044053518, + "learning_rate": 0.00038720324803586195, + "loss": 2.9799039363861084, + "step": 11919, + "token_acc": 0.30011162411802766 + }, + { + "epoch": 6.987393726180006, + "grad_norm": 0.15775098525841819, + "learning_rate": 0.0003871829922613369, + "loss": 2.999072551727295, + "step": 11920, + "token_acc": 0.2972999097002302 + }, + { + "epoch": 6.987980064497215, + "grad_norm": 0.15716450229005435, + "learning_rate": 0.0003871627351981625, + "loss": 2.962615966796875, + "step": 11921, + "token_acc": 0.3036182916892644 + }, + { + "epoch": 6.988566402814424, + "grad_norm": 0.15921029241824672, + "learning_rate": 0.00038714247684652916, + "loss": 2.9847588539123535, + "step": 11922, + "token_acc": 0.29856864871516803 + }, + { + "epoch": 6.989152741131633, + "grad_norm": 0.1533614395787824, + "learning_rate": 0.000387122217206627, + "loss": 3.0492870807647705, + "step": 11923, + "token_acc": 0.29162859663869933 + }, + { + "epoch": 6.989739079448842, + "grad_norm": 0.17076873290623834, + "learning_rate": 0.0003871019562786463, + "loss": 3.0162100791931152, + "step": 11924, + "token_acc": 0.29394284104132207 + }, + { + "epoch": 6.990325417766051, + "grad_norm": 0.15662061832022398, + "learning_rate": 0.00038708169406277747, + "loss": 3.0127248764038086, + "step": 11925, + "token_acc": 0.2971713852698073 + }, + { + "epoch": 6.99091175608326, + "grad_norm": 0.15796574300856658, + "learning_rate": 0.00038706143055921096, + "loss": 2.9685285091400146, + "step": 11926, + "token_acc": 0.3020518715235113 + }, + { + "epoch": 6.991498094400469, + "grad_norm": 0.18022603623534778, + "learning_rate": 0.000387041165768137, + "loss": 2.9645304679870605, + "step": 11927, + "token_acc": 0.30317055132762366 + }, + { + "epoch": 6.992084432717678, + "grad_norm": 0.15038844098152293, + "learning_rate": 0.00038702089968974584, + "loss": 2.9493703842163086, + "step": 11928, + "token_acc": 0.30412106386245824 + }, + { + "epoch": 6.992670771034887, + "grad_norm": 0.16580604505540705, + "learning_rate": 0.000387000632324228, + "loss": 3.0014567375183105, + "step": 11929, + "token_acc": 0.2982506467360826 + }, + { + "epoch": 6.993257109352096, + "grad_norm": 0.15455303867112485, + "learning_rate": 0.00038698036367177383, + "loss": 2.9551515579223633, + "step": 11930, + "token_acc": 0.30312639816639064 + }, + { + "epoch": 6.993843447669305, + "grad_norm": 0.15476251903111912, + "learning_rate": 0.0003869600937325737, + "loss": 2.9638864994049072, + "step": 11931, + "token_acc": 0.3029672111977141 + }, + { + "epoch": 6.994429785986514, + "grad_norm": 0.16329109158929905, + "learning_rate": 0.0003869398225068181, + "loss": 2.987675428390503, + "step": 11932, + "token_acc": 0.2980661975777248 + }, + { + "epoch": 6.995016124303723, + "grad_norm": 0.2111594294779838, + "learning_rate": 0.00038691954999469746, + "loss": 2.9428184032440186, + "step": 11933, + "token_acc": 0.30638792639981066 + }, + { + "epoch": 6.9956024626209325, + "grad_norm": 0.2688818403120885, + "learning_rate": 0.000386899276196402, + "loss": 2.9906678199768066, + "step": 11934, + "token_acc": 0.2982298359770297 + }, + { + "epoch": 6.996188800938142, + "grad_norm": 0.23164907186883807, + "learning_rate": 0.00038687900111212235, + "loss": 2.998396873474121, + "step": 11935, + "token_acc": 0.29772230793123544 + }, + { + "epoch": 6.99677513925535, + "grad_norm": 0.1452251209236702, + "learning_rate": 0.00038685872474204905, + "loss": 2.9725193977355957, + "step": 11936, + "token_acc": 0.30138719523837404 + }, + { + "epoch": 6.997361477572559, + "grad_norm": 0.19152440710098131, + "learning_rate": 0.0003868384470863724, + "loss": 2.9761931896209717, + "step": 11937, + "token_acc": 0.3008268406801465 + }, + { + "epoch": 6.997947815889768, + "grad_norm": 0.17738056247886136, + "learning_rate": 0.0003868181681452829, + "loss": 2.9518702030181885, + "step": 11938, + "token_acc": 0.3045169561020169 + }, + { + "epoch": 6.998534154206977, + "grad_norm": 0.14419577525768673, + "learning_rate": 0.000386797887918971, + "loss": 2.980222225189209, + "step": 11939, + "token_acc": 0.3002604066448592 + }, + { + "epoch": 6.999120492524186, + "grad_norm": 0.21397547315177062, + "learning_rate": 0.0003867776064076274, + "loss": 2.9301810264587402, + "step": 11940, + "token_acc": 0.3088196568013629 + }, + { + "epoch": 6.999706830841395, + "grad_norm": 0.17274637573927223, + "learning_rate": 0.00038675732361144244, + "loss": 2.983229160308838, + "step": 11941, + "token_acc": 0.29939774837093763 + }, + { + "epoch": 7.0, + "grad_norm": 0.19584859849622846, + "learning_rate": 0.00038673703953060677, + "loss": 3.0100951194763184, + "step": 11942, + "token_acc": 0.2968003427027383 + }, + { + "epoch": 7.0, + "eval_loss": 3.0641727447509766, + "eval_runtime": 6.4411, + "eval_samples_per_second": 39.744, + "eval_steps_per_second": 4.968, + "eval_token_acc": 0.29074987146860415, + "step": 11942 + } + ], + "logging_steps": 1, + "max_steps": 34120, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": -34120, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2857968356818944.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}