| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 1070, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.09345794392523364, |
| "grad_norm": 2.701171636581421, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.7223, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.18691588785046728, |
| "grad_norm": 1.4309473037719727, |
| "learning_rate": 3.518518518518519e-05, |
| "loss": 0.313, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2803738317757009, |
| "grad_norm": 1.99528968334198, |
| "learning_rate": 5.370370370370371e-05, |
| "loss": 0.2547, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.37383177570093457, |
| "grad_norm": 1.5343451499938965, |
| "learning_rate": 7.222222222222222e-05, |
| "loss": 0.2301, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.4672897196261682, |
| "grad_norm": 1.0875062942504883, |
| "learning_rate": 9.074074074074075e-05, |
| "loss": 0.1949, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.5607476635514018, |
| "grad_norm": 1.1911485195159912, |
| "learning_rate": 9.999402437003975e-05, |
| "loss": 0.1649, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6542056074766355, |
| "grad_norm": 0.9195408821105957, |
| "learning_rate": 9.99462278999732e-05, |
| "loss": 0.1468, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.7476635514018691, |
| "grad_norm": 1.2133328914642334, |
| "learning_rate": 9.985068065535225e-05, |
| "loss": 0.1285, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.8411214953271028, |
| "grad_norm": 0.8248424530029297, |
| "learning_rate": 9.970747398351445e-05, |
| "loss": 0.109, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.9345794392523364, |
| "grad_norm": 0.8372947573661804, |
| "learning_rate": 9.951674479629056e-05, |
| "loss": 0.1036, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.02803738317757, |
| "grad_norm": 1.1082693338394165, |
| "learning_rate": 9.927867543911091e-05, |
| "loss": 0.0895, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.1214953271028036, |
| "grad_norm": 0.9761236310005188, |
| "learning_rate": 9.899349351667522e-05, |
| "loss": 0.0844, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.2149532710280373, |
| "grad_norm": 0.6534666419029236, |
| "learning_rate": 9.866147167535254e-05, |
| "loss": 0.0749, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.308411214953271, |
| "grad_norm": 0.946822464466095, |
| "learning_rate": 9.828292734251944e-05, |
| "loss": 0.0713, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.4018691588785046, |
| "grad_norm": 0.8314803838729858, |
| "learning_rate": 9.785822242308562e-05, |
| "loss": 0.0674, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.4953271028037383, |
| "grad_norm": 0.6225787997245789, |
| "learning_rate": 9.738776295349687e-05, |
| "loss": 0.0654, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.588785046728972, |
| "grad_norm": 0.6679856777191162, |
| "learning_rate": 9.687199871354669e-05, |
| "loss": 0.0666, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.6822429906542056, |
| "grad_norm": 0.4407249391078949, |
| "learning_rate": 9.631142279636706e-05, |
| "loss": 0.056, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.7757009345794392, |
| "grad_norm": 0.5206877589225769, |
| "learning_rate": 9.570657113700985e-05, |
| "loss": 0.0595, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.8691588785046729, |
| "grad_norm": 0.7172874212265015, |
| "learning_rate": 9.50580220000696e-05, |
| "loss": 0.0611, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.9626168224299065, |
| "grad_norm": 0.5936960577964783, |
| "learning_rate": 9.436639542683727e-05, |
| "loss": 0.0538, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.05607476635514, |
| "grad_norm": 0.4505106806755066, |
| "learning_rate": 9.363235264251369e-05, |
| "loss": 0.0586, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.149532710280374, |
| "grad_norm": 0.45083609223365784, |
| "learning_rate": 9.285659542404941e-05, |
| "loss": 0.0571, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.2429906542056073, |
| "grad_norm": 0.5857895612716675, |
| "learning_rate": 9.203986542921532e-05, |
| "loss": 0.0496, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.336448598130841, |
| "grad_norm": 0.600140392780304, |
| "learning_rate": 9.11829434875454e-05, |
| "loss": 0.056, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.4299065420560746, |
| "grad_norm": 0.46349039673805237, |
| "learning_rate": 9.02866488538296e-05, |
| "loss": 0.0482, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.5233644859813085, |
| "grad_norm": 0.597632884979248, |
| "learning_rate": 8.93518384248705e-05, |
| "loss": 0.0498, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.616822429906542, |
| "grad_norm": 0.6381852030754089, |
| "learning_rate": 8.837940592025257e-05, |
| "loss": 0.0521, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.710280373831776, |
| "grad_norm": 0.5228458642959595, |
| "learning_rate": 8.737028102790723e-05, |
| "loss": 0.0523, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.803738317757009, |
| "grad_norm": 0.6592262983322144, |
| "learning_rate": 8.632542851529051e-05, |
| "loss": 0.0481, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.897196261682243, |
| "grad_norm": 0.4157472848892212, |
| "learning_rate": 8.524584730702339e-05, |
| "loss": 0.0459, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.9906542056074765, |
| "grad_norm": 0.2829182744026184, |
| "learning_rate": 8.413256952987611e-05, |
| "loss": 0.0479, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.0841121495327104, |
| "grad_norm": 0.6042788624763489, |
| "learning_rate": 8.298665952600999e-05, |
| "loss": 0.0401, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.177570093457944, |
| "grad_norm": 0.31885308027267456, |
| "learning_rate": 8.180921283541986e-05, |
| "loss": 0.0441, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.2710280373831777, |
| "grad_norm": 0.3092094659805298, |
| "learning_rate": 8.060135514854994e-05, |
| "loss": 0.041, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.364485981308411, |
| "grad_norm": 0.39221957325935364, |
| "learning_rate": 7.936424123008464e-05, |
| "loss": 0.0378, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.457943925233645, |
| "grad_norm": 0.5164565443992615, |
| "learning_rate": 7.809905381494316e-05, |
| "loss": 0.0436, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.5514018691588785, |
| "grad_norm": 0.4761320650577545, |
| "learning_rate": 7.68070024775332e-05, |
| "loss": 0.0393, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.6448598130841123, |
| "grad_norm": 0.24083180725574493, |
| "learning_rate": 7.548932247534506e-05, |
| "loss": 0.0432, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.7383177570093458, |
| "grad_norm": 0.32576093077659607, |
| "learning_rate": 7.414727356799154e-05, |
| "loss": 0.04, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.831775700934579, |
| "grad_norm": 0.4475277066230774, |
| "learning_rate": 7.27821388128227e-05, |
| "loss": 0.0423, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.925233644859813, |
| "grad_norm": 0.47092771530151367, |
| "learning_rate": 7.139522333826707e-05, |
| "loss": 0.0408, |
| "step": 420 |
| }, |
| { |
| "epoch": 4.018691588785047, |
| "grad_norm": 0.37364593148231506, |
| "learning_rate": 6.99878530960719e-05, |
| "loss": 0.0361, |
| "step": 430 |
| }, |
| { |
| "epoch": 4.11214953271028, |
| "grad_norm": 0.3749428391456604, |
| "learning_rate": 6.856137359363533e-05, |
| "loss": 0.0347, |
| "step": 440 |
| }, |
| { |
| "epoch": 4.205607476635514, |
| "grad_norm": 0.28443631529808044, |
| "learning_rate": 6.711714860764266e-05, |
| "loss": 0.0378, |
| "step": 450 |
| }, |
| { |
| "epoch": 4.299065420560748, |
| "grad_norm": 0.31837236881256104, |
| "learning_rate": 6.565655888023618e-05, |
| "loss": 0.0363, |
| "step": 460 |
| }, |
| { |
| "epoch": 4.392523364485982, |
| "grad_norm": 0.3948926329612732, |
| "learning_rate": 6.418100079896556e-05, |
| "loss": 0.0388, |
| "step": 470 |
| }, |
| { |
| "epoch": 4.485981308411215, |
| "grad_norm": 0.5089420080184937, |
| "learning_rate": 6.269188506178019e-05, |
| "loss": 0.0377, |
| "step": 480 |
| }, |
| { |
| "epoch": 4.579439252336448, |
| "grad_norm": 0.266495943069458, |
| "learning_rate": 6.11906353283405e-05, |
| "loss": 0.0367, |
| "step": 490 |
| }, |
| { |
| "epoch": 4.672897196261682, |
| "grad_norm": 0.33666715025901794, |
| "learning_rate": 5.967868685893715e-05, |
| "loss": 0.0372, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.766355140186916, |
| "grad_norm": 0.37420037388801575, |
| "learning_rate": 5.815748514231944e-05, |
| "loss": 0.032, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.859813084112149, |
| "grad_norm": 0.33954936265945435, |
| "learning_rate": 5.6628484513745e-05, |
| "loss": 0.0299, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.953271028037383, |
| "grad_norm": 0.2828126847743988, |
| "learning_rate": 5.5093146764571866e-05, |
| "loss": 0.0343, |
| "step": 530 |
| }, |
| { |
| "epoch": 5.046728971962617, |
| "grad_norm": 0.48174425959587097, |
| "learning_rate": 5.355293974472197e-05, |
| "loss": 0.0326, |
| "step": 540 |
| }, |
| { |
| "epoch": 5.140186915887851, |
| "grad_norm": 0.42725300788879395, |
| "learning_rate": 5.2009335959352666e-05, |
| "loss": 0.0338, |
| "step": 550 |
| }, |
| { |
| "epoch": 5.233644859813084, |
| "grad_norm": 0.34893345832824707, |
| "learning_rate": 5.046381116107742e-05, |
| "loss": 0.0322, |
| "step": 560 |
| }, |
| { |
| "epoch": 5.327102803738318, |
| "grad_norm": 0.3411373198032379, |
| "learning_rate": 4.891784293908192e-05, |
| "loss": 0.0312, |
| "step": 570 |
| }, |
| { |
| "epoch": 5.420560747663552, |
| "grad_norm": 0.35624727606773376, |
| "learning_rate": 4.7372909306484276e-05, |
| "loss": 0.0353, |
| "step": 580 |
| }, |
| { |
| "epoch": 5.5140186915887845, |
| "grad_norm": 0.3242332935333252, |
| "learning_rate": 4.5830487287289966e-05, |
| "loss": 0.0288, |
| "step": 590 |
| }, |
| { |
| "epoch": 5.607476635514018, |
| "grad_norm": 0.4070276618003845, |
| "learning_rate": 4.429205150429241e-05, |
| "loss": 0.0322, |
| "step": 600 |
| }, |
| { |
| "epoch": 5.700934579439252, |
| "grad_norm": 0.2987990975379944, |
| "learning_rate": 4.275907276926918e-05, |
| "loss": 0.0293, |
| "step": 610 |
| }, |
| { |
| "epoch": 5.794392523364486, |
| "grad_norm": 0.36411428451538086, |
| "learning_rate": 4.123301667682171e-05, |
| "loss": 0.0296, |
| "step": 620 |
| }, |
| { |
| "epoch": 5.88785046728972, |
| "grad_norm": 0.30948546528816223, |
| "learning_rate": 3.971534220320291e-05, |
| "loss": 0.0274, |
| "step": 630 |
| }, |
| { |
| "epoch": 5.981308411214953, |
| "grad_norm": 0.4233947694301605, |
| "learning_rate": 3.820750031147211e-05, |
| "loss": 0.0316, |
| "step": 640 |
| }, |
| { |
| "epoch": 6.074766355140187, |
| "grad_norm": 0.47963348031044006, |
| "learning_rate": 3.67109325643111e-05, |
| "loss": 0.029, |
| "step": 650 |
| }, |
| { |
| "epoch": 6.168224299065421, |
| "grad_norm": 0.33454999327659607, |
| "learning_rate": 3.522706974582717e-05, |
| "loss": 0.0301, |
| "step": 660 |
| }, |
| { |
| "epoch": 6.261682242990654, |
| "grad_norm": 0.22388812899589539, |
| "learning_rate": 3.375733049366115e-05, |
| "loss": 0.0273, |
| "step": 670 |
| }, |
| { |
| "epoch": 6.355140186915888, |
| "grad_norm": 0.3955516219139099, |
| "learning_rate": 3.2303119942707796e-05, |
| "loss": 0.0259, |
| "step": 680 |
| }, |
| { |
| "epoch": 6.4485981308411215, |
| "grad_norm": 0.33848804235458374, |
| "learning_rate": 3.086582838174551e-05, |
| "loss": 0.029, |
| "step": 690 |
| }, |
| { |
| "epoch": 6.542056074766355, |
| "grad_norm": 0.6161491274833679, |
| "learning_rate": 2.944682992425959e-05, |
| "loss": 0.0273, |
| "step": 700 |
| }, |
| { |
| "epoch": 6.635514018691588, |
| "grad_norm": 0.3765656352043152, |
| "learning_rate": 2.804748119472969e-05, |
| "loss": 0.027, |
| "step": 710 |
| }, |
| { |
| "epoch": 6.728971962616822, |
| "grad_norm": 0.22724078595638275, |
| "learning_rate": 2.6669120031637663e-05, |
| "loss": 0.0259, |
| "step": 720 |
| }, |
| { |
| "epoch": 6.822429906542056, |
| "grad_norm": 0.39581674337387085, |
| "learning_rate": 2.5313064208435423e-05, |
| "loss": 0.0282, |
| "step": 730 |
| }, |
| { |
| "epoch": 6.91588785046729, |
| "grad_norm": 0.4597732126712799, |
| "learning_rate": 2.3980610173696255e-05, |
| "loss": 0.0269, |
| "step": 740 |
| }, |
| { |
| "epoch": 7.009345794392523, |
| "grad_norm": 0.39131224155426025, |
| "learning_rate": 2.2673031811653034e-05, |
| "loss": 0.0309, |
| "step": 750 |
| }, |
| { |
| "epoch": 7.102803738317757, |
| "grad_norm": 0.33914846181869507, |
| "learning_rate": 2.139157922430956e-05, |
| "loss": 0.0249, |
| "step": 760 |
| }, |
| { |
| "epoch": 7.196261682242991, |
| "grad_norm": 0.40823277831077576, |
| "learning_rate": 2.01374775362883e-05, |
| "loss": 0.026, |
| "step": 770 |
| }, |
| { |
| "epoch": 7.289719626168225, |
| "grad_norm": 0.20548699796199799, |
| "learning_rate": 1.8911925723557806e-05, |
| "loss": 0.0214, |
| "step": 780 |
| }, |
| { |
| "epoch": 7.383177570093458, |
| "grad_norm": 0.28237590193748474, |
| "learning_rate": 1.7716095467159393e-05, |
| "loss": 0.0262, |
| "step": 790 |
| }, |
| { |
| "epoch": 7.4766355140186915, |
| "grad_norm": 0.29693859815597534, |
| "learning_rate": 1.6551130033028827e-05, |
| "loss": 0.0233, |
| "step": 800 |
| }, |
| { |
| "epoch": 7.570093457943925, |
| "grad_norm": 0.3963673710823059, |
| "learning_rate": 1.541814317898425e-05, |
| "loss": 0.0247, |
| "step": 810 |
| }, |
| { |
| "epoch": 7.663551401869158, |
| "grad_norm": 0.22003985941410065, |
| "learning_rate": 1.4318218089924962e-05, |
| "loss": 0.0249, |
| "step": 820 |
| }, |
| { |
| "epoch": 7.757009345794392, |
| "grad_norm": 0.2355249971151352, |
| "learning_rate": 1.3252406342259527e-05, |
| "loss": 0.0245, |
| "step": 830 |
| }, |
| { |
| "epoch": 7.850467289719626, |
| "grad_norm": 0.45212939381599426, |
| "learning_rate": 1.2221726898552665e-05, |
| "loss": 0.0228, |
| "step": 840 |
| }, |
| { |
| "epoch": 7.94392523364486, |
| "grad_norm": 0.4631796181201935, |
| "learning_rate": 1.122716513335262e-05, |
| "loss": 0.0215, |
| "step": 850 |
| }, |
| { |
| "epoch": 8.037383177570094, |
| "grad_norm": 0.24740639328956604, |
| "learning_rate": 1.0269671891130123e-05, |
| "loss": 0.0267, |
| "step": 860 |
| }, |
| { |
| "epoch": 8.130841121495328, |
| "grad_norm": 0.21961306035518646, |
| "learning_rate": 9.350162577229432e-06, |
| "loss": 0.0222, |
| "step": 870 |
| }, |
| { |
| "epoch": 8.22429906542056, |
| "grad_norm": 0.2656007409095764, |
| "learning_rate": 8.46951628270098e-06, |
| "loss": 0.0234, |
| "step": 880 |
| }, |
| { |
| "epoch": 8.317757009345794, |
| "grad_norm": 0.1633329689502716, |
| "learning_rate": 7.628574943851852e-06, |
| "loss": 0.0189, |
| "step": 890 |
| }, |
| { |
| "epoch": 8.411214953271028, |
| "grad_norm": 0.15491314232349396, |
| "learning_rate": 6.82814253731801e-06, |
| "loss": 0.0199, |
| "step": 900 |
| }, |
| { |
| "epoch": 8.504672897196262, |
| "grad_norm": 0.464375376701355, |
| "learning_rate": 6.06898431142745e-06, |
| "loss": 0.0216, |
| "step": 910 |
| }, |
| { |
| "epoch": 8.598130841121495, |
| "grad_norm": 0.21681295335292816, |
| "learning_rate": 5.351826054589393e-06, |
| "loss": 0.0203, |
| "step": 920 |
| }, |
| { |
| "epoch": 8.69158878504673, |
| "grad_norm": 0.18899723887443542, |
| "learning_rate": 4.677353401408974e-06, |
| "loss": 0.0184, |
| "step": 930 |
| }, |
| { |
| "epoch": 8.785046728971963, |
| "grad_norm": 0.28756183385849, |
| "learning_rate": 4.04621117719049e-06, |
| "loss": 0.0257, |
| "step": 940 |
| }, |
| { |
| "epoch": 8.878504672897197, |
| "grad_norm": 0.27610504627227783, |
| "learning_rate": 3.459002781456344e-06, |
| "loss": 0.0208, |
| "step": 950 |
| }, |
| { |
| "epoch": 8.97196261682243, |
| "grad_norm": 0.21105413138866425, |
| "learning_rate": 2.9162896110707163e-06, |
| "loss": 0.0219, |
| "step": 960 |
| }, |
| { |
| "epoch": 9.065420560747663, |
| "grad_norm": 0.23299843072891235, |
| "learning_rate": 2.418590523519687e-06, |
| "loss": 0.0217, |
| "step": 970 |
| }, |
| { |
| "epoch": 9.158878504672897, |
| "grad_norm": 0.3126954436302185, |
| "learning_rate": 1.9663813408607845e-06, |
| "loss": 0.0188, |
| "step": 980 |
| }, |
| { |
| "epoch": 9.25233644859813, |
| "grad_norm": 0.32796964049339294, |
| "learning_rate": 1.5600943948163527e-06, |
| "loss": 0.0227, |
| "step": 990 |
| }, |
| { |
| "epoch": 9.345794392523365, |
| "grad_norm": 0.2622697949409485, |
| "learning_rate": 1.2001181134455475e-06, |
| "loss": 0.0205, |
| "step": 1000 |
| }, |
| { |
| "epoch": 9.439252336448599, |
| "grad_norm": 0.23692955076694489, |
| "learning_rate": 8.867966497901282e-07, |
| "loss": 0.0216, |
| "step": 1010 |
| }, |
| { |
| "epoch": 9.532710280373832, |
| "grad_norm": 0.23177599906921387, |
| "learning_rate": 6.204295528491555e-07, |
| "loss": 0.0197, |
| "step": 1020 |
| }, |
| { |
| "epoch": 9.626168224299064, |
| "grad_norm": 0.18032234907150269, |
| "learning_rate": 4.012714811970464e-07, |
| "loss": 0.0223, |
| "step": 1030 |
| }, |
| { |
| "epoch": 9.719626168224298, |
| "grad_norm": 0.4851064682006836, |
| "learning_rate": 2.295319595188805e-07, |
| "loss": 0.0209, |
| "step": 1040 |
| }, |
| { |
| "epoch": 9.813084112149532, |
| "grad_norm": 0.2691369950771332, |
| "learning_rate": 1.0537517829562472e-07, |
| "loss": 0.0195, |
| "step": 1050 |
| }, |
| { |
| "epoch": 9.906542056074766, |
| "grad_norm": 0.19837787747383118, |
| "learning_rate": 2.8919836830887392e-08, |
| "loss": 0.0186, |
| "step": 1060 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.2632281482219696, |
| "learning_rate": 2.3902976920009423e-10, |
| "loss": 0.0186, |
| "step": 1070 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 1070, |
| "total_flos": 0.0, |
| "train_loss": 0.0546069160939377, |
| "train_runtime": 1371.0334, |
| "train_samples_per_second": 77.708, |
| "train_steps_per_second": 0.78 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1070, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 100, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|