{ "best_global_step": 210, "best_metric": 0.9432340960217255, "best_model_checkpoint": "/content/runs/modernbert-seeks_guidance/checkpoint-210", "epoch": 8.0, "eval_steps": 10, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03347280334728033, "grad_norm": 153.9114990234375, "learning_rate": 7.983333333333334e-05, "loss": 4.027193069458008, "step": 2 }, { "epoch": 0.06694560669456066, "grad_norm": 18.542333602905273, "learning_rate": 7.950000000000001e-05, "loss": 1.8763885498046875, "step": 4 }, { "epoch": 0.100418410041841, "grad_norm": 22.590063095092773, "learning_rate": 7.916666666666668e-05, "loss": 1.6400413513183594, "step": 6 }, { "epoch": 0.13389121338912133, "grad_norm": 25.844432830810547, "learning_rate": 7.883333333333334e-05, "loss": 1.8122658729553223, "step": 8 }, { "epoch": 0.16736401673640167, "grad_norm": 11.009317398071289, "learning_rate": 7.85e-05, "loss": 1.7578697204589844, "step": 10 }, { "epoch": 0.16736401673640167, "eval_accuracy": 0.8490566037735849, "eval_f1": 0.8464171564078771, "eval_loss": 0.33386504650115967, "eval_precision": 0.8669063968979485, "eval_recall": 0.8490566037735849, "eval_runtime": 3.7101, "eval_samples_per_second": 28.571, "eval_steps_per_second": 1.887, "step": 10 }, { "epoch": 0.200836820083682, "grad_norm": 17.16607093811035, "learning_rate": 7.816666666666666e-05, "loss": 1.0902986526489258, "step": 12 }, { "epoch": 0.23430962343096234, "grad_norm": 22.900856018066406, "learning_rate": 7.783333333333333e-05, "loss": 1.3127965927124023, "step": 14 }, { "epoch": 0.26778242677824265, "grad_norm": 63.955284118652344, "learning_rate": 7.75e-05, "loss": 2.1956119537353516, "step": 16 }, { "epoch": 0.301255230125523, "grad_norm": 17.79568099975586, "learning_rate": 7.716666666666667e-05, "loss": 1.2277936935424805, "step": 18 }, { "epoch": 0.33472803347280333, "grad_norm": 20.161556243896484, "learning_rate": 7.683333333333335e-05, "loss": 0.7839030027389526, "step": 20 }, { "epoch": 0.33472803347280333, "eval_accuracy": 0.8679245283018868, "eval_f1": 0.866249658189773, "eval_loss": 0.3612724840641022, "eval_precision": 0.8807391412085384, "eval_recall": 0.8679245283018868, "eval_runtime": 3.6991, "eval_samples_per_second": 28.656, "eval_steps_per_second": 1.892, "step": 20 }, { "epoch": 0.3682008368200837, "grad_norm": 28.197111129760742, "learning_rate": 7.650000000000002e-05, "loss": 1.6678202152252197, "step": 22 }, { "epoch": 0.401673640167364, "grad_norm": 31.67881965637207, "learning_rate": 7.616666666666667e-05, "loss": 1.7228145599365234, "step": 24 }, { "epoch": 0.4351464435146444, "grad_norm": 19.37973403930664, "learning_rate": 7.583333333333334e-05, "loss": 1.5142822265625, "step": 26 }, { "epoch": 0.4686192468619247, "grad_norm": 10.514019966125488, "learning_rate": 7.55e-05, "loss": 1.3633956909179688, "step": 28 }, { "epoch": 0.502092050209205, "grad_norm": 15.489032745361328, "learning_rate": 7.516666666666667e-05, "loss": 0.9567184448242188, "step": 30 }, { "epoch": 0.502092050209205, "eval_accuracy": 0.839622641509434, "eval_f1": 0.8389789004397634, "eval_loss": 0.3083064556121826, "eval_precision": 0.8506357670221494, "eval_recall": 0.839622641509434, "eval_runtime": 3.7037, "eval_samples_per_second": 28.62, "eval_steps_per_second": 1.89, "step": 30 }, { "epoch": 0.5355648535564853, "grad_norm": 27.374549865722656, "learning_rate": 7.483333333333334e-05, "loss": 1.3344135284423828, "step": 32 }, { "epoch": 0.5690376569037657, "grad_norm": 8.501946449279785, "learning_rate": 7.450000000000001e-05, "loss": 0.9554920196533203, "step": 34 }, { "epoch": 0.602510460251046, "grad_norm": 14.161578178405762, "learning_rate": 7.416666666666668e-05, "loss": 1.7361631393432617, "step": 36 }, { "epoch": 0.6359832635983264, "grad_norm": 8.940046310424805, "learning_rate": 7.383333333333334e-05, "loss": 1.5206122398376465, "step": 38 }, { "epoch": 0.6694560669456067, "grad_norm": 13.137919425964355, "learning_rate": 7.35e-05, "loss": 1.2580509185791016, "step": 40 }, { "epoch": 0.6694560669456067, "eval_accuracy": 0.8773584905660378, "eval_f1": 0.8765088822685645, "eval_loss": 0.3498491644859314, "eval_precision": 0.8835071653848282, "eval_recall": 0.8773584905660378, "eval_runtime": 3.7168, "eval_samples_per_second": 28.519, "eval_steps_per_second": 1.883, "step": 40 }, { "epoch": 0.702928870292887, "grad_norm": 22.15394401550293, "learning_rate": 7.316666666666667e-05, "loss": 2.0802297592163086, "step": 42 }, { "epoch": 0.7364016736401674, "grad_norm": 45.23511505126953, "learning_rate": 7.283333333333333e-05, "loss": 1.8975982666015625, "step": 44 }, { "epoch": 0.7698744769874477, "grad_norm": 28.652149200439453, "learning_rate": 7.25e-05, "loss": 2.2509384155273438, "step": 46 }, { "epoch": 0.803347280334728, "grad_norm": 13.515350341796875, "learning_rate": 7.216666666666667e-05, "loss": 1.2774429321289062, "step": 48 }, { "epoch": 0.8368200836820083, "grad_norm": 7.222679138183594, "learning_rate": 7.183333333333334e-05, "loss": 0.6584987640380859, "step": 50 }, { "epoch": 0.8368200836820083, "eval_accuracy": 0.9150943396226415, "eval_f1": 0.9149349565111795, "eval_loss": 0.29712238907814026, "eval_precision": 0.9162532530904359, "eval_recall": 0.9150943396226415, "eval_runtime": 3.6988, "eval_samples_per_second": 28.658, "eval_steps_per_second": 1.893, "step": 50 }, { "epoch": 0.8702928870292888, "grad_norm": 12.111334800720215, "learning_rate": 7.15e-05, "loss": 0.7637321949005127, "step": 52 }, { "epoch": 0.9037656903765691, "grad_norm": 18.626867294311523, "learning_rate": 7.116666666666667e-05, "loss": 1.493333101272583, "step": 54 }, { "epoch": 0.9372384937238494, "grad_norm": 11.713019371032715, "learning_rate": 7.083333333333334e-05, "loss": 1.2619588375091553, "step": 56 }, { "epoch": 0.9707112970711297, "grad_norm": 8.967315673828125, "learning_rate": 7.05e-05, "loss": 0.9338784217834473, "step": 58 }, { "epoch": 1.0, "grad_norm": 7.203049659729004, "learning_rate": 7.016666666666667e-05, "loss": 0.5685266256332397, "step": 60 }, { "epoch": 1.0, "eval_accuracy": 0.8773584905660378, "eval_f1": 0.8771282705161484, "eval_loss": 0.3064689040184021, "eval_precision": 0.8783140858815874, "eval_recall": 0.8773584905660378, "eval_runtime": 3.6996, "eval_samples_per_second": 28.651, "eval_steps_per_second": 1.892, "step": 60 }, { "epoch": 1.0334728033472804, "grad_norm": 5.976049423217773, "learning_rate": 6.983333333333334e-05, "loss": 0.6323506832122803, "step": 62 }, { "epoch": 1.0669456066945606, "grad_norm": 25.86642074584961, "learning_rate": 6.950000000000001e-05, "loss": 1.2498211860656738, "step": 64 }, { "epoch": 1.100418410041841, "grad_norm": 12.371410369873047, "learning_rate": 6.916666666666668e-05, "loss": 1.057340145111084, "step": 66 }, { "epoch": 1.1338912133891212, "grad_norm": 26.944753646850586, "learning_rate": 6.883333333333334e-05, "loss": 0.8938901424407959, "step": 68 }, { "epoch": 1.1673640167364017, "grad_norm": 19.103439331054688, "learning_rate": 6.85e-05, "loss": 1.2235288619995117, "step": 70 }, { "epoch": 1.1673640167364017, "eval_accuracy": 0.8584905660377359, "eval_f1": 0.8585283522684334, "eval_loss": 0.37047046422958374, "eval_precision": 0.8599419448476053, "eval_recall": 0.8584905660377359, "eval_runtime": 3.7064, "eval_samples_per_second": 28.599, "eval_steps_per_second": 1.889, "step": 70 }, { "epoch": 1.200836820083682, "grad_norm": 22.506622314453125, "learning_rate": 6.816666666666667e-05, "loss": 1.2068042755126953, "step": 72 }, { "epoch": 1.2343096234309623, "grad_norm": 26.553388595581055, "learning_rate": 6.783333333333333e-05, "loss": 1.0496406555175781, "step": 74 }, { "epoch": 1.2677824267782427, "grad_norm": 7.650374412536621, "learning_rate": 6.75e-05, "loss": 0.7221107482910156, "step": 76 }, { "epoch": 1.301255230125523, "grad_norm": 5.790910720825195, "learning_rate": 6.716666666666667e-05, "loss": 1.0422630310058594, "step": 78 }, { "epoch": 1.3347280334728033, "grad_norm": 7.1742167472839355, "learning_rate": 6.683333333333334e-05, "loss": 0.48769330978393555, "step": 80 }, { "epoch": 1.3347280334728033, "eval_accuracy": 0.8962264150943396, "eval_f1": 0.896180133056651, "eval_loss": 0.2511148750782013, "eval_precision": 0.8962938005390837, "eval_recall": 0.8962264150943396, "eval_runtime": 3.7059, "eval_samples_per_second": 28.603, "eval_steps_per_second": 1.889, "step": 80 }, { "epoch": 1.3682008368200838, "grad_norm": 20.29821014404297, "learning_rate": 6.65e-05, "loss": 1.1386289596557617, "step": 82 }, { "epoch": 1.401673640167364, "grad_norm": 12.427876472473145, "learning_rate": 6.616666666666667e-05, "loss": 0.8347885608673096, "step": 84 }, { "epoch": 1.4351464435146444, "grad_norm": 22.335369110107422, "learning_rate": 6.583333333333334e-05, "loss": 1.108169436454773, "step": 86 }, { "epoch": 1.4686192468619246, "grad_norm": 26.64649200439453, "learning_rate": 6.55e-05, "loss": 1.1676079034805298, "step": 88 }, { "epoch": 1.502092050209205, "grad_norm": 8.125500679016113, "learning_rate": 6.516666666666667e-05, "loss": 0.8083582520484924, "step": 90 }, { "epoch": 1.502092050209205, "eval_accuracy": 0.9150943396226415, "eval_f1": 0.9149349565111795, "eval_loss": 0.27510833740234375, "eval_precision": 0.9162532530904359, "eval_recall": 0.9150943396226415, "eval_runtime": 3.6936, "eval_samples_per_second": 28.698, "eval_steps_per_second": 1.895, "step": 90 }, { "epoch": 1.5355648535564854, "grad_norm": 20.720991134643555, "learning_rate": 6.483333333333334e-05, "loss": 0.7836205363273621, "step": 92 }, { "epoch": 1.5690376569037658, "grad_norm": 15.862018585205078, "learning_rate": 6.450000000000001e-05, "loss": 0.7883846163749695, "step": 94 }, { "epoch": 1.602510460251046, "grad_norm": 6.251119136810303, "learning_rate": 6.416666666666668e-05, "loss": 0.6632102727890015, "step": 96 }, { "epoch": 1.6359832635983262, "grad_norm": 29.456884384155273, "learning_rate": 6.383333333333334e-05, "loss": 0.9870191812515259, "step": 98 }, { "epoch": 1.6694560669456067, "grad_norm": 16.34809684753418, "learning_rate": 6.35e-05, "loss": 1.072102427482605, "step": 100 }, { "epoch": 1.6694560669456067, "eval_accuracy": 0.8962264150943396, "eval_f1": 0.8962541447420387, "eval_loss": 0.2538890242576599, "eval_precision": 0.8964414341772833, "eval_recall": 0.8962264150943396, "eval_runtime": 3.7081, "eval_samples_per_second": 28.586, "eval_steps_per_second": 1.888, "step": 100 }, { "epoch": 1.702928870292887, "grad_norm": 11.714218139648438, "learning_rate": 6.316666666666667e-05, "loss": 0.8041658401489258, "step": 102 }, { "epoch": 1.7364016736401675, "grad_norm": 15.745891571044922, "learning_rate": 6.283333333333333e-05, "loss": 0.5025429725646973, "step": 104 }, { "epoch": 1.7698744769874477, "grad_norm": 4.477856159210205, "learning_rate": 6.25e-05, "loss": 0.7340664863586426, "step": 106 }, { "epoch": 1.803347280334728, "grad_norm": 5.773235321044922, "learning_rate": 6.216666666666667e-05, "loss": 0.5497207641601562, "step": 108 }, { "epoch": 1.8368200836820083, "grad_norm": 15.65743350982666, "learning_rate": 6.183333333333334e-05, "loss": 0.775230884552002, "step": 110 }, { "epoch": 1.8368200836820083, "eval_accuracy": 0.8207547169811321, "eval_f1": 0.8195209736368527, "eval_loss": 0.3917406499385834, "eval_precision": 0.8364272671941569, "eval_recall": 0.8207547169811321, "eval_runtime": 3.6963, "eval_samples_per_second": 28.677, "eval_steps_per_second": 1.894, "step": 110 }, { "epoch": 1.8702928870292888, "grad_norm": 21.60720443725586, "learning_rate": 6.15e-05, "loss": 0.49250876903533936, "step": 112 }, { "epoch": 1.9037656903765692, "grad_norm": 16.193058013916016, "learning_rate": 6.116666666666667e-05, "loss": 0.8217854499816895, "step": 114 }, { "epoch": 1.9372384937238494, "grad_norm": 13.398934364318848, "learning_rate": 6.083333333333333e-05, "loss": 0.4902282953262329, "step": 116 }, { "epoch": 1.9707112970711296, "grad_norm": 13.481511116027832, "learning_rate": 6.05e-05, "loss": 0.870509147644043, "step": 118 }, { "epoch": 2.0, "grad_norm": 9.881525993347168, "learning_rate": 6.016666666666667e-05, "loss": 0.13166169822216034, "step": 120 }, { "epoch": 2.0, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9245283018867925, "eval_loss": 0.25263166427612305, "eval_precision": 0.9245283018867925, "eval_recall": 0.9245283018867925, "eval_runtime": 3.7038, "eval_samples_per_second": 28.619, "eval_steps_per_second": 1.89, "step": 120 }, { "epoch": 2.0334728033472804, "grad_norm": 24.188133239746094, "learning_rate": 5.983333333333334e-05, "loss": 0.3535723388195038, "step": 122 }, { "epoch": 2.066945606694561, "grad_norm": 14.360116004943848, "learning_rate": 5.950000000000001e-05, "loss": 0.47157666087150574, "step": 124 }, { "epoch": 2.100418410041841, "grad_norm": 11.354023933410645, "learning_rate": 5.916666666666668e-05, "loss": 0.8223622441291809, "step": 126 }, { "epoch": 2.1338912133891212, "grad_norm": 2.063201665878296, "learning_rate": 5.8833333333333345e-05, "loss": 0.16897237300872803, "step": 128 }, { "epoch": 2.1673640167364017, "grad_norm": 57.08894348144531, "learning_rate": 5.85e-05, "loss": 0.715202808380127, "step": 130 }, { "epoch": 2.1673640167364017, "eval_accuracy": 0.8679245283018868, "eval_f1": 0.867783422028705, "eval_loss": 0.4883545935153961, "eval_precision": 0.8732342986847171, "eval_recall": 0.8679245283018868, "eval_runtime": 3.7389, "eval_samples_per_second": 28.35, "eval_steps_per_second": 1.872, "step": 130 }, { "epoch": 2.200836820083682, "grad_norm": 24.498315811157227, "learning_rate": 5.8166666666666667e-05, "loss": 0.7688882946968079, "step": 132 }, { "epoch": 2.2343096234309625, "grad_norm": 37.9853401184082, "learning_rate": 5.7833333333333334e-05, "loss": 0.41721656918525696, "step": 134 }, { "epoch": 2.2677824267782425, "grad_norm": 11.750936508178711, "learning_rate": 5.75e-05, "loss": 0.7180262804031372, "step": 136 }, { "epoch": 2.301255230125523, "grad_norm": 17.77412986755371, "learning_rate": 5.716666666666667e-05, "loss": 1.0747008323669434, "step": 138 }, { "epoch": 2.3347280334728033, "grad_norm": 19.369474411010742, "learning_rate": 5.6833333333333344e-05, "loss": 0.34032344818115234, "step": 140 }, { "epoch": 2.3347280334728033, "eval_accuracy": 0.9056603773584906, "eval_f1": 0.9055592991913747, "eval_loss": 0.3055481016635895, "eval_precision": 0.906106235940255, "eval_recall": 0.9056603773584906, "eval_runtime": 3.7023, "eval_samples_per_second": 28.631, "eval_steps_per_second": 1.891, "step": 140 }, { "epoch": 2.3682008368200838, "grad_norm": 13.315515518188477, "learning_rate": 5.650000000000001e-05, "loss": 0.303825318813324, "step": 142 }, { "epoch": 2.401673640167364, "grad_norm": 8.536312103271484, "learning_rate": 5.6166666666666665e-05, "loss": 0.44345003366470337, "step": 144 }, { "epoch": 2.435146443514644, "grad_norm": 15.174897193908691, "learning_rate": 5.583333333333333e-05, "loss": 0.40959596633911133, "step": 146 }, { "epoch": 2.4686192468619246, "grad_norm": 23.133258819580078, "learning_rate": 5.55e-05, "loss": 0.9834498167037964, "step": 148 }, { "epoch": 2.502092050209205, "grad_norm": 10.005050659179688, "learning_rate": 5.516666666666667e-05, "loss": 0.3470939099788666, "step": 150 }, { "epoch": 2.502092050209205, "eval_accuracy": 0.8962264150943396, "eval_f1": 0.8955075157657084, "eval_loss": 0.32574841380119324, "eval_precision": 0.9028108227743042, "eval_recall": 0.8962264150943396, "eval_runtime": 3.7058, "eval_samples_per_second": 28.604, "eval_steps_per_second": 1.889, "step": 150 }, { "epoch": 2.5355648535564854, "grad_norm": 22.247119903564453, "learning_rate": 5.4833333333333336e-05, "loss": 0.6573967933654785, "step": 152 }, { "epoch": 2.569037656903766, "grad_norm": 15.729837417602539, "learning_rate": 5.45e-05, "loss": 0.6016145944595337, "step": 154 }, { "epoch": 2.602510460251046, "grad_norm": 23.62583351135254, "learning_rate": 5.416666666666667e-05, "loss": 0.5384809970855713, "step": 156 }, { "epoch": 2.6359832635983262, "grad_norm": 45.31764602661133, "learning_rate": 5.383333333333334e-05, "loss": 0.9770799875259399, "step": 158 }, { "epoch": 2.6694560669456067, "grad_norm": 8.508702278137207, "learning_rate": 5.35e-05, "loss": 0.4839167892932892, "step": 160 }, { "epoch": 2.6694560669456067, "eval_accuracy": 0.8962264150943396, "eval_f1": 0.8962541447420387, "eval_loss": 0.24312472343444824, "eval_precision": 0.8964414341772833, "eval_recall": 0.8962264150943396, "eval_runtime": 3.7071, "eval_samples_per_second": 28.594, "eval_steps_per_second": 1.888, "step": 160 }, { "epoch": 2.702928870292887, "grad_norm": 26.186450958251953, "learning_rate": 5.316666666666667e-05, "loss": 0.4771866500377655, "step": 162 }, { "epoch": 2.7364016736401675, "grad_norm": 22.855634689331055, "learning_rate": 5.2833333333333335e-05, "loss": 0.5913600325584412, "step": 164 }, { "epoch": 2.7698744769874475, "grad_norm": 16.469451904296875, "learning_rate": 5.25e-05, "loss": 0.4517851769924164, "step": 166 }, { "epoch": 2.803347280334728, "grad_norm": 19.56461524963379, "learning_rate": 5.216666666666667e-05, "loss": 0.4359516203403473, "step": 168 }, { "epoch": 2.8368200836820083, "grad_norm": 17.79911231994629, "learning_rate": 5.183333333333334e-05, "loss": 0.2632891833782196, "step": 170 }, { "epoch": 2.8368200836820083, "eval_accuracy": 0.8490566037735849, "eval_f1": 0.8482479784366577, "eval_loss": 0.4442301094532013, "eval_precision": 0.8628724610784616, "eval_recall": 0.8490566037735849, "eval_runtime": 3.7031, "eval_samples_per_second": 28.625, "eval_steps_per_second": 1.89, "step": 170 }, { "epoch": 2.8702928870292888, "grad_norm": 11.702485084533691, "learning_rate": 5.1500000000000005e-05, "loss": 0.5320336222648621, "step": 172 }, { "epoch": 2.903765690376569, "grad_norm": 3.1553103923797607, "learning_rate": 5.1166666666666666e-05, "loss": 0.04770774394273758, "step": 174 }, { "epoch": 2.937238493723849, "grad_norm": 7.18648099899292, "learning_rate": 5.0833333333333333e-05, "loss": 0.19427719712257385, "step": 176 }, { "epoch": 2.9707112970711296, "grad_norm": 4.937314510345459, "learning_rate": 5.05e-05, "loss": 0.03693216294050217, "step": 178 }, { "epoch": 3.0, "grad_norm": 8.795729637145996, "learning_rate": 5.016666666666667e-05, "loss": 0.15167710185050964, "step": 180 }, { "epoch": 3.0, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9241216655823031, "eval_loss": 0.35652634501457214, "eval_precision": 0.9297659552531188, "eval_recall": 0.9245283018867925, "eval_runtime": 3.7059, "eval_samples_per_second": 28.603, "eval_steps_per_second": 1.889, "step": 180 }, { "epoch": 3.0334728033472804, "grad_norm": 8.904112815856934, "learning_rate": 4.9833333333333336e-05, "loss": 0.41982442140579224, "step": 182 }, { "epoch": 3.066945606694561, "grad_norm": 0.2715027928352356, "learning_rate": 4.9500000000000004e-05, "loss": 0.022679880261421204, "step": 184 }, { "epoch": 3.100418410041841, "grad_norm": 15.07321548461914, "learning_rate": 4.916666666666667e-05, "loss": 0.11157393455505371, "step": 186 }, { "epoch": 3.1338912133891212, "grad_norm": 0.28451114892959595, "learning_rate": 4.883333333333334e-05, "loss": 0.14321056008338928, "step": 188 }, { "epoch": 3.1673640167364017, "grad_norm": 49.651493072509766, "learning_rate": 4.85e-05, "loss": 1.1760060787200928, "step": 190 }, { "epoch": 3.1673640167364017, "eval_accuracy": 0.839622641509434, "eval_f1": 0.8389789004397634, "eval_loss": 0.8716182112693787, "eval_precision": 0.8506357670221494, "eval_recall": 0.839622641509434, "eval_runtime": 3.7137, "eval_samples_per_second": 28.543, "eval_steps_per_second": 1.885, "step": 190 }, { "epoch": 3.200836820083682, "grad_norm": 6.322359561920166, "learning_rate": 4.816666666666667e-05, "loss": 0.9762416481971741, "step": 192 }, { "epoch": 3.2343096234309625, "grad_norm": 54.51497268676758, "learning_rate": 4.7833333333333335e-05, "loss": 0.3249124586582184, "step": 194 }, { "epoch": 3.2677824267782425, "grad_norm": 42.640785217285156, "learning_rate": 4.75e-05, "loss": 0.5758523344993591, "step": 196 }, { "epoch": 3.301255230125523, "grad_norm": 43.0511474609375, "learning_rate": 4.716666666666667e-05, "loss": 0.32314157485961914, "step": 198 }, { "epoch": 3.3347280334728033, "grad_norm": 32.850379943847656, "learning_rate": 4.683333333333334e-05, "loss": 0.36569491028785706, "step": 200 }, { "epoch": 3.3347280334728033, "eval_accuracy": 0.8773584905660378, "eval_f1": 0.8773039104550302, "eval_loss": 0.481644868850708, "eval_precision": 0.8811657681940702, "eval_recall": 0.8773584905660378, "eval_runtime": 3.7171, "eval_samples_per_second": 28.517, "eval_steps_per_second": 1.883, "step": 200 }, { "epoch": 3.3682008368200838, "grad_norm": 6.7750563621521, "learning_rate": 4.6500000000000005e-05, "loss": 0.20069155097007751, "step": 202 }, { "epoch": 3.401673640167364, "grad_norm": 17.214975357055664, "learning_rate": 4.6166666666666666e-05, "loss": 0.10171210020780563, "step": 204 }, { "epoch": 3.435146443514644, "grad_norm": 12.708967208862305, "learning_rate": 4.5833333333333334e-05, "loss": 0.4347713887691498, "step": 206 }, { "epoch": 3.4686192468619246, "grad_norm": 5.2025227546691895, "learning_rate": 4.55e-05, "loss": 0.4251987338066101, "step": 208 }, { "epoch": 3.502092050209205, "grad_norm": 32.6699104309082, "learning_rate": 4.516666666666667e-05, "loss": 0.5603814125061035, "step": 210 }, { "epoch": 3.502092050209205, "eval_accuracy": 0.9433962264150944, "eval_f1": 0.9432340960217255, "eval_loss": 0.30713358521461487, "eval_precision": 0.9457912893195164, "eval_recall": 0.9433962264150944, "eval_runtime": 3.7057, "eval_samples_per_second": 28.605, "eval_steps_per_second": 1.889, "step": 210 }, { "epoch": 3.5355648535564854, "grad_norm": 8.29829216003418, "learning_rate": 4.483333333333334e-05, "loss": 0.4240536391735077, "step": 212 }, { "epoch": 3.569037656903766, "grad_norm": 29.757633209228516, "learning_rate": 4.4500000000000004e-05, "loss": 0.18742801249027252, "step": 214 }, { "epoch": 3.602510460251046, "grad_norm": 1.6654223203659058, "learning_rate": 4.416666666666667e-05, "loss": 0.047725528478622437, "step": 216 }, { "epoch": 3.6359832635983262, "grad_norm": 59.568790435791016, "learning_rate": 4.383333333333334e-05, "loss": 0.4550500810146332, "step": 218 }, { "epoch": 3.6694560669456067, "grad_norm": 8.077507019042969, "learning_rate": 4.35e-05, "loss": 0.23772548139095306, "step": 220 }, { "epoch": 3.6694560669456067, "eval_accuracy": 0.8679245283018868, "eval_f1": 0.8679245283018868, "eval_loss": 0.44908207654953003, "eval_precision": 0.870399892375475, "eval_recall": 0.8679245283018868, "eval_runtime": 3.708, "eval_samples_per_second": 28.587, "eval_steps_per_second": 1.888, "step": 220 }, { "epoch": 3.702928870292887, "grad_norm": 0.12859027087688446, "learning_rate": 4.316666666666667e-05, "loss": 0.025083430111408234, "step": 222 }, { "epoch": 3.7364016736401675, "grad_norm": 4.374208450317383, "learning_rate": 4.2833333333333335e-05, "loss": 0.011821538209915161, "step": 224 }, { "epoch": 3.7698744769874475, "grad_norm": 18.42703628540039, "learning_rate": 4.25e-05, "loss": 0.19812774658203125, "step": 226 }, { "epoch": 3.803347280334728, "grad_norm": 0.09996317327022552, "learning_rate": 4.216666666666667e-05, "loss": 0.0015517398715019226, "step": 228 }, { "epoch": 3.8368200836820083, "grad_norm": 14.784162521362305, "learning_rate": 4.183333333333334e-05, "loss": 0.6335777640342712, "step": 230 }, { "epoch": 3.8368200836820083, "eval_accuracy": 0.8962264150943396, "eval_f1": 0.8962541447420387, "eval_loss": 0.4861188232898712, "eval_precision": 0.8964414341772833, "eval_recall": 0.8962264150943396, "eval_runtime": 3.7138, "eval_samples_per_second": 28.542, "eval_steps_per_second": 1.885, "step": 230 }, { "epoch": 3.8702928870292888, "grad_norm": 48.12224197387695, "learning_rate": 4.1500000000000006e-05, "loss": 0.07531946897506714, "step": 232 }, { "epoch": 3.903765690376569, "grad_norm": 47.012454986572266, "learning_rate": 4.116666666666667e-05, "loss": 0.2723342478275299, "step": 234 }, { "epoch": 3.937238493723849, "grad_norm": 1.611098289489746, "learning_rate": 4.0833333333333334e-05, "loss": 0.03626517951488495, "step": 236 }, { "epoch": 3.9707112970711296, "grad_norm": 0.5698888301849365, "learning_rate": 4.05e-05, "loss": 0.007945887744426727, "step": 238 }, { "epoch": 4.0, "grad_norm": 0.19228243827819824, "learning_rate": 4.016666666666667e-05, "loss": 0.46452969312667847, "step": 240 }, { "epoch": 4.0, "eval_accuracy": 0.9056603773584906, "eval_f1": 0.9053901600362091, "eval_loss": 0.6092321872711182, "eval_precision": 0.907728840775946, "eval_recall": 0.9056603773584906, "eval_runtime": 3.7164, "eval_samples_per_second": 28.522, "eval_steps_per_second": 1.884, "step": 240 }, { "epoch": 4.03347280334728, "grad_norm": 22.535974502563477, "learning_rate": 3.983333333333334e-05, "loss": 0.08055298775434494, "step": 242 }, { "epoch": 4.066945606694561, "grad_norm": 11.386103630065918, "learning_rate": 3.9500000000000005e-05, "loss": 0.035454049706459045, "step": 244 }, { "epoch": 4.100418410041841, "grad_norm": 10.153188705444336, "learning_rate": 3.9166666666666665e-05, "loss": 0.016098804771900177, "step": 246 }, { "epoch": 4.133891213389122, "grad_norm": 11.270447731018066, "learning_rate": 3.883333333333333e-05, "loss": 0.2176855206489563, "step": 248 }, { "epoch": 4.167364016736402, "grad_norm": 0.02757176198065281, "learning_rate": 3.85e-05, "loss": 0.0005865916609764099, "step": 250 }, { "epoch": 4.167364016736402, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9338382995086952, "eval_loss": 0.5473006963729858, "eval_precision": 0.9352228366948602, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7163, "eval_samples_per_second": 28.523, "eval_steps_per_second": 1.884, "step": 250 }, { "epoch": 4.200836820083682, "grad_norm": 7.626204490661621, "learning_rate": 3.8166666666666675e-05, "loss": 0.019215278327465057, "step": 252 }, { "epoch": 4.2343096234309625, "grad_norm": 35.58063888549805, "learning_rate": 3.7833333333333336e-05, "loss": 0.9835410118103027, "step": 254 }, { "epoch": 4.2677824267782425, "grad_norm": 14.793781280517578, "learning_rate": 3.7500000000000003e-05, "loss": 0.06062314659357071, "step": 256 }, { "epoch": 4.301255230125523, "grad_norm": 50.333740234375, "learning_rate": 3.716666666666667e-05, "loss": 0.18287469446659088, "step": 258 }, { "epoch": 4.334728033472803, "grad_norm": 0.20852503180503845, "learning_rate": 3.683333333333333e-05, "loss": 0.001136690378189087, "step": 260 }, { "epoch": 4.334728033472803, "eval_accuracy": 0.8867924528301887, "eval_f1": 0.8868327689082406, "eval_loss": 0.6996495127677917, "eval_precision": 0.8875044499822, "eval_recall": 0.8867924528301887, "eval_runtime": 3.7065, "eval_samples_per_second": 28.598, "eval_steps_per_second": 1.889, "step": 260 }, { "epoch": 4.368200836820083, "grad_norm": 0.04896273836493492, "learning_rate": 3.65e-05, "loss": 0.0016482695937156677, "step": 262 }, { "epoch": 4.401673640167364, "grad_norm": 0.6264411807060242, "learning_rate": 3.616666666666667e-05, "loss": 0.2418299913406372, "step": 264 }, { "epoch": 4.435146443514644, "grad_norm": 1.5253939628601074, "learning_rate": 3.5833333333333335e-05, "loss": 0.006439179182052612, "step": 266 }, { "epoch": 4.468619246861925, "grad_norm": 0.04963093623518944, "learning_rate": 3.55e-05, "loss": 0.0010056868195533752, "step": 268 }, { "epoch": 4.502092050209205, "grad_norm": 26.80730438232422, "learning_rate": 3.516666666666667e-05, "loss": 0.19413813948631287, "step": 270 }, { "epoch": 4.502092050209205, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9244474393530997, "eval_loss": 0.61537766456604, "eval_precision": 0.9250282039330132, "eval_recall": 0.9245283018867925, "eval_runtime": 3.7061, "eval_samples_per_second": 28.602, "eval_steps_per_second": 1.889, "step": 270 }, { "epoch": 4.535564853556485, "grad_norm": 4.934254169464111, "learning_rate": 3.483333333333334e-05, "loss": 0.015330374240875244, "step": 272 }, { "epoch": 4.569037656903766, "grad_norm": 0.6267944574356079, "learning_rate": 3.4500000000000005e-05, "loss": 0.005184158682823181, "step": 274 }, { "epoch": 4.602510460251046, "grad_norm": 1.4941405057907104, "learning_rate": 3.4166666666666666e-05, "loss": 0.0041460320353507996, "step": 276 }, { "epoch": 4.635983263598327, "grad_norm": 0.0014329368714243174, "learning_rate": 3.3833333333333334e-05, "loss": 0.00018259137868881226, "step": 278 }, { "epoch": 4.669456066945607, "grad_norm": 0.006859190296381712, "learning_rate": 3.35e-05, "loss": 3.684312105178833e-05, "step": 280 }, { "epoch": 4.669456066945607, "eval_accuracy": 0.8679245283018868, "eval_f1": 0.8679245283018868, "eval_loss": 0.9834848046302795, "eval_precision": 0.870399892375475, "eval_recall": 0.8679245283018868, "eval_runtime": 3.7342, "eval_samples_per_second": 28.386, "eval_steps_per_second": 1.875, "step": 280 }, { "epoch": 4.702928870292887, "grad_norm": 0.00331820803694427, "learning_rate": 3.316666666666667e-05, "loss": 0.002377226948738098, "step": 282 }, { "epoch": 4.7364016736401675, "grad_norm": 18.56043243408203, "learning_rate": 3.2833333333333336e-05, "loss": 0.042559750378131866, "step": 284 }, { "epoch": 4.7698744769874475, "grad_norm": 0.001107880030758679, "learning_rate": 3.2500000000000004e-05, "loss": 4.0978193283081055e-05, "step": 286 }, { "epoch": 4.803347280334728, "grad_norm": 0.04220420494675636, "learning_rate": 3.216666666666667e-05, "loss": 5.342811346054077e-05, "step": 288 }, { "epoch": 4.836820083682008, "grad_norm": 8.83952808380127, "learning_rate": 3.183333333333333e-05, "loss": 0.005525052547454834, "step": 290 }, { "epoch": 4.836820083682008, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9335047827599963, "eval_loss": 0.6870636343955994, "eval_precision": 0.9414181375532562, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7055, "eval_samples_per_second": 28.606, "eval_steps_per_second": 1.889, "step": 290 }, { "epoch": 4.870292887029288, "grad_norm": 0.010887798853218555, "learning_rate": 3.15e-05, "loss": 0.00016146153211593628, "step": 292 }, { "epoch": 4.903765690376569, "grad_norm": 0.0005961931310594082, "learning_rate": 3.116666666666667e-05, "loss": 5.476176738739014e-06, "step": 294 }, { "epoch": 4.937238493723849, "grad_norm": 0.08674273639917374, "learning_rate": 3.0833333333333335e-05, "loss": 0.00012561678886413574, "step": 296 }, { "epoch": 4.97071129707113, "grad_norm": 0.009289816953241825, "learning_rate": 3.05e-05, "loss": 1.0974705219268799e-05, "step": 298 }, { "epoch": 5.0, "grad_norm": 0.00011958323011640459, "learning_rate": 3.0166666666666667e-05, "loss": 0.07568688690662384, "step": 300 }, { "epoch": 5.0, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9338382995086952, "eval_loss": 0.6480500102043152, "eval_precision": 0.9352228366948602, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7049, "eval_samples_per_second": 28.611, "eval_steps_per_second": 1.889, "step": 300 }, { "epoch": 5.03347280334728, "grad_norm": 0.022767871618270874, "learning_rate": 2.9833333333333338e-05, "loss": 8.972734212875366e-05, "step": 302 }, { "epoch": 5.066945606694561, "grad_norm": 0.001911279046908021, "learning_rate": 2.9500000000000006e-05, "loss": 2.216547727584839e-05, "step": 304 }, { "epoch": 5.100418410041841, "grad_norm": 0.005798493977636099, "learning_rate": 2.9166666666666666e-05, "loss": 1.2740492820739746e-05, "step": 306 }, { "epoch": 5.133891213389122, "grad_norm": 0.01798299513757229, "learning_rate": 2.8833333333333334e-05, "loss": 4.5612454414367676e-05, "step": 308 }, { "epoch": 5.167364016736402, "grad_norm": 48.38139343261719, "learning_rate": 2.85e-05, "loss": 0.32229170203208923, "step": 310 }, { "epoch": 5.167364016736402, "eval_accuracy": 0.8584905660377359, "eval_f1": 0.8584275889865733, "eval_loss": 1.0875648260116577, "eval_precision": 0.8621967654986522, "eval_recall": 0.8584905660377359, "eval_runtime": 3.7291, "eval_samples_per_second": 28.425, "eval_steps_per_second": 1.877, "step": 310 }, { "epoch": 5.200836820083682, "grad_norm": 35.383583068847656, "learning_rate": 2.8166666666666673e-05, "loss": 0.18194064497947693, "step": 312 }, { "epoch": 5.2343096234309625, "grad_norm": 0.0029588930774480104, "learning_rate": 2.7833333333333333e-05, "loss": 1.0028481483459473e-05, "step": 314 }, { "epoch": 5.2677824267782425, "grad_norm": 0.00013960737851448357, "learning_rate": 2.75e-05, "loss": 2.16066837310791e-06, "step": 316 }, { "epoch": 5.301255230125523, "grad_norm": 0.017387770116329193, "learning_rate": 2.716666666666667e-05, "loss": 2.9422342777252197e-05, "step": 318 }, { "epoch": 5.334728033472803, "grad_norm": 0.4475659430027008, "learning_rate": 2.6833333333333333e-05, "loss": 0.00039640069007873535, "step": 320 }, { "epoch": 5.334728033472803, "eval_accuracy": 0.9056603773584906, "eval_f1": 0.9056603773584906, "eval_loss": 0.7041952610015869, "eval_precision": 0.9056603773584906, "eval_recall": 0.9056603773584906, "eval_runtime": 3.7075, "eval_samples_per_second": 28.591, "eval_steps_per_second": 1.888, "step": 320 }, { "epoch": 5.368200836820083, "grad_norm": 0.0003625499957706779, "learning_rate": 2.65e-05, "loss": 3.3020973205566406e-05, "step": 322 }, { "epoch": 5.401673640167364, "grad_norm": 0.00012655714817810804, "learning_rate": 2.6166666666666668e-05, "loss": 1.6242265701293945e-06, "step": 324 }, { "epoch": 5.435146443514644, "grad_norm": 0.019135398790240288, "learning_rate": 2.5833333333333336e-05, "loss": 2.977997064590454e-05, "step": 326 }, { "epoch": 5.468619246861925, "grad_norm": 0.010481717996299267, "learning_rate": 2.55e-05, "loss": 1.7717480659484863e-05, "step": 328 }, { "epoch": 5.502092050209205, "grad_norm": 0.00013351759116631, "learning_rate": 2.5166666666666667e-05, "loss": 2.6226043701171875e-06, "step": 330 }, { "epoch": 5.502092050209205, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9243121280289672, "eval_loss": 0.655044674873352, "eval_precision": 0.9267600650477312, "eval_recall": 0.9245283018867925, "eval_runtime": 3.7068, "eval_samples_per_second": 28.596, "eval_steps_per_second": 1.888, "step": 330 }, { "epoch": 5.535564853556485, "grad_norm": 0.0010905138915404677, "learning_rate": 2.4833333333333335e-05, "loss": 5.260109901428223e-06, "step": 332 }, { "epoch": 5.569037656903766, "grad_norm": 0.0005198650760576129, "learning_rate": 2.4500000000000003e-05, "loss": 3.255903720855713e-06, "step": 334 }, { "epoch": 5.602510460251046, "grad_norm": 4.2097148252651095e-05, "learning_rate": 2.4166666666666667e-05, "loss": 1.6391277313232422e-06, "step": 336 }, { "epoch": 5.635983263598327, "grad_norm": 5.612479435512796e-05, "learning_rate": 2.3833333333333334e-05, "loss": 9.42423939704895e-05, "step": 338 }, { "epoch": 5.669456066945607, "grad_norm": 0.002715431386604905, "learning_rate": 2.3500000000000002e-05, "loss": 5.62518835067749e-06, "step": 340 }, { "epoch": 5.669456066945607, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9241216655823031, "eval_loss": 0.6835985779762268, "eval_precision": 0.9297659552531188, "eval_recall": 0.9245283018867925, "eval_runtime": 3.71, "eval_samples_per_second": 28.572, "eval_steps_per_second": 1.887, "step": 340 }, { "epoch": 5.702928870292887, "grad_norm": 0.00018491320952307433, "learning_rate": 2.316666666666667e-05, "loss": 4.3429434299468994e-05, "step": 342 }, { "epoch": 5.7364016736401675, "grad_norm": 0.18149010837078094, "learning_rate": 2.2833333333333334e-05, "loss": 0.00017549097537994385, "step": 344 }, { "epoch": 5.7698744769874475, "grad_norm": 0.00043266554712317884, "learning_rate": 2.25e-05, "loss": 1.9669532775878906e-05, "step": 346 }, { "epoch": 5.803347280334728, "grad_norm": 0.3249743580818176, "learning_rate": 2.216666666666667e-05, "loss": 0.0005292296409606934, "step": 348 }, { "epoch": 5.836820083682008, "grad_norm": 35.03731918334961, "learning_rate": 2.1833333333333333e-05, "loss": 0.04761844128370285, "step": 350 }, { "epoch": 5.836820083682008, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9243121280289672, "eval_loss": 0.6467688679695129, "eval_precision": 0.9267600650477312, "eval_recall": 0.9245283018867925, "eval_runtime": 3.7122, "eval_samples_per_second": 28.555, "eval_steps_per_second": 1.886, "step": 350 }, { "epoch": 5.870292887029288, "grad_norm": 6.428731285268441e-05, "learning_rate": 2.15e-05, "loss": 9.47713851928711e-06, "step": 352 }, { "epoch": 5.903765690376569, "grad_norm": 0.0001438381295884028, "learning_rate": 2.116666666666667e-05, "loss": 1.3910233974456787e-05, "step": 354 }, { "epoch": 5.937238493723849, "grad_norm": 1.1610552072525024, "learning_rate": 2.0833333333333336e-05, "loss": 0.0029199421405792236, "step": 356 }, { "epoch": 5.97071129707113, "grad_norm": 203.78492736816406, "learning_rate": 2.05e-05, "loss": 0.312627911567688, "step": 358 }, { "epoch": 6.0, "grad_norm": 0.00013352386304177344, "learning_rate": 2.0166666666666668e-05, "loss": 1.8507241748011438e-06, "step": 360 }, { "epoch": 6.0, "eval_accuracy": 0.9245283018867925, "eval_f1": 0.9244474393530997, "eval_loss": 0.712137758731842, "eval_precision": 0.9250282039330132, "eval_recall": 0.9245283018867925, "eval_runtime": 3.7252, "eval_samples_per_second": 28.455, "eval_steps_per_second": 1.879, "step": 360 }, { "epoch": 6.03347280334728, "grad_norm": 0.0001777389697963372, "learning_rate": 1.9833333333333335e-05, "loss": 2.7939677238464355e-06, "step": 362 }, { "epoch": 6.066945606694561, "grad_norm": 0.0001761111052474007, "learning_rate": 1.95e-05, "loss": 2.1383166313171387e-06, "step": 364 }, { "epoch": 6.100418410041841, "grad_norm": 0.00021096244745422155, "learning_rate": 1.916666666666667e-05, "loss": 3.3155083656311035e-06, "step": 366 }, { "epoch": 6.133891213389122, "grad_norm": 0.0002524006413295865, "learning_rate": 1.8833333333333335e-05, "loss": 2.1010637283325195e-06, "step": 368 }, { "epoch": 6.167364016736402, "grad_norm": 0.00042212067637592554, "learning_rate": 1.8500000000000002e-05, "loss": 1.691281795501709e-06, "step": 370 }, { "epoch": 6.167364016736402, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9338382995086952, "eval_loss": 0.6748734712600708, "eval_precision": 0.9352228366948602, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7068, "eval_samples_per_second": 28.596, "eval_steps_per_second": 1.888, "step": 370 }, { "epoch": 6.200836820083682, "grad_norm": 0.00020465327543206513, "learning_rate": 1.8166666666666667e-05, "loss": 1.8700957298278809e-06, "step": 372 }, { "epoch": 6.2343096234309625, "grad_norm": 6.185871461639181e-05, "learning_rate": 1.7833333333333334e-05, "loss": 2.2649765014648438e-06, "step": 374 }, { "epoch": 6.2677824267782425, "grad_norm": 0.0367373451590538, "learning_rate": 1.7500000000000002e-05, "loss": 2.899765968322754e-05, "step": 376 }, { "epoch": 6.301255230125523, "grad_norm": 0.00019855774007737637, "learning_rate": 1.7166666666666666e-05, "loss": 4.954636096954346e-06, "step": 378 }, { "epoch": 6.334728033472803, "grad_norm": 0.0034095882438123226, "learning_rate": 1.6833333333333334e-05, "loss": 1.884251832962036e-05, "step": 380 }, { "epoch": 6.334728033472803, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.689272403717041, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7099, "eval_samples_per_second": 28.572, "eval_steps_per_second": 1.887, "step": 380 }, { "epoch": 6.368200836820083, "grad_norm": 7.818471203790978e-05, "learning_rate": 1.65e-05, "loss": 3.56137752532959e-06, "step": 382 }, { "epoch": 6.401673640167364, "grad_norm": 0.00011000553058693185, "learning_rate": 1.616666666666667e-05, "loss": 2.8014183044433594e-06, "step": 384 }, { "epoch": 6.435146443514644, "grad_norm": 0.000760717608500272, "learning_rate": 1.5833333333333333e-05, "loss": 2.2277235984802246e-06, "step": 386 }, { "epoch": 6.468619246861925, "grad_norm": 0.0005883196135982871, "learning_rate": 1.55e-05, "loss": 3.688037395477295e-06, "step": 388 }, { "epoch": 6.502092050209205, "grad_norm": 8.678250014781952e-05, "learning_rate": 1.5166666666666667e-05, "loss": 1.3530254364013672e-05, "step": 390 }, { "epoch": 6.502092050209205, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6938135623931885, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7182, "eval_samples_per_second": 28.508, "eval_steps_per_second": 1.883, "step": 390 }, { "epoch": 6.535564853556485, "grad_norm": 0.00010644060967024416, "learning_rate": 1.4833333333333336e-05, "loss": 1.4007091522216797e-06, "step": 392 }, { "epoch": 6.569037656903766, "grad_norm": 0.0003668045101221651, "learning_rate": 1.45e-05, "loss": 2.473592758178711e-06, "step": 394 }, { "epoch": 6.602510460251046, "grad_norm": 0.0006142224883660674, "learning_rate": 1.416666666666667e-05, "loss": 2.9802322387695312e-06, "step": 396 }, { "epoch": 6.635983263598327, "grad_norm": 0.0030253613367676735, "learning_rate": 1.3833333333333334e-05, "loss": 1.2032687664031982e-05, "step": 398 }, { "epoch": 6.669456066945607, "grad_norm": 0.00012326073192525655, "learning_rate": 1.3500000000000001e-05, "loss": 2.346932888031006e-06, "step": 400 }, { "epoch": 6.669456066945607, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6961540579795837, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7152, "eval_samples_per_second": 28.532, "eval_steps_per_second": 1.884, "step": 400 }, { "epoch": 6.702928870292887, "grad_norm": 3.313773049740121e-05, "learning_rate": 1.3166666666666667e-05, "loss": 1.087784767150879e-06, "step": 402 }, { "epoch": 6.7364016736401675, "grad_norm": 0.0002554058446548879, "learning_rate": 1.2833333333333335e-05, "loss": 1.9371509552001953e-06, "step": 404 }, { "epoch": 6.7698744769874475, "grad_norm": 0.00010436172306071967, "learning_rate": 1.25e-05, "loss": 1.080334186553955e-06, "step": 406 }, { "epoch": 6.803347280334728, "grad_norm": 0.0002193980908486992, "learning_rate": 1.2166666666666667e-05, "loss": 6.273388862609863e-06, "step": 408 }, { "epoch": 6.836820083682008, "grad_norm": 0.0015420763520523906, "learning_rate": 1.1833333333333334e-05, "loss": 7.3015689849853516e-06, "step": 410 }, { "epoch": 6.836820083682008, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6988417506217957, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7296, "eval_samples_per_second": 28.421, "eval_steps_per_second": 1.877, "step": 410 }, { "epoch": 6.870292887029288, "grad_norm": 3.193110751453787e-05, "learning_rate": 1.15e-05, "loss": 6.258487701416016e-07, "step": 412 }, { "epoch": 6.903765690376569, "grad_norm": 0.0001242496946360916, "learning_rate": 1.1166666666666668e-05, "loss": 6.854534149169922e-07, "step": 414 }, { "epoch": 6.937238493723849, "grad_norm": 0.0014401256339624524, "learning_rate": 1.0833333333333334e-05, "loss": 3.3080577850341797e-06, "step": 416 }, { "epoch": 6.97071129707113, "grad_norm": 0.001787560642696917, "learning_rate": 1.0500000000000001e-05, "loss": 0.00014481693506240845, "step": 418 }, { "epoch": 7.0, "grad_norm": 0.005672928411513567, "learning_rate": 1.0166666666666667e-05, "loss": 7.63237494538771e-06, "step": 420 }, { "epoch": 7.0, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6962770223617554, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7093, "eval_samples_per_second": 28.577, "eval_steps_per_second": 1.887, "step": 420 }, { "epoch": 7.03347280334728, "grad_norm": 0.015223933383822441, "learning_rate": 9.833333333333333e-06, "loss": 1.2353062629699707e-05, "step": 422 }, { "epoch": 7.066945606694561, "grad_norm": 5.270187102723867e-05, "learning_rate": 9.5e-06, "loss": 2.078711986541748e-06, "step": 424 }, { "epoch": 7.100418410041841, "grad_norm": 5.6049997510854155e-05, "learning_rate": 9.166666666666666e-06, "loss": 9.909272193908691e-07, "step": 426 }, { "epoch": 7.133891213389122, "grad_norm": 0.0028543949592858553, "learning_rate": 8.833333333333334e-06, "loss": 1.2464821338653564e-05, "step": 428 }, { "epoch": 7.167364016736402, "grad_norm": 0.00044102492392994463, "learning_rate": 8.5e-06, "loss": 2.391636371612549e-06, "step": 430 }, { "epoch": 7.167364016736402, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6910688281059265, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7168, "eval_samples_per_second": 28.519, "eval_steps_per_second": 1.883, "step": 430 }, { "epoch": 7.200836820083682, "grad_norm": 0.0004783471522387117, "learning_rate": 8.166666666666668e-06, "loss": 3.5315752029418945e-06, "step": 432 }, { "epoch": 7.2343096234309625, "grad_norm": 5.297744064591825e-05, "learning_rate": 7.833333333333333e-06, "loss": 8.419156074523926e-07, "step": 434 }, { "epoch": 7.2677824267782425, "grad_norm": 0.004397497046738863, "learning_rate": 7.500000000000001e-06, "loss": 1.1071562767028809e-05, "step": 436 }, { "epoch": 7.301255230125523, "grad_norm": 3.487208596197888e-05, "learning_rate": 7.166666666666667e-06, "loss": 2.2649765014648438e-06, "step": 438 }, { "epoch": 7.334728033472803, "grad_norm": 7.298957643797621e-05, "learning_rate": 6.833333333333334e-06, "loss": 1.914799213409424e-06, "step": 440 }, { "epoch": 7.334728033472803, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6907399892807007, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7092, "eval_samples_per_second": 28.578, "eval_steps_per_second": 1.887, "step": 440 }, { "epoch": 7.368200836820083, "grad_norm": 0.0004972554743289948, "learning_rate": 6.5000000000000004e-06, "loss": 1.9073486328125e-06, "step": 442 }, { "epoch": 7.401673640167364, "grad_norm": 0.0031505597289651632, "learning_rate": 6.166666666666667e-06, "loss": 6.586313247680664e-06, "step": 444 }, { "epoch": 7.435146443514644, "grad_norm": 0.00017355683667119592, "learning_rate": 5.833333333333334e-06, "loss": 1.2814998626708984e-06, "step": 446 }, { "epoch": 7.468619246861925, "grad_norm": 0.001673478283919394, "learning_rate": 5.500000000000001e-06, "loss": 4.023313522338867e-06, "step": 448 }, { "epoch": 7.502092050209205, "grad_norm": 0.0006961830076761544, "learning_rate": 5.1666666666666675e-06, "loss": 3.769993782043457e-06, "step": 450 }, { "epoch": 7.502092050209205, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6910889148712158, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7118, "eval_samples_per_second": 28.558, "eval_steps_per_second": 1.886, "step": 450 }, { "epoch": 7.535564853556485, "grad_norm": 5.422241520136595e-05, "learning_rate": 4.833333333333333e-06, "loss": 7.227063179016113e-07, "step": 452 }, { "epoch": 7.569037656903766, "grad_norm": 2.27185373660177e-05, "learning_rate": 4.5e-06, "loss": 6.109476089477539e-07, "step": 454 }, { "epoch": 7.602510460251046, "grad_norm": 1.3216957995609846e-05, "learning_rate": 4.166666666666667e-06, "loss": 1.1920928955078125e-06, "step": 456 }, { "epoch": 7.635983263598327, "grad_norm": 0.0005943789728917181, "learning_rate": 3.833333333333334e-06, "loss": 2.0712614059448242e-06, "step": 458 }, { "epoch": 7.669456066945607, "grad_norm": 0.0003266510902903974, "learning_rate": 3.5e-06, "loss": 1.0356307029724121e-06, "step": 460 }, { "epoch": 7.669456066945607, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6905573010444641, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7067, "eval_samples_per_second": 28.597, "eval_steps_per_second": 1.888, "step": 460 }, { "epoch": 7.702928870292887, "grad_norm": 4.351493771537207e-05, "learning_rate": 3.1666666666666667e-06, "loss": 1.058727502822876e-05, "step": 462 }, { "epoch": 7.7364016736401675, "grad_norm": 0.00025774663663469255, "learning_rate": 2.8333333333333335e-06, "loss": 2.4065375328063965e-06, "step": 464 }, { "epoch": 7.7698744769874475, "grad_norm": 0.0015569524839520454, "learning_rate": 2.5e-06, "loss": 5.640089511871338e-06, "step": 466 }, { "epoch": 7.803347280334728, "grad_norm": 4.590475873555988e-05, "learning_rate": 2.166666666666667e-06, "loss": 3.1068921089172363e-06, "step": 468 }, { "epoch": 7.836820083682008, "grad_norm": 2.451378713885788e-05, "learning_rate": 1.8333333333333333e-06, "loss": 3.501772880554199e-07, "step": 470 }, { "epoch": 7.836820083682008, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.6899929642677307, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.7107, "eval_samples_per_second": 28.566, "eval_steps_per_second": 1.886, "step": 470 }, { "epoch": 7.870292887029288, "grad_norm": 9.816468082135543e-05, "learning_rate": 1.5e-06, "loss": 2.205371856689453e-06, "step": 472 }, { "epoch": 7.903765690376569, "grad_norm": 0.0009466425399295986, "learning_rate": 1.1666666666666668e-06, "loss": 3.606081008911133e-06, "step": 474 }, { "epoch": 7.937238493723849, "grad_norm": 0.00021921261213719845, "learning_rate": 8.333333333333333e-07, "loss": 1.1101365089416504e-06, "step": 476 }, { "epoch": 7.97071129707113, "grad_norm": 6.945950735826045e-05, "learning_rate": 5.000000000000001e-07, "loss": 1.1011958122253418e-05, "step": 478 }, { "epoch": 8.0, "grad_norm": 0.0003524368512444198, "learning_rate": 1.6666666666666668e-07, "loss": 2.1785497210657923e-06, "step": 480 }, { "epoch": 8.0, "eval_accuracy": 0.9339622641509434, "eval_f1": 0.9336958636029195, "eval_loss": 0.687942624092102, "eval_precision": 0.9376538146021329, "eval_recall": 0.9339622641509434, "eval_runtime": 3.71, "eval_samples_per_second": 28.571, "eval_steps_per_second": 1.887, "step": 480 }, { "epoch": 8.0, "step": 480, "total_flos": 3.1041290060383428e+16, "train_loss": 0.40463688724984764, "train_runtime": 1466.2333, "train_samples_per_second": 10.416, "train_steps_per_second": 0.327 } ], "logging_steps": 2, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1041290060383428e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }