{ "best_metric": 0.78049123, "best_model_checkpoint": "/global/D1/homes/sushant/SoccerNetExperiments/Soccer-Video-ChatGPT/November_xvars/swift/output/qwen2-vl-7b-instruct/v7-20241118-100959/checkpoint-5800", "epoch": 5.0, "eval_steps": 100, "global_step": 7270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.51318568, "epoch": 0.000687757909215956, "grad_norm": 0.8106947541236877, "learning_rate": 0.0, "loss": 2.20640945, "memory(GiB)": 68.96, "step": 1, "train_speed(iter/s)": 0.018567 }, { "acc": 0.522241, "epoch": 0.0034387895460797797, "grad_norm": 0.7490503191947937, "learning_rate": 2.7291774109314122e-05, "loss": 2.21162605, "memory(GiB)": 68.96, "step": 5, "train_speed(iter/s)": 0.040393 }, { "acc": 0.54119682, "epoch": 0.0068775790921595595, "grad_norm": 0.7276351451873779, "learning_rate": 3.904570144643008e-05, "loss": 2.13246613, "memory(GiB)": 73.29, "step": 10, "train_speed(iter/s)": 0.052109 }, { "acc": 0.5478312, "epoch": 0.01031636863823934, "grad_norm": 0.763149082660675, "learning_rate": 4.5921308174844174e-05, "loss": 2.00201836, "memory(GiB)": 73.29, "step": 15, "train_speed(iter/s)": 0.056038 }, { "acc": 0.56882777, "epoch": 0.013755158184319119, "grad_norm": 0.7898057103157043, "learning_rate": 5.0799628783546016e-05, "loss": 1.84855347, "memory(GiB)": 73.29, "step": 20, "train_speed(iter/s)": 0.058389 }, { "acc": 0.59600019, "epoch": 0.0171939477303989, "grad_norm": 0.9877901673316956, "learning_rate": 5.4583548218628245e-05, "loss": 1.73883362, "memory(GiB)": 73.29, "step": 25, "train_speed(iter/s)": 0.060811 }, { "acc": 0.60646133, "epoch": 0.02063273727647868, "grad_norm": 0.9179441332817078, "learning_rate": 5.7675235511960126e-05, "loss": 1.64034233, "memory(GiB)": 73.29, "step": 30, "train_speed(iter/s)": 0.06211 }, { "acc": 0.61271744, "epoch": 0.024071526822558458, "grad_norm": 0.9307955503463745, "learning_rate": 6.028921987267401e-05, "loss": 1.5906249, "memory(GiB)": 73.29, "step": 35, "train_speed(iter/s)": 0.06288 }, { "acc": 0.63797045, "epoch": 0.027510316368638238, "grad_norm": 1.0717326402664185, "learning_rate": 6.255355612066197e-05, "loss": 1.44913902, "memory(GiB)": 73.29, "step": 40, "train_speed(iter/s)": 0.064 }, { "acc": 0.64407902, "epoch": 0.030949105914718018, "grad_norm": 1.0732834339141846, "learning_rate": 6.455084224037423e-05, "loss": 1.44504213, "memory(GiB)": 73.29, "step": 45, "train_speed(iter/s)": 0.064348 }, { "acc": 0.64924326, "epoch": 0.0343878954607978, "grad_norm": 1.1162458658218384, "learning_rate": 6.633747555574418e-05, "loss": 1.36141453, "memory(GiB)": 73.29, "step": 50, "train_speed(iter/s)": 0.064587 }, { "acc": 0.6600091, "epoch": 0.03782668500687758, "grad_norm": 1.0633102655410767, "learning_rate": 6.795368198249832e-05, "loss": 1.3460659, "memory(GiB)": 73.29, "step": 55, "train_speed(iter/s)": 0.064754 }, { "acc": 0.65980716, "epoch": 0.04126547455295736, "grad_norm": 1.0164440870285034, "learning_rate": 6.942916284907606e-05, "loss": 1.33123722, "memory(GiB)": 73.29, "step": 60, "train_speed(iter/s)": 0.065266 }, { "acc": 0.68235178, "epoch": 0.04470426409903714, "grad_norm": 1.185117483139038, "learning_rate": 7.078647367172232e-05, "loss": 1.24153843, "memory(GiB)": 73.29, "step": 65, "train_speed(iter/s)": 0.065105 }, { "acc": 0.67718811, "epoch": 0.048143053645116916, "grad_norm": 1.1237530708312988, "learning_rate": 7.204314720978996e-05, "loss": 1.28077126, "memory(GiB)": 73.29, "step": 70, "train_speed(iter/s)": 0.065778 }, { "acc": 0.68681493, "epoch": 0.0515818431911967, "grad_norm": 1.2692396640777588, "learning_rate": 7.321308228415829e-05, "loss": 1.25220881, "memory(GiB)": 73.29, "step": 75, "train_speed(iter/s)": 0.066254 }, { "acc": 0.67568145, "epoch": 0.055020632737276476, "grad_norm": 1.118291974067688, "learning_rate": 7.43074834577779e-05, "loss": 1.2524622, "memory(GiB)": 73.29, "step": 80, "train_speed(iter/s)": 0.066119 }, { "acc": 0.67670813, "epoch": 0.05845942228335626, "grad_norm": 1.1989713907241821, "learning_rate": 7.533551533853211e-05, "loss": 1.24576015, "memory(GiB)": 73.29, "step": 85, "train_speed(iter/s)": 0.066293 }, { "acc": 0.69235625, "epoch": 0.061898211829436035, "grad_norm": 1.2158828973770142, "learning_rate": 7.630476957749017e-05, "loss": 1.20176125, "memory(GiB)": 73.29, "step": 90, "train_speed(iter/s)": 0.066483 }, { "acc": 0.68914762, "epoch": 0.06533700137551582, "grad_norm": 1.1737231016159058, "learning_rate": 7.722160543566566e-05, "loss": 1.21274147, "memory(GiB)": 73.29, "step": 95, "train_speed(iter/s)": 0.066668 }, { "acc": 0.69834042, "epoch": 0.0687757909215956, "grad_norm": 1.351590871810913, "learning_rate": 7.809140289286016e-05, "loss": 1.1592926, "memory(GiB)": 73.29, "step": 100, "train_speed(iter/s)": 0.067074 }, { "epoch": 0.0687757909215956, "eval_acc": 0.6962927970486346, "eval_loss": 1.1645218133926392, "eval_runtime": 1212.4549, "eval_samples_per_second": 3.533, "eval_steps_per_second": 0.064, "step": 100 }, { "acc": 0.68415451, "epoch": 0.07221458046767538, "grad_norm": 1.3831959962844849, "learning_rate": 7.891875393820406e-05, "loss": 1.2015852, "memory(GiB)": 73.29, "step": 105, "train_speed(iter/s)": 0.037875 }, { "acc": 0.68617525, "epoch": 0.07565337001375516, "grad_norm": 1.1734046936035156, "learning_rate": 7.970760931961428e-05, "loss": 1.20146303, "memory(GiB)": 67.64, "step": 110, "train_speed(iter/s)": 0.038604 }, { "acc": 0.71534705, "epoch": 0.07909215955983494, "grad_norm": 1.1629948616027832, "learning_rate": 8.046139264575035e-05, "loss": 1.0837039, "memory(GiB)": 67.64, "step": 115, "train_speed(iter/s)": 0.039388 }, { "acc": 0.69408731, "epoch": 0.08253094910591471, "grad_norm": 1.3171385526657104, "learning_rate": 8.118309018619202e-05, "loss": 1.18307505, "memory(GiB)": 67.64, "step": 120, "train_speed(iter/s)": 0.040077 }, { "acc": 0.70501647, "epoch": 0.0859697386519945, "grad_norm": 1.3044822216033936, "learning_rate": 8.187532232794237e-05, "loss": 1.13398886, "memory(GiB)": 67.64, "step": 125, "train_speed(iter/s)": 0.040793 }, { "acc": 0.70435572, "epoch": 0.08940852819807428, "grad_norm": 1.329248070716858, "learning_rate": 8.254040100883828e-05, "loss": 1.12086363, "memory(GiB)": 67.64, "step": 130, "train_speed(iter/s)": 0.041495 }, { "acc": 0.69323554, "epoch": 0.09284731774415406, "grad_norm": 1.2392340898513794, "learning_rate": 8.318037630590428e-05, "loss": 1.17682867, "memory(GiB)": 67.64, "step": 135, "train_speed(iter/s)": 0.042188 }, { "acc": 0.68262854, "epoch": 0.09628610729023383, "grad_norm": 1.50913667678833, "learning_rate": 8.379707454690589e-05, "loss": 1.20728226, "memory(GiB)": 67.64, "step": 140, "train_speed(iter/s)": 0.042814 }, { "acc": 0.71855187, "epoch": 0.09972489683631362, "grad_norm": 1.3727279901504517, "learning_rate": 8.439212973113382e-05, "loss": 1.06742191, "memory(GiB)": 67.64, "step": 145, "train_speed(iter/s)": 0.043424 }, { "acc": 0.70871205, "epoch": 0.1031636863823934, "grad_norm": 1.3805909156799316, "learning_rate": 8.496700962127424e-05, "loss": 1.11401825, "memory(GiB)": 67.64, "step": 150, "train_speed(iter/s)": 0.044042 }, { "acc": 0.73342023, "epoch": 0.10660247592847318, "grad_norm": 1.2183469533920288, "learning_rate": 8.552303755540939e-05, "loss": 1.00700331, "memory(GiB)": 67.64, "step": 155, "train_speed(iter/s)": 0.044601 }, { "acc": 0.70797634, "epoch": 0.11004126547455295, "grad_norm": 1.4554626941680908, "learning_rate": 8.606141079489386e-05, "loss": 1.09950924, "memory(GiB)": 67.64, "step": 160, "train_speed(iter/s)": 0.045018 }, { "acc": 0.69420943, "epoch": 0.11348005502063274, "grad_norm": 1.3245141506195068, "learning_rate": 8.658321604802837e-05, "loss": 1.15192003, "memory(GiB)": 67.64, "step": 165, "train_speed(iter/s)": 0.045554 }, { "acc": 0.7062036, "epoch": 0.11691884456671252, "grad_norm": 1.271952748298645, "learning_rate": 8.708944267564807e-05, "loss": 1.08263731, "memory(GiB)": 67.64, "step": 170, "train_speed(iter/s)": 0.045943 }, { "acc": 0.69984941, "epoch": 0.1203576341127923, "grad_norm": 1.4456363916397095, "learning_rate": 8.758099398198813e-05, "loss": 1.12532272, "memory(GiB)": 67.64, "step": 175, "train_speed(iter/s)": 0.046356 }, { "acc": 0.71651649, "epoch": 0.12379642365887207, "grad_norm": 1.44161057472229, "learning_rate": 8.805869691460613e-05, "loss": 1.07470217, "memory(GiB)": 67.64, "step": 180, "train_speed(iter/s)": 0.04681 }, { "acc": 0.71227612, "epoch": 0.12723521320495185, "grad_norm": 1.3441652059555054, "learning_rate": 8.852331043501091e-05, "loss": 1.09072762, "memory(GiB)": 67.64, "step": 185, "train_speed(iter/s)": 0.047226 }, { "acc": 0.72131248, "epoch": 0.13067400275103164, "grad_norm": 1.6152911186218262, "learning_rate": 8.897553277278162e-05, "loss": 1.05218563, "memory(GiB)": 67.64, "step": 190, "train_speed(iter/s)": 0.047678 }, { "acc": 0.71243434, "epoch": 0.13411279229711143, "grad_norm": 1.45099937915802, "learning_rate": 8.94160077372524e-05, "loss": 1.08823862, "memory(GiB)": 67.64, "step": 195, "train_speed(iter/s)": 0.048064 }, { "acc": 0.71744361, "epoch": 0.1375515818431912, "grad_norm": 1.4195399284362793, "learning_rate": 8.984533022997609e-05, "loss": 1.05846539, "memory(GiB)": 67.64, "step": 200, "train_speed(iter/s)": 0.048352 }, { "epoch": 0.1375515818431912, "eval_acc": 0.7119831736176722, "eval_loss": 1.0829237699508667, "eval_runtime": 1085.6657, "eval_samples_per_second": 3.945, "eval_steps_per_second": 0.071, "step": 200 }, { "acc": 0.69938354, "epoch": 0.14099037138927098, "grad_norm": 1.3801318407058716, "learning_rate": 9.026405107641496e-05, "loss": 1.1244791, "memory(GiB)": 67.64, "step": 205, "train_speed(iter/s)": 0.038724 }, { "acc": 0.70936947, "epoch": 0.14442916093535077, "grad_norm": 1.3959752321243286, "learning_rate": 9.067268127532e-05, "loss": 1.09259109, "memory(GiB)": 67.64, "step": 210, "train_speed(iter/s)": 0.039167 }, { "acc": 0.70823245, "epoch": 0.14786795048143053, "grad_norm": 1.4662190675735474, "learning_rate": 9.107169574803587e-05, "loss": 1.11423931, "memory(GiB)": 67.64, "step": 215, "train_speed(iter/s)": 0.039589 }, { "acc": 0.71795692, "epoch": 0.15130674002751032, "grad_norm": 1.2458115816116333, "learning_rate": 9.146153665673023e-05, "loss": 1.08119087, "memory(GiB)": 67.64, "step": 220, "train_speed(iter/s)": 0.040035 }, { "acc": 0.70050411, "epoch": 0.15474552957359008, "grad_norm": 1.3731013536453247, "learning_rate": 9.184261634968835e-05, "loss": 1.12374535, "memory(GiB)": 67.64, "step": 225, "train_speed(iter/s)": 0.040423 }, { "acc": 0.7144835, "epoch": 0.15818431911966988, "grad_norm": 1.4678212404251099, "learning_rate": 9.221531998286629e-05, "loss": 1.05655756, "memory(GiB)": 67.64, "step": 230, "train_speed(iter/s)": 0.04076 }, { "acc": 0.70189781, "epoch": 0.16162310866574967, "grad_norm": 1.3910584449768066, "learning_rate": 9.258000785948739e-05, "loss": 1.12144871, "memory(GiB)": 67.64, "step": 235, "train_speed(iter/s)": 0.04114 }, { "acc": 0.73604274, "epoch": 0.16506189821182943, "grad_norm": 1.3721731901168823, "learning_rate": 9.293701752330797e-05, "loss": 0.98678083, "memory(GiB)": 67.64, "step": 240, "train_speed(iter/s)": 0.041584 }, { "acc": 0.71869593, "epoch": 0.16850068775790922, "grad_norm": 1.6655057668685913, "learning_rate": 9.32866656360339e-05, "loss": 1.07134695, "memory(GiB)": 67.64, "step": 245, "train_speed(iter/s)": 0.042014 }, { "acc": 0.71307015, "epoch": 0.171939477303989, "grad_norm": 1.3052812814712524, "learning_rate": 9.36292496650583e-05, "loss": 1.06437588, "memory(GiB)": 67.64, "step": 250, "train_speed(iter/s)": 0.042373 }, { "acc": 0.71391711, "epoch": 0.17537826685006877, "grad_norm": 1.5577940940856934, "learning_rate": 9.396504940406217e-05, "loss": 1.07451763, "memory(GiB)": 67.64, "step": 255, "train_speed(iter/s)": 0.042705 }, { "acc": 0.73166742, "epoch": 0.17881705639614856, "grad_norm": 1.33721923828125, "learning_rate": 9.429432834595424e-05, "loss": 0.99717045, "memory(GiB)": 67.64, "step": 260, "train_speed(iter/s)": 0.043055 }, { "acc": 0.71127567, "epoch": 0.18225584594222832, "grad_norm": 1.549870252609253, "learning_rate": 9.461733492503013e-05, "loss": 1.10144587, "memory(GiB)": 67.64, "step": 265, "train_speed(iter/s)": 0.043386 }, { "acc": 0.72763004, "epoch": 0.1856946354883081, "grad_norm": 1.4047890901565552, "learning_rate": 9.493430364302024e-05, "loss": 1.01531572, "memory(GiB)": 67.64, "step": 270, "train_speed(iter/s)": 0.043699 }, { "acc": 0.72718954, "epoch": 0.1891334250343879, "grad_norm": 1.3470264673233032, "learning_rate": 9.524545609181246e-05, "loss": 1.01689529, "memory(GiB)": 67.64, "step": 275, "train_speed(iter/s)": 0.043985 }, { "acc": 0.72714009, "epoch": 0.19257221458046767, "grad_norm": 1.3996589183807373, "learning_rate": 9.555100188402185e-05, "loss": 1.01372051, "memory(GiB)": 67.64, "step": 280, "train_speed(iter/s)": 0.044244 }, { "acc": 0.71828256, "epoch": 0.19601100412654746, "grad_norm": 1.5369681119918823, "learning_rate": 9.585113950119573e-05, "loss": 1.06217566, "memory(GiB)": 67.64, "step": 285, "train_speed(iter/s)": 0.044532 }, { "acc": 0.72461739, "epoch": 0.19944979367262725, "grad_norm": 1.3398535251617432, "learning_rate": 9.614605706824978e-05, "loss": 1.03975096, "memory(GiB)": 67.64, "step": 290, "train_speed(iter/s)": 0.044835 }, { "acc": 0.71793423, "epoch": 0.202888583218707, "grad_norm": 1.4092602729797363, "learning_rate": 9.64359330617034e-05, "loss": 1.05028229, "memory(GiB)": 67.64, "step": 295, "train_speed(iter/s)": 0.045175 }, { "acc": 0.72302713, "epoch": 0.2063273727647868, "grad_norm": 1.2952080965042114, "learning_rate": 9.67209369583902e-05, "loss": 1.01765738, "memory(GiB)": 67.64, "step": 300, "train_speed(iter/s)": 0.045401 }, { "epoch": 0.2063273727647868, "eval_acc": 0.7200083232105098, "eval_loss": 1.0458483695983887, "eval_runtime": 1091.4981, "eval_samples_per_second": 3.924, "eval_steps_per_second": 0.071, "step": 300 }, { "acc": 0.709624, "epoch": 0.2097661623108666, "grad_norm": 1.5634573698043823, "learning_rate": 9.700122983054879e-05, "loss": 1.07294426, "memory(GiB)": 67.64, "step": 305, "train_speed(iter/s)": 0.039263 }, { "acc": 0.70980182, "epoch": 0.21320495185694635, "grad_norm": 1.372841477394104, "learning_rate": 9.727696489252533e-05, "loss": 1.10122662, "memory(GiB)": 67.64, "step": 310, "train_speed(iter/s)": 0.039598 }, { "acc": 0.73576632, "epoch": 0.21664374140302614, "grad_norm": 1.539969801902771, "learning_rate": 9.754828800373411e-05, "loss": 0.98867779, "memory(GiB)": 67.64, "step": 315, "train_speed(iter/s)": 0.039872 }, { "acc": 0.7245533, "epoch": 0.2200825309491059, "grad_norm": 1.2448300123214722, "learning_rate": 9.781533813200982e-05, "loss": 1.01700201, "memory(GiB)": 67.64, "step": 320, "train_speed(iter/s)": 0.040166 }, { "acc": 0.72238054, "epoch": 0.2235213204951857, "grad_norm": 1.2697371244430542, "learning_rate": 9.807824778103646e-05, "loss": 1.0503273, "memory(GiB)": 67.64, "step": 325, "train_speed(iter/s)": 0.040431 }, { "acc": 0.7078352, "epoch": 0.22696011004126548, "grad_norm": 1.3228161334991455, "learning_rate": 9.833714338514432e-05, "loss": 1.10422878, "memory(GiB)": 67.64, "step": 330, "train_speed(iter/s)": 0.040712 }, { "acc": 0.71083031, "epoch": 0.23039889958734525, "grad_norm": 1.2554104328155518, "learning_rate": 9.859214567441929e-05, "loss": 1.06728878, "memory(GiB)": 67.64, "step": 335, "train_speed(iter/s)": 0.040973 }, { "acc": 0.73246231, "epoch": 0.23383768913342504, "grad_norm": 1.601881742477417, "learning_rate": 9.884337001276401e-05, "loss": 0.99594593, "memory(GiB)": 67.64, "step": 340, "train_speed(iter/s)": 0.041255 }, { "acc": 0.72822175, "epoch": 0.23727647867950483, "grad_norm": 1.374062418937683, "learning_rate": 9.90909267112804e-05, "loss": 0.9949461, "memory(GiB)": 67.64, "step": 345, "train_speed(iter/s)": 0.041529 }, { "acc": 0.72813654, "epoch": 0.2407152682255846, "grad_norm": 1.4039307832717896, "learning_rate": 9.933492131910406e-05, "loss": 1.00009727, "memory(GiB)": 67.64, "step": 350, "train_speed(iter/s)": 0.041803 }, { "acc": 0.71051707, "epoch": 0.24415405777166438, "grad_norm": 1.4029077291488647, "learning_rate": 9.957545489361027e-05, "loss": 1.05340385, "memory(GiB)": 67.64, "step": 355, "train_speed(iter/s)": 0.042061 }, { "acc": 0.72410893, "epoch": 0.24759284731774414, "grad_norm": 1.379601001739502, "learning_rate": 9.981262425172208e-05, "loss": 1.03275814, "memory(GiB)": 67.64, "step": 360, "train_speed(iter/s)": 0.042262 }, { "acc": 0.72450876, "epoch": 0.25103163686382396, "grad_norm": 1.2809425592422485, "learning_rate": 9.999999482699181e-05, "loss": 1.02212152, "memory(GiB)": 67.64, "step": 365, "train_speed(iter/s)": 0.042465 }, { "acc": 0.72661881, "epoch": 0.2544704264099037, "grad_norm": 1.5250205993652344, "learning_rate": 9.999981377181717e-05, "loss": 1.03086433, "memory(GiB)": 67.64, "step": 370, "train_speed(iter/s)": 0.042736 }, { "acc": 0.72812705, "epoch": 0.2579092159559835, "grad_norm": 1.2832344770431519, "learning_rate": 9.999937406730297e-05, "loss": 1.00952168, "memory(GiB)": 67.64, "step": 375, "train_speed(iter/s)": 0.042979 }, { "acc": 0.69843874, "epoch": 0.2613480055020633, "grad_norm": 1.3689916133880615, "learning_rate": 9.999867571572407e-05, "loss": 1.1430685, "memory(GiB)": 67.64, "step": 380, "train_speed(iter/s)": 0.043186 }, { "acc": 0.71121368, "epoch": 0.26478679504814306, "grad_norm": 1.5115655660629272, "learning_rate": 9.999771872069336e-05, "loss": 1.06673965, "memory(GiB)": 67.64, "step": 385, "train_speed(iter/s)": 0.043399 }, { "acc": 0.71244879, "epoch": 0.26822558459422285, "grad_norm": 1.2644624710083008, "learning_rate": 9.999650308716193e-05, "loss": 1.0759717, "memory(GiB)": 67.64, "step": 390, "train_speed(iter/s)": 0.043596 }, { "acc": 0.71011033, "epoch": 0.2716643741403026, "grad_norm": 1.425584077835083, "learning_rate": 9.999502882141882e-05, "loss": 1.08612566, "memory(GiB)": 67.64, "step": 395, "train_speed(iter/s)": 0.043813 }, { "acc": 0.71973572, "epoch": 0.2751031636863824, "grad_norm": 1.281044840812683, "learning_rate": 9.999329593109124e-05, "loss": 1.04273968, "memory(GiB)": 67.64, "step": 400, "train_speed(iter/s)": 0.04406 }, { "epoch": 0.2751031636863824, "eval_acc": 0.723596301795114, "eval_loss": 1.0237661600112915, "eval_runtime": 1145.744, "eval_samples_per_second": 3.738, "eval_steps_per_second": 0.067, "step": 400 }, { "acc": 0.72366686, "epoch": 0.27854195323246217, "grad_norm": 1.4894949197769165, "learning_rate": 9.999130442514431e-05, "loss": 1.02950411, "memory(GiB)": 67.64, "step": 405, "train_speed(iter/s)": 0.03932 }, { "acc": 0.74041648, "epoch": 0.28198074277854196, "grad_norm": 1.2302844524383545, "learning_rate": 9.998905431388113e-05, "loss": 0.95937977, "memory(GiB)": 67.64, "step": 410, "train_speed(iter/s)": 0.039555 }, { "acc": 0.72292333, "epoch": 0.28541953232462175, "grad_norm": 1.1821825504302979, "learning_rate": 9.998654560894271e-05, "loss": 1.02365704, "memory(GiB)": 67.64, "step": 415, "train_speed(iter/s)": 0.039766 }, { "acc": 0.70973835, "epoch": 0.28885832187070154, "grad_norm": 1.2947014570236206, "learning_rate": 9.998377832330788e-05, "loss": 1.07417269, "memory(GiB)": 67.64, "step": 420, "train_speed(iter/s)": 0.039992 }, { "acc": 0.73527951, "epoch": 0.2922971114167813, "grad_norm": 1.2616949081420898, "learning_rate": 9.99807524712933e-05, "loss": 0.98149738, "memory(GiB)": 67.64, "step": 425, "train_speed(iter/s)": 0.040206 }, { "acc": 0.71251645, "epoch": 0.29573590096286106, "grad_norm": 1.2349984645843506, "learning_rate": 9.997746806855323e-05, "loss": 1.07718506, "memory(GiB)": 67.64, "step": 430, "train_speed(iter/s)": 0.040404 }, { "acc": 0.72872591, "epoch": 0.29917469050894085, "grad_norm": 1.128265619277954, "learning_rate": 9.997392513207963e-05, "loss": 1.00703831, "memory(GiB)": 67.64, "step": 435, "train_speed(iter/s)": 0.040581 }, { "acc": 0.72117209, "epoch": 0.30261348005502064, "grad_norm": 1.249985933303833, "learning_rate": 9.997012368020198e-05, "loss": 1.01667709, "memory(GiB)": 67.64, "step": 440, "train_speed(iter/s)": 0.040799 }, { "acc": 0.72518797, "epoch": 0.30605226960110044, "grad_norm": 1.3999882936477661, "learning_rate": 9.996606373258716e-05, "loss": 1.04834728, "memory(GiB)": 67.64, "step": 445, "train_speed(iter/s)": 0.041 }, { "acc": 0.72560539, "epoch": 0.30949105914718017, "grad_norm": 1.3446978330612183, "learning_rate": 9.99617453102394e-05, "loss": 1.01653395, "memory(GiB)": 67.64, "step": 450, "train_speed(iter/s)": 0.041207 }, { "acc": 0.72350621, "epoch": 0.31292984869325996, "grad_norm": 1.2894266843795776, "learning_rate": 9.99571684355002e-05, "loss": 0.99579372, "memory(GiB)": 67.64, "step": 455, "train_speed(iter/s)": 0.041368 }, { "acc": 0.70948811, "epoch": 0.31636863823933975, "grad_norm": 1.4214539527893066, "learning_rate": 9.995233313204806e-05, "loss": 1.09332161, "memory(GiB)": 67.64, "step": 460, "train_speed(iter/s)": 0.041539 }, { "acc": 0.74661293, "epoch": 0.31980742778541954, "grad_norm": 1.2697914838790894, "learning_rate": 9.994723942489859e-05, "loss": 0.93414135, "memory(GiB)": 67.64, "step": 465, "train_speed(iter/s)": 0.041724 }, { "acc": 0.72276139, "epoch": 0.32324621733149933, "grad_norm": 1.2612886428833008, "learning_rate": 9.99418873404042e-05, "loss": 1.04514399, "memory(GiB)": 67.64, "step": 470, "train_speed(iter/s)": 0.041901 }, { "acc": 0.72859631, "epoch": 0.32668500687757906, "grad_norm": 1.2637856006622314, "learning_rate": 9.993627690625399e-05, "loss": 0.99566994, "memory(GiB)": 67.64, "step": 475, "train_speed(iter/s)": 0.042059 }, { "acc": 0.72311392, "epoch": 0.33012379642365886, "grad_norm": 1.2103707790374756, "learning_rate": 9.993040815147369e-05, "loss": 1.02551346, "memory(GiB)": 67.64, "step": 480, "train_speed(iter/s)": 0.042237 }, { "acc": 0.7304266, "epoch": 0.33356258596973865, "grad_norm": 1.4478263854980469, "learning_rate": 9.992428110642546e-05, "loss": 1.00502892, "memory(GiB)": 67.64, "step": 485, "train_speed(iter/s)": 0.042429 }, { "acc": 0.72812204, "epoch": 0.33700137551581844, "grad_norm": 1.28928542137146, "learning_rate": 9.991789580280768e-05, "loss": 0.99270744, "memory(GiB)": 67.64, "step": 490, "train_speed(iter/s)": 0.042611 }, { "acc": 0.73110504, "epoch": 0.3404401650618982, "grad_norm": 1.277113914489746, "learning_rate": 9.991125227365489e-05, "loss": 0.9932848, "memory(GiB)": 67.71, "step": 495, "train_speed(iter/s)": 0.042803 }, { "acc": 0.73536983, "epoch": 0.343878954607978, "grad_norm": 1.4031190872192383, "learning_rate": 9.990435055333755e-05, "loss": 1.00407228, "memory(GiB)": 67.71, "step": 500, "train_speed(iter/s)": 0.042997 }, { "epoch": 0.343878954607978, "eval_acc": 0.7273923606424618, "eval_loss": 1.006140112876892, "eval_runtime": 1123.2925, "eval_samples_per_second": 3.813, "eval_steps_per_second": 0.069, "step": 500 }, { "acc": 0.7310411, "epoch": 0.34731774415405775, "grad_norm": 1.1264581680297852, "learning_rate": 9.989719067756184e-05, "loss": 0.97913218, "memory(GiB)": 67.71, "step": 505, "train_speed(iter/s)": 0.039389 }, { "acc": 0.72247181, "epoch": 0.35075653370013754, "grad_norm": 1.2322190999984741, "learning_rate": 9.988977268336956e-05, "loss": 1.04118223, "memory(GiB)": 67.71, "step": 510, "train_speed(iter/s)": 0.039571 }, { "acc": 0.7294539, "epoch": 0.35419532324621733, "grad_norm": 1.1988883018493652, "learning_rate": 9.988209660913789e-05, "loss": 0.96120787, "memory(GiB)": 67.71, "step": 515, "train_speed(iter/s)": 0.039729 }, { "acc": 0.72807951, "epoch": 0.3576341127922971, "grad_norm": 1.4514073133468628, "learning_rate": 9.987416249457917e-05, "loss": 1.00832357, "memory(GiB)": 67.71, "step": 520, "train_speed(iter/s)": 0.039869 }, { "acc": 0.72818184, "epoch": 0.3610729023383769, "grad_norm": 1.2781667709350586, "learning_rate": 9.986597038074072e-05, "loss": 1.00557394, "memory(GiB)": 67.71, "step": 525, "train_speed(iter/s)": 0.040019 }, { "acc": 0.7372427, "epoch": 0.36451169188445665, "grad_norm": 1.196447491645813, "learning_rate": 9.985752031000465e-05, "loss": 0.97588711, "memory(GiB)": 67.71, "step": 530, "train_speed(iter/s)": 0.040179 }, { "acc": 0.73485746, "epoch": 0.36795048143053644, "grad_norm": 1.2713799476623535, "learning_rate": 9.984881232608758e-05, "loss": 0.99121141, "memory(GiB)": 67.71, "step": 535, "train_speed(iter/s)": 0.040356 }, { "acc": 0.7316514, "epoch": 0.3713892709766162, "grad_norm": 1.388735055923462, "learning_rate": 9.983984647404047e-05, "loss": 0.97529774, "memory(GiB)": 67.71, "step": 540, "train_speed(iter/s)": 0.040533 }, { "acc": 0.73824301, "epoch": 0.374828060522696, "grad_norm": 1.263832926750183, "learning_rate": 9.983062280024837e-05, "loss": 0.95761375, "memory(GiB)": 67.71, "step": 545, "train_speed(iter/s)": 0.040707 }, { "acc": 0.72791233, "epoch": 0.3782668500687758, "grad_norm": 1.3154568672180176, "learning_rate": 9.982114135243019e-05, "loss": 1.00505419, "memory(GiB)": 67.71, "step": 550, "train_speed(iter/s)": 0.040862 }, { "acc": 0.73077579, "epoch": 0.3817056396148556, "grad_norm": 1.2996647357940674, "learning_rate": 9.981140217963838e-05, "loss": 0.98154631, "memory(GiB)": 67.71, "step": 555, "train_speed(iter/s)": 0.041008 }, { "acc": 0.7352643, "epoch": 0.38514442916093533, "grad_norm": 1.3090369701385498, "learning_rate": 9.980140533225882e-05, "loss": 0.9830574, "memory(GiB)": 67.71, "step": 560, "train_speed(iter/s)": 0.041146 }, { "acc": 0.7195425, "epoch": 0.3885832187070151, "grad_norm": 1.655612587928772, "learning_rate": 9.979115086201042e-05, "loss": 1.05448446, "memory(GiB)": 67.71, "step": 565, "train_speed(iter/s)": 0.041304 }, { "acc": 0.73759327, "epoch": 0.3920220082530949, "grad_norm": 1.183268427848816, "learning_rate": 9.978063882194492e-05, "loss": 0.96683788, "memory(GiB)": 67.71, "step": 570, "train_speed(iter/s)": 0.041468 }, { "acc": 0.73216171, "epoch": 0.3954607977991747, "grad_norm": 1.2590916156768799, "learning_rate": 9.976986926644662e-05, "loss": 0.97658138, "memory(GiB)": 67.71, "step": 575, "train_speed(iter/s)": 0.04163 }, { "acc": 0.72127271, "epoch": 0.3988995873452545, "grad_norm": 1.1548501253128052, "learning_rate": 9.975884225123204e-05, "loss": 1.00985394, "memory(GiB)": 67.71, "step": 580, "train_speed(iter/s)": 0.041797 }, { "acc": 0.74563594, "epoch": 0.4023383768913342, "grad_norm": 1.0580244064331055, "learning_rate": 9.974755783334972e-05, "loss": 0.94991455, "memory(GiB)": 67.71, "step": 585, "train_speed(iter/s)": 0.041937 }, { "acc": 0.72397938, "epoch": 0.405777166437414, "grad_norm": 1.2799969911575317, "learning_rate": 9.973601607117985e-05, "loss": 1.04541121, "memory(GiB)": 67.71, "step": 590, "train_speed(iter/s)": 0.042103 }, { "acc": 0.75536423, "epoch": 0.4092159559834938, "grad_norm": 1.2122467756271362, "learning_rate": 9.972421702443402e-05, "loss": 0.91661882, "memory(GiB)": 67.71, "step": 595, "train_speed(iter/s)": 0.042263 }, { "acc": 0.72923999, "epoch": 0.4126547455295736, "grad_norm": 1.3098151683807373, "learning_rate": 9.971216075415486e-05, "loss": 0.99268637, "memory(GiB)": 67.71, "step": 600, "train_speed(iter/s)": 0.042394 }, { "epoch": 0.4126547455295736, "eval_acc": 0.7299005713771539, "eval_loss": 0.9898082613945007, "eval_runtime": 1136.3836, "eval_samples_per_second": 3.769, "eval_steps_per_second": 0.068, "step": 600 }, { "acc": 0.73311081, "epoch": 0.4160935350756534, "grad_norm": 1.149190902709961, "learning_rate": 9.969984732271578e-05, "loss": 0.98028679, "memory(GiB)": 67.71, "step": 605, "train_speed(iter/s)": 0.039392 }, { "acc": 0.7316927, "epoch": 0.4195323246217332, "grad_norm": 1.3081296682357788, "learning_rate": 9.96872767938206e-05, "loss": 0.98179483, "memory(GiB)": 67.71, "step": 610, "train_speed(iter/s)": 0.039559 }, { "acc": 0.73893361, "epoch": 0.4229711141678129, "grad_norm": 1.1731023788452148, "learning_rate": 9.967444923250323e-05, "loss": 0.94215651, "memory(GiB)": 67.71, "step": 615, "train_speed(iter/s)": 0.039695 }, { "acc": 0.72336564, "epoch": 0.4264099037138927, "grad_norm": 1.2004274129867554, "learning_rate": 9.966136470512739e-05, "loss": 1.01167727, "memory(GiB)": 67.71, "step": 620, "train_speed(iter/s)": 0.03985 }, { "acc": 0.73260341, "epoch": 0.4298486932599725, "grad_norm": 1.1863032579421997, "learning_rate": 9.964802327938616e-05, "loss": 0.98780212, "memory(GiB)": 67.71, "step": 625, "train_speed(iter/s)": 0.039998 }, { "acc": 0.72430835, "epoch": 0.4332874828060523, "grad_norm": 1.2297348976135254, "learning_rate": 9.963442502430173e-05, "loss": 1.02258396, "memory(GiB)": 67.71, "step": 630, "train_speed(iter/s)": 0.0401 }, { "acc": 0.73400669, "epoch": 0.43672627235213207, "grad_norm": 1.1201564073562622, "learning_rate": 9.962057001022499e-05, "loss": 0.95277481, "memory(GiB)": 67.71, "step": 635, "train_speed(iter/s)": 0.040238 }, { "acc": 0.72435627, "epoch": 0.4401650618982118, "grad_norm": 1.2594115734100342, "learning_rate": 9.96064583088352e-05, "loss": 1.01793871, "memory(GiB)": 67.71, "step": 640, "train_speed(iter/s)": 0.040389 }, { "acc": 0.74932237, "epoch": 0.4436038514442916, "grad_norm": 1.0871134996414185, "learning_rate": 9.959208999313953e-05, "loss": 0.92056198, "memory(GiB)": 67.71, "step": 645, "train_speed(iter/s)": 0.040522 }, { "acc": 0.74172649, "epoch": 0.4470426409903714, "grad_norm": 1.0481441020965576, "learning_rate": 9.957746513747285e-05, "loss": 0.94307327, "memory(GiB)": 67.71, "step": 650, "train_speed(iter/s)": 0.040673 }, { "acc": 0.73418083, "epoch": 0.4504814305364512, "grad_norm": 1.2039026021957397, "learning_rate": 9.956258381749717e-05, "loss": 0.96942959, "memory(GiB)": 67.71, "step": 655, "train_speed(iter/s)": 0.04079 }, { "acc": 0.73663011, "epoch": 0.45392022008253097, "grad_norm": 1.2746825218200684, "learning_rate": 9.954744611020134e-05, "loss": 0.96783085, "memory(GiB)": 67.71, "step": 660, "train_speed(iter/s)": 0.040931 }, { "acc": 0.75085382, "epoch": 0.4573590096286107, "grad_norm": 1.1864688396453857, "learning_rate": 9.953205209390065e-05, "loss": 0.93258324, "memory(GiB)": 67.71, "step": 665, "train_speed(iter/s)": 0.041065 }, { "acc": 0.74181981, "epoch": 0.4607977991746905, "grad_norm": 1.2284380197525024, "learning_rate": 9.95164018482364e-05, "loss": 0.94610729, "memory(GiB)": 67.71, "step": 670, "train_speed(iter/s)": 0.041186 }, { "acc": 0.73316283, "epoch": 0.4642365887207703, "grad_norm": 1.0974282026290894, "learning_rate": 9.950049545417551e-05, "loss": 0.97180891, "memory(GiB)": 67.71, "step": 675, "train_speed(iter/s)": 0.041284 }, { "acc": 0.73497968, "epoch": 0.4676753782668501, "grad_norm": 1.1195545196533203, "learning_rate": 9.948433299401008e-05, "loss": 0.96802521, "memory(GiB)": 67.71, "step": 680, "train_speed(iter/s)": 0.041406 }, { "acc": 0.71404638, "epoch": 0.47111416781292986, "grad_norm": 1.2557018995285034, "learning_rate": 9.946791455135697e-05, "loss": 1.04876156, "memory(GiB)": 67.71, "step": 685, "train_speed(iter/s)": 0.041511 }, { "acc": 0.73286834, "epoch": 0.47455295735900965, "grad_norm": 1.2220708131790161, "learning_rate": 9.945124021115738e-05, "loss": 0.96964302, "memory(GiB)": 67.71, "step": 690, "train_speed(iter/s)": 0.041631 }, { "acc": 0.73684483, "epoch": 0.4779917469050894, "grad_norm": 1.2621607780456543, "learning_rate": 9.94343100596764e-05, "loss": 0.95697803, "memory(GiB)": 67.71, "step": 695, "train_speed(iter/s)": 0.041775 }, { "acc": 0.73987064, "epoch": 0.4814305364511692, "grad_norm": 1.1854294538497925, "learning_rate": 9.941712418450258e-05, "loss": 0.94488659, "memory(GiB)": 67.71, "step": 700, "train_speed(iter/s)": 0.041901 }, { "epoch": 0.4814305364511692, "eval_acc": 0.7335672830341476, "eval_loss": 0.9757564663887024, "eval_runtime": 1129.274, "eval_samples_per_second": 3.793, "eval_steps_per_second": 0.068, "step": 700 }, { "acc": 0.73576145, "epoch": 0.48486932599724897, "grad_norm": 1.1550548076629639, "learning_rate": 9.939968267454743e-05, "loss": 0.95160465, "memory(GiB)": 67.71, "step": 705, "train_speed(iter/s)": 0.03937 }, { "acc": 0.71119275, "epoch": 0.48830811554332876, "grad_norm": 1.2182416915893555, "learning_rate": 9.938198562004501e-05, "loss": 1.04482851, "memory(GiB)": 67.71, "step": 710, "train_speed(iter/s)": 0.039477 }, { "acc": 0.74570274, "epoch": 0.49174690508940855, "grad_norm": 1.1353340148925781, "learning_rate": 9.936403311255144e-05, "loss": 0.92555218, "memory(GiB)": 67.71, "step": 715, "train_speed(iter/s)": 0.039603 }, { "acc": 0.74782338, "epoch": 0.4951856946354883, "grad_norm": 1.2046043872833252, "learning_rate": 9.934582524494446e-05, "loss": 0.92999516, "memory(GiB)": 67.71, "step": 720, "train_speed(iter/s)": 0.039731 }, { "acc": 0.73299646, "epoch": 0.4986244841815681, "grad_norm": 1.102347731590271, "learning_rate": 9.932736211142291e-05, "loss": 0.97149315, "memory(GiB)": 67.71, "step": 725, "train_speed(iter/s)": 0.03984 }, { "acc": 0.72648382, "epoch": 0.5020632737276479, "grad_norm": 1.0632636547088623, "learning_rate": 9.930864380750617e-05, "loss": 1.01790123, "memory(GiB)": 67.71, "step": 730, "train_speed(iter/s)": 0.039945 }, { "acc": 0.71636868, "epoch": 0.5055020632737276, "grad_norm": 1.1830312013626099, "learning_rate": 9.928967043003391e-05, "loss": 1.01803741, "memory(GiB)": 67.71, "step": 735, "train_speed(iter/s)": 0.040055 }, { "acc": 0.73447638, "epoch": 0.5089408528198074, "grad_norm": 1.1544054746627808, "learning_rate": 9.92704420771653e-05, "loss": 0.97713757, "memory(GiB)": 67.71, "step": 740, "train_speed(iter/s)": 0.040173 }, { "acc": 0.73799992, "epoch": 0.5123796423658872, "grad_norm": 1.0744158029556274, "learning_rate": 9.925095884837867e-05, "loss": 0.95858746, "memory(GiB)": 67.71, "step": 745, "train_speed(iter/s)": 0.040296 }, { "acc": 0.74002094, "epoch": 0.515818431911967, "grad_norm": 1.086005687713623, "learning_rate": 9.923122084447098e-05, "loss": 0.95759525, "memory(GiB)": 67.71, "step": 750, "train_speed(iter/s)": 0.040432 }, { "acc": 0.73197713, "epoch": 0.5192572214580468, "grad_norm": 1.177945852279663, "learning_rate": 9.921122816755725e-05, "loss": 0.98773813, "memory(GiB)": 67.71, "step": 755, "train_speed(iter/s)": 0.040536 }, { "acc": 0.71955528, "epoch": 0.5226960110041265, "grad_norm": 1.1270967721939087, "learning_rate": 9.919098092107003e-05, "loss": 1.0065423, "memory(GiB)": 67.71, "step": 760, "train_speed(iter/s)": 0.040641 }, { "acc": 0.72435188, "epoch": 0.5261348005502063, "grad_norm": 1.1566613912582397, "learning_rate": 9.917047920975897e-05, "loss": 1.00753899, "memory(GiB)": 67.71, "step": 765, "train_speed(iter/s)": 0.040761 }, { "acc": 0.72682076, "epoch": 0.5295735900962861, "grad_norm": 1.0998412370681763, "learning_rate": 9.914972313969015e-05, "loss": 0.99639912, "memory(GiB)": 67.71, "step": 770, "train_speed(iter/s)": 0.040857 }, { "acc": 0.73786283, "epoch": 0.5330123796423659, "grad_norm": 1.0717042684555054, "learning_rate": 9.912871281824555e-05, "loss": 0.95036526, "memory(GiB)": 67.71, "step": 775, "train_speed(iter/s)": 0.040955 }, { "acc": 0.72474022, "epoch": 0.5364511691884457, "grad_norm": 1.1307621002197266, "learning_rate": 9.910744835412258e-05, "loss": 1.00282173, "memory(GiB)": 67.71, "step": 780, "train_speed(iter/s)": 0.041067 }, { "acc": 0.73896732, "epoch": 0.5398899587345255, "grad_norm": 1.0760217905044556, "learning_rate": 9.908592985733346e-05, "loss": 0.95014591, "memory(GiB)": 67.71, "step": 785, "train_speed(iter/s)": 0.041189 }, { "acc": 0.73375082, "epoch": 0.5433287482806052, "grad_norm": 1.1228985786437988, "learning_rate": 9.90641574392046e-05, "loss": 0.97449379, "memory(GiB)": 67.71, "step": 790, "train_speed(iter/s)": 0.041296 }, { "acc": 0.73906136, "epoch": 0.546767537826685, "grad_norm": 1.0855998992919922, "learning_rate": 9.904213121237616e-05, "loss": 0.9437438, "memory(GiB)": 67.71, "step": 795, "train_speed(iter/s)": 0.041409 }, { "acc": 0.7277792, "epoch": 0.5502063273727648, "grad_norm": 1.24734365940094, "learning_rate": 9.90198512908013e-05, "loss": 1.01125345, "memory(GiB)": 67.71, "step": 800, "train_speed(iter/s)": 0.041532 }, { "epoch": 0.5502063273727648, "eval_acc": 0.736024879650875, "eval_loss": 0.9637655019760132, "eval_runtime": 1126.4376, "eval_samples_per_second": 3.802, "eval_steps_per_second": 0.068, "step": 800 }, { "acc": 0.75724821, "epoch": 0.5536451169188445, "grad_norm": 1.1258316040039062, "learning_rate": 9.899731778974572e-05, "loss": 0.87265921, "memory(GiB)": 67.71, "step": 805, "train_speed(iter/s)": 0.039349 }, { "acc": 0.74204683, "epoch": 0.5570839064649243, "grad_norm": 0.9689936637878418, "learning_rate": 9.897453082578703e-05, "loss": 0.91779423, "memory(GiB)": 67.71, "step": 810, "train_speed(iter/s)": 0.039466 }, { "acc": 0.73968034, "epoch": 0.5605226960110041, "grad_norm": 1.1123220920562744, "learning_rate": 9.895149051681413e-05, "loss": 0.97357388, "memory(GiB)": 67.71, "step": 815, "train_speed(iter/s)": 0.039574 }, { "acc": 0.73935227, "epoch": 0.5639614855570839, "grad_norm": 1.0451692342758179, "learning_rate": 9.892819698202658e-05, "loss": 0.93994102, "memory(GiB)": 67.71, "step": 820, "train_speed(iter/s)": 0.039675 }, { "acc": 0.73578658, "epoch": 0.5674002751031637, "grad_norm": 1.0823888778686523, "learning_rate": 9.890465034193403e-05, "loss": 0.92713509, "memory(GiB)": 67.71, "step": 825, "train_speed(iter/s)": 0.039784 }, { "acc": 0.7370616, "epoch": 0.5708390646492435, "grad_norm": 1.1076163053512573, "learning_rate": 9.888085071835557e-05, "loss": 0.96277084, "memory(GiB)": 67.71, "step": 830, "train_speed(iter/s)": 0.03987 }, { "acc": 0.74359312, "epoch": 0.5742778541953233, "grad_norm": 0.9995237588882446, "learning_rate": 9.885679823441913e-05, "loss": 0.92473927, "memory(GiB)": 67.71, "step": 835, "train_speed(iter/s)": 0.039986 }, { "acc": 0.73567324, "epoch": 0.5777166437414031, "grad_norm": 1.1980810165405273, "learning_rate": 9.883249301456078e-05, "loss": 0.97589169, "memory(GiB)": 67.71, "step": 840, "train_speed(iter/s)": 0.040091 }, { "acc": 0.72378907, "epoch": 0.5811554332874828, "grad_norm": 1.059746503829956, "learning_rate": 9.880793518452414e-05, "loss": 1.01202221, "memory(GiB)": 67.71, "step": 845, "train_speed(iter/s)": 0.040196 }, { "acc": 0.72781639, "epoch": 0.5845942228335625, "grad_norm": 1.1578445434570312, "learning_rate": 9.878312487135973e-05, "loss": 0.98674173, "memory(GiB)": 67.71, "step": 850, "train_speed(iter/s)": 0.040293 }, { "acc": 0.7325696, "epoch": 0.5880330123796423, "grad_norm": 1.1622587442398071, "learning_rate": 9.87580622034243e-05, "loss": 0.96467819, "memory(GiB)": 67.71, "step": 855, "train_speed(iter/s)": 0.040397 }, { "acc": 0.74238405, "epoch": 0.5914718019257221, "grad_norm": 1.221163034439087, "learning_rate": 9.873274731038013e-05, "loss": 0.94902515, "memory(GiB)": 67.71, "step": 860, "train_speed(iter/s)": 0.040497 }, { "acc": 0.73676643, "epoch": 0.5949105914718019, "grad_norm": 1.0908128023147583, "learning_rate": 9.87071803231944e-05, "loss": 0.94923353, "memory(GiB)": 67.71, "step": 865, "train_speed(iter/s)": 0.040613 }, { "acc": 0.7285512, "epoch": 0.5983493810178817, "grad_norm": 0.9778567552566528, "learning_rate": 9.868136137413854e-05, "loss": 0.99065866, "memory(GiB)": 67.71, "step": 870, "train_speed(iter/s)": 0.040705 }, { "acc": 0.75390539, "epoch": 0.6017881705639615, "grad_norm": 1.1204711198806763, "learning_rate": 9.865529059678749e-05, "loss": 0.89114456, "memory(GiB)": 67.71, "step": 875, "train_speed(iter/s)": 0.040815 }, { "acc": 0.72871351, "epoch": 0.6052269601100413, "grad_norm": 1.1295973062515259, "learning_rate": 9.8628968126019e-05, "loss": 0.97484636, "memory(GiB)": 67.71, "step": 880, "train_speed(iter/s)": 0.04091 }, { "acc": 0.75279789, "epoch": 0.6086657496561211, "grad_norm": 1.276840090751648, "learning_rate": 9.8602394098013e-05, "loss": 0.9101244, "memory(GiB)": 67.71, "step": 885, "train_speed(iter/s)": 0.041017 }, { "acc": 0.72960396, "epoch": 0.6121045392022009, "grad_norm": 1.1485203504562378, "learning_rate": 9.857556865025087e-05, "loss": 0.9954258, "memory(GiB)": 67.71, "step": 890, "train_speed(iter/s)": 0.041116 }, { "acc": 0.73271265, "epoch": 0.6155433287482807, "grad_norm": 1.2299952507019043, "learning_rate": 9.854849192151468e-05, "loss": 0.97523527, "memory(GiB)": 67.71, "step": 895, "train_speed(iter/s)": 0.041212 }, { "acc": 0.72924538, "epoch": 0.6189821182943603, "grad_norm": 1.1494402885437012, "learning_rate": 9.852116405188648e-05, "loss": 0.98907299, "memory(GiB)": 67.71, "step": 900, "train_speed(iter/s)": 0.041323 }, { "epoch": 0.6189821182943603, "eval_acc": 0.7370371620101678, "eval_loss": 0.9578044414520264, "eval_runtime": 1104.9055, "eval_samples_per_second": 3.876, "eval_steps_per_second": 0.07, "step": 900 }, { "acc": 0.72602391, "epoch": 0.6224209078404401, "grad_norm": 1.0728832483291626, "learning_rate": 9.849358518274771e-05, "loss": 1.01037588, "memory(GiB)": 67.71, "step": 905, "train_speed(iter/s)": 0.039428 }, { "acc": 0.7520565, "epoch": 0.6258596973865199, "grad_norm": 1.0786807537078857, "learning_rate": 9.846575545677823e-05, "loss": 0.92040062, "memory(GiB)": 67.71, "step": 910, "train_speed(iter/s)": 0.039534 }, { "acc": 0.74691858, "epoch": 0.6292984869325997, "grad_norm": 1.0437581539154053, "learning_rate": 9.843767501795583e-05, "loss": 0.9074029, "memory(GiB)": 67.71, "step": 915, "train_speed(iter/s)": 0.039631 }, { "acc": 0.73221941, "epoch": 0.6327372764786795, "grad_norm": 1.1795591115951538, "learning_rate": 9.840934401155528e-05, "loss": 0.988484, "memory(GiB)": 67.71, "step": 920, "train_speed(iter/s)": 0.039722 }, { "acc": 0.72777405, "epoch": 0.6361760660247593, "grad_norm": 1.1894828081130981, "learning_rate": 9.838076258414776e-05, "loss": 1.01051292, "memory(GiB)": 67.71, "step": 925, "train_speed(iter/s)": 0.039818 }, { "acc": 0.75026011, "epoch": 0.6396148555708391, "grad_norm": 0.9834104180335999, "learning_rate": 9.835193088359988e-05, "loss": 0.90967407, "memory(GiB)": 67.71, "step": 930, "train_speed(iter/s)": 0.0399 }, { "acc": 0.74026661, "epoch": 0.6430536451169189, "grad_norm": 1.2417614459991455, "learning_rate": 9.832284905907318e-05, "loss": 0.92580471, "memory(GiB)": 67.71, "step": 935, "train_speed(iter/s)": 0.03999 }, { "acc": 0.73846035, "epoch": 0.6464924346629987, "grad_norm": 1.1710271835327148, "learning_rate": 9.829351726102313e-05, "loss": 0.95107613, "memory(GiB)": 67.71, "step": 940, "train_speed(iter/s)": 0.040081 }, { "acc": 0.7366385, "epoch": 0.6499312242090785, "grad_norm": 1.0618470907211304, "learning_rate": 9.826393564119847e-05, "loss": 0.94500179, "memory(GiB)": 67.71, "step": 945, "train_speed(iter/s)": 0.040159 }, { "acc": 0.74606085, "epoch": 0.6533700137551581, "grad_norm": 1.0151257514953613, "learning_rate": 9.823410435264042e-05, "loss": 0.90975704, "memory(GiB)": 67.71, "step": 950, "train_speed(iter/s)": 0.040245 }, { "acc": 0.73123455, "epoch": 0.6568088033012379, "grad_norm": 1.1929761171340942, "learning_rate": 9.820402354968183e-05, "loss": 0.95826616, "memory(GiB)": 67.71, "step": 955, "train_speed(iter/s)": 0.040333 }, { "acc": 0.73816185, "epoch": 0.6602475928473177, "grad_norm": 1.240237832069397, "learning_rate": 9.817369338794646e-05, "loss": 0.94996367, "memory(GiB)": 67.71, "step": 960, "train_speed(iter/s)": 0.040436 }, { "acc": 0.74816332, "epoch": 0.6636863823933975, "grad_norm": 0.9286736845970154, "learning_rate": 9.81431140243481e-05, "loss": 0.90342827, "memory(GiB)": 67.71, "step": 965, "train_speed(iter/s)": 0.040521 }, { "acc": 0.74362345, "epoch": 0.6671251719394773, "grad_norm": 1.1020361185073853, "learning_rate": 9.811228561708979e-05, "loss": 0.92705402, "memory(GiB)": 67.71, "step": 970, "train_speed(iter/s)": 0.040625 }, { "acc": 0.73400373, "epoch": 0.6705639614855571, "grad_norm": 1.0580672025680542, "learning_rate": 9.808120832566306e-05, "loss": 0.98702965, "memory(GiB)": 67.71, "step": 975, "train_speed(iter/s)": 0.040726 }, { "acc": 0.71981792, "epoch": 0.6740027510316369, "grad_norm": 1.054178237915039, "learning_rate": 9.804988231084695e-05, "loss": 1.02396307, "memory(GiB)": 67.71, "step": 980, "train_speed(iter/s)": 0.040822 }, { "acc": 0.7412406, "epoch": 0.6774415405777167, "grad_norm": 1.064276933670044, "learning_rate": 9.801830773470738e-05, "loss": 0.92902575, "memory(GiB)": 67.71, "step": 985, "train_speed(iter/s)": 0.040901 }, { "acc": 0.73765955, "epoch": 0.6808803301237965, "grad_norm": 1.02224862575531, "learning_rate": 9.798648476059612e-05, "loss": 0.96069899, "memory(GiB)": 67.71, "step": 990, "train_speed(iter/s)": 0.040989 }, { "acc": 0.72184877, "epoch": 0.6843191196698762, "grad_norm": 1.10880446434021, "learning_rate": 9.795441355315009e-05, "loss": 1.00857792, "memory(GiB)": 67.71, "step": 995, "train_speed(iter/s)": 0.041069 }, { "acc": 0.75687084, "epoch": 0.687757909215956, "grad_norm": 1.0748587846755981, "learning_rate": 9.792209427829044e-05, "loss": 0.89921093, "memory(GiB)": 67.71, "step": 1000, "train_speed(iter/s)": 0.041172 }, { "epoch": 0.687757909215956, "eval_acc": 0.739815314707338, "eval_loss": 0.9472519159317017, "eval_runtime": 1138.6289, "eval_samples_per_second": 3.762, "eval_steps_per_second": 0.068, "step": 1000 }, { "acc": 0.74216719, "epoch": 0.6911966987620357, "grad_norm": 0.9975650906562805, "learning_rate": 9.788952710322168e-05, "loss": 0.92038422, "memory(GiB)": 67.71, "step": 1005, "train_speed(iter/s)": 0.039418 }, { "acc": 0.73918667, "epoch": 0.6946354883081155, "grad_norm": 1.167277455329895, "learning_rate": 9.785671219643086e-05, "loss": 0.95244484, "memory(GiB)": 67.71, "step": 1010, "train_speed(iter/s)": 0.039509 }, { "acc": 0.74078741, "epoch": 0.6980742778541953, "grad_norm": 1.1248480081558228, "learning_rate": 9.782364972768667e-05, "loss": 0.95239239, "memory(GiB)": 67.71, "step": 1015, "train_speed(iter/s)": 0.039611 }, { "acc": 0.73078346, "epoch": 0.7015130674002751, "grad_norm": 1.1177655458450317, "learning_rate": 9.779033986803856e-05, "loss": 0.97850962, "memory(GiB)": 67.71, "step": 1020, "train_speed(iter/s)": 0.039689 }, { "acc": 0.72922001, "epoch": 0.7049518569463549, "grad_norm": 1.025723934173584, "learning_rate": 9.775678278981587e-05, "loss": 0.97461071, "memory(GiB)": 67.71, "step": 1025, "train_speed(iter/s)": 0.039762 }, { "acc": 0.74167843, "epoch": 0.7083906464924347, "grad_norm": 1.0677716732025146, "learning_rate": 9.772297866662694e-05, "loss": 0.93668747, "memory(GiB)": 67.71, "step": 1030, "train_speed(iter/s)": 0.039851 }, { "acc": 0.73580718, "epoch": 0.7118294360385145, "grad_norm": 1.071346402168274, "learning_rate": 9.768892767335818e-05, "loss": 0.94455872, "memory(GiB)": 67.71, "step": 1035, "train_speed(iter/s)": 0.03994 }, { "acc": 0.73037386, "epoch": 0.7152682255845942, "grad_norm": 1.1164538860321045, "learning_rate": 9.76546299861732e-05, "loss": 0.96749563, "memory(GiB)": 67.71, "step": 1040, "train_speed(iter/s)": 0.040026 }, { "acc": 0.73348866, "epoch": 0.718707015130674, "grad_norm": 1.016825556755066, "learning_rate": 9.76200857825119e-05, "loss": 0.97527409, "memory(GiB)": 67.71, "step": 1045, "train_speed(iter/s)": 0.040097 }, { "acc": 0.74810896, "epoch": 0.7221458046767538, "grad_norm": 1.0394419431686401, "learning_rate": 9.758529524108952e-05, "loss": 0.91727133, "memory(GiB)": 67.71, "step": 1050, "train_speed(iter/s)": 0.040188 }, { "acc": 0.7377789, "epoch": 0.7255845942228336, "grad_norm": 1.2021335363388062, "learning_rate": 9.755025854189574e-05, "loss": 0.96904411, "memory(GiB)": 67.71, "step": 1055, "train_speed(iter/s)": 0.040268 }, { "acc": 0.7272027, "epoch": 0.7290233837689133, "grad_norm": 1.012821912765503, "learning_rate": 9.751497586619374e-05, "loss": 0.9858429, "memory(GiB)": 67.71, "step": 1060, "train_speed(iter/s)": 0.040336 }, { "acc": 0.73458595, "epoch": 0.7324621733149931, "grad_norm": 1.1023552417755127, "learning_rate": 9.747944739651928e-05, "loss": 0.95475712, "memory(GiB)": 67.71, "step": 1065, "train_speed(iter/s)": 0.04042 }, { "acc": 0.74384351, "epoch": 0.7359009628610729, "grad_norm": 1.1670334339141846, "learning_rate": 9.744367331667972e-05, "loss": 0.90986481, "memory(GiB)": 67.71, "step": 1070, "train_speed(iter/s)": 0.040504 }, { "acc": 0.74164953, "epoch": 0.7393397524071527, "grad_norm": 1.1003512144088745, "learning_rate": 9.740765381175308e-05, "loss": 0.95252619, "memory(GiB)": 67.71, "step": 1075, "train_speed(iter/s)": 0.040585 }, { "acc": 0.72832394, "epoch": 0.7427785419532325, "grad_norm": 1.141493320465088, "learning_rate": 9.737138906808716e-05, "loss": 0.9896246, "memory(GiB)": 67.71, "step": 1080, "train_speed(iter/s)": 0.040669 }, { "acc": 0.75672712, "epoch": 0.7462173314993122, "grad_norm": 1.0548261404037476, "learning_rate": 9.733487927329842e-05, "loss": 0.89013748, "memory(GiB)": 67.71, "step": 1085, "train_speed(iter/s)": 0.040758 }, { "acc": 0.74028645, "epoch": 0.749656121045392, "grad_norm": 1.058765172958374, "learning_rate": 9.729812461627116e-05, "loss": 0.9446207, "memory(GiB)": 67.71, "step": 1090, "train_speed(iter/s)": 0.040824 }, { "acc": 0.74290891, "epoch": 0.7530949105914718, "grad_norm": 1.0750882625579834, "learning_rate": 9.726112528715645e-05, "loss": 0.93429804, "memory(GiB)": 67.71, "step": 1095, "train_speed(iter/s)": 0.040904 }, { "acc": 0.74171824, "epoch": 0.7565337001375516, "grad_norm": 1.1232870817184448, "learning_rate": 9.722388147737117e-05, "loss": 0.9356823, "memory(GiB)": 67.71, "step": 1100, "train_speed(iter/s)": 0.040982 }, { "epoch": 0.7565337001375516, "eval_acc": 0.7418005128897287, "eval_loss": 0.9385226964950562, "eval_runtime": 1120.0428, "eval_samples_per_second": 3.824, "eval_steps_per_second": 0.069, "step": 1100 }, { "acc": 0.73683257, "epoch": 0.7599724896836314, "grad_norm": 1.0508232116699219, "learning_rate": 9.718639337959709e-05, "loss": 0.95805416, "memory(GiB)": 67.71, "step": 1105, "train_speed(iter/s)": 0.03942 }, { "acc": 0.74061327, "epoch": 0.7634112792297112, "grad_norm": 1.0770542621612549, "learning_rate": 9.714866118777971e-05, "loss": 0.92782459, "memory(GiB)": 67.71, "step": 1110, "train_speed(iter/s)": 0.039509 }, { "acc": 0.72901726, "epoch": 0.7668500687757909, "grad_norm": 1.104008674621582, "learning_rate": 9.711068509712744e-05, "loss": 0.99297533, "memory(GiB)": 67.71, "step": 1115, "train_speed(iter/s)": 0.039593 }, { "acc": 0.73747034, "epoch": 0.7702888583218707, "grad_norm": 1.1213022470474243, "learning_rate": 9.707246530411045e-05, "loss": 0.96422043, "memory(GiB)": 67.71, "step": 1120, "train_speed(iter/s)": 0.039669 }, { "acc": 0.75132704, "epoch": 0.7737276478679505, "grad_norm": 0.9887475967407227, "learning_rate": 9.703400200645976e-05, "loss": 0.90485935, "memory(GiB)": 67.71, "step": 1125, "train_speed(iter/s)": 0.039747 }, { "acc": 0.74963489, "epoch": 0.7771664374140302, "grad_norm": 1.105952501296997, "learning_rate": 9.69952954031661e-05, "loss": 0.89224911, "memory(GiB)": 67.71, "step": 1130, "train_speed(iter/s)": 0.039821 }, { "acc": 0.73749495, "epoch": 0.78060522696011, "grad_norm": 1.140572428703308, "learning_rate": 9.695634569447904e-05, "loss": 0.9487175, "memory(GiB)": 67.71, "step": 1135, "train_speed(iter/s)": 0.039888 }, { "acc": 0.7312088, "epoch": 0.7840440165061898, "grad_norm": 1.1275548934936523, "learning_rate": 9.691715308190576e-05, "loss": 0.96534138, "memory(GiB)": 67.71, "step": 1140, "train_speed(iter/s)": 0.039959 }, { "acc": 0.73048372, "epoch": 0.7874828060522696, "grad_norm": 1.0315409898757935, "learning_rate": 9.68777177682102e-05, "loss": 0.97743053, "memory(GiB)": 67.71, "step": 1145, "train_speed(iter/s)": 0.040036 }, { "acc": 0.72779579, "epoch": 0.7909215955983494, "grad_norm": 0.9368631839752197, "learning_rate": 9.683803995741186e-05, "loss": 0.98139448, "memory(GiB)": 67.71, "step": 1150, "train_speed(iter/s)": 0.040117 }, { "acc": 0.74332333, "epoch": 0.7943603851444292, "grad_norm": 1.1103096008300781, "learning_rate": 9.679811985478483e-05, "loss": 0.9456337, "memory(GiB)": 67.71, "step": 1155, "train_speed(iter/s)": 0.040199 }, { "acc": 0.73605175, "epoch": 0.797799174690509, "grad_norm": 0.9985005259513855, "learning_rate": 9.675795766685669e-05, "loss": 0.94118538, "memory(GiB)": 67.71, "step": 1160, "train_speed(iter/s)": 0.040265 }, { "acc": 0.7368608, "epoch": 0.8012379642365888, "grad_norm": 1.086758017539978, "learning_rate": 9.671755360140746e-05, "loss": 0.94844141, "memory(GiB)": 67.71, "step": 1165, "train_speed(iter/s)": 0.040343 }, { "acc": 0.74641371, "epoch": 0.8046767537826685, "grad_norm": 0.9669944643974304, "learning_rate": 9.667690786746852e-05, "loss": 0.91691303, "memory(GiB)": 67.71, "step": 1170, "train_speed(iter/s)": 0.040416 }, { "acc": 0.74651995, "epoch": 0.8081155433287482, "grad_norm": 1.0597587823867798, "learning_rate": 9.663602067532151e-05, "loss": 0.91813259, "memory(GiB)": 67.71, "step": 1175, "train_speed(iter/s)": 0.040502 }, { "acc": 0.73420897, "epoch": 0.811554332874828, "grad_norm": 1.1257351636886597, "learning_rate": 9.659489223649731e-05, "loss": 0.97081699, "memory(GiB)": 67.71, "step": 1180, "train_speed(iter/s)": 0.040572 }, { "acc": 0.76278071, "epoch": 0.8149931224209078, "grad_norm": 1.0399693250656128, "learning_rate": 9.655352276377484e-05, "loss": 0.85249825, "memory(GiB)": 67.71, "step": 1185, "train_speed(iter/s)": 0.040662 }, { "acc": 0.74080434, "epoch": 0.8184319119669876, "grad_norm": 1.104978322982788, "learning_rate": 9.651191247118003e-05, "loss": 0.93528318, "memory(GiB)": 67.71, "step": 1190, "train_speed(iter/s)": 0.040735 }, { "acc": 0.7570159, "epoch": 0.8218707015130674, "grad_norm": 1.1633975505828857, "learning_rate": 9.647006157398471e-05, "loss": 0.8937582, "memory(GiB)": 67.71, "step": 1195, "train_speed(iter/s)": 0.040819 }, { "acc": 0.75301266, "epoch": 0.8253094910591472, "grad_norm": 1.0719282627105713, "learning_rate": 9.642797028870549e-05, "loss": 0.87665348, "memory(GiB)": 67.71, "step": 1200, "train_speed(iter/s)": 0.040893 }, { "epoch": 0.8253094910591472, "eval_acc": 0.7415305709272506, "eval_loss": 0.9332711100578308, "eval_runtime": 1102.5228, "eval_samples_per_second": 3.885, "eval_steps_per_second": 0.07, "step": 1200 }, { "acc": 0.73832102, "epoch": 0.828748280605227, "grad_norm": 1.1222566366195679, "learning_rate": 9.63856388331026e-05, "loss": 0.92794905, "memory(GiB)": 67.71, "step": 1205, "train_speed(iter/s)": 0.039487 }, { "acc": 0.74549799, "epoch": 0.8321870701513068, "grad_norm": 1.0469160079956055, "learning_rate": 9.634306742617881e-05, "loss": 0.91989012, "memory(GiB)": 67.71, "step": 1210, "train_speed(iter/s)": 0.039561 }, { "acc": 0.73400946, "epoch": 0.8356258596973866, "grad_norm": 1.1092973947525024, "learning_rate": 9.630025628817833e-05, "loss": 0.96797295, "memory(GiB)": 67.71, "step": 1215, "train_speed(iter/s)": 0.039633 }, { "acc": 0.74465179, "epoch": 0.8390646492434664, "grad_norm": 1.0476914644241333, "learning_rate": 9.625720564058553e-05, "loss": 0.9328536, "memory(GiB)": 67.71, "step": 1220, "train_speed(iter/s)": 0.039708 }, { "acc": 0.7472661, "epoch": 0.842503438789546, "grad_norm": 1.002954363822937, "learning_rate": 9.6213915706124e-05, "loss": 0.90329132, "memory(GiB)": 67.71, "step": 1225, "train_speed(iter/s)": 0.039779 }, { "acc": 0.73477154, "epoch": 0.8459422283356258, "grad_norm": 1.1124727725982666, "learning_rate": 9.617038670875518e-05, "loss": 0.97662973, "memory(GiB)": 67.71, "step": 1230, "train_speed(iter/s)": 0.039851 }, { "acc": 0.7584034, "epoch": 0.8493810178817056, "grad_norm": 0.9963657855987549, "learning_rate": 9.612661887367738e-05, "loss": 0.87994137, "memory(GiB)": 67.71, "step": 1235, "train_speed(iter/s)": 0.03993 }, { "acc": 0.74326572, "epoch": 0.8528198074277854, "grad_norm": 1.0891412496566772, "learning_rate": 9.608261242732453e-05, "loss": 0.92397137, "memory(GiB)": 67.71, "step": 1240, "train_speed(iter/s)": 0.039998 }, { "acc": 0.74097652, "epoch": 0.8562585969738652, "grad_norm": 0.9601296186447144, "learning_rate": 9.603836759736501e-05, "loss": 0.92762499, "memory(GiB)": 67.71, "step": 1245, "train_speed(iter/s)": 0.040067 }, { "acc": 0.74741158, "epoch": 0.859697386519945, "grad_norm": 1.0737489461898804, "learning_rate": 9.599388461270046e-05, "loss": 0.89353437, "memory(GiB)": 67.71, "step": 1250, "train_speed(iter/s)": 0.040137 }, { "acc": 0.7489872, "epoch": 0.8631361760660248, "grad_norm": 0.8971010446548462, "learning_rate": 9.594916370346464e-05, "loss": 0.9029624, "memory(GiB)": 67.71, "step": 1255, "train_speed(iter/s)": 0.040204 }, { "acc": 0.76023664, "epoch": 0.8665749656121046, "grad_norm": 1.1241250038146973, "learning_rate": 9.590420510102226e-05, "loss": 0.87794628, "memory(GiB)": 67.71, "step": 1260, "train_speed(iter/s)": 0.040279 }, { "acc": 0.73114996, "epoch": 0.8700137551581844, "grad_norm": 1.2173177003860474, "learning_rate": 9.585900903796766e-05, "loss": 0.97089109, "memory(GiB)": 67.71, "step": 1265, "train_speed(iter/s)": 0.040357 }, { "acc": 0.74563522, "epoch": 0.8734525447042641, "grad_norm": 1.1517419815063477, "learning_rate": 9.581357574812375e-05, "loss": 0.89781647, "memory(GiB)": 67.71, "step": 1270, "train_speed(iter/s)": 0.040428 }, { "acc": 0.77028093, "epoch": 0.8768913342503438, "grad_norm": 1.0377655029296875, "learning_rate": 9.576790546654071e-05, "loss": 0.85278912, "memory(GiB)": 67.71, "step": 1275, "train_speed(iter/s)": 0.040497 }, { "acc": 0.73771534, "epoch": 0.8803301237964236, "grad_norm": 1.3595341444015503, "learning_rate": 9.572199842949484e-05, "loss": 0.94212608, "memory(GiB)": 67.71, "step": 1280, "train_speed(iter/s)": 0.04057 }, { "acc": 0.73818164, "epoch": 0.8837689133425034, "grad_norm": 1.0683890581130981, "learning_rate": 9.567585487448723e-05, "loss": 0.94818478, "memory(GiB)": 67.71, "step": 1285, "train_speed(iter/s)": 0.040642 }, { "acc": 0.73784003, "epoch": 0.8872077028885832, "grad_norm": 1.0474903583526611, "learning_rate": 9.562947504024267e-05, "loss": 0.93362265, "memory(GiB)": 67.71, "step": 1290, "train_speed(iter/s)": 0.040707 }, { "acc": 0.73629189, "epoch": 0.890646492434663, "grad_norm": 0.9906838536262512, "learning_rate": 9.558285916670833e-05, "loss": 0.96513948, "memory(GiB)": 67.71, "step": 1295, "train_speed(iter/s)": 0.04077 }, { "acc": 0.74767346, "epoch": 0.8940852819807428, "grad_norm": 0.937610924243927, "learning_rate": 9.553600749505249e-05, "loss": 0.91039581, "memory(GiB)": 67.71, "step": 1300, "train_speed(iter/s)": 0.040841 }, { "epoch": 0.8940852819807428, "eval_acc": 0.7442806046699959, "eval_loss": 0.9251495003700256, "eval_runtime": 1163.3237, "eval_samples_per_second": 3.682, "eval_steps_per_second": 0.066, "step": 1300 }, { "acc": 0.75481806, "epoch": 0.8975240715268226, "grad_norm": 1.093235969543457, "learning_rate": 9.548892026766336e-05, "loss": 0.89359856, "memory(GiB)": 67.71, "step": 1305, "train_speed(iter/s)": 0.039469 }, { "acc": 0.7461483, "epoch": 0.9009628610729024, "grad_norm": 1.1600829362869263, "learning_rate": 9.544159772814784e-05, "loss": 0.92318993, "memory(GiB)": 67.71, "step": 1310, "train_speed(iter/s)": 0.039531 }, { "acc": 0.74466972, "epoch": 0.9044016506189821, "grad_norm": 1.0172392129898071, "learning_rate": 9.539404012133022e-05, "loss": 0.92656469, "memory(GiB)": 67.71, "step": 1315, "train_speed(iter/s)": 0.039601 }, { "acc": 0.74887996, "epoch": 0.9078404401650619, "grad_norm": 1.0804096460342407, "learning_rate": 9.534624769325086e-05, "loss": 0.9098074, "memory(GiB)": 67.71, "step": 1320, "train_speed(iter/s)": 0.039666 }, { "acc": 0.74121346, "epoch": 0.9112792297111417, "grad_norm": 0.9664003849029541, "learning_rate": 9.529822069116499e-05, "loss": 0.9316514, "memory(GiB)": 67.71, "step": 1325, "train_speed(iter/s)": 0.039723 }, { "acc": 0.75105286, "epoch": 0.9147180192572214, "grad_norm": 0.9869258999824524, "learning_rate": 9.524995936354147e-05, "loss": 0.88554888, "memory(GiB)": 67.71, "step": 1330, "train_speed(iter/s)": 0.039787 }, { "acc": 0.73669834, "epoch": 0.9181568088033012, "grad_norm": 1.0221189260482788, "learning_rate": 9.520146396006138e-05, "loss": 0.96149244, "memory(GiB)": 67.71, "step": 1335, "train_speed(iter/s)": 0.03984 }, { "acc": 0.73676386, "epoch": 0.921595598349381, "grad_norm": 1.1528207063674927, "learning_rate": 9.515273473161683e-05, "loss": 0.96016941, "memory(GiB)": 67.71, "step": 1340, "train_speed(iter/s)": 0.039908 }, { "acc": 0.74311166, "epoch": 0.9250343878954608, "grad_norm": 1.161664366722107, "learning_rate": 9.510377193030963e-05, "loss": 0.93340931, "memory(GiB)": 67.71, "step": 1345, "train_speed(iter/s)": 0.039969 }, { "acc": 0.7412838, "epoch": 0.9284731774415406, "grad_norm": 1.1524734497070312, "learning_rate": 9.505457580944998e-05, "loss": 0.94830551, "memory(GiB)": 67.71, "step": 1350, "train_speed(iter/s)": 0.040033 }, { "acc": 0.7353076, "epoch": 0.9319119669876204, "grad_norm": 0.991431474685669, "learning_rate": 9.500514662355515e-05, "loss": 0.94869709, "memory(GiB)": 67.71, "step": 1355, "train_speed(iter/s)": 0.040099 }, { "acc": 0.74592419, "epoch": 0.9353507565337001, "grad_norm": 0.9828781485557556, "learning_rate": 9.495548462834822e-05, "loss": 0.91297379, "memory(GiB)": 67.71, "step": 1360, "train_speed(iter/s)": 0.040166 }, { "acc": 0.72816801, "epoch": 0.9387895460797799, "grad_norm": 0.9917466640472412, "learning_rate": 9.490559008075665e-05, "loss": 0.97318478, "memory(GiB)": 67.71, "step": 1365, "train_speed(iter/s)": 0.040224 }, { "acc": 0.75423832, "epoch": 0.9422283356258597, "grad_norm": 1.021081566810608, "learning_rate": 9.485546323891107e-05, "loss": 0.88315115, "memory(GiB)": 67.71, "step": 1370, "train_speed(iter/s)": 0.040286 }, { "acc": 0.74885693, "epoch": 0.9456671251719395, "grad_norm": 1.1856554746627808, "learning_rate": 9.480510436214387e-05, "loss": 0.91739559, "memory(GiB)": 67.71, "step": 1375, "train_speed(iter/s)": 0.040345 }, { "acc": 0.7300355, "epoch": 0.9491059147180193, "grad_norm": 0.9983332753181458, "learning_rate": 9.475451371098787e-05, "loss": 0.96374044, "memory(GiB)": 67.71, "step": 1380, "train_speed(iter/s)": 0.040398 }, { "acc": 0.76302462, "epoch": 0.952544704264099, "grad_norm": 1.0011341571807861, "learning_rate": 9.470369154717498e-05, "loss": 0.86735439, "memory(GiB)": 67.71, "step": 1385, "train_speed(iter/s)": 0.040463 }, { "acc": 0.74582882, "epoch": 0.9559834938101788, "grad_norm": 1.051133394241333, "learning_rate": 9.465263813363488e-05, "loss": 0.90945959, "memory(GiB)": 67.71, "step": 1390, "train_speed(iter/s)": 0.040529 }, { "acc": 0.74545488, "epoch": 0.9594222833562586, "grad_norm": 1.0635967254638672, "learning_rate": 9.460135373449359e-05, "loss": 0.92096958, "memory(GiB)": 67.71, "step": 1395, "train_speed(iter/s)": 0.0406 }, { "acc": 0.7286293, "epoch": 0.9628610729023384, "grad_norm": 1.146252155303955, "learning_rate": 9.454983861507213e-05, "loss": 0.98950424, "memory(GiB)": 67.71, "step": 1400, "train_speed(iter/s)": 0.040657 }, { "epoch": 0.9628610729023384, "eval_acc": 0.7448486075493769, "eval_loss": 0.9200888872146606, "eval_runtime": 1140.3609, "eval_samples_per_second": 3.756, "eval_steps_per_second": 0.068, "step": 1400 }, { "acc": 0.73903141, "epoch": 0.9662998624484181, "grad_norm": 1.1544698476791382, "learning_rate": 9.44980930418852e-05, "loss": 0.94653835, "memory(GiB)": 67.71, "step": 1405, "train_speed(iter/s)": 0.039418 }, { "acc": 0.73803802, "epoch": 0.9697386519944979, "grad_norm": 1.1627522706985474, "learning_rate": 9.444611728263972e-05, "loss": 0.92657709, "memory(GiB)": 67.71, "step": 1410, "train_speed(iter/s)": 0.039486 }, { "acc": 0.74716005, "epoch": 0.9731774415405777, "grad_norm": 1.0238428115844727, "learning_rate": 9.439391160623352e-05, "loss": 0.91622248, "memory(GiB)": 67.71, "step": 1415, "train_speed(iter/s)": 0.039543 }, { "acc": 0.72584734, "epoch": 0.9766162310866575, "grad_norm": 1.0079649686813354, "learning_rate": 9.434147628275387e-05, "loss": 0.99349623, "memory(GiB)": 67.71, "step": 1420, "train_speed(iter/s)": 0.039598 }, { "acc": 0.74595861, "epoch": 0.9800550206327373, "grad_norm": 0.93181973695755, "learning_rate": 9.428881158347614e-05, "loss": 0.90428505, "memory(GiB)": 67.71, "step": 1425, "train_speed(iter/s)": 0.039658 }, { "acc": 0.74024305, "epoch": 0.9834938101788171, "grad_norm": 1.0997734069824219, "learning_rate": 9.42359177808624e-05, "loss": 0.92796974, "memory(GiB)": 67.71, "step": 1430, "train_speed(iter/s)": 0.039723 }, { "acc": 0.75488276, "epoch": 0.9869325997248969, "grad_norm": 1.075714111328125, "learning_rate": 9.418279514855995e-05, "loss": 0.88083801, "memory(GiB)": 67.71, "step": 1435, "train_speed(iter/s)": 0.039792 }, { "acc": 0.75168705, "epoch": 0.9903713892709766, "grad_norm": 1.039860486984253, "learning_rate": 9.412944396139998e-05, "loss": 0.89997187, "memory(GiB)": 67.71, "step": 1440, "train_speed(iter/s)": 0.039851 }, { "acc": 0.74686685, "epoch": 0.9938101788170564, "grad_norm": 0.9597694277763367, "learning_rate": 9.407586449539616e-05, "loss": 0.90008278, "memory(GiB)": 67.71, "step": 1445, "train_speed(iter/s)": 0.039911 }, { "acc": 0.74714336, "epoch": 0.9972489683631361, "grad_norm": 1.0538160800933838, "learning_rate": 9.402205702774304e-05, "loss": 0.89391537, "memory(GiB)": 67.71, "step": 1450, "train_speed(iter/s)": 0.039977 }, { "acc": 0.74310694, "epoch": 1.000687757909216, "grad_norm": 1.170095443725586, "learning_rate": 9.396802183681483e-05, "loss": 0.9227499, "memory(GiB)": 67.71, "step": 1455, "train_speed(iter/s)": 0.039992 }, { "acc": 0.75078964, "epoch": 1.0041265474552958, "grad_norm": 0.9855571985244751, "learning_rate": 9.391375920216388e-05, "loss": 0.86350327, "memory(GiB)": 67.71, "step": 1460, "train_speed(iter/s)": 0.040045 }, { "acc": 0.76349792, "epoch": 1.0075653370013755, "grad_norm": 1.0560338497161865, "learning_rate": 9.38592694045192e-05, "loss": 0.8352499, "memory(GiB)": 67.71, "step": 1465, "train_speed(iter/s)": 0.040102 }, { "acc": 0.75685053, "epoch": 1.0110041265474552, "grad_norm": 1.2401360273361206, "learning_rate": 9.380455272578501e-05, "loss": 0.86346865, "memory(GiB)": 67.71, "step": 1470, "train_speed(iter/s)": 0.040151 }, { "acc": 0.74811668, "epoch": 1.014442916093535, "grad_norm": 1.0885376930236816, "learning_rate": 9.374960944903933e-05, "loss": 0.90040436, "memory(GiB)": 67.71, "step": 1475, "train_speed(iter/s)": 0.04021 }, { "acc": 0.76131544, "epoch": 1.0178817056396148, "grad_norm": 1.10344660282135, "learning_rate": 9.36944398585325e-05, "loss": 0.84312658, "memory(GiB)": 67.71, "step": 1480, "train_speed(iter/s)": 0.040266 }, { "acc": 0.76296844, "epoch": 1.0213204951856947, "grad_norm": 0.9839646816253662, "learning_rate": 9.36390442396857e-05, "loss": 0.83021584, "memory(GiB)": 67.71, "step": 1485, "train_speed(iter/s)": 0.040327 }, { "acc": 0.75224285, "epoch": 1.0247592847317744, "grad_norm": 1.2059285640716553, "learning_rate": 9.358342287908944e-05, "loss": 0.90571365, "memory(GiB)": 67.71, "step": 1490, "train_speed(iter/s)": 0.040386 }, { "acc": 0.75849729, "epoch": 1.0281980742778543, "grad_norm": 1.134822130203247, "learning_rate": 9.352757606450213e-05, "loss": 0.86388903, "memory(GiB)": 67.71, "step": 1495, "train_speed(iter/s)": 0.040446 }, { "acc": 0.757271, "epoch": 1.031636863823934, "grad_norm": 1.153064489364624, "learning_rate": 9.34715040848486e-05, "loss": 0.86167965, "memory(GiB)": 67.71, "step": 1500, "train_speed(iter/s)": 0.040505 }, { "epoch": 1.031636863823934, "eval_acc": 0.7464288927880506, "eval_loss": 0.9147906303405762, "eval_runtime": 1155.4137, "eval_samples_per_second": 3.707, "eval_steps_per_second": 0.067, "step": 1500 }, { "acc": 0.74669247, "epoch": 1.0350756533700138, "grad_norm": 1.0673768520355225, "learning_rate": 9.341520723021853e-05, "loss": 0.8943819, "memory(GiB)": 67.71, "step": 1505, "train_speed(iter/s)": 0.03933 }, { "acc": 0.76012006, "epoch": 1.0385144429160935, "grad_norm": 1.1268237829208374, "learning_rate": 9.3358685791865e-05, "loss": 0.82661228, "memory(GiB)": 67.71, "step": 1510, "train_speed(iter/s)": 0.039388 }, { "acc": 0.75681725, "epoch": 1.0419532324621734, "grad_norm": 0.9775263667106628, "learning_rate": 9.330194006220301e-05, "loss": 0.85321465, "memory(GiB)": 67.71, "step": 1515, "train_speed(iter/s)": 0.039438 }, { "acc": 0.75653033, "epoch": 1.045392022008253, "grad_norm": 1.0308629274368286, "learning_rate": 9.324497033480792e-05, "loss": 0.85134258, "memory(GiB)": 67.71, "step": 1520, "train_speed(iter/s)": 0.039499 }, { "acc": 0.75386848, "epoch": 1.0488308115543328, "grad_norm": 1.1134449243545532, "learning_rate": 9.318777690441397e-05, "loss": 0.87145538, "memory(GiB)": 67.71, "step": 1525, "train_speed(iter/s)": 0.039557 }, { "acc": 0.75666504, "epoch": 1.0522696011004127, "grad_norm": 1.0742757320404053, "learning_rate": 9.31303600669127e-05, "loss": 0.86683607, "memory(GiB)": 67.71, "step": 1530, "train_speed(iter/s)": 0.03961 }, { "acc": 0.75246024, "epoch": 1.0557083906464924, "grad_norm": 1.33464515209198, "learning_rate": 9.30727201193514e-05, "loss": 0.85711727, "memory(GiB)": 67.71, "step": 1535, "train_speed(iter/s)": 0.039666 }, { "acc": 0.7529563, "epoch": 1.0591471801925723, "grad_norm": 1.165124535560608, "learning_rate": 9.301485735993179e-05, "loss": 0.86484051, "memory(GiB)": 67.71, "step": 1540, "train_speed(iter/s)": 0.039727 }, { "acc": 0.76439376, "epoch": 1.062585969738652, "grad_norm": 0.9591624736785889, "learning_rate": 9.295677208800816e-05, "loss": 0.82861805, "memory(GiB)": 67.71, "step": 1545, "train_speed(iter/s)": 0.039774 }, { "acc": 0.75873203, "epoch": 1.0660247592847318, "grad_norm": 1.1774530410766602, "learning_rate": 9.289846460408602e-05, "loss": 0.85537472, "memory(GiB)": 67.71, "step": 1550, "train_speed(iter/s)": 0.039836 }, { "acc": 0.75072222, "epoch": 1.0694635488308115, "grad_norm": 1.0823148488998413, "learning_rate": 9.283993520982051e-05, "loss": 0.87792244, "memory(GiB)": 67.71, "step": 1555, "train_speed(iter/s)": 0.039888 }, { "acc": 0.755092, "epoch": 1.0729023383768914, "grad_norm": 1.084096908569336, "learning_rate": 9.278118420801481e-05, "loss": 0.8602149, "memory(GiB)": 67.71, "step": 1560, "train_speed(iter/s)": 0.03995 }, { "acc": 0.76771908, "epoch": 1.076341127922971, "grad_norm": 1.157706379890442, "learning_rate": 9.272221190261863e-05, "loss": 0.83946896, "memory(GiB)": 67.71, "step": 1565, "train_speed(iter/s)": 0.040014 }, { "acc": 0.75533552, "epoch": 1.0797799174690508, "grad_norm": 1.1175612211227417, "learning_rate": 9.266301859872657e-05, "loss": 0.86314983, "memory(GiB)": 67.71, "step": 1570, "train_speed(iter/s)": 0.040069 }, { "acc": 0.74409065, "epoch": 1.0832187070151307, "grad_norm": 1.0851186513900757, "learning_rate": 9.260360460257653e-05, "loss": 0.91000662, "memory(GiB)": 67.71, "step": 1575, "train_speed(iter/s)": 0.040117 }, { "acc": 0.76165962, "epoch": 1.0866574965612104, "grad_norm": 1.208783507347107, "learning_rate": 9.254397022154828e-05, "loss": 0.86310711, "memory(GiB)": 67.71, "step": 1580, "train_speed(iter/s)": 0.040175 }, { "acc": 0.75226078, "epoch": 1.0900962861072903, "grad_norm": 1.2301445007324219, "learning_rate": 9.248411576416162e-05, "loss": 0.87202549, "memory(GiB)": 67.71, "step": 1585, "train_speed(iter/s)": 0.04023 }, { "acc": 0.74834862, "epoch": 1.09353507565337, "grad_norm": 1.0916322469711304, "learning_rate": 9.242404154007502e-05, "loss": 0.89558239, "memory(GiB)": 67.71, "step": 1590, "train_speed(iter/s)": 0.040284 }, { "acc": 0.75830355, "epoch": 1.0969738651994498, "grad_norm": 1.077378749847412, "learning_rate": 9.236374786008389e-05, "loss": 0.85708294, "memory(GiB)": 67.71, "step": 1595, "train_speed(iter/s)": 0.04034 }, { "acc": 0.76014338, "epoch": 1.1004126547455295, "grad_norm": 1.2474371194839478, "learning_rate": 9.230323503611897e-05, "loss": 0.85164671, "memory(GiB)": 67.71, "step": 1600, "train_speed(iter/s)": 0.040404 }, { "epoch": 1.1004126547455295, "eval_acc": 0.7467494488684933, "eval_loss": 0.9101867079734802, "eval_runtime": 1124.3275, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.068, "step": 1600 }, { "acc": 0.75647812, "epoch": 1.1038514442916094, "grad_norm": 1.179998517036438, "learning_rate": 9.224250338124481e-05, "loss": 0.86214447, "memory(GiB)": 67.71, "step": 1605, "train_speed(iter/s)": 0.039344 }, { "acc": 0.76110773, "epoch": 1.107290233837689, "grad_norm": 1.1358368396759033, "learning_rate": 9.2181553209658e-05, "loss": 0.82860346, "memory(GiB)": 67.71, "step": 1610, "train_speed(iter/s)": 0.039399 }, { "acc": 0.7529737, "epoch": 1.110729023383769, "grad_norm": 1.1643849611282349, "learning_rate": 9.212038483668572e-05, "loss": 0.88976746, "memory(GiB)": 67.71, "step": 1615, "train_speed(iter/s)": 0.039457 }, { "acc": 0.75971909, "epoch": 1.1141678129298487, "grad_norm": 1.1216496229171753, "learning_rate": 9.205899857878396e-05, "loss": 0.85760059, "memory(GiB)": 67.71, "step": 1620, "train_speed(iter/s)": 0.039517 }, { "acc": 0.76570654, "epoch": 1.1176066024759286, "grad_norm": 1.2371224164962769, "learning_rate": 9.199739475353596e-05, "loss": 0.82434063, "memory(GiB)": 67.71, "step": 1625, "train_speed(iter/s)": 0.039574 }, { "acc": 0.75779676, "epoch": 1.1210453920220083, "grad_norm": 1.1810933351516724, "learning_rate": 9.193557367965056e-05, "loss": 0.85758648, "memory(GiB)": 67.71, "step": 1630, "train_speed(iter/s)": 0.03963 }, { "acc": 0.76221857, "epoch": 1.124484181568088, "grad_norm": 1.1857250928878784, "learning_rate": 9.187353567696055e-05, "loss": 0.84511681, "memory(GiB)": 67.71, "step": 1635, "train_speed(iter/s)": 0.039679 }, { "acc": 0.74748664, "epoch": 1.1279229711141678, "grad_norm": 1.026563286781311, "learning_rate": 9.181128106642096e-05, "loss": 0.9065136, "memory(GiB)": 67.71, "step": 1640, "train_speed(iter/s)": 0.039732 }, { "acc": 0.75156937, "epoch": 1.1313617606602475, "grad_norm": 1.0305781364440918, "learning_rate": 9.174881017010746e-05, "loss": 0.86748962, "memory(GiB)": 67.71, "step": 1645, "train_speed(iter/s)": 0.039783 }, { "acc": 0.75970831, "epoch": 1.1348005502063274, "grad_norm": 1.2061082124710083, "learning_rate": 9.168612331121477e-05, "loss": 0.84413948, "memory(GiB)": 67.71, "step": 1650, "train_speed(iter/s)": 0.039833 }, { "acc": 0.75250425, "epoch": 1.138239339752407, "grad_norm": 1.2730051279067993, "learning_rate": 9.162322081405473e-05, "loss": 0.86202583, "memory(GiB)": 67.71, "step": 1655, "train_speed(iter/s)": 0.039887 }, { "acc": 0.7535017, "epoch": 1.141678129298487, "grad_norm": 1.0208563804626465, "learning_rate": 9.156010300405495e-05, "loss": 0.86017208, "memory(GiB)": 67.71, "step": 1660, "train_speed(iter/s)": 0.03994 }, { "acc": 0.7593123, "epoch": 1.1451169188445667, "grad_norm": 1.2210179567337036, "learning_rate": 9.149677020775686e-05, "loss": 0.8386488, "memory(GiB)": 67.71, "step": 1665, "train_speed(iter/s)": 0.039998 }, { "acc": 0.76598496, "epoch": 1.1485557083906466, "grad_norm": 1.1266486644744873, "learning_rate": 9.143322275281419e-05, "loss": 0.84045124, "memory(GiB)": 67.71, "step": 1670, "train_speed(iter/s)": 0.040053 }, { "acc": 0.7449192, "epoch": 1.1519944979367263, "grad_norm": 1.2747905254364014, "learning_rate": 9.136946096799117e-05, "loss": 0.89558125, "memory(GiB)": 67.71, "step": 1675, "train_speed(iter/s)": 0.040103 }, { "acc": 0.77260947, "epoch": 1.155433287482806, "grad_norm": 1.1446512937545776, "learning_rate": 9.13054851831609e-05, "loss": 0.79779301, "memory(GiB)": 67.71, "step": 1680, "train_speed(iter/s)": 0.040158 }, { "acc": 0.73968267, "epoch": 1.1588720770288858, "grad_norm": 1.0520663261413574, "learning_rate": 9.124129572930356e-05, "loss": 0.91217728, "memory(GiB)": 67.71, "step": 1685, "train_speed(iter/s)": 0.040209 }, { "acc": 0.76177702, "epoch": 1.1623108665749655, "grad_norm": 1.0818169116973877, "learning_rate": 9.117689293850484e-05, "loss": 0.84482117, "memory(GiB)": 67.71, "step": 1690, "train_speed(iter/s)": 0.040254 }, { "acc": 0.75831223, "epoch": 1.1657496561210454, "grad_norm": 1.1914788484573364, "learning_rate": 9.111227714395406e-05, "loss": 0.85761623, "memory(GiB)": 67.71, "step": 1695, "train_speed(iter/s)": 0.040302 }, { "acc": 0.75756545, "epoch": 1.169188445667125, "grad_norm": 1.1335783004760742, "learning_rate": 9.104744867994258e-05, "loss": 0.85422668, "memory(GiB)": 67.71, "step": 1700, "train_speed(iter/s)": 0.040347 }, { "epoch": 1.169188445667125, "eval_acc": 0.7492295406487605, "eval_loss": 0.9045791625976562, "eval_runtime": 1125.5138, "eval_samples_per_second": 3.805, "eval_steps_per_second": 0.068, "step": 1700 }, { "acc": 0.75732212, "epoch": 1.172627235213205, "grad_norm": 0.9720064997673035, "learning_rate": 9.098240788186192e-05, "loss": 0.85368481, "memory(GiB)": 67.71, "step": 1705, "train_speed(iter/s)": 0.03935 }, { "acc": 0.76147232, "epoch": 1.1760660247592847, "grad_norm": 1.2705514430999756, "learning_rate": 9.091715508620222e-05, "loss": 0.85527439, "memory(GiB)": 67.71, "step": 1710, "train_speed(iter/s)": 0.039404 }, { "acc": 0.74866266, "epoch": 1.1795048143053646, "grad_norm": 1.1010618209838867, "learning_rate": 9.085169063055032e-05, "loss": 0.8962719, "memory(GiB)": 67.71, "step": 1715, "train_speed(iter/s)": 0.039457 }, { "acc": 0.76777854, "epoch": 1.1829436038514443, "grad_norm": 1.0222831964492798, "learning_rate": 9.078601485358813e-05, "loss": 0.81568956, "memory(GiB)": 67.71, "step": 1720, "train_speed(iter/s)": 0.03951 }, { "acc": 0.74900856, "epoch": 1.1863823933975242, "grad_norm": 1.1607588529586792, "learning_rate": 9.072012809509081e-05, "loss": 0.88696823, "memory(GiB)": 67.71, "step": 1725, "train_speed(iter/s)": 0.039555 }, { "acc": 0.75043535, "epoch": 1.1898211829436038, "grad_norm": 1.1782574653625488, "learning_rate": 9.065403069592505e-05, "loss": 0.86962795, "memory(GiB)": 67.71, "step": 1730, "train_speed(iter/s)": 0.039599 }, { "acc": 0.74629622, "epoch": 1.1932599724896837, "grad_norm": 1.1644479036331177, "learning_rate": 9.058772299804731e-05, "loss": 0.88353643, "memory(GiB)": 67.71, "step": 1735, "train_speed(iter/s)": 0.039653 }, { "acc": 0.76667023, "epoch": 1.1966987620357634, "grad_norm": 1.176121711730957, "learning_rate": 9.052120534450196e-05, "loss": 0.82560787, "memory(GiB)": 67.71, "step": 1740, "train_speed(iter/s)": 0.039705 }, { "acc": 0.7706706, "epoch": 1.200137551581843, "grad_norm": 1.2071737051010132, "learning_rate": 9.045447807941972e-05, "loss": 0.82129135, "memory(GiB)": 67.71, "step": 1745, "train_speed(iter/s)": 0.039758 }, { "acc": 0.76220055, "epoch": 1.203576341127923, "grad_norm": 1.161576509475708, "learning_rate": 9.038754154801559e-05, "loss": 0.84442816, "memory(GiB)": 67.71, "step": 1750, "train_speed(iter/s)": 0.039809 }, { "acc": 0.76516528, "epoch": 1.2070151306740027, "grad_norm": 1.0194506645202637, "learning_rate": 9.032039609658732e-05, "loss": 0.82462807, "memory(GiB)": 67.71, "step": 1755, "train_speed(iter/s)": 0.03985 }, { "acc": 0.76512585, "epoch": 1.2104539202200826, "grad_norm": 1.123105764389038, "learning_rate": 9.025304207251346e-05, "loss": 0.84622154, "memory(GiB)": 67.71, "step": 1760, "train_speed(iter/s)": 0.039905 }, { "acc": 0.75925913, "epoch": 1.2138927097661623, "grad_norm": 1.0418940782546997, "learning_rate": 9.018547982425164e-05, "loss": 0.84370403, "memory(GiB)": 67.71, "step": 1765, "train_speed(iter/s)": 0.03995 }, { "acc": 0.76256437, "epoch": 1.2173314993122422, "grad_norm": 1.133818507194519, "learning_rate": 9.011770970133671e-05, "loss": 0.84478779, "memory(GiB)": 67.71, "step": 1770, "train_speed(iter/s)": 0.040002 }, { "acc": 0.75265675, "epoch": 1.2207702888583218, "grad_norm": 1.3675616979599, "learning_rate": 9.0049732054379e-05, "loss": 0.86621552, "memory(GiB)": 67.71, "step": 1775, "train_speed(iter/s)": 0.040062 }, { "acc": 0.75733051, "epoch": 1.2242090784044017, "grad_norm": 1.2875425815582275, "learning_rate": 8.998154723506249e-05, "loss": 0.88228512, "memory(GiB)": 67.71, "step": 1780, "train_speed(iter/s)": 0.040108 }, { "acc": 0.74635658, "epoch": 1.2276478679504814, "grad_norm": 1.2586891651153564, "learning_rate": 8.991315559614288e-05, "loss": 0.90037432, "memory(GiB)": 67.71, "step": 1785, "train_speed(iter/s)": 0.040152 }, { "acc": 0.7586679, "epoch": 1.231086657496561, "grad_norm": 1.1891663074493408, "learning_rate": 8.984455749144597e-05, "loss": 0.84769564, "memory(GiB)": 67.71, "step": 1790, "train_speed(iter/s)": 0.0402 }, { "acc": 0.74606829, "epoch": 1.234525447042641, "grad_norm": 1.154038667678833, "learning_rate": 8.977575327586563e-05, "loss": 0.88660145, "memory(GiB)": 67.71, "step": 1795, "train_speed(iter/s)": 0.040243 }, { "acc": 0.77012577, "epoch": 1.2379642365887207, "grad_norm": 1.2006701231002808, "learning_rate": 8.97067433053621e-05, "loss": 0.8128231, "memory(GiB)": 67.71, "step": 1800, "train_speed(iter/s)": 0.040293 }, { "epoch": 1.2379642365887207, "eval_acc": 0.7502980609169029, "eval_loss": 0.9007091522216797, "eval_runtime": 1174.5762, "eval_samples_per_second": 3.646, "eval_steps_per_second": 0.066, "step": 1800 }, { "acc": 0.74588566, "epoch": 1.2414030261348006, "grad_norm": 1.2025572061538696, "learning_rate": 8.963752793696004e-05, "loss": 0.89730377, "memory(GiB)": 67.71, "step": 1805, "train_speed(iter/s)": 0.039311 }, { "acc": 0.76906261, "epoch": 1.2448418156808803, "grad_norm": 1.0686986446380615, "learning_rate": 8.956810752874682e-05, "loss": 0.81423302, "memory(GiB)": 67.71, "step": 1810, "train_speed(iter/s)": 0.039359 }, { "acc": 0.77615113, "epoch": 1.2482806052269602, "grad_norm": 1.2386928796768188, "learning_rate": 8.949848243987054e-05, "loss": 0.79887466, "memory(GiB)": 67.71, "step": 1815, "train_speed(iter/s)": 0.039407 }, { "acc": 0.75191274, "epoch": 1.2517193947730398, "grad_norm": 1.180568814277649, "learning_rate": 8.94286530305382e-05, "loss": 0.85600204, "memory(GiB)": 67.71, "step": 1820, "train_speed(iter/s)": 0.039452 }, { "acc": 0.76613312, "epoch": 1.2551581843191197, "grad_norm": 1.1538622379302979, "learning_rate": 8.935861966201393e-05, "loss": 0.82688131, "memory(GiB)": 67.71, "step": 1825, "train_speed(iter/s)": 0.039499 }, { "acc": 0.77081518, "epoch": 1.2585969738651994, "grad_norm": 1.0973575115203857, "learning_rate": 8.928838269661694e-05, "loss": 0.80709963, "memory(GiB)": 67.71, "step": 1830, "train_speed(iter/s)": 0.039543 }, { "acc": 0.74893703, "epoch": 1.262035763411279, "grad_norm": 1.1516822576522827, "learning_rate": 8.921794249771987e-05, "loss": 0.87887421, "memory(GiB)": 67.71, "step": 1835, "train_speed(iter/s)": 0.039584 }, { "acc": 0.74806905, "epoch": 1.265474552957359, "grad_norm": 1.1790939569473267, "learning_rate": 8.914729942974674e-05, "loss": 0.88099899, "memory(GiB)": 67.71, "step": 1840, "train_speed(iter/s)": 0.03963 }, { "acc": 0.77447009, "epoch": 1.268913342503439, "grad_norm": 1.009238600730896, "learning_rate": 8.907645385817104e-05, "loss": 0.7905911, "memory(GiB)": 67.71, "step": 1845, "train_speed(iter/s)": 0.039686 }, { "acc": 0.75110741, "epoch": 1.2723521320495186, "grad_norm": 1.2757585048675537, "learning_rate": 8.900540614951409e-05, "loss": 0.87034512, "memory(GiB)": 67.71, "step": 1850, "train_speed(iter/s)": 0.03973 }, { "acc": 0.74727058, "epoch": 1.2757909215955983, "grad_norm": 1.0743454694747925, "learning_rate": 8.893415667134281e-05, "loss": 0.88521938, "memory(GiB)": 67.71, "step": 1855, "train_speed(iter/s)": 0.039772 }, { "acc": 0.76257467, "epoch": 1.2792297111416782, "grad_norm": 1.0623903274536133, "learning_rate": 8.886270579226807e-05, "loss": 0.84139423, "memory(GiB)": 67.71, "step": 1860, "train_speed(iter/s)": 0.03982 }, { "acc": 0.76310492, "epoch": 1.2826685006877578, "grad_norm": 1.0730196237564087, "learning_rate": 8.879105388194267e-05, "loss": 0.84801579, "memory(GiB)": 67.71, "step": 1865, "train_speed(iter/s)": 0.039868 }, { "acc": 0.76296768, "epoch": 1.2861072902338377, "grad_norm": 1.0681921243667603, "learning_rate": 8.871920131105943e-05, "loss": 0.82966671, "memory(GiB)": 67.71, "step": 1870, "train_speed(iter/s)": 0.039919 }, { "acc": 0.7662539, "epoch": 1.2895460797799174, "grad_norm": 1.1676512956619263, "learning_rate": 8.864714845134931e-05, "loss": 0.82158031, "memory(GiB)": 67.71, "step": 1875, "train_speed(iter/s)": 0.039968 }, { "acc": 0.76386523, "epoch": 1.2929848693259973, "grad_norm": 1.2241677045822144, "learning_rate": 8.857489567557949e-05, "loss": 0.8327158, "memory(GiB)": 67.71, "step": 1880, "train_speed(iter/s)": 0.040021 }, { "acc": 0.77355728, "epoch": 1.296423658872077, "grad_norm": 1.0751720666885376, "learning_rate": 8.850244335755136e-05, "loss": 0.803335, "memory(GiB)": 67.71, "step": 1885, "train_speed(iter/s)": 0.040073 }, { "acc": 0.76320724, "epoch": 1.299862448418157, "grad_norm": 1.292360544204712, "learning_rate": 8.84297918720987e-05, "loss": 0.85545721, "memory(GiB)": 67.71, "step": 1890, "train_speed(iter/s)": 0.040122 }, { "acc": 0.76533775, "epoch": 1.3033012379642366, "grad_norm": 1.27505624294281, "learning_rate": 8.835694159508568e-05, "loss": 0.83456764, "memory(GiB)": 67.71, "step": 1895, "train_speed(iter/s)": 0.040172 }, { "acc": 0.77199011, "epoch": 1.3067400275103163, "grad_norm": 1.0612465143203735, "learning_rate": 8.82838929034049e-05, "loss": 0.81219292, "memory(GiB)": 67.71, "step": 1900, "train_speed(iter/s)": 0.040222 }, { "epoch": 1.3067400275103163, "eval_acc": 0.7521145453727449, "eval_loss": 0.8923233151435852, "eval_runtime": 1131.335, "eval_samples_per_second": 3.786, "eval_steps_per_second": 0.068, "step": 1900 }, { "acc": 0.75053563, "epoch": 1.3101788170563962, "grad_norm": 1.2158348560333252, "learning_rate": 8.821064617497549e-05, "loss": 0.87764034, "memory(GiB)": 67.71, "step": 1905, "train_speed(iter/s)": 0.039328 }, { "acc": 0.76767535, "epoch": 1.313617606602476, "grad_norm": 1.0964173078536987, "learning_rate": 8.81372017887411e-05, "loss": 0.83279819, "memory(GiB)": 67.71, "step": 1910, "train_speed(iter/s)": 0.039378 }, { "acc": 0.75541239, "epoch": 1.3170563961485557, "grad_norm": 1.2945960760116577, "learning_rate": 8.806356012466799e-05, "loss": 0.8567975, "memory(GiB)": 67.71, "step": 1915, "train_speed(iter/s)": 0.03942 }, { "acc": 0.75376849, "epoch": 1.3204951856946354, "grad_norm": 1.2059944868087769, "learning_rate": 8.798972156374303e-05, "loss": 0.86053438, "memory(GiB)": 67.71, "step": 1920, "train_speed(iter/s)": 0.039461 }, { "acc": 0.75244598, "epoch": 1.3239339752407153, "grad_norm": 1.2470142841339111, "learning_rate": 8.791568648797175e-05, "loss": 0.84860821, "memory(GiB)": 67.71, "step": 1925, "train_speed(iter/s)": 0.039509 }, { "acc": 0.76857953, "epoch": 1.327372764786795, "grad_norm": 1.074821949005127, "learning_rate": 8.784145528037633e-05, "loss": 0.81543255, "memory(GiB)": 67.71, "step": 1930, "train_speed(iter/s)": 0.039548 }, { "acc": 0.75690975, "epoch": 1.330811554332875, "grad_norm": 1.2594019174575806, "learning_rate": 8.776702832499369e-05, "loss": 0.85649605, "memory(GiB)": 67.71, "step": 1935, "train_speed(iter/s)": 0.039588 }, { "acc": 0.75640688, "epoch": 1.3342503438789546, "grad_norm": 1.2854877710342407, "learning_rate": 8.769240600687341e-05, "loss": 0.85886908, "memory(GiB)": 67.71, "step": 1940, "train_speed(iter/s)": 0.03963 }, { "acc": 0.76101456, "epoch": 1.3376891334250343, "grad_norm": 1.2323275804519653, "learning_rate": 8.761758871207578e-05, "loss": 0.85549269, "memory(GiB)": 67.71, "step": 1945, "train_speed(iter/s)": 0.039677 }, { "acc": 0.75675645, "epoch": 1.3411279229711142, "grad_norm": 1.3912837505340576, "learning_rate": 8.754257682766987e-05, "loss": 0.86173325, "memory(GiB)": 67.71, "step": 1950, "train_speed(iter/s)": 0.039723 }, { "acc": 0.75836124, "epoch": 1.344566712517194, "grad_norm": 1.325785517692566, "learning_rate": 8.746737074173139e-05, "loss": 0.85381556, "memory(GiB)": 67.71, "step": 1955, "train_speed(iter/s)": 0.039767 }, { "acc": 0.75378246, "epoch": 1.3480055020632737, "grad_norm": 1.3383103609085083, "learning_rate": 8.739197084334078e-05, "loss": 0.85643635, "memory(GiB)": 67.71, "step": 1960, "train_speed(iter/s)": 0.039815 }, { "acc": 0.74990363, "epoch": 1.3514442916093534, "grad_norm": 1.0907026529312134, "learning_rate": 8.731637752258122e-05, "loss": 0.8558506, "memory(GiB)": 67.71, "step": 1965, "train_speed(iter/s)": 0.039861 }, { "acc": 0.75551319, "epoch": 1.3548830811554333, "grad_norm": 1.1416265964508057, "learning_rate": 8.724059117053647e-05, "loss": 0.86469622, "memory(GiB)": 67.71, "step": 1970, "train_speed(iter/s)": 0.039902 }, { "acc": 0.7511488, "epoch": 1.358321870701513, "grad_norm": 1.1393564939498901, "learning_rate": 8.716461217928903e-05, "loss": 0.85416451, "memory(GiB)": 67.71, "step": 1975, "train_speed(iter/s)": 0.039948 }, { "acc": 0.76492167, "epoch": 1.361760660247593, "grad_norm": 1.0601388216018677, "learning_rate": 8.708844094191798e-05, "loss": 0.82022047, "memory(GiB)": 67.71, "step": 1980, "train_speed(iter/s)": 0.039988 }, { "acc": 0.75548849, "epoch": 1.3651994497936726, "grad_norm": 1.1647326946258545, "learning_rate": 8.701207785249703e-05, "loss": 0.8785594, "memory(GiB)": 67.71, "step": 1985, "train_speed(iter/s)": 0.040031 }, { "acc": 0.76257005, "epoch": 1.3686382393397525, "grad_norm": 1.208771824836731, "learning_rate": 8.693552330609235e-05, "loss": 0.82169209, "memory(GiB)": 67.71, "step": 1990, "train_speed(iter/s)": 0.040075 }, { "acc": 0.7662899, "epoch": 1.3720770288858322, "grad_norm": 1.0375357866287231, "learning_rate": 8.685877769876074e-05, "loss": 0.82175579, "memory(GiB)": 67.71, "step": 1995, "train_speed(iter/s)": 0.040117 }, { "acc": 0.75507236, "epoch": 1.375515818431912, "grad_norm": 1.070656180381775, "learning_rate": 8.678184142754736e-05, "loss": 0.84867239, "memory(GiB)": 67.71, "step": 2000, "train_speed(iter/s)": 0.040168 }, { "epoch": 1.375515818431912, "eval_acc": 0.7514059477212399, "eval_loss": 0.890434980392456, "eval_runtime": 1140.0174, "eval_samples_per_second": 3.757, "eval_steps_per_second": 0.068, "step": 2000 }, { "acc": 0.75042534, "epoch": 1.3789546079779917, "grad_norm": 1.181110143661499, "learning_rate": 8.670471489048382e-05, "loss": 0.90365086, "memory(GiB)": 67.71, "step": 2005, "train_speed(iter/s)": 0.039309 }, { "acc": 0.75193415, "epoch": 1.3823933975240714, "grad_norm": 1.0542738437652588, "learning_rate": 8.662739848658605e-05, "loss": 0.86276369, "memory(GiB)": 67.71, "step": 2010, "train_speed(iter/s)": 0.039351 }, { "acc": 0.76266041, "epoch": 1.3858321870701513, "grad_norm": 1.1464662551879883, "learning_rate": 8.654989261585231e-05, "loss": 0.83303232, "memory(GiB)": 67.71, "step": 2015, "train_speed(iter/s)": 0.039396 }, { "acc": 0.7555974, "epoch": 1.3892709766162312, "grad_norm": 1.137511134147644, "learning_rate": 8.6472197679261e-05, "loss": 0.87258329, "memory(GiB)": 67.71, "step": 2020, "train_speed(iter/s)": 0.039441 }, { "acc": 0.75746002, "epoch": 1.392709766162311, "grad_norm": 1.1067372560501099, "learning_rate": 8.639431407876873e-05, "loss": 0.8575942, "memory(GiB)": 67.71, "step": 2025, "train_speed(iter/s)": 0.039486 }, { "acc": 0.74920359, "epoch": 1.3961485557083906, "grad_norm": 1.1339222192764282, "learning_rate": 8.631624221730809e-05, "loss": 0.89333057, "memory(GiB)": 67.71, "step": 2030, "train_speed(iter/s)": 0.039533 }, { "acc": 0.75785513, "epoch": 1.3995873452544705, "grad_norm": 1.193408489227295, "learning_rate": 8.623798249878573e-05, "loss": 0.85004654, "memory(GiB)": 67.71, "step": 2035, "train_speed(iter/s)": 0.039579 }, { "acc": 0.76322355, "epoch": 1.4030261348005502, "grad_norm": 1.4497336149215698, "learning_rate": 8.615953532808008e-05, "loss": 0.85098343, "memory(GiB)": 67.71, "step": 2040, "train_speed(iter/s)": 0.039624 }, { "acc": 0.75989523, "epoch": 1.40646492434663, "grad_norm": 1.424786925315857, "learning_rate": 8.608090111103948e-05, "loss": 0.86450672, "memory(GiB)": 67.71, "step": 2045, "train_speed(iter/s)": 0.039665 }, { "acc": 0.75607204, "epoch": 1.4099037138927097, "grad_norm": 1.1818575859069824, "learning_rate": 8.600208025447983e-05, "loss": 0.859338, "memory(GiB)": 67.71, "step": 2050, "train_speed(iter/s)": 0.039707 }, { "acc": 0.74912252, "epoch": 1.4133425034387894, "grad_norm": 1.0915964841842651, "learning_rate": 8.592307316618272e-05, "loss": 0.88583393, "memory(GiB)": 67.71, "step": 2055, "train_speed(iter/s)": 0.039753 }, { "acc": 0.7610446, "epoch": 1.4167812929848693, "grad_norm": 1.3371332883834839, "learning_rate": 8.584388025489314e-05, "loss": 0.83794365, "memory(GiB)": 67.71, "step": 2060, "train_speed(iter/s)": 0.0398 }, { "acc": 0.76503677, "epoch": 1.4202200825309492, "grad_norm": 1.0140537023544312, "learning_rate": 8.57645019303175e-05, "loss": 0.81193466, "memory(GiB)": 67.71, "step": 2065, "train_speed(iter/s)": 0.039843 }, { "acc": 0.75801926, "epoch": 1.423658872077029, "grad_norm": 1.2762821912765503, "learning_rate": 8.568493860312142e-05, "loss": 0.85571671, "memory(GiB)": 67.71, "step": 2070, "train_speed(iter/s)": 0.039884 }, { "acc": 0.75672455, "epoch": 1.4270976616231086, "grad_norm": 1.1539915800094604, "learning_rate": 8.56051906849276e-05, "loss": 0.86462698, "memory(GiB)": 67.71, "step": 2075, "train_speed(iter/s)": 0.039921 }, { "acc": 0.75165954, "epoch": 1.4305364511691885, "grad_norm": 1.306449055671692, "learning_rate": 8.55252585883138e-05, "loss": 0.88216114, "memory(GiB)": 67.71, "step": 2080, "train_speed(iter/s)": 0.039965 }, { "acc": 0.76647811, "epoch": 1.4339752407152682, "grad_norm": 1.1315670013427734, "learning_rate": 8.544514272681056e-05, "loss": 0.83219862, "memory(GiB)": 67.71, "step": 2085, "train_speed(iter/s)": 0.040006 }, { "acc": 0.75151563, "epoch": 1.437414030261348, "grad_norm": 1.2553141117095947, "learning_rate": 8.536484351489918e-05, "loss": 0.88327541, "memory(GiB)": 67.71, "step": 2090, "train_speed(iter/s)": 0.040054 }, { "acc": 0.75124393, "epoch": 1.4408528198074277, "grad_norm": 1.0970312356948853, "learning_rate": 8.528436136800955e-05, "loss": 0.86639719, "memory(GiB)": 67.71, "step": 2095, "train_speed(iter/s)": 0.040094 }, { "acc": 0.75917168, "epoch": 1.4442916093535076, "grad_norm": 1.1937634944915771, "learning_rate": 8.520369670251787e-05, "loss": 0.85560112, "memory(GiB)": 67.71, "step": 2100, "train_speed(iter/s)": 0.040143 }, { "epoch": 1.4442916093535076, "eval_acc": 0.7529749853781437, "eval_loss": 0.8845105767250061, "eval_runtime": 1065.8538, "eval_samples_per_second": 4.018, "eval_steps_per_second": 0.072, "step": 2100 }, { "acc": 0.75502768, "epoch": 1.4477303988995873, "grad_norm": 1.2217875719070435, "learning_rate": 8.512284993574473e-05, "loss": 0.86460505, "memory(GiB)": 67.71, "step": 2105, "train_speed(iter/s)": 0.039384 }, { "acc": 0.7641192, "epoch": 1.4511691884456672, "grad_norm": 1.1658051013946533, "learning_rate": 8.504182148595275e-05, "loss": 0.82581739, "memory(GiB)": 67.71, "step": 2110, "train_speed(iter/s)": 0.039424 }, { "acc": 0.75860772, "epoch": 1.454607977991747, "grad_norm": 1.4592278003692627, "learning_rate": 8.496061177234452e-05, "loss": 0.84182692, "memory(GiB)": 67.71, "step": 2115, "train_speed(iter/s)": 0.039466 }, { "acc": 0.76232295, "epoch": 1.4580467675378266, "grad_norm": 1.2424806356430054, "learning_rate": 8.487922121506039e-05, "loss": 0.84641819, "memory(GiB)": 67.71, "step": 2120, "train_speed(iter/s)": 0.039509 }, { "acc": 0.75302744, "epoch": 1.4614855570839065, "grad_norm": 1.1985810995101929, "learning_rate": 8.479765023517631e-05, "loss": 0.87050896, "memory(GiB)": 67.71, "step": 2125, "train_speed(iter/s)": 0.039552 }, { "acc": 0.76178207, "epoch": 1.4649243466299862, "grad_norm": 1.108946442604065, "learning_rate": 8.471589925470166e-05, "loss": 0.82996387, "memory(GiB)": 67.71, "step": 2130, "train_speed(iter/s)": 0.039594 }, { "acc": 0.75262017, "epoch": 1.468363136176066, "grad_norm": 1.266554832458496, "learning_rate": 8.463396869657704e-05, "loss": 0.85832672, "memory(GiB)": 67.71, "step": 2135, "train_speed(iter/s)": 0.039634 }, { "acc": 0.76968784, "epoch": 1.4718019257221457, "grad_norm": 1.2651324272155762, "learning_rate": 8.455185898467213e-05, "loss": 0.80993366, "memory(GiB)": 67.71, "step": 2140, "train_speed(iter/s)": 0.039677 }, { "acc": 0.76287999, "epoch": 1.4752407152682256, "grad_norm": 1.4108299016952515, "learning_rate": 8.446957054378344e-05, "loss": 0.82752171, "memory(GiB)": 67.71, "step": 2145, "train_speed(iter/s)": 0.039717 }, { "acc": 0.77508984, "epoch": 1.4786795048143053, "grad_norm": 1.1667840480804443, "learning_rate": 8.438710379963214e-05, "loss": 0.78502192, "memory(GiB)": 67.71, "step": 2150, "train_speed(iter/s)": 0.039761 }, { "acc": 0.74883337, "epoch": 1.4821182943603852, "grad_norm": 1.1578980684280396, "learning_rate": 8.430445917886186e-05, "loss": 0.88730097, "memory(GiB)": 67.71, "step": 2155, "train_speed(iter/s)": 0.039804 }, { "acc": 0.77094564, "epoch": 1.485557083906465, "grad_norm": 1.039753794670105, "learning_rate": 8.422163710903649e-05, "loss": 0.80611877, "memory(GiB)": 67.71, "step": 2160, "train_speed(iter/s)": 0.039843 }, { "acc": 0.76888881, "epoch": 1.4889958734525446, "grad_norm": 1.5004595518112183, "learning_rate": 8.413863801863794e-05, "loss": 0.80163708, "memory(GiB)": 67.71, "step": 2165, "train_speed(iter/s)": 0.039887 }, { "acc": 0.76752806, "epoch": 1.4924346629986245, "grad_norm": 1.2288601398468018, "learning_rate": 8.405546233706395e-05, "loss": 0.82048512, "memory(GiB)": 67.71, "step": 2170, "train_speed(iter/s)": 0.039936 }, { "acc": 0.76018772, "epoch": 1.4958734525447044, "grad_norm": 1.1614660024642944, "learning_rate": 8.397211049462586e-05, "loss": 0.84854307, "memory(GiB)": 67.71, "step": 2175, "train_speed(iter/s)": 0.039975 }, { "acc": 0.77043438, "epoch": 1.499312242090784, "grad_norm": 1.3372976779937744, "learning_rate": 8.388858292254637e-05, "loss": 0.79604712, "memory(GiB)": 67.71, "step": 2180, "train_speed(iter/s)": 0.040022 }, { "acc": 0.76631165, "epoch": 1.5027510316368637, "grad_norm": 1.1987308263778687, "learning_rate": 8.380488005295732e-05, "loss": 0.83228321, "memory(GiB)": 67.71, "step": 2185, "train_speed(iter/s)": 0.040065 }, { "acc": 0.75986252, "epoch": 1.5061898211829436, "grad_norm": 1.0442498922348022, "learning_rate": 8.37210023188975e-05, "loss": 0.84417458, "memory(GiB)": 67.71, "step": 2190, "train_speed(iter/s)": 0.040102 }, { "acc": 0.75435362, "epoch": 1.5096286107290235, "grad_norm": 1.0899875164031982, "learning_rate": 8.363695015431028e-05, "loss": 0.8657095, "memory(GiB)": 67.71, "step": 2195, "train_speed(iter/s)": 0.04015 }, { "acc": 0.76132326, "epoch": 1.5130674002751032, "grad_norm": 1.076157569885254, "learning_rate": 8.355272399404156e-05, "loss": 0.83814745, "memory(GiB)": 67.71, "step": 2200, "train_speed(iter/s)": 0.040192 }, { "epoch": 1.5130674002751032, "eval_acc": 0.7542065955819499, "eval_loss": 0.877788245677948, "eval_runtime": 1157.3904, "eval_samples_per_second": 3.701, "eval_steps_per_second": 0.067, "step": 2200 }, { "acc": 0.75972261, "epoch": 1.516506189821183, "grad_norm": 1.387868046760559, "learning_rate": 8.346832427383732e-05, "loss": 0.84049091, "memory(GiB)": 67.71, "step": 2205, "train_speed(iter/s)": 0.039401 }, { "acc": 0.74931083, "epoch": 1.5199449793672626, "grad_norm": 1.2687524557113647, "learning_rate": 8.338375143034148e-05, "loss": 0.87477436, "memory(GiB)": 67.71, "step": 2210, "train_speed(iter/s)": 0.039447 }, { "acc": 0.76771116, "epoch": 1.5233837689133425, "grad_norm": 1.1818050146102905, "learning_rate": 8.329900590109365e-05, "loss": 0.81554508, "memory(GiB)": 67.71, "step": 2215, "train_speed(iter/s)": 0.039489 }, { "acc": 0.76883683, "epoch": 1.5268225584594224, "grad_norm": 1.1263651847839355, "learning_rate": 8.321408812452678e-05, "loss": 0.82251701, "memory(GiB)": 67.71, "step": 2220, "train_speed(iter/s)": 0.03953 }, { "acc": 0.76298013, "epoch": 1.530261348005502, "grad_norm": 1.1538478136062622, "learning_rate": 8.312899853996501e-05, "loss": 0.81565828, "memory(GiB)": 67.71, "step": 2225, "train_speed(iter/s)": 0.039564 }, { "acc": 0.75918069, "epoch": 1.5337001375515817, "grad_norm": 1.2974464893341064, "learning_rate": 8.304373758762128e-05, "loss": 0.8574604, "memory(GiB)": 67.71, "step": 2230, "train_speed(iter/s)": 0.039603 }, { "acc": 0.76306868, "epoch": 1.5371389270976616, "grad_norm": 1.2755868434906006, "learning_rate": 8.295830570859512e-05, "loss": 0.83660641, "memory(GiB)": 67.71, "step": 2235, "train_speed(iter/s)": 0.039636 }, { "acc": 0.75555844, "epoch": 1.5405777166437415, "grad_norm": 1.196268081665039, "learning_rate": 8.287270334487034e-05, "loss": 0.84184723, "memory(GiB)": 67.71, "step": 2240, "train_speed(iter/s)": 0.039674 }, { "acc": 0.77357531, "epoch": 1.5440165061898212, "grad_norm": 1.2208247184753418, "learning_rate": 8.278693093931282e-05, "loss": 0.79285612, "memory(GiB)": 67.71, "step": 2245, "train_speed(iter/s)": 0.039716 }, { "acc": 0.75058088, "epoch": 1.547455295735901, "grad_norm": 1.1691052913665771, "learning_rate": 8.270098893566807e-05, "loss": 0.864328, "memory(GiB)": 67.73, "step": 2250, "train_speed(iter/s)": 0.03975 }, { "acc": 0.74813089, "epoch": 1.5508940852819806, "grad_norm": 1.300010323524475, "learning_rate": 8.261487777855909e-05, "loss": 0.89021215, "memory(GiB)": 67.73, "step": 2255, "train_speed(iter/s)": 0.039788 }, { "acc": 0.75823145, "epoch": 1.5543328748280605, "grad_norm": 1.080557107925415, "learning_rate": 8.252859791348392e-05, "loss": 0.86599722, "memory(GiB)": 67.73, "step": 2260, "train_speed(iter/s)": 0.039824 }, { "acc": 0.75691137, "epoch": 1.5577716643741404, "grad_norm": 1.098506212234497, "learning_rate": 8.244214978681348e-05, "loss": 0.87960701, "memory(GiB)": 67.73, "step": 2265, "train_speed(iter/s)": 0.039862 }, { "acc": 0.76327119, "epoch": 1.56121045392022, "grad_norm": 1.3256527185440063, "learning_rate": 8.23555338457892e-05, "loss": 0.8320919, "memory(GiB)": 67.73, "step": 2270, "train_speed(iter/s)": 0.039901 }, { "acc": 0.75916958, "epoch": 1.5646492434662997, "grad_norm": 1.2192107439041138, "learning_rate": 8.226875053852066e-05, "loss": 0.84912138, "memory(GiB)": 67.73, "step": 2275, "train_speed(iter/s)": 0.039941 }, { "acc": 0.75922327, "epoch": 1.5680880330123796, "grad_norm": 1.3527653217315674, "learning_rate": 8.218180031398334e-05, "loss": 0.84136915, "memory(GiB)": 67.73, "step": 2280, "train_speed(iter/s)": 0.039986 }, { "acc": 0.76518865, "epoch": 1.5715268225584595, "grad_norm": 1.1937755346298218, "learning_rate": 8.209468362201627e-05, "loss": 0.82890606, "memory(GiB)": 67.73, "step": 2285, "train_speed(iter/s)": 0.040027 }, { "acc": 0.76164193, "epoch": 1.5749656121045392, "grad_norm": 1.1419281959533691, "learning_rate": 8.200740091331969e-05, "loss": 0.8369875, "memory(GiB)": 67.73, "step": 2290, "train_speed(iter/s)": 0.040071 }, { "acc": 0.7657156, "epoch": 1.578404401650619, "grad_norm": 1.2638212442398071, "learning_rate": 8.19199526394527e-05, "loss": 0.82229643, "memory(GiB)": 67.73, "step": 2295, "train_speed(iter/s)": 0.040112 }, { "acc": 0.76849699, "epoch": 1.5818431911966988, "grad_norm": 1.1830896139144897, "learning_rate": 8.183233925283104e-05, "loss": 0.79942322, "memory(GiB)": 67.73, "step": 2300, "train_speed(iter/s)": 0.040155 }, { "epoch": 1.5818431911966988, "eval_acc": 0.7557418904935439, "eval_loss": 0.8737921714782715, "eval_runtime": 1090.3389, "eval_samples_per_second": 3.928, "eval_steps_per_second": 0.071, "step": 2300 }, { "acc": 0.76840105, "epoch": 1.5852819807427787, "grad_norm": 1.1682363748550415, "learning_rate": 8.17445612067246e-05, "loss": 0.82419491, "memory(GiB)": 67.73, "step": 2305, "train_speed(iter/s)": 0.039444 }, { "acc": 0.76440401, "epoch": 1.5887207702888584, "grad_norm": 1.2088557481765747, "learning_rate": 8.165661895525515e-05, "loss": 0.8236021, "memory(GiB)": 67.73, "step": 2310, "train_speed(iter/s)": 0.039487 }, { "acc": 0.76112623, "epoch": 1.592159559834938, "grad_norm": 1.2835819721221924, "learning_rate": 8.156851295339401e-05, "loss": 0.84509296, "memory(GiB)": 67.73, "step": 2315, "train_speed(iter/s)": 0.039527 }, { "acc": 0.75933437, "epoch": 1.5955983493810177, "grad_norm": 1.1950072050094604, "learning_rate": 8.148024365695961e-05, "loss": 0.83572178, "memory(GiB)": 67.73, "step": 2320, "train_speed(iter/s)": 0.039562 }, { "acc": 0.74721594, "epoch": 1.5990371389270976, "grad_norm": 1.1385269165039062, "learning_rate": 8.139181152261524e-05, "loss": 0.87340145, "memory(GiB)": 67.73, "step": 2325, "train_speed(iter/s)": 0.039597 }, { "acc": 0.76489792, "epoch": 1.6024759284731775, "grad_norm": 1.3601405620574951, "learning_rate": 8.130321700786662e-05, "loss": 0.81867256, "memory(GiB)": 67.73, "step": 2330, "train_speed(iter/s)": 0.039642 }, { "acc": 0.7683671, "epoch": 1.6059147180192572, "grad_norm": 1.2533677816390991, "learning_rate": 8.121446057105955e-05, "loss": 0.81394958, "memory(GiB)": 67.73, "step": 2335, "train_speed(iter/s)": 0.039682 }, { "acc": 0.75637655, "epoch": 1.609353507565337, "grad_norm": 1.196452260017395, "learning_rate": 8.112554267137753e-05, "loss": 0.84699097, "memory(GiB)": 67.73, "step": 2340, "train_speed(iter/s)": 0.03972 }, { "acc": 0.77178955, "epoch": 1.6127922971114168, "grad_norm": 1.0918421745300293, "learning_rate": 8.103646376883937e-05, "loss": 0.79872456, "memory(GiB)": 67.73, "step": 2345, "train_speed(iter/s)": 0.039764 }, { "acc": 0.75204129, "epoch": 1.6162310866574967, "grad_norm": 1.2889692783355713, "learning_rate": 8.094722432429691e-05, "loss": 0.88343906, "memory(GiB)": 67.73, "step": 2350, "train_speed(iter/s)": 0.039803 }, { "acc": 0.75908298, "epoch": 1.6196698762035764, "grad_norm": 1.1028622388839722, "learning_rate": 8.085782479943245e-05, "loss": 0.8362504, "memory(GiB)": 67.73, "step": 2355, "train_speed(iter/s)": 0.039844 }, { "acc": 0.76125684, "epoch": 1.623108665749656, "grad_norm": 1.3756259679794312, "learning_rate": 8.076826565675657e-05, "loss": 0.8257452, "memory(GiB)": 67.73, "step": 2360, "train_speed(iter/s)": 0.039885 }, { "acc": 0.76714849, "epoch": 1.6265474552957357, "grad_norm": 1.5173252820968628, "learning_rate": 8.067854735960555e-05, "loss": 0.81308384, "memory(GiB)": 67.73, "step": 2365, "train_speed(iter/s)": 0.039928 }, { "acc": 0.7745882, "epoch": 1.6299862448418156, "grad_norm": 1.2738362550735474, "learning_rate": 8.058867037213916e-05, "loss": 0.79546738, "memory(GiB)": 67.73, "step": 2370, "train_speed(iter/s)": 0.039964 }, { "acc": 0.77209988, "epoch": 1.6334250343878955, "grad_norm": 1.1855344772338867, "learning_rate": 8.049863515933802e-05, "loss": 0.79778285, "memory(GiB)": 67.73, "step": 2375, "train_speed(iter/s)": 0.039997 }, { "acc": 0.76279697, "epoch": 1.6368638239339752, "grad_norm": 1.1562272310256958, "learning_rate": 8.040844218700147e-05, "loss": 0.82462883, "memory(GiB)": 67.73, "step": 2380, "train_speed(iter/s)": 0.040033 }, { "acc": 0.76684308, "epoch": 1.640302613480055, "grad_norm": 1.3373991250991821, "learning_rate": 8.031809192174495e-05, "loss": 0.81806488, "memory(GiB)": 67.73, "step": 2385, "train_speed(iter/s)": 0.040073 }, { "acc": 0.75144334, "epoch": 1.6437414030261348, "grad_norm": 1.3013478517532349, "learning_rate": 8.022758483099767e-05, "loss": 0.86880703, "memory(GiB)": 67.73, "step": 2390, "train_speed(iter/s)": 0.040113 }, { "acc": 0.7674602, "epoch": 1.6471801925722147, "grad_norm": 1.2739620208740234, "learning_rate": 8.013692138300018e-05, "loss": 0.82607212, "memory(GiB)": 67.73, "step": 2395, "train_speed(iter/s)": 0.040148 }, { "acc": 0.75973258, "epoch": 1.6506189821182944, "grad_norm": 1.313481330871582, "learning_rate": 8.004610204680196e-05, "loss": 0.83364353, "memory(GiB)": 67.73, "step": 2400, "train_speed(iter/s)": 0.040184 }, { "epoch": 1.6506189821182944, "eval_acc": 0.7556687812120394, "eval_loss": 0.8708279728889465, "eval_runtime": 1138.5113, "eval_samples_per_second": 3.762, "eval_steps_per_second": 0.068, "step": 2400 }, { "acc": 0.76298213, "epoch": 1.654057771664374, "grad_norm": 1.3461359739303589, "learning_rate": 7.995512729225894e-05, "loss": 0.82495756, "memory(GiB)": 67.73, "step": 2405, "train_speed(iter/s)": 0.03947 }, { "acc": 0.76738596, "epoch": 1.657496561210454, "grad_norm": 1.272608757019043, "learning_rate": 7.986399759003119e-05, "loss": 0.82517872, "memory(GiB)": 67.73, "step": 2410, "train_speed(iter/s)": 0.039512 }, { "acc": 0.75537925, "epoch": 1.6609353507565336, "grad_norm": 1.2757365703582764, "learning_rate": 7.977271341158035e-05, "loss": 0.8790472, "memory(GiB)": 67.73, "step": 2415, "train_speed(iter/s)": 0.039549 }, { "acc": 0.75297923, "epoch": 1.6643741403026135, "grad_norm": 1.1887763738632202, "learning_rate": 7.968127522916723e-05, "loss": 0.8699337, "memory(GiB)": 67.73, "step": 2420, "train_speed(iter/s)": 0.039586 }, { "acc": 0.76874723, "epoch": 1.6678129298486932, "grad_norm": 1.0565059185028076, "learning_rate": 7.95896835158494e-05, "loss": 0.81132565, "memory(GiB)": 67.73, "step": 2425, "train_speed(iter/s)": 0.03962 }, { "acc": 0.75187912, "epoch": 1.671251719394773, "grad_norm": 1.1199684143066406, "learning_rate": 7.949793874547877e-05, "loss": 0.89120388, "memory(GiB)": 67.73, "step": 2430, "train_speed(iter/s)": 0.039655 }, { "acc": 0.7603467, "epoch": 1.6746905089408528, "grad_norm": 1.225197196006775, "learning_rate": 7.940604139269903e-05, "loss": 0.83448153, "memory(GiB)": 67.73, "step": 2435, "train_speed(iter/s)": 0.039691 }, { "acc": 0.76669693, "epoch": 1.6781292984869327, "grad_norm": 1.2933470010757446, "learning_rate": 7.931399193294331e-05, "loss": 0.81721525, "memory(GiB)": 67.73, "step": 2440, "train_speed(iter/s)": 0.039727 }, { "acc": 0.76869669, "epoch": 1.6815680880330124, "grad_norm": 1.311872124671936, "learning_rate": 7.922179084243161e-05, "loss": 0.82446499, "memory(GiB)": 67.73, "step": 2445, "train_speed(iter/s)": 0.039768 }, { "acc": 0.74998088, "epoch": 1.685006877579092, "grad_norm": 1.109681248664856, "learning_rate": 7.912943859816847e-05, "loss": 0.88515491, "memory(GiB)": 67.73, "step": 2450, "train_speed(iter/s)": 0.039804 }, { "acc": 0.76143503, "epoch": 1.688445667125172, "grad_norm": 1.2891324758529663, "learning_rate": 7.903693567794035e-05, "loss": 0.84492321, "memory(GiB)": 67.73, "step": 2455, "train_speed(iter/s)": 0.039838 }, { "acc": 0.77193007, "epoch": 1.6918844566712519, "grad_norm": 1.0803942680358887, "learning_rate": 7.894428256031332e-05, "loss": 0.78853378, "memory(GiB)": 67.73, "step": 2460, "train_speed(iter/s)": 0.039872 }, { "acc": 0.78182478, "epoch": 1.6953232462173315, "grad_norm": 1.1716827154159546, "learning_rate": 7.88514797246304e-05, "loss": 0.76458054, "memory(GiB)": 67.73, "step": 2465, "train_speed(iter/s)": 0.039909 }, { "acc": 0.76684537, "epoch": 1.6987620357634112, "grad_norm": 1.2370884418487549, "learning_rate": 7.875852765100926e-05, "loss": 0.83122387, "memory(GiB)": 67.73, "step": 2470, "train_speed(iter/s)": 0.03994 }, { "acc": 0.7637423, "epoch": 1.702200825309491, "grad_norm": 1.4126001596450806, "learning_rate": 7.866542682033964e-05, "loss": 0.83239994, "memory(GiB)": 67.73, "step": 2475, "train_speed(iter/s)": 0.03998 }, { "acc": 0.7585422, "epoch": 1.7056396148555708, "grad_norm": 1.277557373046875, "learning_rate": 7.857217771428085e-05, "loss": 0.84281693, "memory(GiB)": 67.73, "step": 2480, "train_speed(iter/s)": 0.040019 }, { "acc": 0.76609259, "epoch": 1.7090784044016507, "grad_norm": 1.2501623630523682, "learning_rate": 7.847878081525932e-05, "loss": 0.8170804, "memory(GiB)": 67.73, "step": 2485, "train_speed(iter/s)": 0.040056 }, { "acc": 0.75793953, "epoch": 1.7125171939477304, "grad_norm": 1.5299009084701538, "learning_rate": 7.838523660646611e-05, "loss": 0.86527452, "memory(GiB)": 67.73, "step": 2490, "train_speed(iter/s)": 0.040087 }, { "acc": 0.75957718, "epoch": 1.71595598349381, "grad_norm": 1.2600144147872925, "learning_rate": 7.829154557185438e-05, "loss": 0.84371586, "memory(GiB)": 67.73, "step": 2495, "train_speed(iter/s)": 0.040119 }, { "acc": 0.77097268, "epoch": 1.71939477303989, "grad_norm": 1.2481366395950317, "learning_rate": 7.819770819613685e-05, "loss": 0.79474764, "memory(GiB)": 67.73, "step": 2500, "train_speed(iter/s)": 0.040159 }, { "epoch": 1.71939477303989, "eval_acc": 0.755859990102128, "eval_loss": 0.8665845990180969, "eval_runtime": 1172.0149, "eval_samples_per_second": 3.654, "eval_steps_per_second": 0.066, "step": 2500 }, { "acc": 0.76427364, "epoch": 1.7228335625859699, "grad_norm": 1.3622970581054688, "learning_rate": 7.810372496478342e-05, "loss": 0.83532944, "memory(GiB)": 67.73, "step": 2505, "train_speed(iter/s)": 0.039453 }, { "acc": 0.75734344, "epoch": 1.7262723521320495, "grad_norm": 1.2002394199371338, "learning_rate": 7.800959636401853e-05, "loss": 0.85398045, "memory(GiB)": 67.73, "step": 2510, "train_speed(iter/s)": 0.039487 }, { "acc": 0.76924725, "epoch": 1.7297111416781292, "grad_norm": 1.1298774480819702, "learning_rate": 7.791532288081868e-05, "loss": 0.81432209, "memory(GiB)": 67.73, "step": 2515, "train_speed(iter/s)": 0.03953 }, { "acc": 0.76489067, "epoch": 1.7331499312242091, "grad_norm": 1.3914809226989746, "learning_rate": 7.782090500290998e-05, "loss": 0.83234596, "memory(GiB)": 67.73, "step": 2520, "train_speed(iter/s)": 0.039566 }, { "acc": 0.75667624, "epoch": 1.7365887207702888, "grad_norm": 1.2779312133789062, "learning_rate": 7.77263432187655e-05, "loss": 0.84862852, "memory(GiB)": 67.73, "step": 2525, "train_speed(iter/s)": 0.039602 }, { "acc": 0.76841941, "epoch": 1.7400275103163687, "grad_norm": 1.1182903051376343, "learning_rate": 7.763163801760286e-05, "loss": 0.80550995, "memory(GiB)": 67.73, "step": 2530, "train_speed(iter/s)": 0.039639 }, { "acc": 0.75564499, "epoch": 1.7434662998624484, "grad_norm": 1.325380802154541, "learning_rate": 7.753678988938162e-05, "loss": 0.85131378, "memory(GiB)": 67.73, "step": 2535, "train_speed(iter/s)": 0.039668 }, { "acc": 0.77792916, "epoch": 1.746905089408528, "grad_norm": 1.2355977296829224, "learning_rate": 7.74417993248008e-05, "loss": 0.76762047, "memory(GiB)": 67.73, "step": 2540, "train_speed(iter/s)": 0.039703 }, { "acc": 0.7703722, "epoch": 1.750343878954608, "grad_norm": 1.1645699739456177, "learning_rate": 7.734666681529633e-05, "loss": 0.80783539, "memory(GiB)": 67.73, "step": 2545, "train_speed(iter/s)": 0.039737 }, { "acc": 0.76581202, "epoch": 1.7537826685006879, "grad_norm": 1.2667499780654907, "learning_rate": 7.725139285303843e-05, "loss": 0.81426716, "memory(GiB)": 67.73, "step": 2550, "train_speed(iter/s)": 0.039769 }, { "acc": 0.75635591, "epoch": 1.7572214580467675, "grad_norm": 1.325819492340088, "learning_rate": 7.71559779309292e-05, "loss": 0.85436335, "memory(GiB)": 67.73, "step": 2555, "train_speed(iter/s)": 0.039804 }, { "acc": 0.76963515, "epoch": 1.7606602475928472, "grad_norm": 1.1206269264221191, "learning_rate": 7.70604225426e-05, "loss": 0.79051266, "memory(GiB)": 67.73, "step": 2560, "train_speed(iter/s)": 0.039844 }, { "acc": 0.77096367, "epoch": 1.7640990371389271, "grad_norm": 1.2406977415084839, "learning_rate": 7.696472718240883e-05, "loss": 0.8147171, "memory(GiB)": 67.73, "step": 2565, "train_speed(iter/s)": 0.039879 }, { "acc": 0.76435289, "epoch": 1.767537826685007, "grad_norm": 1.2148582935333252, "learning_rate": 7.686889234543788e-05, "loss": 0.82190208, "memory(GiB)": 67.73, "step": 2570, "train_speed(iter/s)": 0.039906 }, { "acc": 0.76752815, "epoch": 1.7709766162310867, "grad_norm": 1.0485085248947144, "learning_rate": 7.677291852749093e-05, "loss": 0.81348267, "memory(GiB)": 67.73, "step": 2575, "train_speed(iter/s)": 0.039941 }, { "acc": 0.75209255, "epoch": 1.7744154057771664, "grad_norm": 1.2998508214950562, "learning_rate": 7.667680622509081e-05, "loss": 0.85738831, "memory(GiB)": 67.73, "step": 2580, "train_speed(iter/s)": 0.039973 }, { "acc": 0.77212009, "epoch": 1.777854195323246, "grad_norm": 1.1790145635604858, "learning_rate": 7.65805559354767e-05, "loss": 0.81079607, "memory(GiB)": 67.73, "step": 2585, "train_speed(iter/s)": 0.040007 }, { "acc": 0.76209583, "epoch": 1.781292984869326, "grad_norm": 1.147714614868164, "learning_rate": 7.648416815660177e-05, "loss": 0.82997284, "memory(GiB)": 67.73, "step": 2590, "train_speed(iter/s)": 0.040039 }, { "acc": 0.76417446, "epoch": 1.7847317744154059, "grad_norm": 1.3315579891204834, "learning_rate": 7.638764338713044e-05, "loss": 0.81722393, "memory(GiB)": 67.73, "step": 2595, "train_speed(iter/s)": 0.040074 }, { "acc": 0.76321087, "epoch": 1.7881705639614855, "grad_norm": 1.282499074935913, "learning_rate": 7.629098212643586e-05, "loss": 0.82541628, "memory(GiB)": 67.73, "step": 2600, "train_speed(iter/s)": 0.040108 }, { "epoch": 1.7881705639614855, "eval_acc": 0.7575696225311558, "eval_loss": 0.8597843050956726, "eval_runtime": 1165.0354, "eval_samples_per_second": 3.676, "eval_steps_per_second": 0.066, "step": 2600 }, { "acc": 0.76687446, "epoch": 1.7916093535075652, "grad_norm": 1.1122969388961792, "learning_rate": 7.619418487459733e-05, "loss": 0.80312977, "memory(GiB)": 67.73, "step": 2605, "train_speed(iter/s)": 0.039432 }, { "acc": 0.76852121, "epoch": 1.7950481430536451, "grad_norm": 1.3779712915420532, "learning_rate": 7.609725213239771e-05, "loss": 0.79960012, "memory(GiB)": 67.73, "step": 2610, "train_speed(iter/s)": 0.039468 }, { "acc": 0.76329231, "epoch": 1.798486932599725, "grad_norm": 1.4102786779403687, "learning_rate": 7.60001844013208e-05, "loss": 0.83775997, "memory(GiB)": 67.73, "step": 2615, "train_speed(iter/s)": 0.039502 }, { "acc": 0.76466594, "epoch": 1.8019257221458047, "grad_norm": 1.2906368970870972, "learning_rate": 7.590298218354877e-05, "loss": 0.82873688, "memory(GiB)": 67.73, "step": 2620, "train_speed(iter/s)": 0.039534 }, { "acc": 0.76969028, "epoch": 1.8053645116918844, "grad_norm": 1.2397427558898926, "learning_rate": 7.580564598195957e-05, "loss": 0.81822834, "memory(GiB)": 67.73, "step": 2625, "train_speed(iter/s)": 0.03957 }, { "acc": 0.7630662, "epoch": 1.8088033012379643, "grad_norm": 1.3433514833450317, "learning_rate": 7.570817630012435e-05, "loss": 0.82502728, "memory(GiB)": 67.73, "step": 2630, "train_speed(iter/s)": 0.039606 }, { "acc": 0.76269207, "epoch": 1.812242090784044, "grad_norm": 1.0851596593856812, "learning_rate": 7.561057364230475e-05, "loss": 0.81567841, "memory(GiB)": 67.73, "step": 2635, "train_speed(iter/s)": 0.03964 }, { "acc": 0.75853286, "epoch": 1.8156808803301239, "grad_norm": 1.2418478727340698, "learning_rate": 7.551283851345042e-05, "loss": 0.84409733, "memory(GiB)": 67.73, "step": 2640, "train_speed(iter/s)": 0.039671 }, { "acc": 0.76311216, "epoch": 1.8191196698762035, "grad_norm": 1.1131020784378052, "learning_rate": 7.541497141919636e-05, "loss": 0.82704649, "memory(GiB)": 67.73, "step": 2645, "train_speed(iter/s)": 0.039706 }, { "acc": 0.76137314, "epoch": 1.8225584594222832, "grad_norm": 1.378839373588562, "learning_rate": 7.531697286586024e-05, "loss": 0.833605, "memory(GiB)": 67.73, "step": 2650, "train_speed(iter/s)": 0.039741 }, { "acc": 0.76775074, "epoch": 1.8259972489683631, "grad_norm": 1.269956350326538, "learning_rate": 7.521884336043988e-05, "loss": 0.82008057, "memory(GiB)": 67.73, "step": 2655, "train_speed(iter/s)": 0.039776 }, { "acc": 0.77038703, "epoch": 1.829436038514443, "grad_norm": 1.1615530252456665, "learning_rate": 7.51205834106106e-05, "loss": 0.79894481, "memory(GiB)": 67.73, "step": 2660, "train_speed(iter/s)": 0.039811 }, { "acc": 0.76342058, "epoch": 1.8328748280605227, "grad_norm": 1.482840895652771, "learning_rate": 7.502219352472252e-05, "loss": 0.85134239, "memory(GiB)": 67.73, "step": 2665, "train_speed(iter/s)": 0.039842 }, { "acc": 0.77493591, "epoch": 1.8363136176066024, "grad_norm": 1.321035385131836, "learning_rate": 7.492367421179802e-05, "loss": 0.77966547, "memory(GiB)": 67.73, "step": 2670, "train_speed(iter/s)": 0.039874 }, { "acc": 0.7690084, "epoch": 1.8397524071526823, "grad_norm": 1.1279528141021729, "learning_rate": 7.482502598152908e-05, "loss": 0.80104809, "memory(GiB)": 67.73, "step": 2675, "train_speed(iter/s)": 0.039909 }, { "acc": 0.75845652, "epoch": 1.8431911966987622, "grad_norm": 1.340296983718872, "learning_rate": 7.472624934427461e-05, "loss": 0.8414402, "memory(GiB)": 67.73, "step": 2680, "train_speed(iter/s)": 0.039938 }, { "acc": 0.77168741, "epoch": 1.8466299862448419, "grad_norm": 1.1220262050628662, "learning_rate": 7.462734481105788e-05, "loss": 0.80376453, "memory(GiB)": 67.73, "step": 2685, "train_speed(iter/s)": 0.03997 }, { "acc": 0.77061701, "epoch": 1.8500687757909215, "grad_norm": 1.15684974193573, "learning_rate": 7.452831289356382e-05, "loss": 0.80241566, "memory(GiB)": 67.73, "step": 2690, "train_speed(iter/s)": 0.040001 }, { "acc": 0.77713642, "epoch": 1.8535075653370012, "grad_norm": 1.0986595153808594, "learning_rate": 7.442915410413635e-05, "loss": 0.76731901, "memory(GiB)": 67.73, "step": 2695, "train_speed(iter/s)": 0.040033 }, { "acc": 0.77222977, "epoch": 1.8569463548830811, "grad_norm": 1.258157730102539, "learning_rate": 7.43298689557758e-05, "loss": 0.78445282, "memory(GiB)": 67.73, "step": 2700, "train_speed(iter/s)": 0.040066 }, { "epoch": 1.8569463548830811, "eval_acc": 0.7587618661987673, "eval_loss": 0.8582912087440491, "eval_runtime": 1138.3587, "eval_samples_per_second": 3.762, "eval_steps_per_second": 0.068, "step": 2700 }, { "acc": 0.76180067, "epoch": 1.860385144429161, "grad_norm": 1.2461254596710205, "learning_rate": 7.423045796213618e-05, "loss": 0.84116306, "memory(GiB)": 67.73, "step": 2705, "train_speed(iter/s)": 0.039433 }, { "acc": 0.76939058, "epoch": 1.8638239339752407, "grad_norm": 1.1999588012695312, "learning_rate": 7.413092163752263e-05, "loss": 0.79865079, "memory(GiB)": 67.73, "step": 2710, "train_speed(iter/s)": 0.039464 }, { "acc": 0.76056762, "epoch": 1.8672627235213204, "grad_norm": 1.1177629232406616, "learning_rate": 7.403126049688864e-05, "loss": 0.83327274, "memory(GiB)": 67.73, "step": 2715, "train_speed(iter/s)": 0.0395 }, { "acc": 0.76080637, "epoch": 1.8707015130674003, "grad_norm": 1.1269170045852661, "learning_rate": 7.393147505583345e-05, "loss": 0.84126358, "memory(GiB)": 67.73, "step": 2720, "train_speed(iter/s)": 0.039532 }, { "acc": 0.76141424, "epoch": 1.8741403026134802, "grad_norm": 1.4532649517059326, "learning_rate": 7.383156583059941e-05, "loss": 0.83713207, "memory(GiB)": 67.73, "step": 2725, "train_speed(iter/s)": 0.039565 }, { "acc": 0.76509705, "epoch": 1.8775790921595599, "grad_norm": 1.7495522499084473, "learning_rate": 7.373153333806917e-05, "loss": 0.82034264, "memory(GiB)": 67.73, "step": 2730, "train_speed(iter/s)": 0.039601 }, { "acc": 0.7695118, "epoch": 1.8810178817056395, "grad_norm": 1.3915905952453613, "learning_rate": 7.363137809576322e-05, "loss": 0.81267509, "memory(GiB)": 67.73, "step": 2735, "train_speed(iter/s)": 0.039634 }, { "acc": 0.76744928, "epoch": 1.8844566712517192, "grad_norm": 1.1005185842514038, "learning_rate": 7.353110062183706e-05, "loss": 0.82437191, "memory(GiB)": 67.73, "step": 2740, "train_speed(iter/s)": 0.039664 }, { "acc": 0.75223131, "epoch": 1.8878954607977991, "grad_norm": 1.4645339250564575, "learning_rate": 7.343070143507853e-05, "loss": 0.84084778, "memory(GiB)": 67.73, "step": 2745, "train_speed(iter/s)": 0.039692 }, { "acc": 0.76176004, "epoch": 1.891334250343879, "grad_norm": 1.2676668167114258, "learning_rate": 7.333018105490515e-05, "loss": 0.83568439, "memory(GiB)": 67.73, "step": 2750, "train_speed(iter/s)": 0.039723 }, { "acc": 0.75615792, "epoch": 1.8947730398899587, "grad_norm": 1.2724095582962036, "learning_rate": 7.322954000136148e-05, "loss": 0.8729351, "memory(GiB)": 67.73, "step": 2755, "train_speed(iter/s)": 0.03975 }, { "acc": 0.78407092, "epoch": 1.8982118294360384, "grad_norm": 1.0895689725875854, "learning_rate": 7.312877879511639e-05, "loss": 0.76207438, "memory(GiB)": 67.73, "step": 2760, "train_speed(iter/s)": 0.039781 }, { "acc": 0.76615877, "epoch": 1.9016506189821183, "grad_norm": 1.2832506895065308, "learning_rate": 7.30278979574603e-05, "loss": 0.82421865, "memory(GiB)": 67.73, "step": 2765, "train_speed(iter/s)": 0.039817 }, { "acc": 0.75757556, "epoch": 1.9050894085281982, "grad_norm": 1.069419503211975, "learning_rate": 7.292689801030262e-05, "loss": 0.84605732, "memory(GiB)": 67.73, "step": 2770, "train_speed(iter/s)": 0.039848 }, { "acc": 0.7682868, "epoch": 1.9085281980742779, "grad_norm": 1.2053790092468262, "learning_rate": 7.282577947616894e-05, "loss": 0.81153774, "memory(GiB)": 67.73, "step": 2775, "train_speed(iter/s)": 0.039879 }, { "acc": 0.76934462, "epoch": 1.9119669876203575, "grad_norm": 1.1767531633377075, "learning_rate": 7.272454287819833e-05, "loss": 0.80814152, "memory(GiB)": 67.73, "step": 2780, "train_speed(iter/s)": 0.039913 }, { "acc": 0.7687346, "epoch": 1.9154057771664375, "grad_norm": 1.2420388460159302, "learning_rate": 7.262318874014073e-05, "loss": 0.79890871, "memory(GiB)": 67.73, "step": 2785, "train_speed(iter/s)": 0.039945 }, { "acc": 0.76412306, "epoch": 1.9188445667125174, "grad_norm": 1.1197503805160522, "learning_rate": 7.252171758635413e-05, "loss": 0.8147377, "memory(GiB)": 67.73, "step": 2790, "train_speed(iter/s)": 0.039979 }, { "acc": 0.76467457, "epoch": 1.922283356258597, "grad_norm": 1.3166663646697998, "learning_rate": 7.242012994180194e-05, "loss": 0.80287476, "memory(GiB)": 67.73, "step": 2795, "train_speed(iter/s)": 0.040006 }, { "acc": 0.75744901, "epoch": 1.9257221458046767, "grad_norm": 1.226488709449768, "learning_rate": 7.231842633205018e-05, "loss": 0.84611988, "memory(GiB)": 67.73, "step": 2800, "train_speed(iter/s)": 0.040037 }, { "epoch": 1.9257221458046767, "eval_acc": 0.7585087956089441, "eval_loss": 0.8532779812812805, "eval_runtime": 1120.0858, "eval_samples_per_second": 3.824, "eval_steps_per_second": 0.069, "step": 2800 }, { "acc": 0.75332041, "epoch": 1.9291609353507564, "grad_norm": 1.3092836141586304, "learning_rate": 7.221660728326488e-05, "loss": 0.86540194, "memory(GiB)": 67.73, "step": 2805, "train_speed(iter/s)": 0.039438 }, { "acc": 0.75658636, "epoch": 1.9325997248968363, "grad_norm": 1.507814645767212, "learning_rate": 7.211467332220929e-05, "loss": 0.84582443, "memory(GiB)": 67.73, "step": 2810, "train_speed(iter/s)": 0.039471 }, { "acc": 0.77044678, "epoch": 1.9360385144429162, "grad_norm": 1.1073263883590698, "learning_rate": 7.201262497624113e-05, "loss": 0.80068121, "memory(GiB)": 67.73, "step": 2815, "train_speed(iter/s)": 0.0395 }, { "acc": 0.76184196, "epoch": 1.9394773039889959, "grad_norm": 1.2935534715652466, "learning_rate": 7.191046277330989e-05, "loss": 0.80897961, "memory(GiB)": 67.73, "step": 2820, "train_speed(iter/s)": 0.039535 }, { "acc": 0.77721043, "epoch": 1.9429160935350756, "grad_norm": 1.291559100151062, "learning_rate": 7.180818724195413e-05, "loss": 0.78424644, "memory(GiB)": 67.73, "step": 2825, "train_speed(iter/s)": 0.039567 }, { "acc": 0.76044025, "epoch": 1.9463548830811555, "grad_norm": 1.3120973110198975, "learning_rate": 7.170579891129872e-05, "loss": 0.83824387, "memory(GiB)": 67.73, "step": 2830, "train_speed(iter/s)": 0.039597 }, { "acc": 0.768398, "epoch": 1.9497936726272354, "grad_norm": 1.4003841876983643, "learning_rate": 7.160329831105207e-05, "loss": 0.81456871, "memory(GiB)": 67.73, "step": 2835, "train_speed(iter/s)": 0.039628 }, { "acc": 0.76318188, "epoch": 1.953232462173315, "grad_norm": 1.2559746503829956, "learning_rate": 7.150068597150343e-05, "loss": 0.81296453, "memory(GiB)": 67.73, "step": 2840, "train_speed(iter/s)": 0.039657 }, { "acc": 0.76650887, "epoch": 1.9566712517193947, "grad_norm": 1.4540189504623413, "learning_rate": 7.139796242352016e-05, "loss": 0.80663786, "memory(GiB)": 67.73, "step": 2845, "train_speed(iter/s)": 0.039685 }, { "acc": 0.75642557, "epoch": 1.9601100412654744, "grad_norm": 1.2288488149642944, "learning_rate": 7.129512819854492e-05, "loss": 0.85932999, "memory(GiB)": 67.73, "step": 2850, "train_speed(iter/s)": 0.039713 }, { "acc": 0.76574574, "epoch": 1.9635488308115543, "grad_norm": 1.0715101957321167, "learning_rate": 7.1192183828593e-05, "loss": 0.81189203, "memory(GiB)": 67.73, "step": 2855, "train_speed(iter/s)": 0.039744 }, { "acc": 0.76550779, "epoch": 1.9669876203576342, "grad_norm": 1.1467492580413818, "learning_rate": 7.108912984624951e-05, "loss": 0.816045, "memory(GiB)": 67.73, "step": 2860, "train_speed(iter/s)": 0.039771 }, { "acc": 0.77190948, "epoch": 1.9704264099037139, "grad_norm": 1.3903864622116089, "learning_rate": 7.098596678466663e-05, "loss": 0.79169202, "memory(GiB)": 67.73, "step": 2865, "train_speed(iter/s)": 0.039802 }, { "acc": 0.77034965, "epoch": 1.9738651994497936, "grad_norm": 1.4153941869735718, "learning_rate": 7.088269517756085e-05, "loss": 0.8023941, "memory(GiB)": 67.73, "step": 2870, "train_speed(iter/s)": 0.039833 }, { "acc": 0.7611412, "epoch": 1.9773039889958735, "grad_norm": 1.420799732208252, "learning_rate": 7.07793155592103e-05, "loss": 0.82977829, "memory(GiB)": 67.73, "step": 2875, "train_speed(iter/s)": 0.039863 }, { "acc": 0.7672267, "epoch": 1.9807427785419534, "grad_norm": 1.3404732942581177, "learning_rate": 7.06758284644518e-05, "loss": 0.82698822, "memory(GiB)": 67.73, "step": 2880, "train_speed(iter/s)": 0.039895 }, { "acc": 0.7625721, "epoch": 1.984181568088033, "grad_norm": 1.215389370918274, "learning_rate": 7.057223442867828e-05, "loss": 0.81949692, "memory(GiB)": 67.73, "step": 2885, "train_speed(iter/s)": 0.039923 }, { "acc": 0.76380196, "epoch": 1.9876203576341127, "grad_norm": 1.1068233251571655, "learning_rate": 7.046853398783595e-05, "loss": 0.82260599, "memory(GiB)": 67.73, "step": 2890, "train_speed(iter/s)": 0.039951 }, { "acc": 0.77088547, "epoch": 1.9910591471801926, "grad_norm": 1.2111361026763916, "learning_rate": 7.03647276784214e-05, "loss": 0.79111667, "memory(GiB)": 67.73, "step": 2895, "train_speed(iter/s)": 0.039979 }, { "acc": 0.77545271, "epoch": 1.9944979367262725, "grad_norm": 1.5610926151275635, "learning_rate": 7.026081603747905e-05, "loss": 0.78897448, "memory(GiB)": 67.73, "step": 2900, "train_speed(iter/s)": 0.040011 }, { "epoch": 1.9944979367262725, "eval_acc": 0.7610057587618662, "eval_loss": 0.8448835015296936, "eval_runtime": 1173.7451, "eval_samples_per_second": 3.649, "eval_steps_per_second": 0.066, "step": 2900 }, { "acc": 0.76741076, "epoch": 1.9979367262723522, "grad_norm": 1.279598593711853, "learning_rate": 7.01567996025982e-05, "loss": 0.81564512, "memory(GiB)": 67.73, "step": 2905, "train_speed(iter/s)": 0.039405 }, { "acc": 0.76975098, "epoch": 2.001375515818432, "grad_norm": 1.2036709785461426, "learning_rate": 7.00526789119103e-05, "loss": 0.80328093, "memory(GiB)": 67.73, "step": 2910, "train_speed(iter/s)": 0.039417 }, { "acc": 0.77956657, "epoch": 2.0048143053645116, "grad_norm": 1.2451400756835938, "learning_rate": 6.994845450408618e-05, "loss": 0.77778225, "memory(GiB)": 67.73, "step": 2915, "train_speed(iter/s)": 0.039446 }, { "acc": 0.77100277, "epoch": 2.0082530949105917, "grad_norm": 1.501703143119812, "learning_rate": 6.984412691833326e-05, "loss": 0.78023448, "memory(GiB)": 67.73, "step": 2920, "train_speed(iter/s)": 0.039476 }, { "acc": 0.78834424, "epoch": 2.0116918844566714, "grad_norm": 1.3272697925567627, "learning_rate": 6.973969669439275e-05, "loss": 0.72417383, "memory(GiB)": 67.73, "step": 2925, "train_speed(iter/s)": 0.039503 }, { "acc": 0.78664284, "epoch": 2.015130674002751, "grad_norm": 1.2188291549682617, "learning_rate": 6.963516437253684e-05, "loss": 0.72837029, "memory(GiB)": 67.73, "step": 2930, "train_speed(iter/s)": 0.039531 }, { "acc": 0.78195653, "epoch": 2.0185694635488307, "grad_norm": 1.386407494544983, "learning_rate": 6.953053049356597e-05, "loss": 0.74600391, "memory(GiB)": 67.73, "step": 2935, "train_speed(iter/s)": 0.039557 }, { "acc": 0.77869625, "epoch": 2.0220082530949104, "grad_norm": 1.3403911590576172, "learning_rate": 6.942579559880596e-05, "loss": 0.75640421, "memory(GiB)": 67.73, "step": 2940, "train_speed(iter/s)": 0.039588 }, { "acc": 0.78800364, "epoch": 2.0254470426409905, "grad_norm": 1.2047083377838135, "learning_rate": 6.932096023010522e-05, "loss": 0.73669438, "memory(GiB)": 67.73, "step": 2945, "train_speed(iter/s)": 0.039621 }, { "acc": 0.79104028, "epoch": 2.02888583218707, "grad_norm": 1.208552360534668, "learning_rate": 6.921602492983198e-05, "loss": 0.71291051, "memory(GiB)": 67.73, "step": 2950, "train_speed(iter/s)": 0.039651 }, { "acc": 0.77535133, "epoch": 2.03232462173315, "grad_norm": 1.5250719785690308, "learning_rate": 6.911099024087153e-05, "loss": 0.76551533, "memory(GiB)": 67.73, "step": 2955, "train_speed(iter/s)": 0.039681 }, { "acc": 0.76844397, "epoch": 2.0357634112792296, "grad_norm": 1.2615329027175903, "learning_rate": 6.900585670662321e-05, "loss": 0.79221487, "memory(GiB)": 67.73, "step": 2960, "train_speed(iter/s)": 0.039706 }, { "acc": 0.7686276, "epoch": 2.0392022008253097, "grad_norm": 1.3869153261184692, "learning_rate": 6.890062487099788e-05, "loss": 0.79985504, "memory(GiB)": 67.73, "step": 2965, "train_speed(iter/s)": 0.039733 }, { "acc": 0.79315829, "epoch": 2.0426409903713894, "grad_norm": 1.3412541151046753, "learning_rate": 6.87952952784149e-05, "loss": 0.70044346, "memory(GiB)": 67.73, "step": 2970, "train_speed(iter/s)": 0.039764 }, { "acc": 0.76822557, "epoch": 2.046079779917469, "grad_norm": 1.4066526889801025, "learning_rate": 6.868986847379934e-05, "loss": 0.79726977, "memory(GiB)": 67.73, "step": 2975, "train_speed(iter/s)": 0.03979 }, { "acc": 0.77974381, "epoch": 2.0495185694635487, "grad_norm": 1.2747722864151, "learning_rate": 6.858434500257929e-05, "loss": 0.75174856, "memory(GiB)": 67.73, "step": 2980, "train_speed(iter/s)": 0.039819 }, { "acc": 0.7757237, "epoch": 2.0529573590096284, "grad_norm": 1.3321025371551514, "learning_rate": 6.847872541068289e-05, "loss": 0.76223741, "memory(GiB)": 67.73, "step": 2985, "train_speed(iter/s)": 0.039845 }, { "acc": 0.78108168, "epoch": 2.0563961485557085, "grad_norm": 1.3092007637023926, "learning_rate": 6.837301024453556e-05, "loss": 0.75712924, "memory(GiB)": 67.73, "step": 2990, "train_speed(iter/s)": 0.03987 }, { "acc": 0.78598289, "epoch": 2.059834938101788, "grad_norm": 1.305657982826233, "learning_rate": 6.826720005105718e-05, "loss": 0.73581972, "memory(GiB)": 67.73, "step": 2995, "train_speed(iter/s)": 0.039898 }, { "acc": 0.78907838, "epoch": 2.063273727647868, "grad_norm": 1.4367668628692627, "learning_rate": 6.816129537765928e-05, "loss": 0.72936554, "memory(GiB)": 67.73, "step": 3000, "train_speed(iter/s)": 0.039927 }, { "epoch": 2.063273727647868, "eval_acc": 0.7620799028208936, "eval_loss": 0.8483734726905823, "eval_runtime": 1164.2299, "eval_samples_per_second": 3.679, "eval_steps_per_second": 0.066, "step": 3000 }, { "acc": 0.78301954, "epoch": 2.0667125171939476, "grad_norm": 1.5559133291244507, "learning_rate": 6.805529677224216e-05, "loss": 0.7379096, "memory(GiB)": 67.73, "step": 3005, "train_speed(iter/s)": 0.039348 }, { "acc": 0.77635798, "epoch": 2.0701513067400277, "grad_norm": 1.266300916671753, "learning_rate": 6.79492047831921e-05, "loss": 0.74843826, "memory(GiB)": 67.73, "step": 3010, "train_speed(iter/s)": 0.039374 }, { "acc": 0.78403974, "epoch": 2.0735900962861074, "grad_norm": 1.3442883491516113, "learning_rate": 6.784301995937846e-05, "loss": 0.73924718, "memory(GiB)": 67.73, "step": 3015, "train_speed(iter/s)": 0.039405 }, { "acc": 0.7821476, "epoch": 2.077028885832187, "grad_norm": 1.3284059762954712, "learning_rate": 6.773674285015092e-05, "loss": 0.731111, "memory(GiB)": 67.73, "step": 3020, "train_speed(iter/s)": 0.039435 }, { "acc": 0.78100576, "epoch": 2.0804676753782667, "grad_norm": 1.2452079057693481, "learning_rate": 6.76303740053366e-05, "loss": 0.75636292, "memory(GiB)": 67.73, "step": 3025, "train_speed(iter/s)": 0.039465 }, { "acc": 0.77952466, "epoch": 2.083906464924347, "grad_norm": 1.5737926959991455, "learning_rate": 6.752391397523725e-05, "loss": 0.75301075, "memory(GiB)": 67.73, "step": 3030, "train_speed(iter/s)": 0.039494 }, { "acc": 0.78698683, "epoch": 2.0873452544704265, "grad_norm": 1.337703824043274, "learning_rate": 6.741736331062626e-05, "loss": 0.73099127, "memory(GiB)": 67.73, "step": 3035, "train_speed(iter/s)": 0.039519 }, { "acc": 0.79346113, "epoch": 2.090784044016506, "grad_norm": 1.203200340270996, "learning_rate": 6.731072256274604e-05, "loss": 0.70464416, "memory(GiB)": 67.73, "step": 3040, "train_speed(iter/s)": 0.039547 }, { "acc": 0.78062749, "epoch": 2.094222833562586, "grad_norm": 1.5236440896987915, "learning_rate": 6.720399228330494e-05, "loss": 0.75513244, "memory(GiB)": 67.73, "step": 3045, "train_speed(iter/s)": 0.039576 }, { "acc": 0.78337817, "epoch": 2.0976616231086656, "grad_norm": 1.533868670463562, "learning_rate": 6.709717302447462e-05, "loss": 0.73356051, "memory(GiB)": 67.73, "step": 3050, "train_speed(iter/s)": 0.039604 }, { "acc": 0.7775434, "epoch": 2.1011004126547457, "grad_norm": 1.5052993297576904, "learning_rate": 6.699026533888696e-05, "loss": 0.75711803, "memory(GiB)": 67.73, "step": 3055, "train_speed(iter/s)": 0.039632 }, { "acc": 0.76806664, "epoch": 2.1045392022008254, "grad_norm": 1.5001362562179565, "learning_rate": 6.688326977963142e-05, "loss": 0.78131714, "memory(GiB)": 67.73, "step": 3060, "train_speed(iter/s)": 0.039658 }, { "acc": 0.76824741, "epoch": 2.107977991746905, "grad_norm": 1.422959327697754, "learning_rate": 6.677618690025201e-05, "loss": 0.79324121, "memory(GiB)": 67.73, "step": 3065, "train_speed(iter/s)": 0.039682 }, { "acc": 0.79779997, "epoch": 2.1114167812929847, "grad_norm": 1.4451581239700317, "learning_rate": 6.666901725474453e-05, "loss": 0.69419732, "memory(GiB)": 67.73, "step": 3070, "train_speed(iter/s)": 0.039716 }, { "acc": 0.77174778, "epoch": 2.114855570839065, "grad_norm": 1.47465980052948, "learning_rate": 6.656176139755361e-05, "loss": 0.79069195, "memory(GiB)": 67.73, "step": 3075, "train_speed(iter/s)": 0.039741 }, { "acc": 0.77605443, "epoch": 2.1182943603851445, "grad_norm": 1.3267581462860107, "learning_rate": 6.645441988356998e-05, "loss": 0.74461274, "memory(GiB)": 67.73, "step": 3080, "train_speed(iter/s)": 0.03977 }, { "acc": 0.77349963, "epoch": 2.121733149931224, "grad_norm": 1.391455054283142, "learning_rate": 6.634699326812746e-05, "loss": 0.77960453, "memory(GiB)": 67.73, "step": 3085, "train_speed(iter/s)": 0.039803 }, { "acc": 0.79598751, "epoch": 2.125171939477304, "grad_norm": 1.2205495834350586, "learning_rate": 6.623948210700017e-05, "loss": 0.70987749, "memory(GiB)": 67.73, "step": 3090, "train_speed(iter/s)": 0.039834 }, { "acc": 0.77750764, "epoch": 2.128610729023384, "grad_norm": 1.4609593152999878, "learning_rate": 6.613188695639961e-05, "loss": 0.75586929, "memory(GiB)": 67.73, "step": 3095, "train_speed(iter/s)": 0.039862 }, { "acc": 0.78546953, "epoch": 2.1320495185694637, "grad_norm": 1.264543890953064, "learning_rate": 6.602420837297181e-05, "loss": 0.71513643, "memory(GiB)": 67.73, "step": 3100, "train_speed(iter/s)": 0.039888 }, { "epoch": 2.1320495185694637, "eval_acc": 0.7627378863544338, "eval_loss": 0.8477216958999634, "eval_runtime": 1140.3429, "eval_samples_per_second": 3.756, "eval_steps_per_second": 0.068, "step": 3100 }, { "acc": 0.79360695, "epoch": 2.1354883081155434, "grad_norm": 1.446578025817871, "learning_rate": 6.591644691379445e-05, "loss": 0.69685826, "memory(GiB)": 67.73, "step": 3105, "train_speed(iter/s)": 0.039338 }, { "acc": 0.78709059, "epoch": 2.138927097661623, "grad_norm": 1.3313159942626953, "learning_rate": 6.580860313637395e-05, "loss": 0.71339779, "memory(GiB)": 67.73, "step": 3110, "train_speed(iter/s)": 0.039367 }, { "acc": 0.77740135, "epoch": 2.1423658872077027, "grad_norm": 1.2659940719604492, "learning_rate": 6.57006775986426e-05, "loss": 0.77046852, "memory(GiB)": 67.73, "step": 3115, "train_speed(iter/s)": 0.039389 }, { "acc": 0.77650619, "epoch": 2.145804676753783, "grad_norm": 1.4188055992126465, "learning_rate": 6.559267085895575e-05, "loss": 0.77363644, "memory(GiB)": 67.73, "step": 3120, "train_speed(iter/s)": 0.039413 }, { "acc": 0.7744916, "epoch": 2.1492434662998625, "grad_norm": 1.5709620714187622, "learning_rate": 6.548458347608877e-05, "loss": 0.78516483, "memory(GiB)": 67.73, "step": 3125, "train_speed(iter/s)": 0.039441 }, { "acc": 0.79064846, "epoch": 2.152682255845942, "grad_norm": 1.4528831243515015, "learning_rate": 6.537641600923424e-05, "loss": 0.72166934, "memory(GiB)": 67.73, "step": 3130, "train_speed(iter/s)": 0.039473 }, { "acc": 0.78410482, "epoch": 2.156121045392022, "grad_norm": 1.2021089792251587, "learning_rate": 6.52681690179991e-05, "loss": 0.72740555, "memory(GiB)": 67.73, "step": 3135, "train_speed(iter/s)": 0.039502 }, { "acc": 0.78779097, "epoch": 2.1595598349381016, "grad_norm": 1.3785122632980347, "learning_rate": 6.515984306240171e-05, "loss": 0.7183506, "memory(GiB)": 67.73, "step": 3140, "train_speed(iter/s)": 0.039532 }, { "acc": 0.78875618, "epoch": 2.1629986244841817, "grad_norm": 1.4517391920089722, "learning_rate": 6.505143870286892e-05, "loss": 0.72756548, "memory(GiB)": 67.73, "step": 3145, "train_speed(iter/s)": 0.039559 }, { "acc": 0.77699008, "epoch": 2.1664374140302614, "grad_norm": 1.3971807956695557, "learning_rate": 6.494295650023326e-05, "loss": 0.76216259, "memory(GiB)": 67.73, "step": 3150, "train_speed(iter/s)": 0.039584 }, { "acc": 0.78072176, "epoch": 2.169876203576341, "grad_norm": 1.3789912462234497, "learning_rate": 6.483439701572987e-05, "loss": 0.74354634, "memory(GiB)": 67.73, "step": 3155, "train_speed(iter/s)": 0.039607 }, { "acc": 0.78208303, "epoch": 2.1733149931224207, "grad_norm": 1.4556952714920044, "learning_rate": 6.472576081099384e-05, "loss": 0.74753394, "memory(GiB)": 67.73, "step": 3160, "train_speed(iter/s)": 0.03963 }, { "acc": 0.77552352, "epoch": 2.176753782668501, "grad_norm": 1.3656238317489624, "learning_rate": 6.461704844805711e-05, "loss": 0.76849761, "memory(GiB)": 67.73, "step": 3165, "train_speed(iter/s)": 0.039651 }, { "acc": 0.79463406, "epoch": 2.1801925722145805, "grad_norm": 1.6309324502944946, "learning_rate": 6.450826048934564e-05, "loss": 0.71653328, "memory(GiB)": 67.73, "step": 3170, "train_speed(iter/s)": 0.03968 }, { "acc": 0.78766985, "epoch": 2.18363136176066, "grad_norm": 1.5563981533050537, "learning_rate": 6.439939749767646e-05, "loss": 0.72953587, "memory(GiB)": 67.73, "step": 3175, "train_speed(iter/s)": 0.039706 }, { "acc": 0.78887863, "epoch": 2.18707015130674, "grad_norm": 1.4095101356506348, "learning_rate": 6.429046003625481e-05, "loss": 0.71779909, "memory(GiB)": 67.73, "step": 3180, "train_speed(iter/s)": 0.039734 }, { "acc": 0.78688583, "epoch": 2.19050894085282, "grad_norm": 1.3465501070022583, "learning_rate": 6.418144866867121e-05, "loss": 0.73801022, "memory(GiB)": 67.73, "step": 3185, "train_speed(iter/s)": 0.03976 }, { "acc": 0.78006182, "epoch": 2.1939477303988997, "grad_norm": 1.4523009061813354, "learning_rate": 6.407236395889853e-05, "loss": 0.75504117, "memory(GiB)": 67.73, "step": 3190, "train_speed(iter/s)": 0.039787 }, { "acc": 0.77997456, "epoch": 2.1973865199449794, "grad_norm": 1.4462857246398926, "learning_rate": 6.396320647128904e-05, "loss": 0.7402379, "memory(GiB)": 67.73, "step": 3195, "train_speed(iter/s)": 0.039814 }, { "acc": 0.78546247, "epoch": 2.200825309491059, "grad_norm": 1.3453855514526367, "learning_rate": 6.385397677057158e-05, "loss": 0.74274435, "memory(GiB)": 67.73, "step": 3200, "train_speed(iter/s)": 0.039843 }, { "epoch": 2.200825309491059, "eval_acc": 0.7633396319791245, "eval_loss": 0.8417202234268188, "eval_runtime": 1162.9585, "eval_samples_per_second": 3.683, "eval_steps_per_second": 0.066, "step": 3200 }, { "acc": 0.7864996, "epoch": 2.2042640990371387, "grad_norm": 1.5509644746780396, "learning_rate": 6.374467542184858e-05, "loss": 0.7381216, "memory(GiB)": 67.73, "step": 3205, "train_speed(iter/s)": 0.0393 }, { "acc": 0.7754878, "epoch": 2.207702888583219, "grad_norm": 1.4426201581954956, "learning_rate": 6.363530299059309e-05, "loss": 0.76541862, "memory(GiB)": 67.73, "step": 3210, "train_speed(iter/s)": 0.039326 }, { "acc": 0.78851786, "epoch": 2.2111416781292985, "grad_norm": 1.4319977760314941, "learning_rate": 6.352586004264595e-05, "loss": 0.71185198, "memory(GiB)": 67.73, "step": 3215, "train_speed(iter/s)": 0.039353 }, { "acc": 0.78093901, "epoch": 2.214580467675378, "grad_norm": 1.4418880939483643, "learning_rate": 6.341634714421283e-05, "loss": 0.73851643, "memory(GiB)": 67.73, "step": 3220, "train_speed(iter/s)": 0.039379 }, { "acc": 0.78669729, "epoch": 2.218019257221458, "grad_norm": 1.270719289779663, "learning_rate": 6.330676486186129e-05, "loss": 0.71485491, "memory(GiB)": 67.73, "step": 3225, "train_speed(iter/s)": 0.039408 }, { "acc": 0.79144497, "epoch": 2.221458046767538, "grad_norm": 1.4314090013504028, "learning_rate": 6.31971137625178e-05, "loss": 0.7153636, "memory(GiB)": 67.73, "step": 3230, "train_speed(iter/s)": 0.039435 }, { "acc": 0.77727938, "epoch": 2.2248968363136177, "grad_norm": 1.4725736379623413, "learning_rate": 6.308739441346485e-05, "loss": 0.76273413, "memory(GiB)": 67.73, "step": 3235, "train_speed(iter/s)": 0.039462 }, { "acc": 0.78571582, "epoch": 2.2283356258596974, "grad_norm": 1.512195348739624, "learning_rate": 6.297760738233815e-05, "loss": 0.72405062, "memory(GiB)": 67.73, "step": 3240, "train_speed(iter/s)": 0.039488 }, { "acc": 0.79117641, "epoch": 2.231774415405777, "grad_norm": 1.3315008878707886, "learning_rate": 6.286775323712345e-05, "loss": 0.69954386, "memory(GiB)": 67.73, "step": 3245, "train_speed(iter/s)": 0.039513 }, { "acc": 0.7794364, "epoch": 2.235213204951857, "grad_norm": 1.3924676179885864, "learning_rate": 6.275783254615373e-05, "loss": 0.73521681, "memory(GiB)": 67.73, "step": 3250, "train_speed(iter/s)": 0.039537 }, { "acc": 0.77737999, "epoch": 2.238651994497937, "grad_norm": 1.2619364261627197, "learning_rate": 6.264784587810623e-05, "loss": 0.74972701, "memory(GiB)": 67.73, "step": 3255, "train_speed(iter/s)": 0.039565 }, { "acc": 0.7738802, "epoch": 2.2420907840440165, "grad_norm": 1.5401955842971802, "learning_rate": 6.25377938019996e-05, "loss": 0.79901037, "memory(GiB)": 67.73, "step": 3260, "train_speed(iter/s)": 0.039591 }, { "acc": 0.78558297, "epoch": 2.245529573590096, "grad_norm": 1.2368144989013672, "learning_rate": 6.24276768871908e-05, "loss": 0.72351027, "memory(GiB)": 67.73, "step": 3265, "train_speed(iter/s)": 0.039617 }, { "acc": 0.78128538, "epoch": 2.248968363136176, "grad_norm": 1.4421414136886597, "learning_rate": 6.23174957033723e-05, "loss": 0.72427325, "memory(GiB)": 67.73, "step": 3270, "train_speed(iter/s)": 0.039638 }, { "acc": 0.79050694, "epoch": 2.252407152682256, "grad_norm": 1.473962426185608, "learning_rate": 6.220725082056901e-05, "loss": 0.70825963, "memory(GiB)": 67.73, "step": 3275, "train_speed(iter/s)": 0.039665 }, { "acc": 0.78690662, "epoch": 2.2558459422283357, "grad_norm": 1.4135388135910034, "learning_rate": 6.209694280913539e-05, "loss": 0.71959724, "memory(GiB)": 67.73, "step": 3280, "train_speed(iter/s)": 0.039691 }, { "acc": 0.78945398, "epoch": 2.2592847317744154, "grad_norm": 1.604658842086792, "learning_rate": 6.198657223975257e-05, "loss": 0.71993084, "memory(GiB)": 67.73, "step": 3285, "train_speed(iter/s)": 0.039719 }, { "acc": 0.78186049, "epoch": 2.262723521320495, "grad_norm": 1.5232980251312256, "learning_rate": 6.187613968342524e-05, "loss": 0.75053821, "memory(GiB)": 67.73, "step": 3290, "train_speed(iter/s)": 0.039741 }, { "acc": 0.78012853, "epoch": 2.266162310866575, "grad_norm": 1.2351890802383423, "learning_rate": 6.176564571147882e-05, "loss": 0.76610746, "memory(GiB)": 67.73, "step": 3295, "train_speed(iter/s)": 0.039772 }, { "acc": 0.79100294, "epoch": 2.269601100412655, "grad_norm": 1.2078155279159546, "learning_rate": 6.165509089555647e-05, "loss": 0.69998646, "memory(GiB)": 67.73, "step": 3300, "train_speed(iter/s)": 0.039798 }, { "epoch": 2.269601100412655, "eval_acc": 0.7635139694965583, "eval_loss": 0.8397796750068665, "eval_runtime": 1111.351, "eval_samples_per_second": 3.854, "eval_steps_per_second": 0.069, "step": 3300 }, { "acc": 0.78393035, "epoch": 2.2730398899587345, "grad_norm": 1.453913688659668, "learning_rate": 6.154447580761612e-05, "loss": 0.72859416, "memory(GiB)": 67.73, "step": 3305, "train_speed(iter/s)": 0.039299 }, { "acc": 0.78495998, "epoch": 2.276478679504814, "grad_norm": 1.4776540994644165, "learning_rate": 6.143380101992752e-05, "loss": 0.7363111, "memory(GiB)": 67.73, "step": 3310, "train_speed(iter/s)": 0.039328 }, { "acc": 0.78339643, "epoch": 2.2799174690508943, "grad_norm": 1.5362030267715454, "learning_rate": 6.132306710506926e-05, "loss": 0.7379076, "memory(GiB)": 67.73, "step": 3315, "train_speed(iter/s)": 0.039357 }, { "acc": 0.7797461, "epoch": 2.283356258596974, "grad_norm": 1.5111163854599, "learning_rate": 6.121227463592585e-05, "loss": 0.76058264, "memory(GiB)": 67.73, "step": 3320, "train_speed(iter/s)": 0.039381 }, { "acc": 0.77675905, "epoch": 2.2867950481430537, "grad_norm": 1.4410961866378784, "learning_rate": 6.11014241856847e-05, "loss": 0.7604672, "memory(GiB)": 67.73, "step": 3325, "train_speed(iter/s)": 0.039401 }, { "acc": 0.78640685, "epoch": 2.2902338376891334, "grad_norm": 1.7038127183914185, "learning_rate": 6.099051632783327e-05, "loss": 0.72746906, "memory(GiB)": 67.73, "step": 3330, "train_speed(iter/s)": 0.039426 }, { "acc": 0.78558645, "epoch": 2.293672627235213, "grad_norm": 1.2854883670806885, "learning_rate": 6.0879551636155894e-05, "loss": 0.74064126, "memory(GiB)": 67.73, "step": 3335, "train_speed(iter/s)": 0.039449 }, { "acc": 0.781954, "epoch": 2.297111416781293, "grad_norm": 1.7075494527816772, "learning_rate": 6.076853068473104e-05, "loss": 0.73064299, "memory(GiB)": 67.73, "step": 3340, "train_speed(iter/s)": 0.039476 }, { "acc": 0.7864274, "epoch": 2.300550206327373, "grad_norm": 1.3785117864608765, "learning_rate": 6.065745404792816e-05, "loss": 0.72167091, "memory(GiB)": 67.73, "step": 3345, "train_speed(iter/s)": 0.039497 }, { "acc": 0.77905812, "epoch": 2.3039889958734525, "grad_norm": 1.2673250436782837, "learning_rate": 6.054632230040489e-05, "loss": 0.75723281, "memory(GiB)": 67.73, "step": 3350, "train_speed(iter/s)": 0.039522 }, { "acc": 0.78323727, "epoch": 2.307427785419532, "grad_norm": 1.5396391153335571, "learning_rate": 6.043513601710391e-05, "loss": 0.75284595, "memory(GiB)": 67.73, "step": 3355, "train_speed(iter/s)": 0.039546 }, { "acc": 0.77735343, "epoch": 2.310866574965612, "grad_norm": 1.4035242795944214, "learning_rate": 6.032389577325004e-05, "loss": 0.76564093, "memory(GiB)": 67.73, "step": 3360, "train_speed(iter/s)": 0.039572 }, { "acc": 0.78433137, "epoch": 2.314305364511692, "grad_norm": 1.3705512285232544, "learning_rate": 6.0212602144347295e-05, "loss": 0.74389467, "memory(GiB)": 67.73, "step": 3365, "train_speed(iter/s)": 0.039597 }, { "acc": 0.7952363, "epoch": 2.3177441540577717, "grad_norm": 1.601040005683899, "learning_rate": 6.010125570617587e-05, "loss": 0.69709463, "memory(GiB)": 67.73, "step": 3370, "train_speed(iter/s)": 0.039624 }, { "acc": 0.78479719, "epoch": 2.3211829436038514, "grad_norm": 1.6512783765792847, "learning_rate": 5.998985703478916e-05, "loss": 0.7351841, "memory(GiB)": 67.73, "step": 3375, "train_speed(iter/s)": 0.039648 }, { "acc": 0.7886528, "epoch": 2.324621733149931, "grad_norm": 1.4211089611053467, "learning_rate": 5.9878406706510834e-05, "loss": 0.72703929, "memory(GiB)": 67.73, "step": 3380, "train_speed(iter/s)": 0.039672 }, { "acc": 0.79997978, "epoch": 2.328060522696011, "grad_norm": 1.5382604598999023, "learning_rate": 5.976690529793175e-05, "loss": 0.68571553, "memory(GiB)": 67.73, "step": 3385, "train_speed(iter/s)": 0.039698 }, { "acc": 0.78704443, "epoch": 2.331499312242091, "grad_norm": 1.4559470415115356, "learning_rate": 5.9655353385907055e-05, "loss": 0.7418499, "memory(GiB)": 67.73, "step": 3390, "train_speed(iter/s)": 0.039721 }, { "acc": 0.77985802, "epoch": 2.3349381017881705, "grad_norm": 1.4432960748672485, "learning_rate": 5.954375154755316e-05, "loss": 0.75312023, "memory(GiB)": 67.73, "step": 3395, "train_speed(iter/s)": 0.039747 }, { "acc": 0.79101877, "epoch": 2.33837689133425, "grad_norm": 1.3332661390304565, "learning_rate": 5.9432100360244816e-05, "loss": 0.71166148, "memory(GiB)": 67.73, "step": 3400, "train_speed(iter/s)": 0.039772 }, { "epoch": 2.33837689133425, "eval_acc": 0.7662640032393035, "eval_loss": 0.8363276124000549, "eval_runtime": 1137.5857, "eval_samples_per_second": 3.765, "eval_steps_per_second": 0.068, "step": 3400 }, { "acc": 0.77607446, "epoch": 2.3418156808803303, "grad_norm": 1.4800801277160645, "learning_rate": 5.9320400401612055e-05, "loss": 0.75290956, "memory(GiB)": 67.73, "step": 3405, "train_speed(iter/s)": 0.039276 }, { "acc": 0.79126697, "epoch": 2.34525447042641, "grad_norm": 1.4802907705307007, "learning_rate": 5.9208652249537224e-05, "loss": 0.71252222, "memory(GiB)": 67.73, "step": 3410, "train_speed(iter/s)": 0.039305 }, { "acc": 0.79908376, "epoch": 2.3486932599724897, "grad_norm": 1.5732394456863403, "learning_rate": 5.909685648215198e-05, "loss": 0.68212581, "memory(GiB)": 67.73, "step": 3415, "train_speed(iter/s)": 0.039334 }, { "acc": 0.79249792, "epoch": 2.3521320495185694, "grad_norm": 1.5170937776565552, "learning_rate": 5.8985013677834376e-05, "loss": 0.69914331, "memory(GiB)": 67.73, "step": 3420, "train_speed(iter/s)": 0.039361 }, { "acc": 0.76918459, "epoch": 2.355570839064649, "grad_norm": 1.4346693754196167, "learning_rate": 5.887312441520577e-05, "loss": 0.7888124, "memory(GiB)": 67.73, "step": 3425, "train_speed(iter/s)": 0.039379 }, { "acc": 0.77460504, "epoch": 2.359009628610729, "grad_norm": 1.5379953384399414, "learning_rate": 5.876118927312788e-05, "loss": 0.77686357, "memory(GiB)": 67.73, "step": 3430, "train_speed(iter/s)": 0.039402 }, { "acc": 0.78930125, "epoch": 2.362448418156809, "grad_norm": 1.5620882511138916, "learning_rate": 5.8649208830699776e-05, "loss": 0.72846537, "memory(GiB)": 67.73, "step": 3435, "train_speed(iter/s)": 0.039427 }, { "acc": 0.7783771, "epoch": 2.3658872077028885, "grad_norm": 1.3905718326568604, "learning_rate": 5.85371836672549e-05, "loss": 0.74923038, "memory(GiB)": 67.73, "step": 3440, "train_speed(iter/s)": 0.039449 }, { "acc": 0.77820864, "epoch": 2.369325997248968, "grad_norm": 1.2664304971694946, "learning_rate": 5.842511436235807e-05, "loss": 0.77669792, "memory(GiB)": 67.73, "step": 3445, "train_speed(iter/s)": 0.039476 }, { "acc": 0.78637152, "epoch": 2.3727647867950483, "grad_norm": 1.3347089290618896, "learning_rate": 5.831300149580245e-05, "loss": 0.72091188, "memory(GiB)": 67.73, "step": 3450, "train_speed(iter/s)": 0.039501 }, { "acc": 0.77844844, "epoch": 2.376203576341128, "grad_norm": 1.4759833812713623, "learning_rate": 5.820084564760657e-05, "loss": 0.76853113, "memory(GiB)": 67.73, "step": 3455, "train_speed(iter/s)": 0.039528 }, { "acc": 0.7886765, "epoch": 2.3796423658872077, "grad_norm": 1.7147775888442993, "learning_rate": 5.808864739801136e-05, "loss": 0.73901868, "memory(GiB)": 67.73, "step": 3460, "train_speed(iter/s)": 0.039549 }, { "acc": 0.78849745, "epoch": 2.3830811554332874, "grad_norm": 1.4572715759277344, "learning_rate": 5.797640732747707e-05, "loss": 0.71194096, "memory(GiB)": 67.73, "step": 3465, "train_speed(iter/s)": 0.039573 }, { "acc": 0.79340534, "epoch": 2.3865199449793675, "grad_norm": 1.8968569040298462, "learning_rate": 5.7864126016680354e-05, "loss": 0.69921117, "memory(GiB)": 67.73, "step": 3470, "train_speed(iter/s)": 0.039599 }, { "acc": 0.7905076, "epoch": 2.389958734525447, "grad_norm": 1.6759856939315796, "learning_rate": 5.7751804046511185e-05, "loss": 0.71475925, "memory(GiB)": 67.73, "step": 3475, "train_speed(iter/s)": 0.039625 }, { "acc": 0.78112564, "epoch": 2.393397524071527, "grad_norm": 1.301619052886963, "learning_rate": 5.763944199806991e-05, "loss": 0.74565101, "memory(GiB)": 67.73, "step": 3480, "train_speed(iter/s)": 0.039651 }, { "acc": 0.77826796, "epoch": 2.3968363136176065, "grad_norm": 1.726420283317566, "learning_rate": 5.7527040452664215e-05, "loss": 0.76559715, "memory(GiB)": 67.73, "step": 3485, "train_speed(iter/s)": 0.039679 }, { "acc": 0.78291936, "epoch": 2.400275103163686, "grad_norm": 1.7139410972595215, "learning_rate": 5.741459999180612e-05, "loss": 0.74492068, "memory(GiB)": 67.73, "step": 3490, "train_speed(iter/s)": 0.039704 }, { "acc": 0.7825911, "epoch": 2.4037138927097663, "grad_norm": 1.6379047632217407, "learning_rate": 5.7302121197209e-05, "loss": 0.7245317, "memory(GiB)": 67.73, "step": 3495, "train_speed(iter/s)": 0.039726 }, { "acc": 0.78516874, "epoch": 2.407152682255846, "grad_norm": 1.6368329524993896, "learning_rate": 5.7189604650784524e-05, "loss": 0.71679149, "memory(GiB)": 67.73, "step": 3500, "train_speed(iter/s)": 0.039751 }, { "epoch": 2.407152682255846, "eval_acc": 0.7661683987942592, "eval_loss": 0.8319239020347595, "eval_runtime": 1159.9932, "eval_samples_per_second": 3.692, "eval_steps_per_second": 0.066, "step": 3500 }, { "acc": 0.77984338, "epoch": 2.4105914718019257, "grad_norm": 1.3497166633605957, "learning_rate": 5.707705093463969e-05, "loss": 0.76508818, "memory(GiB)": 67.73, "step": 3505, "train_speed(iter/s)": 0.039259 }, { "acc": 0.77530251, "epoch": 2.4140302613480054, "grad_norm": 1.4699667692184448, "learning_rate": 5.6964460631073736e-05, "loss": 0.7737175, "memory(GiB)": 67.73, "step": 3510, "train_speed(iter/s)": 0.039282 }, { "acc": 0.77435117, "epoch": 2.417469050894085, "grad_norm": 1.3567484617233276, "learning_rate": 5.685183432257529e-05, "loss": 0.76602321, "memory(GiB)": 67.73, "step": 3515, "train_speed(iter/s)": 0.039309 }, { "acc": 0.77687979, "epoch": 2.420907840440165, "grad_norm": 1.6473392248153687, "learning_rate": 5.6739172591819187e-05, "loss": 0.76530232, "memory(GiB)": 67.73, "step": 3520, "train_speed(iter/s)": 0.039334 }, { "acc": 0.78237972, "epoch": 2.424346629986245, "grad_norm": 1.5507371425628662, "learning_rate": 5.662647602166351e-05, "loss": 0.75080051, "memory(GiB)": 67.73, "step": 3525, "train_speed(iter/s)": 0.039356 }, { "acc": 0.78154564, "epoch": 2.4277854195323245, "grad_norm": 1.4991931915283203, "learning_rate": 5.65137451951466e-05, "loss": 0.75352793, "memory(GiB)": 67.73, "step": 3530, "train_speed(iter/s)": 0.039378 }, { "acc": 0.77997007, "epoch": 2.4312242090784046, "grad_norm": 1.6739842891693115, "learning_rate": 5.640098069548404e-05, "loss": 0.76081572, "memory(GiB)": 67.73, "step": 3535, "train_speed(iter/s)": 0.039401 }, { "acc": 0.78768792, "epoch": 2.4346629986244843, "grad_norm": 1.7248750925064087, "learning_rate": 5.628818310606561e-05, "loss": 0.7255187, "memory(GiB)": 67.73, "step": 3540, "train_speed(iter/s)": 0.039426 }, { "acc": 0.79025412, "epoch": 2.438101788170564, "grad_norm": 1.62557053565979, "learning_rate": 5.617535301045228e-05, "loss": 0.70803857, "memory(GiB)": 67.73, "step": 3545, "train_speed(iter/s)": 0.039454 }, { "acc": 0.78769302, "epoch": 2.4415405777166437, "grad_norm": 1.4635558128356934, "learning_rate": 5.606249099237318e-05, "loss": 0.73414497, "memory(GiB)": 67.73, "step": 3550, "train_speed(iter/s)": 0.039479 }, { "acc": 0.79148045, "epoch": 2.4449793672627234, "grad_norm": 1.3263139724731445, "learning_rate": 5.594959763572263e-05, "loss": 0.71763167, "memory(GiB)": 67.73, "step": 3555, "train_speed(iter/s)": 0.039504 }, { "acc": 0.79330978, "epoch": 2.4484181568088035, "grad_norm": 1.5712461471557617, "learning_rate": 5.5836673524557e-05, "loss": 0.71670427, "memory(GiB)": 67.73, "step": 3560, "train_speed(iter/s)": 0.039529 }, { "acc": 0.78728065, "epoch": 2.451856946354883, "grad_norm": 1.610227108001709, "learning_rate": 5.572371924309188e-05, "loss": 0.71835189, "memory(GiB)": 67.73, "step": 3565, "train_speed(iter/s)": 0.039555 }, { "acc": 0.7980279, "epoch": 2.455295735900963, "grad_norm": 1.3638176918029785, "learning_rate": 5.5610735375698863e-05, "loss": 0.68947468, "memory(GiB)": 67.73, "step": 3570, "train_speed(iter/s)": 0.039583 }, { "acc": 0.79382896, "epoch": 2.4587345254470425, "grad_norm": 1.671080470085144, "learning_rate": 5.549772250690264e-05, "loss": 0.6997715, "memory(GiB)": 67.73, "step": 3575, "train_speed(iter/s)": 0.039609 }, { "acc": 0.78277836, "epoch": 2.462173314993122, "grad_norm": 1.5335805416107178, "learning_rate": 5.538468122137791e-05, "loss": 0.75039816, "memory(GiB)": 67.73, "step": 3580, "train_speed(iter/s)": 0.039637 }, { "acc": 0.78852177, "epoch": 2.4656121045392023, "grad_norm": 1.5238432884216309, "learning_rate": 5.527161210394645e-05, "loss": 0.72055502, "memory(GiB)": 67.73, "step": 3585, "train_speed(iter/s)": 0.039663 }, { "acc": 0.77950158, "epoch": 2.469050894085282, "grad_norm": 1.3826063871383667, "learning_rate": 5.515851573957397e-05, "loss": 0.74620533, "memory(GiB)": 67.73, "step": 3590, "train_speed(iter/s)": 0.039684 }, { "acc": 0.78283319, "epoch": 2.4724896836313617, "grad_norm": 1.637787938117981, "learning_rate": 5.504539271336714e-05, "loss": 0.75348463, "memory(GiB)": 67.73, "step": 3595, "train_speed(iter/s)": 0.039709 }, { "acc": 0.79472337, "epoch": 2.4759284731774414, "grad_norm": 1.4630149602890015, "learning_rate": 5.493224361057062e-05, "loss": 0.70524812, "memory(GiB)": 67.73, "step": 3600, "train_speed(iter/s)": 0.039738 }, { "epoch": 2.4759284731774414, "eval_acc": 0.7665226976200117, "eval_loss": 0.8275927901268005, "eval_runtime": 1123.227, "eval_samples_per_second": 3.813, "eval_steps_per_second": 0.069, "step": 3600 }, { "acc": 0.79677072, "epoch": 2.4793672627235215, "grad_norm": 1.354331374168396, "learning_rate": 5.481906901656389e-05, "loss": 0.70224314, "memory(GiB)": 67.73, "step": 3605, "train_speed(iter/s)": 0.039277 }, { "acc": 0.78522711, "epoch": 2.482806052269601, "grad_norm": 1.4437576532363892, "learning_rate": 5.470586951685842e-05, "loss": 0.74459286, "memory(GiB)": 67.73, "step": 3610, "train_speed(iter/s)": 0.039302 }, { "acc": 0.79154515, "epoch": 2.486244841815681, "grad_norm": 1.358216643333435, "learning_rate": 5.4592645697094434e-05, "loss": 0.70617638, "memory(GiB)": 67.73, "step": 3615, "train_speed(iter/s)": 0.039323 }, { "acc": 0.78887815, "epoch": 2.4896836313617605, "grad_norm": 1.6288851499557495, "learning_rate": 5.447939814303803e-05, "loss": 0.72137556, "memory(GiB)": 67.73, "step": 3620, "train_speed(iter/s)": 0.039345 }, { "acc": 0.7799171, "epoch": 2.4931224209078406, "grad_norm": 1.4252561330795288, "learning_rate": 5.4366127440578063e-05, "loss": 0.75225086, "memory(GiB)": 67.73, "step": 3625, "train_speed(iter/s)": 0.039369 }, { "acc": 0.78901777, "epoch": 2.4965612104539203, "grad_norm": 1.3052763938903809, "learning_rate": 5.42528341757232e-05, "loss": 0.73022747, "memory(GiB)": 67.73, "step": 3630, "train_speed(iter/s)": 0.039392 }, { "acc": 0.77776222, "epoch": 2.5, "grad_norm": 1.3259241580963135, "learning_rate": 5.413951893459877e-05, "loss": 0.74716744, "memory(GiB)": 67.73, "step": 3635, "train_speed(iter/s)": 0.039413 }, { "acc": 0.79203482, "epoch": 2.5034387895460797, "grad_norm": 1.491448998451233, "learning_rate": 5.4026182303443826e-05, "loss": 0.71442933, "memory(GiB)": 67.73, "step": 3640, "train_speed(iter/s)": 0.039433 }, { "acc": 0.78454857, "epoch": 2.5068775790921594, "grad_norm": 1.6916753053665161, "learning_rate": 5.391282486860809e-05, "loss": 0.74134259, "memory(GiB)": 67.73, "step": 3645, "train_speed(iter/s)": 0.039458 }, { "acc": 0.78789535, "epoch": 2.5103163686382395, "grad_norm": 1.5004796981811523, "learning_rate": 5.3799447216548907e-05, "loss": 0.7244381, "memory(GiB)": 67.73, "step": 3650, "train_speed(iter/s)": 0.039483 }, { "acc": 0.80307121, "epoch": 2.513755158184319, "grad_norm": 1.3776211738586426, "learning_rate": 5.368604993382822e-05, "loss": 0.67283263, "memory(GiB)": 67.73, "step": 3655, "train_speed(iter/s)": 0.039509 }, { "acc": 0.78840837, "epoch": 2.517193947730399, "grad_norm": 1.489513635635376, "learning_rate": 5.357263360710951e-05, "loss": 0.73468142, "memory(GiB)": 67.73, "step": 3660, "train_speed(iter/s)": 0.039533 }, { "acc": 0.78454609, "epoch": 2.5206327372764785, "grad_norm": 1.5130376815795898, "learning_rate": 5.345919882315481e-05, "loss": 0.74815798, "memory(GiB)": 67.73, "step": 3665, "train_speed(iter/s)": 0.039558 }, { "acc": 0.77992659, "epoch": 2.524071526822558, "grad_norm": 1.5401512384414673, "learning_rate": 5.3345746168821634e-05, "loss": 0.74576526, "memory(GiB)": 67.73, "step": 3670, "train_speed(iter/s)": 0.039581 }, { "acc": 0.78538713, "epoch": 2.5275103163686383, "grad_norm": 1.6626590490341187, "learning_rate": 5.3232276231059905e-05, "loss": 0.72729344, "memory(GiB)": 67.73, "step": 3675, "train_speed(iter/s)": 0.039608 }, { "acc": 0.77599993, "epoch": 2.530949105914718, "grad_norm": 1.4081122875213623, "learning_rate": 5.311878959690906e-05, "loss": 0.76209216, "memory(GiB)": 67.73, "step": 3680, "train_speed(iter/s)": 0.039628 }, { "acc": 0.78826327, "epoch": 2.5343878954607977, "grad_norm": 1.474022626876831, "learning_rate": 5.3005286853494854e-05, "loss": 0.71333871, "memory(GiB)": 67.73, "step": 3685, "train_speed(iter/s)": 0.039652 }, { "acc": 0.78787079, "epoch": 2.537826685006878, "grad_norm": 1.648646354675293, "learning_rate": 5.289176858802634e-05, "loss": 0.72448759, "memory(GiB)": 67.73, "step": 3690, "train_speed(iter/s)": 0.039677 }, { "acc": 0.78479404, "epoch": 2.5412654745529575, "grad_norm": 1.4439847469329834, "learning_rate": 5.277823538779295e-05, "loss": 0.72407675, "memory(GiB)": 67.73, "step": 3695, "train_speed(iter/s)": 0.039702 }, { "acc": 0.77956858, "epoch": 2.544704264099037, "grad_norm": 1.2535481452941895, "learning_rate": 5.2664687840161364e-05, "loss": 0.74480648, "memory(GiB)": 67.73, "step": 3700, "train_speed(iter/s)": 0.039725 }, { "epoch": 2.544704264099037, "eval_acc": 0.7681029828586854, "eval_loss": 0.8239570260047913, "eval_runtime": 1156.7503, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.067, "step": 3700 }, { "acc": 0.78182096, "epoch": 2.548143053645117, "grad_norm": 1.328555703163147, "learning_rate": 5.255112653257247e-05, "loss": 0.75617981, "memory(GiB)": 67.73, "step": 3705, "train_speed(iter/s)": 0.039261 }, { "acc": 0.78516607, "epoch": 2.5515818431911965, "grad_norm": 1.5017790794372559, "learning_rate": 5.243755205253834e-05, "loss": 0.73223658, "memory(GiB)": 67.73, "step": 3710, "train_speed(iter/s)": 0.039286 }, { "acc": 0.78861194, "epoch": 2.5550206327372766, "grad_norm": 1.309441089630127, "learning_rate": 5.232396498763923e-05, "loss": 0.7213201, "memory(GiB)": 67.73, "step": 3715, "train_speed(iter/s)": 0.039308 }, { "acc": 0.78652673, "epoch": 2.5584594222833563, "grad_norm": 2.0742311477661133, "learning_rate": 5.2210365925520445e-05, "loss": 0.73911443, "memory(GiB)": 67.73, "step": 3720, "train_speed(iter/s)": 0.039335 }, { "acc": 0.78357706, "epoch": 2.561898211829436, "grad_norm": 1.4650071859359741, "learning_rate": 5.2096755453889404e-05, "loss": 0.74594064, "memory(GiB)": 67.73, "step": 3725, "train_speed(iter/s)": 0.039357 }, { "acc": 0.78125381, "epoch": 2.5653370013755157, "grad_norm": 1.7474429607391357, "learning_rate": 5.198313416051257e-05, "loss": 0.75290685, "memory(GiB)": 67.73, "step": 3730, "train_speed(iter/s)": 0.039381 }, { "acc": 0.77420011, "epoch": 2.5687757909215954, "grad_norm": 1.6091666221618652, "learning_rate": 5.186950263321233e-05, "loss": 0.79236693, "memory(GiB)": 67.73, "step": 3735, "train_speed(iter/s)": 0.039403 }, { "acc": 0.77931113, "epoch": 2.5722145804676755, "grad_norm": 1.9077335596084595, "learning_rate": 5.1755861459864064e-05, "loss": 0.74636703, "memory(GiB)": 67.73, "step": 3740, "train_speed(iter/s)": 0.039425 }, { "acc": 0.7796699, "epoch": 2.575653370013755, "grad_norm": 1.6318970918655396, "learning_rate": 5.164221122839306e-05, "loss": 0.76515536, "memory(GiB)": 67.73, "step": 3745, "train_speed(iter/s)": 0.039444 }, { "acc": 0.77925997, "epoch": 2.579092159559835, "grad_norm": 1.563817024230957, "learning_rate": 5.1528552526771425e-05, "loss": 0.74128981, "memory(GiB)": 67.73, "step": 3750, "train_speed(iter/s)": 0.039469 }, { "acc": 0.78557882, "epoch": 2.582530949105915, "grad_norm": 1.3365668058395386, "learning_rate": 5.141488594301512e-05, "loss": 0.72270107, "memory(GiB)": 67.73, "step": 3755, "train_speed(iter/s)": 0.039493 }, { "acc": 0.78748364, "epoch": 2.5859697386519946, "grad_norm": 1.33451247215271, "learning_rate": 5.1301212065180895e-05, "loss": 0.74060202, "memory(GiB)": 67.73, "step": 3760, "train_speed(iter/s)": 0.039516 }, { "acc": 0.79184856, "epoch": 2.5894085281980743, "grad_norm": 1.4293380975723267, "learning_rate": 5.118753148136318e-05, "loss": 0.7231204, "memory(GiB)": 67.73, "step": 3765, "train_speed(iter/s)": 0.039538 }, { "acc": 0.7917345, "epoch": 2.592847317744154, "grad_norm": 1.4640839099884033, "learning_rate": 5.107384477969117e-05, "loss": 0.72228947, "memory(GiB)": 67.73, "step": 3770, "train_speed(iter/s)": 0.039558 }, { "acc": 0.78101654, "epoch": 2.5962861072902337, "grad_norm": 1.5235430002212524, "learning_rate": 5.0960152548325676e-05, "loss": 0.75548983, "memory(GiB)": 67.73, "step": 3775, "train_speed(iter/s)": 0.039578 }, { "acc": 0.79459238, "epoch": 2.599724896836314, "grad_norm": 1.3068392276763916, "learning_rate": 5.08464553754561e-05, "loss": 0.70593162, "memory(GiB)": 67.73, "step": 3780, "train_speed(iter/s)": 0.039602 }, { "acc": 0.79126248, "epoch": 2.6031636863823935, "grad_norm": 1.3516395092010498, "learning_rate": 5.0732753849297434e-05, "loss": 0.72088032, "memory(GiB)": 67.73, "step": 3785, "train_speed(iter/s)": 0.039626 }, { "acc": 0.79167919, "epoch": 2.606602475928473, "grad_norm": 1.7003644704818726, "learning_rate": 5.06190485580872e-05, "loss": 0.68689594, "memory(GiB)": 67.73, "step": 3790, "train_speed(iter/s)": 0.039647 }, { "acc": 0.78057427, "epoch": 2.610041265474553, "grad_norm": 1.7799345254898071, "learning_rate": 5.0505340090082376e-05, "loss": 0.75313406, "memory(GiB)": 67.73, "step": 3795, "train_speed(iter/s)": 0.03967 }, { "acc": 0.78115511, "epoch": 2.6134800550206325, "grad_norm": 1.3012539148330688, "learning_rate": 5.039162903355639e-05, "loss": 0.75619287, "memory(GiB)": 67.73, "step": 3800, "train_speed(iter/s)": 0.039687 }, { "epoch": 2.6134800550206325, "eval_acc": 0.7681029828586854, "eval_loss": 0.8214000463485718, "eval_runtime": 1118.8883, "eval_samples_per_second": 3.828, "eval_steps_per_second": 0.069, "step": 3800 }, { "acc": 0.78926849, "epoch": 2.6169188445667126, "grad_norm": 1.3302139043807983, "learning_rate": 5.027791597679603e-05, "loss": 0.72202902, "memory(GiB)": 67.73, "step": 3805, "train_speed(iter/s)": 0.039252 }, { "acc": 0.788554, "epoch": 2.6203576341127923, "grad_norm": 1.3796292543411255, "learning_rate": 5.0164201508098486e-05, "loss": 0.73341327, "memory(GiB)": 67.73, "step": 3810, "train_speed(iter/s)": 0.039275 }, { "acc": 0.78986712, "epoch": 2.623796423658872, "grad_norm": 1.5008918046951294, "learning_rate": 5.00504862157682e-05, "loss": 0.70993729, "memory(GiB)": 67.73, "step": 3815, "train_speed(iter/s)": 0.0393 }, { "acc": 0.79516368, "epoch": 2.627235213204952, "grad_norm": 1.3220473527908325, "learning_rate": 4.9936770688113924e-05, "loss": 0.70671806, "memory(GiB)": 67.73, "step": 3820, "train_speed(iter/s)": 0.039326 }, { "acc": 0.77930651, "epoch": 2.6306740027510314, "grad_norm": 1.3324934244155884, "learning_rate": 4.982305551344558e-05, "loss": 0.76113019, "memory(GiB)": 67.73, "step": 3825, "train_speed(iter/s)": 0.039345 }, { "acc": 0.78931274, "epoch": 2.6341127922971115, "grad_norm": 1.561617374420166, "learning_rate": 4.970934128007131e-05, "loss": 0.73203354, "memory(GiB)": 67.73, "step": 3830, "train_speed(iter/s)": 0.039369 }, { "acc": 0.78196325, "epoch": 2.637551581843191, "grad_norm": 1.5396491289138794, "learning_rate": 4.959562857629432e-05, "loss": 0.74629278, "memory(GiB)": 67.73, "step": 3835, "train_speed(iter/s)": 0.039389 }, { "acc": 0.79152188, "epoch": 2.640990371389271, "grad_norm": 1.5757373571395874, "learning_rate": 4.948191799041e-05, "loss": 0.71405354, "memory(GiB)": 67.73, "step": 3840, "train_speed(iter/s)": 0.039408 }, { "acc": 0.78608985, "epoch": 2.644429160935351, "grad_norm": 1.47767174243927, "learning_rate": 4.936821011070271e-05, "loss": 0.72424574, "memory(GiB)": 67.73, "step": 3845, "train_speed(iter/s)": 0.039432 }, { "acc": 0.78729639, "epoch": 2.6478679504814306, "grad_norm": 1.2262262105941772, "learning_rate": 4.925450552544281e-05, "loss": 0.72731237, "memory(GiB)": 67.73, "step": 3850, "train_speed(iter/s)": 0.039453 }, { "acc": 0.78679304, "epoch": 2.6513067400275103, "grad_norm": 1.4017452001571655, "learning_rate": 4.914080482288365e-05, "loss": 0.71175966, "memory(GiB)": 67.73, "step": 3855, "train_speed(iter/s)": 0.039476 }, { "acc": 0.79027119, "epoch": 2.65474552957359, "grad_norm": 1.5579813718795776, "learning_rate": 4.902710859125846e-05, "loss": 0.71102552, "memory(GiB)": 67.98, "step": 3860, "train_speed(iter/s)": 0.039498 }, { "acc": 0.79366422, "epoch": 2.6581843191196697, "grad_norm": 1.3325603008270264, "learning_rate": 4.8913417418777377e-05, "loss": 0.69916854, "memory(GiB)": 67.98, "step": 3865, "train_speed(iter/s)": 0.039517 }, { "acc": 0.78973618, "epoch": 2.66162310866575, "grad_norm": 1.4464627504348755, "learning_rate": 4.879973189362433e-05, "loss": 0.72573528, "memory(GiB)": 67.98, "step": 3870, "train_speed(iter/s)": 0.039542 }, { "acc": 0.78937593, "epoch": 2.6650618982118295, "grad_norm": 1.4809215068817139, "learning_rate": 4.8686052603954065e-05, "loss": 0.72520885, "memory(GiB)": 67.98, "step": 3875, "train_speed(iter/s)": 0.039562 }, { "acc": 0.7916564, "epoch": 2.668500687757909, "grad_norm": 1.4060372114181519, "learning_rate": 4.857238013788902e-05, "loss": 0.71384468, "memory(GiB)": 67.98, "step": 3880, "train_speed(iter/s)": 0.039586 }, { "acc": 0.78981237, "epoch": 2.671939477303989, "grad_norm": 1.481585144996643, "learning_rate": 4.845871508351637e-05, "loss": 0.72426672, "memory(GiB)": 67.98, "step": 3885, "train_speed(iter/s)": 0.039608 }, { "acc": 0.79329553, "epoch": 2.6753782668500685, "grad_norm": 1.6132746934890747, "learning_rate": 4.834505802888493e-05, "loss": 0.70904198, "memory(GiB)": 67.98, "step": 3890, "train_speed(iter/s)": 0.039631 }, { "acc": 0.78727617, "epoch": 2.6788170563961486, "grad_norm": 1.845495343208313, "learning_rate": 4.8231409562002164e-05, "loss": 0.72750425, "memory(GiB)": 67.98, "step": 3895, "train_speed(iter/s)": 0.039654 }, { "acc": 0.78334684, "epoch": 2.6822558459422283, "grad_norm": 1.6697547435760498, "learning_rate": 4.811777027083104e-05, "loss": 0.74594717, "memory(GiB)": 67.98, "step": 3900, "train_speed(iter/s)": 0.039676 }, { "epoch": 2.6822558459422283, "eval_acc": 0.7695764160705448, "eval_loss": 0.8164530396461487, "eval_runtime": 1094.9986, "eval_samples_per_second": 3.911, "eval_steps_per_second": 0.07, "step": 3900 }, { "acc": 0.78133011, "epoch": 2.685694635488308, "grad_norm": 1.5049043893814087, "learning_rate": 4.80041407432871e-05, "loss": 0.74013877, "memory(GiB)": 67.98, "step": 3905, "train_speed(iter/s)": 0.039262 }, { "acc": 0.78779163, "epoch": 2.689133425034388, "grad_norm": 1.292845606803894, "learning_rate": 4.7890521567235375e-05, "loss": 0.73777471, "memory(GiB)": 67.98, "step": 3910, "train_speed(iter/s)": 0.039285 }, { "acc": 0.78793478, "epoch": 2.692572214580468, "grad_norm": 1.6969997882843018, "learning_rate": 4.7776913330487335e-05, "loss": 0.72460685, "memory(GiB)": 67.98, "step": 3915, "train_speed(iter/s)": 0.039309 }, { "acc": 0.78481874, "epoch": 2.6960110041265475, "grad_norm": 1.6642791032791138, "learning_rate": 4.766331662079784e-05, "loss": 0.73782244, "memory(GiB)": 67.98, "step": 3920, "train_speed(iter/s)": 0.039331 }, { "acc": 0.77672281, "epoch": 2.699449793672627, "grad_norm": 1.464065670967102, "learning_rate": 4.754973202586213e-05, "loss": 0.77285328, "memory(GiB)": 67.98, "step": 3925, "train_speed(iter/s)": 0.039351 }, { "acc": 0.78013086, "epoch": 2.702888583218707, "grad_norm": 1.6267447471618652, "learning_rate": 4.7436160133312756e-05, "loss": 0.77444224, "memory(GiB)": 67.98, "step": 3930, "train_speed(iter/s)": 0.039372 }, { "acc": 0.79396415, "epoch": 2.706327372764787, "grad_norm": 1.377986192703247, "learning_rate": 4.7322601530716593e-05, "loss": 0.69987969, "memory(GiB)": 67.98, "step": 3935, "train_speed(iter/s)": 0.039392 }, { "acc": 0.78015747, "epoch": 2.7097661623108666, "grad_norm": 1.5132167339324951, "learning_rate": 4.72090568055717e-05, "loss": 0.73972359, "memory(GiB)": 67.98, "step": 3940, "train_speed(iter/s)": 0.039413 }, { "acc": 0.78305364, "epoch": 2.7132049518569463, "grad_norm": 1.3939101696014404, "learning_rate": 4.709552654530438e-05, "loss": 0.74475136, "memory(GiB)": 67.98, "step": 3945, "train_speed(iter/s)": 0.039437 }, { "acc": 0.79228973, "epoch": 2.716643741403026, "grad_norm": 1.5657391548156738, "learning_rate": 4.69820113372661e-05, "loss": 0.70100541, "memory(GiB)": 67.98, "step": 3950, "train_speed(iter/s)": 0.039459 }, { "acc": 0.79344339, "epoch": 2.7200825309491057, "grad_norm": 1.480087399482727, "learning_rate": 4.686851176873045e-05, "loss": 0.70072994, "memory(GiB)": 67.98, "step": 3955, "train_speed(iter/s)": 0.039483 }, { "acc": 0.79308243, "epoch": 2.723521320495186, "grad_norm": 1.5921666622161865, "learning_rate": 4.6755028426890096e-05, "loss": 0.70272703, "memory(GiB)": 67.98, "step": 3960, "train_speed(iter/s)": 0.039506 }, { "acc": 0.79001474, "epoch": 2.7269601100412655, "grad_norm": 1.3979772329330444, "learning_rate": 4.664156189885376e-05, "loss": 0.69688091, "memory(GiB)": 67.98, "step": 3965, "train_speed(iter/s)": 0.039529 }, { "acc": 0.77611008, "epoch": 2.730398899587345, "grad_norm": 1.5082849264144897, "learning_rate": 4.65281127716432e-05, "loss": 0.774436, "memory(GiB)": 67.98, "step": 3970, "train_speed(iter/s)": 0.039554 }, { "acc": 0.78162088, "epoch": 2.7338376891334253, "grad_norm": 1.5324316024780273, "learning_rate": 4.64146816321901e-05, "loss": 0.73829603, "memory(GiB)": 67.98, "step": 3975, "train_speed(iter/s)": 0.039575 }, { "acc": 0.78739605, "epoch": 2.737276478679505, "grad_norm": 1.5039098262786865, "learning_rate": 4.630126906733315e-05, "loss": 0.73118725, "memory(GiB)": 67.98, "step": 3980, "train_speed(iter/s)": 0.039598 }, { "acc": 0.7873105, "epoch": 2.7407152682255846, "grad_norm": 1.6895498037338257, "learning_rate": 4.6187875663814886e-05, "loss": 0.72477093, "memory(GiB)": 67.98, "step": 3985, "train_speed(iter/s)": 0.039618 }, { "acc": 0.78072052, "epoch": 2.7441540577716643, "grad_norm": 1.350480318069458, "learning_rate": 4.607450200827874e-05, "loss": 0.73954563, "memory(GiB)": 67.98, "step": 3990, "train_speed(iter/s)": 0.039639 }, { "acc": 0.78461032, "epoch": 2.747592847317744, "grad_norm": 1.5248438119888306, "learning_rate": 4.596114868726598e-05, "loss": 0.7439085, "memory(GiB)": 67.98, "step": 3995, "train_speed(iter/s)": 0.039656 }, { "acc": 0.7952045, "epoch": 2.751031636863824, "grad_norm": 1.2919889688491821, "learning_rate": 4.5847816287212645e-05, "loss": 0.70409346, "memory(GiB)": 67.98, "step": 4000, "train_speed(iter/s)": 0.03968 }, { "epoch": 2.751031636863824, "eval_acc": 0.7708024024834661, "eval_loss": 0.8120156526565552, "eval_runtime": 1144.2771, "eval_samples_per_second": 3.743, "eval_steps_per_second": 0.067, "step": 4000 }, { "acc": 0.78138909, "epoch": 2.754470426409904, "grad_norm": 1.685054063796997, "learning_rate": 4.57345053944466e-05, "loss": 0.76331453, "memory(GiB)": 67.98, "step": 4005, "train_speed(iter/s)": 0.039257 }, { "acc": 0.79411173, "epoch": 2.7579092159559835, "grad_norm": 2.0349268913269043, "learning_rate": 4.562121659518438e-05, "loss": 0.71027813, "memory(GiB)": 67.98, "step": 4010, "train_speed(iter/s)": 0.039282 }, { "acc": 0.78988757, "epoch": 2.761348005502063, "grad_norm": 1.3015258312225342, "learning_rate": 4.5507950475528236e-05, "loss": 0.71334782, "memory(GiB)": 67.98, "step": 4015, "train_speed(iter/s)": 0.039304 }, { "acc": 0.79387317, "epoch": 2.764786795048143, "grad_norm": 1.4291696548461914, "learning_rate": 4.539470762146308e-05, "loss": 0.70652847, "memory(GiB)": 67.98, "step": 4020, "train_speed(iter/s)": 0.03932 }, { "acc": 0.78285937, "epoch": 2.768225584594223, "grad_norm": 1.477131962776184, "learning_rate": 4.5281488618853503e-05, "loss": 0.75896859, "memory(GiB)": 67.98, "step": 4025, "train_speed(iter/s)": 0.039342 }, { "acc": 0.78991375, "epoch": 2.7716643741403026, "grad_norm": 1.352389931678772, "learning_rate": 4.516829405344063e-05, "loss": 0.71030273, "memory(GiB)": 67.98, "step": 4030, "train_speed(iter/s)": 0.039364 }, { "acc": 0.79130993, "epoch": 2.7751031636863823, "grad_norm": 1.5674926042556763, "learning_rate": 4.505512451083922e-05, "loss": 0.71874084, "memory(GiB)": 67.98, "step": 4035, "train_speed(iter/s)": 0.039386 }, { "acc": 0.79276628, "epoch": 2.7785419532324624, "grad_norm": 1.943419098854065, "learning_rate": 4.494198057653455e-05, "loss": 0.71133614, "memory(GiB)": 67.98, "step": 4040, "train_speed(iter/s)": 0.039408 }, { "acc": 0.80222769, "epoch": 2.7819807427785417, "grad_norm": 1.6925394535064697, "learning_rate": 4.482886283587938e-05, "loss": 0.67353868, "memory(GiB)": 67.98, "step": 4045, "train_speed(iter/s)": 0.039433 }, { "acc": 0.80383835, "epoch": 2.785419532324622, "grad_norm": 1.4405827522277832, "learning_rate": 4.471577187409103e-05, "loss": 0.66345797, "memory(GiB)": 67.98, "step": 4050, "train_speed(iter/s)": 0.039457 }, { "acc": 0.80842638, "epoch": 2.7888583218707015, "grad_norm": 1.674682378768921, "learning_rate": 4.460270827624821e-05, "loss": 0.66658139, "memory(GiB)": 67.98, "step": 4055, "train_speed(iter/s)": 0.039479 }, { "acc": 0.79156666, "epoch": 2.792297111416781, "grad_norm": 1.3792381286621094, "learning_rate": 4.4489672627288124e-05, "loss": 0.73030577, "memory(GiB)": 67.98, "step": 4060, "train_speed(iter/s)": 0.039504 }, { "acc": 0.79346962, "epoch": 2.7957359009628613, "grad_norm": 1.404285192489624, "learning_rate": 4.4376665512003304e-05, "loss": 0.70117588, "memory(GiB)": 67.98, "step": 4065, "train_speed(iter/s)": 0.039526 }, { "acc": 0.790658, "epoch": 2.799174690508941, "grad_norm": 1.417019248008728, "learning_rate": 4.4263687515038755e-05, "loss": 0.70299535, "memory(GiB)": 67.98, "step": 4070, "train_speed(iter/s)": 0.039548 }, { "acc": 0.78736067, "epoch": 2.8026134800550206, "grad_norm": 1.5088238716125488, "learning_rate": 4.415073922088876e-05, "loss": 0.73802028, "memory(GiB)": 67.98, "step": 4075, "train_speed(iter/s)": 0.039565 }, { "acc": 0.79492655, "epoch": 2.8060522696011003, "grad_norm": 1.443625569343567, "learning_rate": 4.4037821213893964e-05, "loss": 0.71042171, "memory(GiB)": 67.98, "step": 4080, "train_speed(iter/s)": 0.039586 }, { "acc": 0.79075756, "epoch": 2.80949105914718, "grad_norm": 1.464545726776123, "learning_rate": 4.392493407823832e-05, "loss": 0.70024977, "memory(GiB)": 67.98, "step": 4085, "train_speed(iter/s)": 0.039605 }, { "acc": 0.79780464, "epoch": 2.81292984869326, "grad_norm": 1.6561044454574585, "learning_rate": 4.3812078397946074e-05, "loss": 0.69342613, "memory(GiB)": 67.98, "step": 4090, "train_speed(iter/s)": 0.039624 }, { "acc": 0.78435755, "epoch": 2.81636863823934, "grad_norm": 1.3976974487304688, "learning_rate": 4.369925475687873e-05, "loss": 0.71552553, "memory(GiB)": 67.98, "step": 4095, "train_speed(iter/s)": 0.039648 }, { "acc": 0.7799448, "epoch": 2.8198074277854195, "grad_norm": 1.3698362112045288, "learning_rate": 4.358646373873203e-05, "loss": 0.75982933, "memory(GiB)": 67.98, "step": 4100, "train_speed(iter/s)": 0.039669 }, { "epoch": 2.8198074277854195, "eval_acc": 0.7719721509875377, "eval_loss": 0.8082969784736633, "eval_runtime": 1151.3186, "eval_samples_per_second": 3.72, "eval_steps_per_second": 0.067, "step": 4100 }, { "acc": 0.78468771, "epoch": 2.823246217331499, "grad_norm": 1.7453495264053345, "learning_rate": 4.3473705927032957e-05, "loss": 0.73120604, "memory(GiB)": 67.98, "step": 4105, "train_speed(iter/s)": 0.039254 }, { "acc": 0.77324467, "epoch": 2.826685006877579, "grad_norm": 1.309380292892456, "learning_rate": 4.336098190513667e-05, "loss": 0.7686954, "memory(GiB)": 67.98, "step": 4110, "train_speed(iter/s)": 0.039274 }, { "acc": 0.78504181, "epoch": 2.830123796423659, "grad_norm": 1.3735424280166626, "learning_rate": 4.324829225622355e-05, "loss": 0.72278986, "memory(GiB)": 67.98, "step": 4115, "train_speed(iter/s)": 0.039295 }, { "acc": 0.78531666, "epoch": 2.8335625859697386, "grad_norm": 1.3972020149230957, "learning_rate": 4.3135637563296157e-05, "loss": 0.74182968, "memory(GiB)": 67.98, "step": 4120, "train_speed(iter/s)": 0.039316 }, { "acc": 0.78637772, "epoch": 2.8370013755158183, "grad_norm": 1.5424326658248901, "learning_rate": 4.3023018409176145e-05, "loss": 0.74376593, "memory(GiB)": 67.98, "step": 4125, "train_speed(iter/s)": 0.039338 }, { "acc": 0.79664993, "epoch": 2.8404401650618984, "grad_norm": 1.3284099102020264, "learning_rate": 4.2910435376501365e-05, "loss": 0.67242994, "memory(GiB)": 67.98, "step": 4130, "train_speed(iter/s)": 0.039363 }, { "acc": 0.78375196, "epoch": 2.843878954607978, "grad_norm": 1.4063657522201538, "learning_rate": 4.279788904772275e-05, "loss": 0.73797774, "memory(GiB)": 67.98, "step": 4135, "train_speed(iter/s)": 0.039386 }, { "acc": 0.80310926, "epoch": 2.847317744154058, "grad_norm": 1.6251460313796997, "learning_rate": 4.268538000510139e-05, "loss": 0.67094946, "memory(GiB)": 67.98, "step": 4140, "train_speed(iter/s)": 0.039411 }, { "acc": 0.78242793, "epoch": 2.8507565337001375, "grad_norm": 1.4719781875610352, "learning_rate": 4.257290883070545e-05, "loss": 0.7414422, "memory(GiB)": 67.98, "step": 4145, "train_speed(iter/s)": 0.039435 }, { "acc": 0.79309282, "epoch": 2.854195323246217, "grad_norm": 1.491889238357544, "learning_rate": 4.246047610640717e-05, "loss": 0.69513445, "memory(GiB)": 67.98, "step": 4150, "train_speed(iter/s)": 0.039458 }, { "acc": 0.79532785, "epoch": 2.8576341127922973, "grad_norm": 1.4044826030731201, "learning_rate": 4.2348082413879894e-05, "loss": 0.69395657, "memory(GiB)": 67.98, "step": 4155, "train_speed(iter/s)": 0.039483 }, { "acc": 0.79063025, "epoch": 2.861072902338377, "grad_norm": 1.4058098793029785, "learning_rate": 4.223572833459501e-05, "loss": 0.71690941, "memory(GiB)": 67.98, "step": 4160, "train_speed(iter/s)": 0.039508 }, { "acc": 0.79869499, "epoch": 2.8645116918844566, "grad_norm": 1.6210905313491821, "learning_rate": 4.212341444981898e-05, "loss": 0.6896822, "memory(GiB)": 67.98, "step": 4165, "train_speed(iter/s)": 0.039532 }, { "acc": 0.79149799, "epoch": 2.8679504814305363, "grad_norm": 1.3731998205184937, "learning_rate": 4.2011141340610326e-05, "loss": 0.7168128, "memory(GiB)": 67.98, "step": 4170, "train_speed(iter/s)": 0.039554 }, { "acc": 0.78879414, "epoch": 2.871389270976616, "grad_norm": 1.632126808166504, "learning_rate": 4.189890958781662e-05, "loss": 0.72364569, "memory(GiB)": 67.98, "step": 4175, "train_speed(iter/s)": 0.039576 }, { "acc": 0.78361959, "epoch": 2.874828060522696, "grad_norm": 1.4791241884231567, "learning_rate": 4.178671977207143e-05, "loss": 0.73310771, "memory(GiB)": 67.98, "step": 4180, "train_speed(iter/s)": 0.039599 }, { "acc": 0.79908352, "epoch": 2.878266850068776, "grad_norm": 1.7965590953826904, "learning_rate": 4.1674572473791395e-05, "loss": 0.69370174, "memory(GiB)": 67.98, "step": 4185, "train_speed(iter/s)": 0.039621 }, { "acc": 0.78735409, "epoch": 2.8817056396148555, "grad_norm": 1.6834094524383545, "learning_rate": 4.156246827317322e-05, "loss": 0.72156515, "memory(GiB)": 67.98, "step": 4190, "train_speed(iter/s)": 0.039644 }, { "acc": 0.78155212, "epoch": 2.8851444291609356, "grad_norm": 1.872073769569397, "learning_rate": 4.14504077501906e-05, "loss": 0.74036779, "memory(GiB)": 67.98, "step": 4195, "train_speed(iter/s)": 0.039667 }, { "acc": 0.79145999, "epoch": 2.8885832187070153, "grad_norm": 1.3122477531433105, "learning_rate": 4.133839148459126e-05, "loss": 0.71245356, "memory(GiB)": 67.98, "step": 4200, "train_speed(iter/s)": 0.039688 }, { "epoch": 2.8885832187070153, "eval_acc": 0.7714435146443515, "eval_loss": 0.805468738079071, "eval_runtime": 1087.9192, "eval_samples_per_second": 3.937, "eval_steps_per_second": 0.071, "step": 4200 }, { "acc": 0.79164152, "epoch": 2.892022008253095, "grad_norm": 1.5151678323745728, "learning_rate": 4.122642005589398e-05, "loss": 0.71430082, "memory(GiB)": 67.98, "step": 4205, "train_speed(iter/s)": 0.039306 }, { "acc": 0.79683599, "epoch": 2.8954607977991746, "grad_norm": 1.5568134784698486, "learning_rate": 4.111449404338556e-05, "loss": 0.69535141, "memory(GiB)": 67.98, "step": 4210, "train_speed(iter/s)": 0.039331 }, { "acc": 0.78143187, "epoch": 2.8988995873452543, "grad_norm": 1.6322216987609863, "learning_rate": 4.100261402611785e-05, "loss": 0.74795027, "memory(GiB)": 67.98, "step": 4215, "train_speed(iter/s)": 0.039355 }, { "acc": 0.779213, "epoch": 2.9023383768913344, "grad_norm": 1.479254126548767, "learning_rate": 4.089078058290476e-05, "loss": 0.76658916, "memory(GiB)": 67.98, "step": 4220, "train_speed(iter/s)": 0.039376 }, { "acc": 0.7864768, "epoch": 2.905777166437414, "grad_norm": 1.4543869495391846, "learning_rate": 4.077899429231921e-05, "loss": 0.71652775, "memory(GiB)": 67.98, "step": 4225, "train_speed(iter/s)": 0.039397 }, { "acc": 0.78852596, "epoch": 2.909215955983494, "grad_norm": 1.5353100299835205, "learning_rate": 4.066725573269019e-05, "loss": 0.7080534, "memory(GiB)": 67.98, "step": 4230, "train_speed(iter/s)": 0.039421 }, { "acc": 0.78499179, "epoch": 2.9126547455295735, "grad_norm": 1.7298237085342407, "learning_rate": 4.055556548209975e-05, "loss": 0.73987002, "memory(GiB)": 67.98, "step": 4235, "train_speed(iter/s)": 0.039446 }, { "acc": 0.79733381, "epoch": 2.916093535075653, "grad_norm": 1.3336453437805176, "learning_rate": 4.044392411838003e-05, "loss": 0.6844718, "memory(GiB)": 67.98, "step": 4240, "train_speed(iter/s)": 0.039468 }, { "acc": 0.78939738, "epoch": 2.9195323246217333, "grad_norm": 1.5154653787612915, "learning_rate": 4.033233221911023e-05, "loss": 0.72056727, "memory(GiB)": 67.98, "step": 4245, "train_speed(iter/s)": 0.039491 }, { "acc": 0.78145633, "epoch": 2.922971114167813, "grad_norm": 1.6946913003921509, "learning_rate": 4.022079036161366e-05, "loss": 0.74741826, "memory(GiB)": 67.98, "step": 4250, "train_speed(iter/s)": 0.039515 }, { "acc": 0.78894501, "epoch": 2.9264099037138926, "grad_norm": 1.7859429121017456, "learning_rate": 4.0109299122954716e-05, "loss": 0.71477051, "memory(GiB)": 67.98, "step": 4255, "train_speed(iter/s)": 0.039538 }, { "acc": 0.80096769, "epoch": 2.9298486932599723, "grad_norm": 1.550113558769226, "learning_rate": 3.999785907993594e-05, "loss": 0.66986256, "memory(GiB)": 67.98, "step": 4260, "train_speed(iter/s)": 0.039561 }, { "acc": 0.79326687, "epoch": 2.933287482806052, "grad_norm": 1.3913989067077637, "learning_rate": 3.9886470809095015e-05, "loss": 0.70431404, "memory(GiB)": 67.98, "step": 4265, "train_speed(iter/s)": 0.039585 }, { "acc": 0.78397541, "epoch": 2.936726272352132, "grad_norm": 1.7210358381271362, "learning_rate": 3.9775134886701754e-05, "loss": 0.74710093, "memory(GiB)": 67.98, "step": 4270, "train_speed(iter/s)": 0.039608 }, { "acc": 0.79305878, "epoch": 2.940165061898212, "grad_norm": 1.7996710538864136, "learning_rate": 3.966385188875515e-05, "loss": 0.70518632, "memory(GiB)": 67.98, "step": 4275, "train_speed(iter/s)": 0.039629 }, { "acc": 0.79449868, "epoch": 2.9436038514442915, "grad_norm": 1.8419127464294434, "learning_rate": 3.9552622390980425e-05, "loss": 0.69353704, "memory(GiB)": 67.98, "step": 4280, "train_speed(iter/s)": 0.039653 }, { "acc": 0.79312563, "epoch": 2.9470426409903716, "grad_norm": 1.6806973218917847, "learning_rate": 3.944144696882598e-05, "loss": 0.70997305, "memory(GiB)": 67.98, "step": 4285, "train_speed(iter/s)": 0.039675 }, { "acc": 0.77975159, "epoch": 2.9504814305364513, "grad_norm": 1.5093615055084229, "learning_rate": 3.9330326197460466e-05, "loss": 0.7535347, "memory(GiB)": 67.98, "step": 4290, "train_speed(iter/s)": 0.039698 }, { "acc": 0.77885957, "epoch": 2.953920220082531, "grad_norm": 1.7408277988433838, "learning_rate": 3.921926065176977e-05, "loss": 0.75995541, "memory(GiB)": 67.98, "step": 4295, "train_speed(iter/s)": 0.03972 }, { "acc": 0.78874741, "epoch": 2.9573590096286106, "grad_norm": 1.6146240234375, "learning_rate": 3.9108250906354117e-05, "loss": 0.71309519, "memory(GiB)": 67.98, "step": 4300, "train_speed(iter/s)": 0.039742 }, { "epoch": 2.9573590096286106, "eval_acc": 0.7727257389661223, "eval_loss": 0.8019844889640808, "eval_runtime": 1140.1199, "eval_samples_per_second": 3.757, "eval_steps_per_second": 0.068, "step": 4300 }, { "acc": 0.78785725, "epoch": 2.9607977991746903, "grad_norm": 1.555442452430725, "learning_rate": 3.8997297535525026e-05, "loss": 0.72890291, "memory(GiB)": 67.98, "step": 4305, "train_speed(iter/s)": 0.039349 }, { "acc": 0.78933182, "epoch": 2.9642365887207704, "grad_norm": 1.710303783416748, "learning_rate": 3.888640111330235e-05, "loss": 0.73036714, "memory(GiB)": 67.98, "step": 4310, "train_speed(iter/s)": 0.039373 }, { "acc": 0.79446011, "epoch": 2.96767537826685, "grad_norm": 1.7401241064071655, "learning_rate": 3.877556221341133e-05, "loss": 0.70017486, "memory(GiB)": 67.98, "step": 4315, "train_speed(iter/s)": 0.039398 }, { "acc": 0.78983717, "epoch": 2.97111416781293, "grad_norm": 1.5789563655853271, "learning_rate": 3.866478140927961e-05, "loss": 0.70362015, "memory(GiB)": 67.98, "step": 4320, "train_speed(iter/s)": 0.039419 }, { "acc": 0.79765377, "epoch": 2.9745529573590095, "grad_norm": 2.0560176372528076, "learning_rate": 3.8554059274034246e-05, "loss": 0.68930745, "memory(GiB)": 67.98, "step": 4325, "train_speed(iter/s)": 0.039442 }, { "acc": 0.79753799, "epoch": 2.977991746905089, "grad_norm": 1.5742462873458862, "learning_rate": 3.844339638049885e-05, "loss": 0.68201818, "memory(GiB)": 67.98, "step": 4330, "train_speed(iter/s)": 0.039466 }, { "acc": 0.7916151, "epoch": 2.9814305364511693, "grad_norm": 1.7083474397659302, "learning_rate": 3.8332793301190456e-05, "loss": 0.6970108, "memory(GiB)": 67.98, "step": 4335, "train_speed(iter/s)": 0.03949 }, { "acc": 0.7908206, "epoch": 2.984869325997249, "grad_norm": 1.6145273447036743, "learning_rate": 3.822225060831669e-05, "loss": 0.72308092, "memory(GiB)": 67.98, "step": 4340, "train_speed(iter/s)": 0.039512 }, { "acc": 0.79732313, "epoch": 2.9883081155433286, "grad_norm": 1.3791991472244263, "learning_rate": 3.8111768873772757e-05, "loss": 0.68552351, "memory(GiB)": 67.98, "step": 4345, "train_speed(iter/s)": 0.039537 }, { "acc": 0.78215866, "epoch": 2.9917469050894088, "grad_norm": 1.587035059928894, "learning_rate": 3.800134866913852e-05, "loss": 0.74166784, "memory(GiB)": 67.98, "step": 4350, "train_speed(iter/s)": 0.03956 }, { "acc": 0.7990098, "epoch": 2.9951856946354884, "grad_norm": 1.8290317058563232, "learning_rate": 3.7890990565675476e-05, "loss": 0.68875532, "memory(GiB)": 67.98, "step": 4355, "train_speed(iter/s)": 0.039584 }, { "acc": 0.78591781, "epoch": 2.998624484181568, "grad_norm": 1.8819842338562012, "learning_rate": 3.778069513432386e-05, "loss": 0.72816386, "memory(GiB)": 67.98, "step": 4360, "train_speed(iter/s)": 0.039604 }, { "acc": 0.80687866, "epoch": 3.002063273727648, "grad_norm": 1.3995342254638672, "learning_rate": 3.767046294569967e-05, "loss": 0.64414482, "memory(GiB)": 67.98, "step": 4365, "train_speed(iter/s)": 0.039611 }, { "acc": 0.80390854, "epoch": 3.0055020632737275, "grad_norm": 1.5679051876068115, "learning_rate": 3.75602945700917e-05, "loss": 0.66774035, "memory(GiB)": 67.98, "step": 4370, "train_speed(iter/s)": 0.039629 }, { "acc": 0.79944701, "epoch": 3.0089408528198076, "grad_norm": 1.531205177307129, "learning_rate": 3.7450190577458635e-05, "loss": 0.67704058, "memory(GiB)": 67.98, "step": 4375, "train_speed(iter/s)": 0.039649 }, { "acc": 0.80703545, "epoch": 3.0123796423658873, "grad_norm": 6.210807800292969, "learning_rate": 3.734015153742605e-05, "loss": 0.64957862, "memory(GiB)": 67.98, "step": 4380, "train_speed(iter/s)": 0.039672 }, { "acc": 0.80491982, "epoch": 3.015818431911967, "grad_norm": 1.6315518617630005, "learning_rate": 3.7230178019283506e-05, "loss": 0.65046768, "memory(GiB)": 67.98, "step": 4385, "train_speed(iter/s)": 0.039693 }, { "acc": 0.8061985, "epoch": 3.0192572214580466, "grad_norm": 1.478652000427246, "learning_rate": 3.712027059198157e-05, "loss": 0.64048343, "memory(GiB)": 67.98, "step": 4390, "train_speed(iter/s)": 0.039708 }, { "acc": 0.81162281, "epoch": 3.0226960110041263, "grad_norm": 1.623420238494873, "learning_rate": 3.701042982412889e-05, "loss": 0.62963314, "memory(GiB)": 67.98, "step": 4395, "train_speed(iter/s)": 0.03973 }, { "acc": 0.80488195, "epoch": 3.0261348005502064, "grad_norm": 1.6778922080993652, "learning_rate": 3.690065628398926e-05, "loss": 0.65336089, "memory(GiB)": 67.98, "step": 4400, "train_speed(iter/s)": 0.039751 }, { "epoch": 3.0261348005502064, "eval_acc": 0.7711454537274486, "eval_loss": 0.8140049576759338, "eval_runtime": 1141.0798, "eval_samples_per_second": 3.753, "eval_steps_per_second": 0.067, "step": 4400 }, { "acc": 0.80764694, "epoch": 3.029573590096286, "grad_norm": 1.6117892265319824, "learning_rate": 3.679095053947864e-05, "loss": 0.6384645, "memory(GiB)": 67.98, "step": 4405, "train_speed(iter/s)": 0.039366 }, { "acc": 0.80960245, "epoch": 3.033012379642366, "grad_norm": 1.5972310304641724, "learning_rate": 3.668131315816228e-05, "loss": 0.63809519, "memory(GiB)": 67.98, "step": 4410, "train_speed(iter/s)": 0.039387 }, { "acc": 0.80579681, "epoch": 3.0364511691884455, "grad_norm": 1.6774109601974487, "learning_rate": 3.657174470725173e-05, "loss": 0.64105072, "memory(GiB)": 67.98, "step": 4415, "train_speed(iter/s)": 0.039406 }, { "acc": 0.81135626, "epoch": 3.0398899587345256, "grad_norm": 1.710260033607483, "learning_rate": 3.646224575360194e-05, "loss": 0.6407239, "memory(GiB)": 67.98, "step": 4420, "train_speed(iter/s)": 0.039428 }, { "acc": 0.81669779, "epoch": 3.0433287482806053, "grad_norm": 1.5772171020507812, "learning_rate": 3.635281686370832e-05, "loss": 0.61197987, "memory(GiB)": 67.98, "step": 4425, "train_speed(iter/s)": 0.039449 }, { "acc": 0.81082649, "epoch": 3.046767537826685, "grad_norm": 2.1017799377441406, "learning_rate": 3.624345860370379e-05, "loss": 0.63282819, "memory(GiB)": 67.98, "step": 4430, "train_speed(iter/s)": 0.039468 }, { "acc": 0.80507336, "epoch": 3.0502063273727646, "grad_norm": 1.904692530632019, "learning_rate": 3.613417153935585e-05, "loss": 0.63742828, "memory(GiB)": 67.98, "step": 4435, "train_speed(iter/s)": 0.039486 }, { "acc": 0.79859557, "epoch": 3.0536451169188448, "grad_norm": 1.673336148262024, "learning_rate": 3.60249562360637e-05, "loss": 0.67739854, "memory(GiB)": 67.98, "step": 4440, "train_speed(iter/s)": 0.039507 }, { "acc": 0.8053956, "epoch": 3.0570839064649244, "grad_norm": 1.6409105062484741, "learning_rate": 3.591581325885528e-05, "loss": 0.64070592, "memory(GiB)": 67.98, "step": 4445, "train_speed(iter/s)": 0.039524 }, { "acc": 0.81321754, "epoch": 3.060522696011004, "grad_norm": 1.599678874015808, "learning_rate": 3.5806743172384325e-05, "loss": 0.62494526, "memory(GiB)": 67.98, "step": 4450, "train_speed(iter/s)": 0.039544 }, { "acc": 0.81660137, "epoch": 3.063961485557084, "grad_norm": 1.527250051498413, "learning_rate": 3.569774654092749e-05, "loss": 0.61917772, "memory(GiB)": 67.98, "step": 4455, "train_speed(iter/s)": 0.039565 }, { "acc": 0.80815334, "epoch": 3.0674002751031635, "grad_norm": 1.9215754270553589, "learning_rate": 3.5588823928381385e-05, "loss": 0.64416943, "memory(GiB)": 67.98, "step": 4460, "train_speed(iter/s)": 0.039584 }, { "acc": 0.81522007, "epoch": 3.0708390646492436, "grad_norm": 1.771016240119934, "learning_rate": 3.54799758982597e-05, "loss": 0.62254939, "memory(GiB)": 67.98, "step": 4465, "train_speed(iter/s)": 0.039604 }, { "acc": 0.81300201, "epoch": 3.0742778541953233, "grad_norm": 1.5185010433197021, "learning_rate": 3.537120301369029e-05, "loss": 0.63570495, "memory(GiB)": 67.98, "step": 4470, "train_speed(iter/s)": 0.039623 }, { "acc": 0.79795976, "epoch": 3.077716643741403, "grad_norm": 1.7474913597106934, "learning_rate": 3.526250583741219e-05, "loss": 0.67301879, "memory(GiB)": 67.98, "step": 4475, "train_speed(iter/s)": 0.039644 }, { "acc": 0.80364552, "epoch": 3.0811554332874826, "grad_norm": 1.611039638519287, "learning_rate": 3.51538849317728e-05, "loss": 0.6553544, "memory(GiB)": 67.98, "step": 4480, "train_speed(iter/s)": 0.039664 }, { "acc": 0.80711832, "epoch": 3.0845942228335628, "grad_norm": 1.956214189529419, "learning_rate": 3.504534085872491e-05, "loss": 0.65441723, "memory(GiB)": 67.98, "step": 4485, "train_speed(iter/s)": 0.039686 }, { "acc": 0.80393448, "epoch": 3.0880330123796425, "grad_norm": 1.7758394479751587, "learning_rate": 3.493687417982382e-05, "loss": 0.63968649, "memory(GiB)": 67.98, "step": 4490, "train_speed(iter/s)": 0.039704 }, { "acc": 0.80570278, "epoch": 3.091471801925722, "grad_norm": 1.878055453300476, "learning_rate": 3.4828485456224454e-05, "loss": 0.64807596, "memory(GiB)": 67.98, "step": 4495, "train_speed(iter/s)": 0.039724 }, { "acc": 0.80985212, "epoch": 3.094910591471802, "grad_norm": 1.647511601448059, "learning_rate": 3.47201752486784e-05, "loss": 0.63398943, "memory(GiB)": 67.98, "step": 4500, "train_speed(iter/s)": 0.039743 }, { "epoch": 3.094910591471802, "eval_acc": 0.7721239933414316, "eval_loss": 0.81331866979599, "eval_runtime": 1133.398, "eval_samples_per_second": 3.779, "eval_steps_per_second": 0.068, "step": 4500 }, { "acc": 0.80513477, "epoch": 3.098349381017882, "grad_norm": 1.8428512811660767, "learning_rate": 3.461194411753105e-05, "loss": 0.64937515, "memory(GiB)": 67.98, "step": 4505, "train_speed(iter/s)": 0.03937 }, { "acc": 0.81019039, "epoch": 3.1017881705639616, "grad_norm": 1.6519265174865723, "learning_rate": 3.450379262271869e-05, "loss": 0.63972459, "memory(GiB)": 67.98, "step": 4510, "train_speed(iter/s)": 0.039392 }, { "acc": 0.8086174, "epoch": 3.1052269601100413, "grad_norm": 1.7133119106292725, "learning_rate": 3.439572132376563e-05, "loss": 0.64712973, "memory(GiB)": 67.98, "step": 4515, "train_speed(iter/s)": 0.039407 }, { "acc": 0.79508266, "epoch": 3.108665749656121, "grad_norm": 1.6571804285049438, "learning_rate": 3.428773077978125e-05, "loss": 0.68026247, "memory(GiB)": 67.98, "step": 4520, "train_speed(iter/s)": 0.039425 }, { "acc": 0.8028862, "epoch": 3.1121045392022006, "grad_norm": 2.0089550018310547, "learning_rate": 3.4179821549457166e-05, "loss": 0.66466484, "memory(GiB)": 67.98, "step": 4525, "train_speed(iter/s)": 0.039441 }, { "acc": 0.7982996, "epoch": 3.1155433287482808, "grad_norm": 1.620611548423767, "learning_rate": 3.407199419106429e-05, "loss": 0.67201767, "memory(GiB)": 67.98, "step": 4530, "train_speed(iter/s)": 0.03946 }, { "acc": 0.81185446, "epoch": 3.1189821182943605, "grad_norm": 1.5307915210723877, "learning_rate": 3.396424926244999e-05, "loss": 0.62855453, "memory(GiB)": 67.98, "step": 4535, "train_speed(iter/s)": 0.039478 }, { "acc": 0.80879059, "epoch": 3.12242090784044, "grad_norm": 1.9358049631118774, "learning_rate": 3.3856587321035206e-05, "loss": 0.63443809, "memory(GiB)": 67.98, "step": 4540, "train_speed(iter/s)": 0.0395 }, { "acc": 0.81181793, "epoch": 3.12585969738652, "grad_norm": 1.9281483888626099, "learning_rate": 3.374900892381146e-05, "loss": 0.62519212, "memory(GiB)": 67.98, "step": 4545, "train_speed(iter/s)": 0.039518 }, { "acc": 0.79986091, "epoch": 3.1292984869326, "grad_norm": 1.8126670122146606, "learning_rate": 3.3641514627338166e-05, "loss": 0.67471228, "memory(GiB)": 67.98, "step": 4550, "train_speed(iter/s)": 0.039538 }, { "acc": 0.81441412, "epoch": 3.1327372764786796, "grad_norm": 1.9482190608978271, "learning_rate": 3.353410498773954e-05, "loss": 0.62350183, "memory(GiB)": 67.98, "step": 4555, "train_speed(iter/s)": 0.039558 }, { "acc": 0.80743856, "epoch": 3.1361760660247593, "grad_norm": 1.8278954029083252, "learning_rate": 3.342678056070189e-05, "loss": 0.65586147, "memory(GiB)": 67.98, "step": 4560, "train_speed(iter/s)": 0.03958 }, { "acc": 0.80009956, "epoch": 3.139614855570839, "grad_norm": 1.9203051328659058, "learning_rate": 3.331954190147065e-05, "loss": 0.67459331, "memory(GiB)": 67.98, "step": 4565, "train_speed(iter/s)": 0.039599 }, { "acc": 0.79797955, "epoch": 3.1430536451169186, "grad_norm": 1.78507399559021, "learning_rate": 3.321238956484752e-05, "loss": 0.68094501, "memory(GiB)": 67.98, "step": 4570, "train_speed(iter/s)": 0.039619 }, { "acc": 0.80958462, "epoch": 3.1464924346629988, "grad_norm": 1.8514398336410522, "learning_rate": 3.310532410518765e-05, "loss": 0.63833261, "memory(GiB)": 67.98, "step": 4575, "train_speed(iter/s)": 0.039639 }, { "acc": 0.8103529, "epoch": 3.1499312242090785, "grad_norm": 2.1083662509918213, "learning_rate": 3.2998346076396664e-05, "loss": 0.63392391, "memory(GiB)": 67.98, "step": 4580, "train_speed(iter/s)": 0.039659 }, { "acc": 0.80684109, "epoch": 3.153370013755158, "grad_norm": 2.081134080886841, "learning_rate": 3.289145603192793e-05, "loss": 0.65391574, "memory(GiB)": 67.98, "step": 4585, "train_speed(iter/s)": 0.03968 }, { "acc": 0.80696983, "epoch": 3.156808803301238, "grad_norm": 1.7588388919830322, "learning_rate": 3.2784654524779587e-05, "loss": 0.65089002, "memory(GiB)": 67.98, "step": 4590, "train_speed(iter/s)": 0.039697 }, { "acc": 0.80319796, "epoch": 3.160247592847318, "grad_norm": 1.8731495141983032, "learning_rate": 3.267794210749173e-05, "loss": 0.66944408, "memory(GiB)": 67.98, "step": 4595, "train_speed(iter/s)": 0.039715 }, { "acc": 0.80229826, "epoch": 3.1636863823933976, "grad_norm": 1.863386869430542, "learning_rate": 3.2571319332143516e-05, "loss": 0.67615876, "memory(GiB)": 67.98, "step": 4600, "train_speed(iter/s)": 0.039734 }, { "epoch": 3.1636863823933976, "eval_acc": 0.7733612273361228, "eval_loss": 0.8061870336532593, "eval_runtime": 1127.8615, "eval_samples_per_second": 3.797, "eval_steps_per_second": 0.068, "step": 4600 }, { "acc": 0.81224995, "epoch": 3.1671251719394773, "grad_norm": 1.7116352319717407, "learning_rate": 3.2464786750350434e-05, "loss": 0.62269239, "memory(GiB)": 67.98, "step": 4605, "train_speed(iter/s)": 0.03937 }, { "acc": 0.80045443, "epoch": 3.170563961485557, "grad_norm": 1.838098406791687, "learning_rate": 3.235834491326126e-05, "loss": 0.65012379, "memory(GiB)": 67.98, "step": 4610, "train_speed(iter/s)": 0.039391 }, { "acc": 0.80470877, "epoch": 3.1740027510316366, "grad_norm": 1.8031960725784302, "learning_rate": 3.225199437155532e-05, "loss": 0.65979033, "memory(GiB)": 67.98, "step": 4615, "train_speed(iter/s)": 0.039409 }, { "acc": 0.80560265, "epoch": 3.1774415405777168, "grad_norm": 1.7068849802017212, "learning_rate": 3.214573567543964e-05, "loss": 0.63796139, "memory(GiB)": 67.98, "step": 4620, "train_speed(iter/s)": 0.039427 }, { "acc": 0.79233809, "epoch": 3.1808803301237965, "grad_norm": 1.7398771047592163, "learning_rate": 3.203956937464607e-05, "loss": 0.67283368, "memory(GiB)": 67.98, "step": 4625, "train_speed(iter/s)": 0.039446 }, { "acc": 0.81366425, "epoch": 3.184319119669876, "grad_norm": 1.7081953287124634, "learning_rate": 3.1933496018428446e-05, "loss": 0.62146492, "memory(GiB)": 67.98, "step": 4630, "train_speed(iter/s)": 0.039466 }, { "acc": 0.80281668, "epoch": 3.187757909215956, "grad_norm": 1.6009129285812378, "learning_rate": 3.1827516155559786e-05, "loss": 0.66720371, "memory(GiB)": 67.98, "step": 4635, "train_speed(iter/s)": 0.039486 }, { "acc": 0.80487442, "epoch": 3.191196698762036, "grad_norm": 1.8239426612854004, "learning_rate": 3.1721630334329366e-05, "loss": 0.64386883, "memory(GiB)": 67.98, "step": 4640, "train_speed(iter/s)": 0.039505 }, { "acc": 0.80696297, "epoch": 3.1946354883081156, "grad_norm": 1.906916856765747, "learning_rate": 3.161583910253998e-05, "loss": 0.64987645, "memory(GiB)": 67.98, "step": 4645, "train_speed(iter/s)": 0.039524 }, { "acc": 0.80997219, "epoch": 3.1980742778541953, "grad_norm": 2.060511350631714, "learning_rate": 3.1510143007505016e-05, "loss": 0.63655567, "memory(GiB)": 67.98, "step": 4650, "train_speed(iter/s)": 0.039543 }, { "acc": 0.79812059, "epoch": 3.201513067400275, "grad_norm": 1.793277382850647, "learning_rate": 3.14045425960457e-05, "loss": 0.68602118, "memory(GiB)": 67.98, "step": 4655, "train_speed(iter/s)": 0.039564 }, { "acc": 0.79850287, "epoch": 3.204951856946355, "grad_norm": 1.6924282312393188, "learning_rate": 3.129903841448827e-05, "loss": 0.67275462, "memory(GiB)": 67.98, "step": 4660, "train_speed(iter/s)": 0.039583 }, { "acc": 0.80627632, "epoch": 3.2083906464924348, "grad_norm": 1.678781509399414, "learning_rate": 3.119363100866106e-05, "loss": 0.65286617, "memory(GiB)": 67.98, "step": 4665, "train_speed(iter/s)": 0.039602 }, { "acc": 0.81515961, "epoch": 3.2118294360385145, "grad_norm": 1.9915016889572144, "learning_rate": 3.108832092389172e-05, "loss": 0.59764929, "memory(GiB)": 67.98, "step": 4670, "train_speed(iter/s)": 0.039621 }, { "acc": 0.80686855, "epoch": 3.215268225584594, "grad_norm": 1.8249253034591675, "learning_rate": 3.098310870500448e-05, "loss": 0.64462824, "memory(GiB)": 67.98, "step": 4675, "train_speed(iter/s)": 0.03964 }, { "acc": 0.80929985, "epoch": 3.218707015130674, "grad_norm": 1.7660592794418335, "learning_rate": 3.087799489631721e-05, "loss": 0.6324172, "memory(GiB)": 67.98, "step": 4680, "train_speed(iter/s)": 0.039659 }, { "acc": 0.80256157, "epoch": 3.222145804676754, "grad_norm": 1.9033777713775635, "learning_rate": 3.077298004163865e-05, "loss": 0.67533493, "memory(GiB)": 67.98, "step": 4685, "train_speed(iter/s)": 0.039677 }, { "acc": 0.80715237, "epoch": 3.2255845942228336, "grad_norm": 1.6797436475753784, "learning_rate": 3.066806468426561e-05, "loss": 0.64756646, "memory(GiB)": 67.98, "step": 4690, "train_speed(iter/s)": 0.039699 }, { "acc": 0.79631739, "epoch": 3.2290233837689133, "grad_norm": 1.6722263097763062, "learning_rate": 3.056324936698014e-05, "loss": 0.68136206, "memory(GiB)": 67.98, "step": 4695, "train_speed(iter/s)": 0.03972 }, { "acc": 0.81018467, "epoch": 3.232462173314993, "grad_norm": 1.9351452589035034, "learning_rate": 3.0458534632046766e-05, "loss": 0.63391657, "memory(GiB)": 67.98, "step": 4700, "train_speed(iter/s)": 0.039741 }, { "epoch": 3.232462173314993, "eval_acc": 0.7737605164889548, "eval_loss": 0.806867241859436, "eval_runtime": 1123.2355, "eval_samples_per_second": 3.813, "eval_steps_per_second": 0.069, "step": 4700 }, { "acc": 0.81098757, "epoch": 3.235900962861073, "grad_norm": 1.8516818284988403, "learning_rate": 3.0353921021209598e-05, "loss": 0.63078384, "memory(GiB)": 67.98, "step": 4705, "train_speed(iter/s)": 0.039386 }, { "acc": 0.80097027, "epoch": 3.2393397524071528, "grad_norm": 1.9018975496292114, "learning_rate": 3.02494090756896e-05, "loss": 0.67307758, "memory(GiB)": 67.98, "step": 4710, "train_speed(iter/s)": 0.039403 }, { "acc": 0.78721581, "epoch": 3.2427785419532325, "grad_norm": 1.8945331573486328, "learning_rate": 3.014499933618176e-05, "loss": 0.71489978, "memory(GiB)": 67.98, "step": 4715, "train_speed(iter/s)": 0.039419 }, { "acc": 0.81447954, "epoch": 3.246217331499312, "grad_norm": 1.873476505279541, "learning_rate": 3.004069234285235e-05, "loss": 0.62558355, "memory(GiB)": 67.98, "step": 4720, "train_speed(iter/s)": 0.039438 }, { "acc": 0.80101833, "epoch": 3.2496561210453923, "grad_norm": 1.9191193580627441, "learning_rate": 2.993648863533602e-05, "loss": 0.65777245, "memory(GiB)": 67.98, "step": 4725, "train_speed(iter/s)": 0.039458 }, { "acc": 0.81086941, "epoch": 3.253094910591472, "grad_norm": 1.973708987236023, "learning_rate": 2.983238875273308e-05, "loss": 0.63210435, "memory(GiB)": 67.98, "step": 4730, "train_speed(iter/s)": 0.039477 }, { "acc": 0.80301018, "epoch": 3.2565337001375516, "grad_norm": 1.7471644878387451, "learning_rate": 2.9728393233606715e-05, "loss": 0.66623907, "memory(GiB)": 67.98, "step": 4735, "train_speed(iter/s)": 0.039495 }, { "acc": 0.80888157, "epoch": 3.2599724896836313, "grad_norm": 1.7000857591629028, "learning_rate": 2.9624502615980177e-05, "loss": 0.64117575, "memory(GiB)": 67.98, "step": 4740, "train_speed(iter/s)": 0.039513 }, { "acc": 0.81031885, "epoch": 3.263411279229711, "grad_norm": 1.760911226272583, "learning_rate": 2.9520717437334024e-05, "loss": 0.65109177, "memory(GiB)": 67.98, "step": 4745, "train_speed(iter/s)": 0.039529 }, { "acc": 0.81288662, "epoch": 3.266850068775791, "grad_norm": 1.7138432264328003, "learning_rate": 2.941703823460329e-05, "loss": 0.62872763, "memory(GiB)": 67.98, "step": 4750, "train_speed(iter/s)": 0.039549 }, { "acc": 0.80744476, "epoch": 3.2702888583218708, "grad_norm": 1.9831231832504272, "learning_rate": 2.9313465544174756e-05, "loss": 0.63904066, "memory(GiB)": 67.98, "step": 4755, "train_speed(iter/s)": 0.039566 }, { "acc": 0.80821819, "epoch": 3.2737276478679505, "grad_norm": 1.79635488986969, "learning_rate": 2.9209999901884165e-05, "loss": 0.64807615, "memory(GiB)": 67.98, "step": 4760, "train_speed(iter/s)": 0.039585 }, { "acc": 0.81542759, "epoch": 3.27716643741403, "grad_norm": 1.7462048530578613, "learning_rate": 2.910664184301346e-05, "loss": 0.61296053, "memory(GiB)": 67.98, "step": 4765, "train_speed(iter/s)": 0.039602 }, { "acc": 0.81904421, "epoch": 3.28060522696011, "grad_norm": 1.6204197406768799, "learning_rate": 2.900339190228796e-05, "loss": 0.60652199, "memory(GiB)": 67.98, "step": 4770, "train_speed(iter/s)": 0.039622 }, { "acc": 0.8142024, "epoch": 3.28404401650619, "grad_norm": 2.0843801498413086, "learning_rate": 2.890025061387362e-05, "loss": 0.61951303, "memory(GiB)": 67.98, "step": 4775, "train_speed(iter/s)": 0.039638 }, { "acc": 0.80437889, "epoch": 3.2874828060522696, "grad_norm": 1.9848445653915405, "learning_rate": 2.879721851137438e-05, "loss": 0.65048337, "memory(GiB)": 67.98, "step": 4780, "train_speed(iter/s)": 0.039655 }, { "acc": 0.8088932, "epoch": 3.2909215955983493, "grad_norm": 1.7368524074554443, "learning_rate": 2.8694296127829177e-05, "loss": 0.64408207, "memory(GiB)": 67.98, "step": 4785, "train_speed(iter/s)": 0.039674 }, { "acc": 0.79394779, "epoch": 3.294360385144429, "grad_norm": 1.72417414188385, "learning_rate": 2.8591483995709407e-05, "loss": 0.68265638, "memory(GiB)": 67.98, "step": 4790, "train_speed(iter/s)": 0.039689 }, { "acc": 0.81347179, "epoch": 3.297799174690509, "grad_norm": 1.7844178676605225, "learning_rate": 2.8488782646916024e-05, "loss": 0.61397967, "memory(GiB)": 67.98, "step": 4795, "train_speed(iter/s)": 0.039709 }, { "acc": 0.80788279, "epoch": 3.3012379642365888, "grad_norm": 1.7968957424163818, "learning_rate": 2.838619261277686e-05, "loss": 0.64608054, "memory(GiB)": 67.98, "step": 4800, "train_speed(iter/s)": 0.039729 }, { "epoch": 3.3012379642365888, "eval_acc": 0.7751046025104602, "eval_loss": 0.804237425327301, "eval_runtime": 1089.1926, "eval_samples_per_second": 3.932, "eval_steps_per_second": 0.071, "step": 4800 }, { "acc": 0.81613159, "epoch": 3.3046767537826685, "grad_norm": 1.6313848495483398, "learning_rate": 2.828371442404386e-05, "loss": 0.62472601, "memory(GiB)": 67.98, "step": 4805, "train_speed(iter/s)": 0.039393 }, { "acc": 0.81500292, "epoch": 3.308115543328748, "grad_norm": 1.8520140647888184, "learning_rate": 2.8181348610890345e-05, "loss": 0.62366076, "memory(GiB)": 67.98, "step": 4810, "train_speed(iter/s)": 0.039414 }, { "acc": 0.79354863, "epoch": 3.3115543328748283, "grad_norm": 1.8981624841690063, "learning_rate": 2.8079095702908214e-05, "loss": 0.69254041, "memory(GiB)": 67.98, "step": 4815, "train_speed(iter/s)": 0.03943 }, { "acc": 0.8094223, "epoch": 3.314993122420908, "grad_norm": 1.9359115362167358, "learning_rate": 2.7976956229105322e-05, "loss": 0.64053526, "memory(GiB)": 67.98, "step": 4820, "train_speed(iter/s)": 0.039449 }, { "acc": 0.79914575, "epoch": 3.3184319119669876, "grad_norm": 1.8818870782852173, "learning_rate": 2.7874930717902603e-05, "loss": 0.68240814, "memory(GiB)": 67.98, "step": 4825, "train_speed(iter/s)": 0.039467 }, { "acc": 0.80387897, "epoch": 3.3218707015130673, "grad_norm": 1.9390044212341309, "learning_rate": 2.7773019697131435e-05, "loss": 0.65107994, "memory(GiB)": 67.98, "step": 4830, "train_speed(iter/s)": 0.039486 }, { "acc": 0.81106586, "epoch": 3.325309491059147, "grad_norm": 1.6695841550827026, "learning_rate": 2.767122369403088e-05, "loss": 0.63033338, "memory(GiB)": 67.98, "step": 4835, "train_speed(iter/s)": 0.039505 }, { "acc": 0.82067537, "epoch": 3.328748280605227, "grad_norm": 1.7732053995132446, "learning_rate": 2.756954323524491e-05, "loss": 0.61327543, "memory(GiB)": 67.98, "step": 4840, "train_speed(iter/s)": 0.039521 }, { "acc": 0.80741024, "epoch": 3.3321870701513068, "grad_norm": 1.7796927690505981, "learning_rate": 2.7467978846819775e-05, "loss": 0.63265486, "memory(GiB)": 67.98, "step": 4845, "train_speed(iter/s)": 0.039541 }, { "acc": 0.80555611, "epoch": 3.3356258596973865, "grad_norm": 1.739590048789978, "learning_rate": 2.7366531054201243e-05, "loss": 0.64431, "memory(GiB)": 67.98, "step": 4850, "train_speed(iter/s)": 0.03956 }, { "acc": 0.79774094, "epoch": 3.339064649243466, "grad_norm": 1.9023163318634033, "learning_rate": 2.726520038223182e-05, "loss": 0.68374538, "memory(GiB)": 67.98, "step": 4855, "train_speed(iter/s)": 0.03958 }, { "acc": 0.81651649, "epoch": 3.3425034387895463, "grad_norm": 1.862848162651062, "learning_rate": 2.716398735514812e-05, "loss": 0.62106805, "memory(GiB)": 67.98, "step": 4860, "train_speed(iter/s)": 0.039595 }, { "acc": 0.8125948, "epoch": 3.345942228335626, "grad_norm": 1.7548292875289917, "learning_rate": 2.7062892496578096e-05, "loss": 0.62365727, "memory(GiB)": 67.98, "step": 4865, "train_speed(iter/s)": 0.039615 }, { "acc": 0.81203623, "epoch": 3.3493810178817056, "grad_norm": 1.8868883848190308, "learning_rate": 2.696191632953835e-05, "loss": 0.63214188, "memory(GiB)": 67.98, "step": 4870, "train_speed(iter/s)": 0.039636 }, { "acc": 0.7978539, "epoch": 3.3528198074277853, "grad_norm": 1.854641318321228, "learning_rate": 2.6861059376431485e-05, "loss": 0.66800289, "memory(GiB)": 67.98, "step": 4875, "train_speed(iter/s)": 0.039653 }, { "acc": 0.80050983, "epoch": 3.3562585969738654, "grad_norm": 2.1327366828918457, "learning_rate": 2.6760322159043293e-05, "loss": 0.68278418, "memory(GiB)": 67.98, "step": 4880, "train_speed(iter/s)": 0.039669 }, { "acc": 0.80420437, "epoch": 3.359697386519945, "grad_norm": 1.9406790733337402, "learning_rate": 2.6659705198540137e-05, "loss": 0.65569339, "memory(GiB)": 67.98, "step": 4885, "train_speed(iter/s)": 0.039687 }, { "acc": 0.81132812, "epoch": 3.3631361760660248, "grad_norm": 2.0002591609954834, "learning_rate": 2.6559209015466198e-05, "loss": 0.64171629, "memory(GiB)": 67.98, "step": 4890, "train_speed(iter/s)": 0.039704 }, { "acc": 0.80664577, "epoch": 3.3665749656121045, "grad_norm": 1.8464481830596924, "learning_rate": 2.6458834129740834e-05, "loss": 0.63870592, "memory(GiB)": 67.98, "step": 4895, "train_speed(iter/s)": 0.039724 }, { "acc": 0.80081406, "epoch": 3.370013755158184, "grad_norm": 1.992497444152832, "learning_rate": 2.635858106065588e-05, "loss": 0.67669377, "memory(GiB)": 67.98, "step": 4900, "train_speed(iter/s)": 0.039739 }, { "epoch": 3.370013755158184, "eval_acc": 0.775284563818779, "eval_loss": 0.8015691637992859, "eval_runtime": 1150.5731, "eval_samples_per_second": 3.722, "eval_steps_per_second": 0.067, "step": 4900 }, { "acc": 0.80654058, "epoch": 3.3734525447042643, "grad_norm": 1.6790952682495117, "learning_rate": 2.625845032687293e-05, "loss": 0.66655011, "memory(GiB)": 67.98, "step": 4905, "train_speed(iter/s)": 0.03939 }, { "acc": 0.81852398, "epoch": 3.376891334250344, "grad_norm": 1.7393443584442139, "learning_rate": 2.6158442446420673e-05, "loss": 0.61265764, "memory(GiB)": 67.98, "step": 4910, "train_speed(iter/s)": 0.039408 }, { "acc": 0.80897388, "epoch": 3.3803301237964236, "grad_norm": 1.757190465927124, "learning_rate": 2.605855793669223e-05, "loss": 0.63301859, "memory(GiB)": 67.98, "step": 4915, "train_speed(iter/s)": 0.039424 }, { "acc": 0.81477318, "epoch": 3.3837689133425033, "grad_norm": 1.7762666940689087, "learning_rate": 2.595879731444242e-05, "loss": 0.63501825, "memory(GiB)": 67.98, "step": 4920, "train_speed(iter/s)": 0.039441 }, { "acc": 0.80826883, "epoch": 3.387207702888583, "grad_norm": 1.8915072679519653, "learning_rate": 2.5859161095785204e-05, "loss": 0.64570541, "memory(GiB)": 67.98, "step": 4925, "train_speed(iter/s)": 0.039458 }, { "acc": 0.80283833, "epoch": 3.390646492434663, "grad_norm": 1.8202823400497437, "learning_rate": 2.5759649796190873e-05, "loss": 0.65588207, "memory(GiB)": 67.98, "step": 4930, "train_speed(iter/s)": 0.039476 }, { "acc": 0.80436974, "epoch": 3.3940852819807428, "grad_norm": 1.8674787282943726, "learning_rate": 2.5660263930483468e-05, "loss": 0.63378534, "memory(GiB)": 67.98, "step": 4935, "train_speed(iter/s)": 0.039489 }, { "acc": 0.80554743, "epoch": 3.3975240715268225, "grad_norm": 1.7539056539535522, "learning_rate": 2.5561004012838067e-05, "loss": 0.65574193, "memory(GiB)": 67.98, "step": 4940, "train_speed(iter/s)": 0.039506 }, { "acc": 0.81471024, "epoch": 3.4009628610729026, "grad_norm": 1.9018100500106812, "learning_rate": 2.5461870556778218e-05, "loss": 0.61126738, "memory(GiB)": 67.98, "step": 4945, "train_speed(iter/s)": 0.039526 }, { "acc": 0.80630493, "epoch": 3.4044016506189823, "grad_norm": 1.7900938987731934, "learning_rate": 2.5362864075173153e-05, "loss": 0.63573794, "memory(GiB)": 67.98, "step": 4950, "train_speed(iter/s)": 0.039543 }, { "acc": 0.80556068, "epoch": 3.407840440165062, "grad_norm": 2.0504183769226074, "learning_rate": 2.526398508023523e-05, "loss": 0.6546957, "memory(GiB)": 67.98, "step": 4955, "train_speed(iter/s)": 0.039561 }, { "acc": 0.80205326, "epoch": 3.4112792297111416, "grad_norm": 1.9150274991989136, "learning_rate": 2.5165234083517246e-05, "loss": 0.64255061, "memory(GiB)": 67.98, "step": 4960, "train_speed(iter/s)": 0.039575 }, { "acc": 0.80601921, "epoch": 3.4147180192572213, "grad_norm": 1.8331859111785889, "learning_rate": 2.5066611595909784e-05, "loss": 0.64326835, "memory(GiB)": 67.98, "step": 4965, "train_speed(iter/s)": 0.039591 }, { "acc": 0.80851765, "epoch": 3.4181568088033014, "grad_norm": 1.8799371719360352, "learning_rate": 2.49681181276386e-05, "loss": 0.63813715, "memory(GiB)": 67.98, "step": 4970, "train_speed(iter/s)": 0.039605 }, { "acc": 0.80517483, "epoch": 3.421595598349381, "grad_norm": 1.8553872108459473, "learning_rate": 2.486975418826196e-05, "loss": 0.66684914, "memory(GiB)": 67.98, "step": 4975, "train_speed(iter/s)": 0.039623 }, { "acc": 0.82046995, "epoch": 3.4250343878954608, "grad_norm": 1.695779800415039, "learning_rate": 2.477152028666798e-05, "loss": 0.60830936, "memory(GiB)": 67.98, "step": 4980, "train_speed(iter/s)": 0.039643 }, { "acc": 0.7990911, "epoch": 3.4284731774415405, "grad_norm": 1.7533307075500488, "learning_rate": 2.4673416931072094e-05, "loss": 0.67933016, "memory(GiB)": 67.98, "step": 4985, "train_speed(iter/s)": 0.039658 }, { "acc": 0.80797586, "epoch": 3.43191196698762, "grad_norm": 2.2120864391326904, "learning_rate": 2.4575444629014292e-05, "loss": 0.65290236, "memory(GiB)": 67.98, "step": 4990, "train_speed(iter/s)": 0.039679 }, { "acc": 0.80715389, "epoch": 3.4353507565337003, "grad_norm": 1.7007701396942139, "learning_rate": 2.447760388735657e-05, "loss": 0.64799299, "memory(GiB)": 67.98, "step": 4995, "train_speed(iter/s)": 0.039694 }, { "acc": 0.80209885, "epoch": 3.43878954607978, "grad_norm": 2.1484506130218506, "learning_rate": 2.4379895212280297e-05, "loss": 0.6714016, "memory(GiB)": 67.98, "step": 5000, "train_speed(iter/s)": 0.039712 }, { "epoch": 3.43878954607978, "eval_acc": 0.7760493993791335, "eval_loss": 0.7988596558570862, "eval_runtime": 1141.4518, "eval_samples_per_second": 3.752, "eval_steps_per_second": 0.067, "step": 5000 }, { "acc": 0.80421772, "epoch": 3.4422283356258596, "grad_norm": 1.683592438697815, "learning_rate": 2.428231910928358e-05, "loss": 0.65520515, "memory(GiB)": 67.98, "step": 5005, "train_speed(iter/s)": 0.039374 }, { "acc": 0.80245571, "epoch": 3.4456671251719393, "grad_norm": 1.8841793537139893, "learning_rate": 2.418487608317867e-05, "loss": 0.67175484, "memory(GiB)": 67.98, "step": 5010, "train_speed(iter/s)": 0.039392 }, { "acc": 0.8125226, "epoch": 3.4491059147180194, "grad_norm": 1.949098825454712, "learning_rate": 2.408756663808937e-05, "loss": 0.61799521, "memory(GiB)": 67.98, "step": 5015, "train_speed(iter/s)": 0.039412 }, { "acc": 0.81072598, "epoch": 3.452544704264099, "grad_norm": 1.7873549461364746, "learning_rate": 2.399039127744836e-05, "loss": 0.64322014, "memory(GiB)": 67.98, "step": 5020, "train_speed(iter/s)": 0.039425 }, { "acc": 0.80515785, "epoch": 3.4559834938101788, "grad_norm": 2.0002734661102295, "learning_rate": 2.389335050399464e-05, "loss": 0.6395524, "memory(GiB)": 67.98, "step": 5025, "train_speed(iter/s)": 0.039445 }, { "acc": 0.80893536, "epoch": 3.4594222833562585, "grad_norm": 1.9517066478729248, "learning_rate": 2.3796444819770926e-05, "loss": 0.63445306, "memory(GiB)": 67.98, "step": 5030, "train_speed(iter/s)": 0.03946 }, { "acc": 0.79918771, "epoch": 3.4628610729023386, "grad_norm": 1.7724376916885376, "learning_rate": 2.3699674726121022e-05, "loss": 0.68629122, "memory(GiB)": 67.98, "step": 5035, "train_speed(iter/s)": 0.039476 }, { "acc": 0.80571623, "epoch": 3.4662998624484183, "grad_norm": 1.769455075263977, "learning_rate": 2.3603040723687315e-05, "loss": 0.65023713, "memory(GiB)": 67.98, "step": 5040, "train_speed(iter/s)": 0.039495 }, { "acc": 0.81652203, "epoch": 3.469738651994498, "grad_norm": 2.4029428958892822, "learning_rate": 2.3506543312408055e-05, "loss": 0.62751317, "memory(GiB)": 67.98, "step": 5045, "train_speed(iter/s)": 0.039514 }, { "acc": 0.8143034, "epoch": 3.4731774415405776, "grad_norm": 1.7803950309753418, "learning_rate": 2.3410182991514863e-05, "loss": 0.62447834, "memory(GiB)": 67.98, "step": 5050, "train_speed(iter/s)": 0.039532 }, { "acc": 0.81082478, "epoch": 3.4766162310866573, "grad_norm": 1.8696342706680298, "learning_rate": 2.3313960259530114e-05, "loss": 0.63704772, "memory(GiB)": 67.98, "step": 5055, "train_speed(iter/s)": 0.039552 }, { "acc": 0.81630154, "epoch": 3.4800550206327374, "grad_norm": 1.9919400215148926, "learning_rate": 2.321787561426436e-05, "loss": 0.61488199, "memory(GiB)": 67.98, "step": 5060, "train_speed(iter/s)": 0.039568 }, { "acc": 0.81280794, "epoch": 3.483493810178817, "grad_norm": 1.9915574789047241, "learning_rate": 2.3121929552813775e-05, "loss": 0.62114315, "memory(GiB)": 67.98, "step": 5065, "train_speed(iter/s)": 0.039585 }, { "acc": 0.80458755, "epoch": 3.4869325997248968, "grad_norm": 1.9132686853408813, "learning_rate": 2.302612257155754e-05, "loss": 0.63852549, "memory(GiB)": 67.98, "step": 5070, "train_speed(iter/s)": 0.039601 }, { "acc": 0.80228262, "epoch": 3.4903713892709765, "grad_norm": 1.638962745666504, "learning_rate": 2.2930455166155325e-05, "loss": 0.65759382, "memory(GiB)": 67.98, "step": 5075, "train_speed(iter/s)": 0.039619 }, { "acc": 0.81052542, "epoch": 3.4938101788170566, "grad_norm": 2.4375152587890625, "learning_rate": 2.2834927831544663e-05, "loss": 0.62842712, "memory(GiB)": 67.98, "step": 5080, "train_speed(iter/s)": 0.039637 }, { "acc": 0.81806412, "epoch": 3.4972489683631363, "grad_norm": 1.8711788654327393, "learning_rate": 2.273954106193851e-05, "loss": 0.59915447, "memory(GiB)": 67.98, "step": 5085, "train_speed(iter/s)": 0.039654 }, { "acc": 0.80885086, "epoch": 3.500687757909216, "grad_norm": 1.7943886518478394, "learning_rate": 2.2644295350822523e-05, "loss": 0.64677639, "memory(GiB)": 67.98, "step": 5090, "train_speed(iter/s)": 0.03967 }, { "acc": 0.80517883, "epoch": 3.5041265474552956, "grad_norm": 1.9428882598876953, "learning_rate": 2.2549191190952614e-05, "loss": 0.64541783, "memory(GiB)": 67.98, "step": 5095, "train_speed(iter/s)": 0.039687 }, { "acc": 0.82121677, "epoch": 3.5075653370013757, "grad_norm": 2.129689931869507, "learning_rate": 2.245422907435237e-05, "loss": 0.59930925, "memory(GiB)": 67.98, "step": 5100, "train_speed(iter/s)": 0.039705 }, { "epoch": 3.5075653370013757, "eval_acc": 0.7766230260493994, "eval_loss": 0.7989464998245239, "eval_runtime": 1103.0791, "eval_samples_per_second": 3.883, "eval_steps_per_second": 0.07, "step": 5100 }, { "acc": 0.80364723, "epoch": 3.5110041265474554, "grad_norm": 1.949704885482788, "learning_rate": 2.2359409492310554e-05, "loss": 0.65982656, "memory(GiB)": 72.17, "step": 5105, "train_speed(iter/s)": 45.411016 }, { "acc": 0.81052856, "epoch": 3.514442916093535, "grad_norm": 1.766641616821289, "learning_rate": 2.2264732935378485e-05, "loss": 0.62573719, "memory(GiB)": 72.17, "step": 5110, "train_speed(iter/s)": 26.201936 }, { "acc": 0.81840916, "epoch": 3.5178817056396148, "grad_norm": 2.0052237510681152, "learning_rate": 2.217019989336754e-05, "loss": 0.60661297, "memory(GiB)": 72.17, "step": 5115, "train_speed(iter/s)": 19.601314 }, { "acc": 0.81169033, "epoch": 3.5213204951856945, "grad_norm": 1.8747566938400269, "learning_rate": 2.2075810855346627e-05, "loss": 0.6164432, "memory(GiB)": 72.17, "step": 5120, "train_speed(iter/s)": 15.593037 }, { "acc": 0.81197557, "epoch": 3.5247592847317746, "grad_norm": 1.8955270051956177, "learning_rate": 2.1981566309639646e-05, "loss": 0.63830528, "memory(GiB)": 72.17, "step": 5125, "train_speed(iter/s)": 13.017298 }, { "acc": 0.80265837, "epoch": 3.5281980742778543, "grad_norm": 1.9690247774124146, "learning_rate": 2.1887466743822955e-05, "loss": 0.66069555, "memory(GiB)": 72.17, "step": 5130, "train_speed(iter/s)": 10.901481 }, { "acc": 0.80450611, "epoch": 3.531636863823934, "grad_norm": 1.9993948936462402, "learning_rate": 2.1793512644722865e-05, "loss": 0.66204972, "memory(GiB)": 72.23, "step": 5135, "train_speed(iter/s)": 9.338386 }, { "acc": 0.80088081, "epoch": 3.5350756533700136, "grad_norm": 2.0130441188812256, "learning_rate": 2.1699704498413108e-05, "loss": 0.67445641, "memory(GiB)": 72.23, "step": 5140, "train_speed(iter/s)": 8.21847 }, { "acc": 0.80354471, "epoch": 3.5385144429160933, "grad_norm": 1.8594011068344116, "learning_rate": 2.1606042790212308e-05, "loss": 0.6569746, "memory(GiB)": 72.28, "step": 5145, "train_speed(iter/s)": 7.396472 }, { "acc": 0.817062, "epoch": 3.5419532324621734, "grad_norm": 1.8149155378341675, "learning_rate": 2.1512528004681535e-05, "loss": 0.63296041, "memory(GiB)": 72.7, "step": 5150, "train_speed(iter/s)": 6.704915 }, { "acc": 0.81256504, "epoch": 3.545392022008253, "grad_norm": 1.9153436422348022, "learning_rate": 2.1419160625621713e-05, "loss": 0.6270606, "memory(GiB)": 72.7, "step": 5155, "train_speed(iter/s)": 6.15874 }, { "acc": 0.80941086, "epoch": 3.5488308115543328, "grad_norm": 1.6500003337860107, "learning_rate": 2.1325941136071155e-05, "loss": 0.63835382, "memory(GiB)": 72.7, "step": 5160, "train_speed(iter/s)": 5.671284 }, { "acc": 0.81136417, "epoch": 3.552269601100413, "grad_norm": 1.8858124017715454, "learning_rate": 2.1232870018303073e-05, "loss": 0.63752775, "memory(GiB)": 72.7, "step": 5165, "train_speed(iter/s)": 5.219987 }, { "acc": 0.80491219, "epoch": 3.5557083906464926, "grad_norm": 1.6921783685684204, "learning_rate": 2.1139947753823062e-05, "loss": 0.64572196, "memory(GiB)": 72.7, "step": 5170, "train_speed(iter/s)": 4.872821 }, { "acc": 0.81005411, "epoch": 3.5591471801925723, "grad_norm": 2.046410322189331, "learning_rate": 2.104717482336666e-05, "loss": 0.63220901, "memory(GiB)": 72.7, "step": 5175, "train_speed(iter/s)": 4.560666 }, { "acc": 0.81424847, "epoch": 3.562585969738652, "grad_norm": 1.6710875034332275, "learning_rate": 2.095455170689679e-05, "loss": 0.61837616, "memory(GiB)": 72.7, "step": 5180, "train_speed(iter/s)": 4.309485 }, { "acc": 0.80108767, "epoch": 3.5660247592847316, "grad_norm": 1.8342450857162476, "learning_rate": 2.0862078883601306e-05, "loss": 0.65744696, "memory(GiB)": 72.7, "step": 5185, "train_speed(iter/s)": 4.065786 }, { "acc": 0.81536474, "epoch": 3.5694635488308117, "grad_norm": 1.8400901556015015, "learning_rate": 2.0769756831890517e-05, "loss": 0.60680361, "memory(GiB)": 72.7, "step": 5190, "train_speed(iter/s)": 3.855262 }, { "acc": 0.80920811, "epoch": 3.5729023383768914, "grad_norm": 2.1009435653686523, "learning_rate": 2.067758602939473e-05, "loss": 0.617168, "memory(GiB)": 72.7, "step": 5195, "train_speed(iter/s)": 3.680601 }, { "acc": 0.81185656, "epoch": 3.576341127922971, "grad_norm": 1.8651849031448364, "learning_rate": 2.058556695296173e-05, "loss": 0.62884312, "memory(GiB)": 72.7, "step": 5200, "train_speed(iter/s)": 3.496681 }, { "epoch": 3.576341127922971, "eval_acc": 0.7778602600440905, "eval_loss": 0.7939268350601196, "eval_runtime": 1134.0299, "eval_samples_per_second": 3.777, "eval_steps_per_second": 0.068, "step": 5200 }, { "acc": 0.80170975, "epoch": 3.5797799174690508, "grad_norm": 1.989461064338684, "learning_rate": 2.0493700078654395e-05, "loss": 0.65876365, "memory(GiB)": 72.7, "step": 5205, "train_speed(iter/s)": 1.934041 }, { "acc": 0.81692247, "epoch": 3.5832187070151305, "grad_norm": 1.9536714553833008, "learning_rate": 2.040198588174813e-05, "loss": 0.60520372, "memory(GiB)": 67.62, "step": 5210, "train_speed(iter/s)": 1.887787 }, { "acc": 0.81152821, "epoch": 3.5866574965612106, "grad_norm": 1.9955531358718872, "learning_rate": 2.0310424836728494e-05, "loss": 0.64239225, "memory(GiB)": 67.62, "step": 5215, "train_speed(iter/s)": 1.844883 }, { "acc": 0.81309061, "epoch": 3.5900962861072903, "grad_norm": 1.7581534385681152, "learning_rate": 2.0219017417288675e-05, "loss": 0.62655144, "memory(GiB)": 67.62, "step": 5220, "train_speed(iter/s)": 1.804023 }, { "acc": 0.79863563, "epoch": 3.59353507565337, "grad_norm": 2.349116086959839, "learning_rate": 2.0127764096327113e-05, "loss": 0.6843668, "memory(GiB)": 67.62, "step": 5225, "train_speed(iter/s)": 1.756569 }, { "acc": 0.82426891, "epoch": 3.59697386519945, "grad_norm": 1.8200994729995728, "learning_rate": 2.0036665345945005e-05, "loss": 0.57460217, "memory(GiB)": 67.62, "step": 5230, "train_speed(iter/s)": 1.719852 }, { "acc": 0.81199923, "epoch": 3.6004126547455297, "grad_norm": 1.760864019393921, "learning_rate": 1.9945721637443855e-05, "loss": 0.63763566, "memory(GiB)": 67.62, "step": 5235, "train_speed(iter/s)": 1.68241 }, { "acc": 0.81704388, "epoch": 3.6038514442916094, "grad_norm": 1.843873143196106, "learning_rate": 1.9854933441323074e-05, "loss": 0.61490622, "memory(GiB)": 67.62, "step": 5240, "train_speed(iter/s)": 1.645642 }, { "acc": 0.80201912, "epoch": 3.607290233837689, "grad_norm": 1.963784098625183, "learning_rate": 1.9764301227277503e-05, "loss": 0.64649305, "memory(GiB)": 67.62, "step": 5245, "train_speed(iter/s)": 1.609859 }, { "acc": 0.79966879, "epoch": 3.6107290233837688, "grad_norm": 2.0832812786102295, "learning_rate": 1.9673825464195065e-05, "loss": 0.68630571, "memory(GiB)": 67.62, "step": 5250, "train_speed(iter/s)": 1.577467 }, { "acc": 0.80603333, "epoch": 3.614167812929849, "grad_norm": 2.1369543075561523, "learning_rate": 1.9583506620154203e-05, "loss": 0.65753994, "memory(GiB)": 67.62, "step": 5255, "train_speed(iter/s)": 1.547258 }, { "acc": 0.80580025, "epoch": 3.6176066024759286, "grad_norm": 1.853987455368042, "learning_rate": 1.9493345162421595e-05, "loss": 0.65103807, "memory(GiB)": 67.62, "step": 5260, "train_speed(iter/s)": 1.514916 }, { "acc": 0.81311512, "epoch": 3.6210453920220083, "grad_norm": 2.1064698696136475, "learning_rate": 1.9403341557449614e-05, "loss": 0.61463804, "memory(GiB)": 67.62, "step": 5265, "train_speed(iter/s)": 1.48572 }, { "acc": 0.79921217, "epoch": 3.624484181568088, "grad_norm": 1.9443074464797974, "learning_rate": 1.9313496270874065e-05, "loss": 0.67477508, "memory(GiB)": 67.62, "step": 5270, "train_speed(iter/s)": 1.458339 }, { "acc": 0.81266232, "epoch": 3.6279229711141676, "grad_norm": 1.8594951629638672, "learning_rate": 1.9223809767511622e-05, "loss": 0.62132969, "memory(GiB)": 67.62, "step": 5275, "train_speed(iter/s)": 1.430819 }, { "acc": 0.81770267, "epoch": 3.6313617606602477, "grad_norm": 1.726508617401123, "learning_rate": 1.913428251135751e-05, "loss": 0.59776912, "memory(GiB)": 67.62, "step": 5280, "train_speed(iter/s)": 1.404985 }, { "acc": 0.81523685, "epoch": 3.6348005502063274, "grad_norm": 1.8356785774230957, "learning_rate": 1.904491496558308e-05, "loss": 0.62854185, "memory(GiB)": 67.62, "step": 5285, "train_speed(iter/s)": 1.381204 }, { "acc": 0.81304836, "epoch": 3.638239339752407, "grad_norm": 2.129279136657715, "learning_rate": 1.8955707592533422e-05, "loss": 0.62155433, "memory(GiB)": 67.62, "step": 5290, "train_speed(iter/s)": 1.359057 }, { "acc": 0.80884018, "epoch": 3.6416781292984868, "grad_norm": 1.8221231698989868, "learning_rate": 1.8866660853724986e-05, "loss": 0.63217707, "memory(GiB)": 67.62, "step": 5295, "train_speed(iter/s)": 1.33386 }, { "acc": 0.81211977, "epoch": 3.6451169188445665, "grad_norm": 1.8867233991622925, "learning_rate": 1.8777775209843136e-05, "loss": 0.62917542, "memory(GiB)": 67.62, "step": 5300, "train_speed(iter/s)": 1.313671 }, { "epoch": 3.6451169188445665, "eval_acc": 0.7788275520763036, "eval_loss": 0.791822075843811, "eval_runtime": 1053.4964, "eval_samples_per_second": 4.066, "eval_steps_per_second": 0.073, "step": 5300 }, { "acc": 0.81195221, "epoch": 3.6485557083906466, "grad_norm": 1.8668956756591797, "learning_rate": 1.868905112073983e-05, "loss": 0.63313217, "memory(GiB)": 67.62, "step": 5305, "train_speed(iter/s)": 1.028242 }, { "acc": 0.80295448, "epoch": 3.6519944979367263, "grad_norm": 1.8957765102386475, "learning_rate": 1.8600489045431255e-05, "loss": 0.6641448, "memory(GiB)": 67.62, "step": 5310, "train_speed(iter/s)": 1.011268 }, { "acc": 0.81735973, "epoch": 3.655433287482806, "grad_norm": 1.843002438545227, "learning_rate": 1.851208944209535e-05, "loss": 0.60693998, "memory(GiB)": 67.62, "step": 5315, "train_speed(iter/s)": 0.999134 }, { "acc": 0.80702572, "epoch": 3.658872077028886, "grad_norm": 1.8155903816223145, "learning_rate": 1.8423852768069548e-05, "loss": 0.65699286, "memory(GiB)": 67.62, "step": 5320, "train_speed(iter/s)": 0.984903 }, { "acc": 0.80685482, "epoch": 3.6623108665749657, "grad_norm": 2.5197625160217285, "learning_rate": 1.8335779479848343e-05, "loss": 0.64485803, "memory(GiB)": 67.62, "step": 5325, "train_speed(iter/s)": 0.97156 }, { "acc": 0.80205936, "epoch": 3.6657496561210454, "grad_norm": 2.0389351844787598, "learning_rate": 1.8247870033080946e-05, "loss": 0.66550064, "memory(GiB)": 67.62, "step": 5330, "train_speed(iter/s)": 0.958682 }, { "acc": 0.79223623, "epoch": 3.669188445667125, "grad_norm": 1.8488144874572754, "learning_rate": 1.8160124882568932e-05, "loss": 0.69218178, "memory(GiB)": 67.62, "step": 5335, "train_speed(iter/s)": 0.946767 }, { "acc": 0.80992165, "epoch": 3.6726272352132048, "grad_norm": 1.701180338859558, "learning_rate": 1.8072544482263918e-05, "loss": 0.63368897, "memory(GiB)": 67.62, "step": 5340, "train_speed(iter/s)": 0.934657 }, { "acc": 0.81273346, "epoch": 3.676066024759285, "grad_norm": 1.870936632156372, "learning_rate": 1.798512928526514e-05, "loss": 0.62342134, "memory(GiB)": 67.62, "step": 5345, "train_speed(iter/s)": 0.922237 }, { "acc": 0.81911898, "epoch": 3.6795048143053646, "grad_norm": 2.1875438690185547, "learning_rate": 1.789787974381717e-05, "loss": 0.60667896, "memory(GiB)": 67.62, "step": 5350, "train_speed(iter/s)": 0.910869 }, { "acc": 0.81839104, "epoch": 3.6829436038514443, "grad_norm": 1.9975168704986572, "learning_rate": 1.7810796309307553e-05, "loss": 0.61631479, "memory(GiB)": 67.62, "step": 5355, "train_speed(iter/s)": 0.898788 }, { "acc": 0.79996266, "epoch": 3.686382393397524, "grad_norm": 2.1356396675109863, "learning_rate": 1.7723879432264454e-05, "loss": 0.65718513, "memory(GiB)": 67.62, "step": 5360, "train_speed(iter/s)": 0.888306 }, { "acc": 0.81604223, "epoch": 3.6898211829436036, "grad_norm": 1.8731410503387451, "learning_rate": 1.763712956235441e-05, "loss": 0.62172794, "memory(GiB)": 67.62, "step": 5365, "train_speed(iter/s)": 0.877305 }, { "acc": 0.80391541, "epoch": 3.6932599724896837, "grad_norm": 2.0950632095336914, "learning_rate": 1.7550547148379887e-05, "loss": 0.66051216, "memory(GiB)": 67.62, "step": 5370, "train_speed(iter/s)": 0.867559 }, { "acc": 0.81647606, "epoch": 3.6966987620357634, "grad_norm": 1.7469427585601807, "learning_rate": 1.7464132638277024e-05, "loss": 0.61341143, "memory(GiB)": 67.62, "step": 5375, "train_speed(iter/s)": 0.85733 }, { "acc": 0.81001339, "epoch": 3.700137551581843, "grad_norm": 1.9832128286361694, "learning_rate": 1.737788647911332e-05, "loss": 0.63573427, "memory(GiB)": 67.62, "step": 5380, "train_speed(iter/s)": 0.846179 }, { "acc": 0.81297035, "epoch": 3.703576341127923, "grad_norm": 1.993898868560791, "learning_rate": 1.72918091170853e-05, "loss": 0.64141645, "memory(GiB)": 67.62, "step": 5385, "train_speed(iter/s)": 0.837314 }, { "acc": 0.81126728, "epoch": 3.707015130674003, "grad_norm": 1.8106107711791992, "learning_rate": 1.72059009975162e-05, "loss": 0.63114452, "memory(GiB)": 67.62, "step": 5390, "train_speed(iter/s)": 0.827419 }, { "acc": 0.80908537, "epoch": 3.7104539202200826, "grad_norm": 2.117880344390869, "learning_rate": 1.71201625648537e-05, "loss": 0.64524364, "memory(GiB)": 67.62, "step": 5395, "train_speed(iter/s)": 0.818052 }, { "acc": 0.80611687, "epoch": 3.7138927097661623, "grad_norm": 1.86283278465271, "learning_rate": 1.7034594262667588e-05, "loss": 0.65121384, "memory(GiB)": 67.62, "step": 5400, "train_speed(iter/s)": 0.809997 }, { "epoch": 3.7138927097661623, "eval_acc": 0.7794068025374544, "eval_loss": 0.7907042503356934, "eval_runtime": 1176.8109, "eval_samples_per_second": 3.639, "eval_steps_per_second": 0.065, "step": 5400 }, { "acc": 0.80929089, "epoch": 3.717331499312242, "grad_norm": 2.175724983215332, "learning_rate": 1.6949196533647456e-05, "loss": 0.63896065, "memory(GiB)": 67.62, "step": 5405, "train_speed(iter/s)": 0.682919 }, { "acc": 0.80504618, "epoch": 3.720770288858322, "grad_norm": 1.7912895679473877, "learning_rate": 1.6863969819600486e-05, "loss": 0.6515821, "memory(GiB)": 67.62, "step": 5410, "train_speed(iter/s)": 0.677418 }, { "acc": 0.82038784, "epoch": 3.7242090784044017, "grad_norm": 1.9168109893798828, "learning_rate": 1.6778914561449068e-05, "loss": 0.60445056, "memory(GiB)": 67.62, "step": 5415, "train_speed(iter/s)": 0.672424 }, { "acc": 0.80365715, "epoch": 3.7276478679504814, "grad_norm": 2.0032663345336914, "learning_rate": 1.669403119922857e-05, "loss": 0.65206861, "memory(GiB)": 67.62, "step": 5420, "train_speed(iter/s)": 0.666879 }, { "acc": 0.81271191, "epoch": 3.731086657496561, "grad_norm": 1.7971467971801758, "learning_rate": 1.660932017208504e-05, "loss": 0.63001757, "memory(GiB)": 67.62, "step": 5425, "train_speed(iter/s)": 0.661349 }, { "acc": 0.80627918, "epoch": 3.7345254470426408, "grad_norm": 2.1404869556427, "learning_rate": 1.6524781918272988e-05, "loss": 0.65701981, "memory(GiB)": 67.62, "step": 5430, "train_speed(iter/s)": 0.656674 }, { "acc": 0.81756916, "epoch": 3.737964236588721, "grad_norm": 1.9488438367843628, "learning_rate": 1.6440416875153035e-05, "loss": 0.62909493, "memory(GiB)": 67.62, "step": 5435, "train_speed(iter/s)": 0.651758 }, { "acc": 0.81080599, "epoch": 3.7414030261348006, "grad_norm": 1.9031460285186768, "learning_rate": 1.6356225479189706e-05, "loss": 0.64159656, "memory(GiB)": 67.62, "step": 5440, "train_speed(iter/s)": 0.647121 }, { "acc": 0.80497589, "epoch": 3.7448418156808803, "grad_norm": 1.9063955545425415, "learning_rate": 1.6272208165949165e-05, "loss": 0.66333132, "memory(GiB)": 67.62, "step": 5445, "train_speed(iter/s)": 0.641945 }, { "acc": 0.81537628, "epoch": 3.7482806052269604, "grad_norm": 1.9544923305511475, "learning_rate": 1.6188365370096938e-05, "loss": 0.60649881, "memory(GiB)": 67.62, "step": 5450, "train_speed(iter/s)": 0.637092 }, { "acc": 0.81485357, "epoch": 3.7517193947730396, "grad_norm": 1.7963929176330566, "learning_rate": 1.61046975253957e-05, "loss": 0.62127781, "memory(GiB)": 67.62, "step": 5455, "train_speed(iter/s)": 0.632455 }, { "acc": 0.80606298, "epoch": 3.7551581843191197, "grad_norm": 1.91194748878479, "learning_rate": 1.6021205064703e-05, "loss": 0.6456295, "memory(GiB)": 67.62, "step": 5460, "train_speed(iter/s)": 0.627638 }, { "acc": 0.80508499, "epoch": 3.7585969738651994, "grad_norm": 1.6945174932479858, "learning_rate": 1.593788841996904e-05, "loss": 0.64310069, "memory(GiB)": 67.62, "step": 5465, "train_speed(iter/s)": 0.622963 }, { "acc": 0.82610073, "epoch": 3.762035763411279, "grad_norm": 1.6806504726409912, "learning_rate": 1.5854748022234422e-05, "loss": 0.57846365, "memory(GiB)": 67.62, "step": 5470, "train_speed(iter/s)": 0.619076 }, { "acc": 0.80571289, "epoch": 3.7654745529573592, "grad_norm": 1.9896758794784546, "learning_rate": 1.5771784301627968e-05, "loss": 0.64995089, "memory(GiB)": 67.62, "step": 5475, "train_speed(iter/s)": 0.614402 }, { "acc": 0.81692181, "epoch": 3.768913342503439, "grad_norm": 1.7746247053146362, "learning_rate": 1.5688997687364408e-05, "loss": 0.61731248, "memory(GiB)": 67.62, "step": 5480, "train_speed(iter/s)": 0.610536 }, { "acc": 0.79758596, "epoch": 3.7723521320495186, "grad_norm": 1.9613304138183594, "learning_rate": 1.560638860774223e-05, "loss": 0.66896119, "memory(GiB)": 67.62, "step": 5485, "train_speed(iter/s)": 0.605861 }, { "acc": 0.80816298, "epoch": 3.7757909215955983, "grad_norm": 1.7979682683944702, "learning_rate": 1.552395749014145e-05, "loss": 0.64903908, "memory(GiB)": 67.62, "step": 5490, "train_speed(iter/s)": 0.601267 }, { "acc": 0.81845226, "epoch": 3.779229711141678, "grad_norm": 1.610510230064392, "learning_rate": 1.5441704761021365e-05, "loss": 0.61122522, "memory(GiB)": 67.62, "step": 5495, "train_speed(iter/s)": 0.59685 }, { "acc": 0.80607834, "epoch": 3.782668500687758, "grad_norm": 1.8088189363479614, "learning_rate": 1.535963084591842e-05, "loss": 0.6456028, "memory(GiB)": 67.62, "step": 5500, "train_speed(iter/s)": 0.59234 }, { "epoch": 3.782668500687758, "eval_acc": 0.7795867638457732, "eval_loss": 0.7851858735084534, "eval_runtime": 1107.2216, "eval_samples_per_second": 3.868, "eval_steps_per_second": 0.07, "step": 5500 }, { "acc": 0.80722027, "epoch": 3.7861072902338377, "grad_norm": 1.7930651903152466, "learning_rate": 1.527773616944393e-05, "loss": 0.65197091, "memory(GiB)": 67.62, "step": 5505, "train_speed(iter/s)": 0.525895 }, { "acc": 0.80907288, "epoch": 3.7895460797799174, "grad_norm": 1.873205542564392, "learning_rate": 1.519602115528191e-05, "loss": 0.63936815, "memory(GiB)": 67.62, "step": 5510, "train_speed(iter/s)": 0.522195 }, { "acc": 0.81462736, "epoch": 3.792984869325997, "grad_norm": 2.1219732761383057, "learning_rate": 1.5114486226186914e-05, "loss": 0.63517313, "memory(GiB)": 67.62, "step": 5515, "train_speed(iter/s)": 0.518863 }, { "acc": 0.81379719, "epoch": 3.796423658872077, "grad_norm": 1.8798179626464844, "learning_rate": 1.5033131803981795e-05, "loss": 0.6165091, "memory(GiB)": 67.62, "step": 5520, "train_speed(iter/s)": 0.516156 }, { "acc": 0.80504332, "epoch": 3.799862448418157, "grad_norm": 2.1897356510162354, "learning_rate": 1.495195830955555e-05, "loss": 0.65493903, "memory(GiB)": 67.62, "step": 5525, "train_speed(iter/s)": 0.512721 }, { "acc": 0.79971151, "epoch": 3.8033012379642366, "grad_norm": 2.3374557495117188, "learning_rate": 1.4870966162861185e-05, "loss": 0.66825953, "memory(GiB)": 67.62, "step": 5530, "train_speed(iter/s)": 0.509778 }, { "acc": 0.81023417, "epoch": 3.8067400275103163, "grad_norm": 2.0296730995178223, "learning_rate": 1.4790155782913446e-05, "loss": 0.6293088, "memory(GiB)": 67.62, "step": 5535, "train_speed(iter/s)": 0.506899 }, { "acc": 0.81744757, "epoch": 3.8101788170563964, "grad_norm": 2.1950666904449463, "learning_rate": 1.4709527587786729e-05, "loss": 0.60644913, "memory(GiB)": 67.62, "step": 5540, "train_speed(iter/s)": 0.504436 }, { "acc": 0.80740032, "epoch": 3.813617606602476, "grad_norm": 1.870073676109314, "learning_rate": 1.4629081994612883e-05, "loss": 0.65674248, "memory(GiB)": 67.62, "step": 5545, "train_speed(iter/s)": 0.501407 }, { "acc": 0.82541618, "epoch": 3.8170563961485557, "grad_norm": 1.814864993095398, "learning_rate": 1.4548819419579082e-05, "loss": 0.59056787, "memory(GiB)": 67.62, "step": 5550, "train_speed(iter/s)": 0.49869 }, { "acc": 0.79932752, "epoch": 3.8204951856946354, "grad_norm": 2.118622303009033, "learning_rate": 1.4468740277925627e-05, "loss": 0.67586517, "memory(GiB)": 67.62, "step": 5555, "train_speed(iter/s)": 0.496257 }, { "acc": 0.80807095, "epoch": 3.823933975240715, "grad_norm": 2.1060431003570557, "learning_rate": 1.4388844983943837e-05, "loss": 0.64639549, "memory(GiB)": 67.62, "step": 5560, "train_speed(iter/s)": 0.493446 }, { "acc": 0.80921745, "epoch": 3.8273727647867952, "grad_norm": 1.9443578720092773, "learning_rate": 1.430913395097388e-05, "loss": 0.63667898, "memory(GiB)": 67.62, "step": 5565, "train_speed(iter/s)": 0.490752 }, { "acc": 0.82145481, "epoch": 3.830811554332875, "grad_norm": 2.1125001907348633, "learning_rate": 1.4229607591402635e-05, "loss": 0.58884673, "memory(GiB)": 67.62, "step": 5570, "train_speed(iter/s)": 0.487797 }, { "acc": 0.82449379, "epoch": 3.8342503438789546, "grad_norm": 1.7490825653076172, "learning_rate": 1.4150266316661623e-05, "loss": 0.60519004, "memory(GiB)": 67.62, "step": 5575, "train_speed(iter/s)": 0.485346 }, { "acc": 0.81672592, "epoch": 3.8376891334250343, "grad_norm": 1.6957894563674927, "learning_rate": 1.407111053722477e-05, "loss": 0.61075163, "memory(GiB)": 67.62, "step": 5580, "train_speed(iter/s)": 0.482577 }, { "acc": 0.79658046, "epoch": 3.841127922971114, "grad_norm": 2.5719101428985596, "learning_rate": 1.3992140662606357e-05, "loss": 0.67197566, "memory(GiB)": 67.62, "step": 5585, "train_speed(iter/s)": 0.480134 }, { "acc": 0.82320576, "epoch": 3.844566712517194, "grad_norm": 2.0491995811462402, "learning_rate": 1.3913357101358865e-05, "loss": 0.59475327, "memory(GiB)": 67.62, "step": 5590, "train_speed(iter/s)": 0.477933 }, { "acc": 0.81174135, "epoch": 3.8480055020632737, "grad_norm": 1.8027801513671875, "learning_rate": 1.3834760261070908e-05, "loss": 0.63737803, "memory(GiB)": 67.62, "step": 5595, "train_speed(iter/s)": 0.475151 }, { "acc": 0.81277132, "epoch": 3.8514442916093534, "grad_norm": 2.0076797008514404, "learning_rate": 1.3756350548365069e-05, "loss": 0.64119816, "memory(GiB)": 67.62, "step": 5600, "train_speed(iter/s)": 0.472763 }, { "epoch": 3.8514442916093534, "eval_acc": 0.7802222522157736, "eval_loss": 0.7851279377937317, "eval_runtime": 1153.3756, "eval_samples_per_second": 3.713, "eval_steps_per_second": 0.067, "step": 5600 }, { "acc": 0.79907894, "epoch": 3.8548830811554335, "grad_norm": 2.066263437271118, "learning_rate": 1.3678128368895824e-05, "loss": 0.66954241, "memory(GiB)": 67.62, "step": 5605, "train_speed(iter/s)": 0.428917 }, { "acc": 0.81123543, "epoch": 3.8583218707015132, "grad_norm": 1.8212432861328125, "learning_rate": 1.3600094127347462e-05, "loss": 0.64494739, "memory(GiB)": 67.62, "step": 5610, "train_speed(iter/s)": 0.427046 }, { "acc": 0.8146841, "epoch": 3.861760660247593, "grad_norm": 1.99728524684906, "learning_rate": 1.3522248227431972e-05, "loss": 0.61559277, "memory(GiB)": 67.62, "step": 5615, "train_speed(iter/s)": 0.42489 }, { "acc": 0.8208971, "epoch": 3.8651994497936726, "grad_norm": 2.206382989883423, "learning_rate": 1.3444591071886931e-05, "loss": 0.61427069, "memory(GiB)": 67.62, "step": 5620, "train_speed(iter/s)": 0.423148 }, { "acc": 0.81454735, "epoch": 3.8686382393397523, "grad_norm": 1.812099575996399, "learning_rate": 1.3367123062473446e-05, "loss": 0.62899446, "memory(GiB)": 67.62, "step": 5625, "train_speed(iter/s)": 0.421338 }, { "acc": 0.81133175, "epoch": 3.8720770288858324, "grad_norm": 1.8373388051986694, "learning_rate": 1.328984459997408e-05, "loss": 0.63102517, "memory(GiB)": 67.62, "step": 5630, "train_speed(iter/s)": 0.419334 }, { "acc": 0.80838804, "epoch": 3.875515818431912, "grad_norm": 1.7026041746139526, "learning_rate": 1.3212756084190767e-05, "loss": 0.63373623, "memory(GiB)": 67.62, "step": 5635, "train_speed(iter/s)": 0.417529 }, { "acc": 0.81233072, "epoch": 3.8789546079779917, "grad_norm": 1.80439293384552, "learning_rate": 1.313585791394274e-05, "loss": 0.63350501, "memory(GiB)": 67.62, "step": 5640, "train_speed(iter/s)": 0.415882 }, { "acc": 0.80420494, "epoch": 3.8823933975240714, "grad_norm": 1.835792064666748, "learning_rate": 1.3059150487064497e-05, "loss": 0.64182324, "memory(GiB)": 67.62, "step": 5645, "train_speed(iter/s)": 0.413931 }, { "acc": 0.82478485, "epoch": 3.885832187070151, "grad_norm": 2.1019296646118164, "learning_rate": 1.2982634200403704e-05, "loss": 0.57977004, "memory(GiB)": 67.62, "step": 5650, "train_speed(iter/s)": 0.41225 }, { "acc": 0.82099762, "epoch": 3.8892709766162312, "grad_norm": 1.7367315292358398, "learning_rate": 1.2906309449819154e-05, "loss": 0.60107656, "memory(GiB)": 67.62, "step": 5655, "train_speed(iter/s)": 0.410363 }, { "acc": 0.81159325, "epoch": 3.892709766162311, "grad_norm": 2.0694830417633057, "learning_rate": 1.2830176630178729e-05, "loss": 0.61608582, "memory(GiB)": 67.62, "step": 5660, "train_speed(iter/s)": 0.408617 }, { "acc": 0.81860466, "epoch": 3.8961485557083906, "grad_norm": 1.9818027019500732, "learning_rate": 1.2754236135357367e-05, "loss": 0.60277052, "memory(GiB)": 67.62, "step": 5665, "train_speed(iter/s)": 0.40679 }, { "acc": 0.8118084, "epoch": 3.8995873452544703, "grad_norm": 1.893306016921997, "learning_rate": 1.2678488358234992e-05, "loss": 0.64575768, "memory(GiB)": 67.62, "step": 5670, "train_speed(iter/s)": 0.405098 }, { "acc": 0.80856295, "epoch": 3.90302613480055, "grad_norm": 1.9855684041976929, "learning_rate": 1.2602933690694502e-05, "loss": 0.65475564, "memory(GiB)": 67.62, "step": 5675, "train_speed(iter/s)": 0.403391 }, { "acc": 0.82089319, "epoch": 3.90646492434663, "grad_norm": 1.8527436256408691, "learning_rate": 1.2527572523619729e-05, "loss": 0.59858413, "memory(GiB)": 67.62, "step": 5680, "train_speed(iter/s)": 0.401656 }, { "acc": 0.81135502, "epoch": 3.9099037138927097, "grad_norm": 1.8112705945968628, "learning_rate": 1.245240524689345e-05, "loss": 0.640869, "memory(GiB)": 67.62, "step": 5685, "train_speed(iter/s)": 0.399915 }, { "acc": 0.81638031, "epoch": 3.9133425034387894, "grad_norm": 1.88164222240448, "learning_rate": 1.2377432249395323e-05, "loss": 0.62925024, "memory(GiB)": 67.62, "step": 5690, "train_speed(iter/s)": 0.398338 }, { "acc": 0.81665897, "epoch": 3.9167812929848695, "grad_norm": 2.2220370769500732, "learning_rate": 1.2302653918999902e-05, "loss": 0.61042566, "memory(GiB)": 67.62, "step": 5695, "train_speed(iter/s)": 0.396628 }, { "acc": 0.81146564, "epoch": 3.9202200825309492, "grad_norm": 2.2208054065704346, "learning_rate": 1.2228070642574637e-05, "loss": 0.62549958, "memory(GiB)": 67.62, "step": 5700, "train_speed(iter/s)": 0.394844 }, { "epoch": 3.9202200825309492, "eval_acc": 0.7811276825482522, "eval_loss": 0.783173680305481, "eval_runtime": 1145.264, "eval_samples_per_second": 3.74, "eval_steps_per_second": 0.067, "step": 5700 }, { "acc": 0.82111177, "epoch": 3.923658872077029, "grad_norm": 2.1147096157073975, "learning_rate": 1.2153682805977849e-05, "loss": 0.61029615, "memory(GiB)": 67.62, "step": 5705, "train_speed(iter/s)": 0.364527 }, { "acc": 0.81873646, "epoch": 3.9270976616231086, "grad_norm": 1.870378851890564, "learning_rate": 1.2079490794056745e-05, "loss": 0.60247025, "memory(GiB)": 67.62, "step": 5710, "train_speed(iter/s)": 0.363318 }, { "acc": 0.8050211, "epoch": 3.9305364511691883, "grad_norm": 2.061549663543701, "learning_rate": 1.2005494990645446e-05, "loss": 0.64639635, "memory(GiB)": 67.62, "step": 5715, "train_speed(iter/s)": 0.361877 }, { "acc": 0.80292168, "epoch": 3.9339752407152684, "grad_norm": 2.088428020477295, "learning_rate": 1.1931695778562984e-05, "loss": 0.66072493, "memory(GiB)": 67.62, "step": 5720, "train_speed(iter/s)": 0.360551 }, { "acc": 0.81333771, "epoch": 3.937414030261348, "grad_norm": 2.195223093032837, "learning_rate": 1.1858093539611302e-05, "loss": 0.62468419, "memory(GiB)": 67.62, "step": 5725, "train_speed(iter/s)": 0.359367 }, { "acc": 0.80784473, "epoch": 3.9408528198074277, "grad_norm": 2.1771881580352783, "learning_rate": 1.1784688654573306e-05, "loss": 0.6561008, "memory(GiB)": 67.62, "step": 5730, "train_speed(iter/s)": 0.35807 }, { "acc": 0.81136761, "epoch": 3.9442916093535074, "grad_norm": 1.9094853401184082, "learning_rate": 1.1711481503210884e-05, "loss": 0.63656788, "memory(GiB)": 67.62, "step": 5735, "train_speed(iter/s)": 0.356785 }, { "acc": 0.81901407, "epoch": 3.947730398899587, "grad_norm": 1.9423341751098633, "learning_rate": 1.1638472464262948e-05, "loss": 0.61632404, "memory(GiB)": 67.62, "step": 5740, "train_speed(iter/s)": 0.355531 }, { "acc": 0.81649647, "epoch": 3.9511691884456672, "grad_norm": 1.8900690078735352, "learning_rate": 1.1565661915443475e-05, "loss": 0.61735368, "memory(GiB)": 67.62, "step": 5745, "train_speed(iter/s)": 0.354181 }, { "acc": 0.80203295, "epoch": 3.954607977991747, "grad_norm": 1.9980183839797974, "learning_rate": 1.1493050233439526e-05, "loss": 0.66276655, "memory(GiB)": 67.62, "step": 5750, "train_speed(iter/s)": 0.352959 }, { "acc": 0.81161861, "epoch": 3.9580467675378266, "grad_norm": 1.8814200162887573, "learning_rate": 1.1420637793909362e-05, "loss": 0.64876308, "memory(GiB)": 67.62, "step": 5755, "train_speed(iter/s)": 0.351641 }, { "acc": 0.80822277, "epoch": 3.9614855570839067, "grad_norm": 2.157858371734619, "learning_rate": 1.1348424971480429e-05, "loss": 0.64273562, "memory(GiB)": 67.62, "step": 5760, "train_speed(iter/s)": 0.350261 }, { "acc": 0.81014862, "epoch": 3.9649243466299864, "grad_norm": 1.8866498470306396, "learning_rate": 1.1276412139747452e-05, "loss": 0.63060379, "memory(GiB)": 67.62, "step": 5765, "train_speed(iter/s)": 0.34897 }, { "acc": 0.80797882, "epoch": 3.968363136176066, "grad_norm": 2.011620283126831, "learning_rate": 1.1204599671270494e-05, "loss": 0.64154892, "memory(GiB)": 67.62, "step": 5770, "train_speed(iter/s)": 0.347683 }, { "acc": 0.81029825, "epoch": 3.9718019257221457, "grad_norm": 2.1030616760253906, "learning_rate": 1.1132987937573052e-05, "loss": 0.62338347, "memory(GiB)": 67.62, "step": 5775, "train_speed(iter/s)": 0.346438 }, { "acc": 0.7949429, "epoch": 3.9752407152682254, "grad_norm": 2.054006338119507, "learning_rate": 1.1061577309140098e-05, "loss": 0.70458865, "memory(GiB)": 67.62, "step": 5780, "train_speed(iter/s)": 0.345233 }, { "acc": 0.8113884, "epoch": 3.9786795048143055, "grad_norm": 2.072899103164673, "learning_rate": 1.0990368155416202e-05, "loss": 0.63724394, "memory(GiB)": 67.62, "step": 5785, "train_speed(iter/s)": 0.344106 }, { "acc": 0.81157551, "epoch": 3.9821182943603852, "grad_norm": 1.9557698965072632, "learning_rate": 1.091936084480358e-05, "loss": 0.62347059, "memory(GiB)": 67.62, "step": 5790, "train_speed(iter/s)": 0.342935 }, { "acc": 0.81167564, "epoch": 3.985557083906465, "grad_norm": 1.9136029481887817, "learning_rate": 1.0848555744660215e-05, "loss": 0.61960039, "memory(GiB)": 67.62, "step": 5795, "train_speed(iter/s)": 0.341839 }, { "acc": 0.83220634, "epoch": 3.9889958734525446, "grad_norm": 1.9021817445755005, "learning_rate": 1.0777953221297932e-05, "loss": 0.56068201, "memory(GiB)": 67.62, "step": 5800, "train_speed(iter/s)": 0.340892 }, { "epoch": 3.9889958734525446, "eval_acc": 0.7817744185000225, "eval_loss": 0.7804912328720093, "eval_runtime": 1111.4903, "eval_samples_per_second": 3.853, "eval_steps_per_second": 0.069, "step": 5800 }, { "acc": 0.80734158, "epoch": 3.9924346629986243, "grad_norm": 2.02614688873291, "learning_rate": 1.0707553639980585e-05, "loss": 0.64991465, "memory(GiB)": 67.62, "step": 5805, "train_speed(iter/s)": 0.319008 }, { "acc": 0.82151909, "epoch": 3.9958734525447044, "grad_norm": 2.07773494720459, "learning_rate": 1.0637357364922026e-05, "loss": 0.58141608, "memory(GiB)": 67.62, "step": 5810, "train_speed(iter/s)": 0.318131 }, { "acc": 0.81823015, "epoch": 3.999312242090784, "grad_norm": 1.7081282138824463, "learning_rate": 1.0567364759284327e-05, "loss": 0.61670866, "memory(GiB)": 67.62, "step": 5815, "train_speed(iter/s)": 0.317189 }, { "acc": 0.82516012, "epoch": 4.002751031636864, "grad_norm": 1.9197478294372559, "learning_rate": 1.0497576185175877e-05, "loss": 0.57296357, "memory(GiB)": 67.62, "step": 5820, "train_speed(iter/s)": 0.315614 }, { "acc": 0.82848129, "epoch": 4.006189821182944, "grad_norm": 1.7602168321609497, "learning_rate": 1.042799200364949e-05, "loss": 0.57674851, "memory(GiB)": 67.62, "step": 5825, "train_speed(iter/s)": 0.314514 }, { "acc": 0.8312006, "epoch": 4.009628610729023, "grad_norm": 1.904069423675537, "learning_rate": 1.0358612574700576e-05, "loss": 0.58140912, "memory(GiB)": 67.62, "step": 5830, "train_speed(iter/s)": 0.313493 }, { "acc": 0.83973274, "epoch": 4.013067400275103, "grad_norm": 1.8969364166259766, "learning_rate": 1.0289438257265218e-05, "loss": 0.5239769, "memory(GiB)": 67.62, "step": 5835, "train_speed(iter/s)": 0.3126 }, { "acc": 0.82053461, "epoch": 4.016506189821183, "grad_norm": 2.004246950149536, "learning_rate": 1.0220469409218385e-05, "loss": 0.58951969, "memory(GiB)": 67.62, "step": 5840, "train_speed(iter/s)": 0.311738 }, { "acc": 0.82730808, "epoch": 4.019944979367263, "grad_norm": 2.120168685913086, "learning_rate": 1.0151706387371993e-05, "loss": 0.57174788, "memory(GiB)": 67.62, "step": 5845, "train_speed(iter/s)": 0.310754 }, { "acc": 0.81913013, "epoch": 4.023383768913343, "grad_norm": 2.083112955093384, "learning_rate": 1.008314954747319e-05, "loss": 0.60139389, "memory(GiB)": 67.62, "step": 5850, "train_speed(iter/s)": 0.309795 }, { "acc": 0.82976856, "epoch": 4.026822558459422, "grad_norm": 1.9144500494003296, "learning_rate": 1.0014799244202362e-05, "loss": 0.56441569, "memory(GiB)": 67.62, "step": 5855, "train_speed(iter/s)": 0.308991 }, { "acc": 0.82570667, "epoch": 4.030261348005502, "grad_norm": 1.9201669692993164, "learning_rate": 9.94665583117142e-06, "loss": 0.58550615, "memory(GiB)": 67.62, "step": 5860, "train_speed(iter/s)": 0.308024 }, { "acc": 0.82395906, "epoch": 4.033700137551582, "grad_norm": 2.058741807937622, "learning_rate": 9.878719660921893e-06, "loss": 0.59208636, "memory(GiB)": 67.62, "step": 5865, "train_speed(iter/s)": 0.30702 }, { "acc": 0.82681818, "epoch": 4.037138927097661, "grad_norm": 2.0860073566436768, "learning_rate": 9.810991084923154e-06, "loss": 0.57163272, "memory(GiB)": 67.62, "step": 5870, "train_speed(iter/s)": 0.306194 }, { "acc": 0.81524105, "epoch": 4.0405777166437415, "grad_norm": 1.9567036628723145, "learning_rate": 9.743470453570575e-06, "loss": 0.62305789, "memory(GiB)": 67.62, "step": 5875, "train_speed(iter/s)": 0.305279 }, { "acc": 0.81588497, "epoch": 4.044016506189821, "grad_norm": 2.048231601715088, "learning_rate": 9.676158116183729e-06, "loss": 0.60361052, "memory(GiB)": 67.62, "step": 5880, "train_speed(iter/s)": 0.304326 }, { "acc": 0.82715149, "epoch": 4.047455295735901, "grad_norm": 1.918243408203125, "learning_rate": 9.609054421004562e-06, "loss": 0.56623569, "memory(GiB)": 67.62, "step": 5885, "train_speed(iter/s)": 0.303437 }, { "acc": 0.82016706, "epoch": 4.050894085281981, "grad_norm": 2.212838888168335, "learning_rate": 9.542159715195614e-06, "loss": 0.60472922, "memory(GiB)": 67.62, "step": 5890, "train_speed(iter/s)": 0.302387 }, { "acc": 0.82020359, "epoch": 4.05433287482806, "grad_norm": 2.029686450958252, "learning_rate": 9.475474344838204e-06, "loss": 0.59589596, "memory(GiB)": 67.62, "step": 5895, "train_speed(iter/s)": 0.301428 }, { "acc": 0.82250319, "epoch": 4.05777166437414, "grad_norm": 2.0857136249542236, "learning_rate": 9.408998654930675e-06, "loss": 0.59207001, "memory(GiB)": 67.62, "step": 5900, "train_speed(iter/s)": 0.300628 }, { "epoch": 4.05777166437414, "eval_acc": 0.7809758401943582, "eval_loss": 0.7950036525726318, "eval_runtime": 1140.5258, "eval_samples_per_second": 3.755, "eval_steps_per_second": 0.068, "step": 5900 }, { "acc": 0.83261538, "epoch": 4.0612104539202205, "grad_norm": 2.129284143447876, "learning_rate": 9.342732989386557e-06, "loss": 0.54631634, "memory(GiB)": 67.62, "step": 5905, "train_speed(iter/s)": 0.283416 }, { "acc": 0.82259159, "epoch": 4.0646492434663, "grad_norm": 2.0771372318267822, "learning_rate": 9.27667769103282e-06, "loss": 0.59988642, "memory(GiB)": 67.62, "step": 5910, "train_speed(iter/s)": 0.282658 }, { "acc": 0.82938833, "epoch": 4.06808803301238, "grad_norm": 2.0288455486297607, "learning_rate": 9.210833101608094e-06, "loss": 0.56707897, "memory(GiB)": 67.62, "step": 5915, "train_speed(iter/s)": 0.281964 }, { "acc": 0.81752338, "epoch": 4.071526822558459, "grad_norm": 2.1337034702301025, "learning_rate": 9.145199561760913e-06, "loss": 0.58798003, "memory(GiB)": 67.62, "step": 5920, "train_speed(iter/s)": 0.281194 }, { "acc": 0.83025227, "epoch": 4.074965612104539, "grad_norm": 1.9078054428100586, "learning_rate": 9.079777411047923e-06, "loss": 0.55221009, "memory(GiB)": 67.62, "step": 5925, "train_speed(iter/s)": 0.280374 }, { "acc": 0.82038937, "epoch": 4.078404401650619, "grad_norm": 2.1154861450195312, "learning_rate": 9.014566987932155e-06, "loss": 0.58884945, "memory(GiB)": 67.62, "step": 5930, "train_speed(iter/s)": 0.279665 }, { "acc": 0.83256464, "epoch": 4.081843191196699, "grad_norm": 2.46669602394104, "learning_rate": 8.949568629781233e-06, "loss": 0.55993681, "memory(GiB)": 67.62, "step": 5935, "train_speed(iter/s)": 0.279022 }, { "acc": 0.82162399, "epoch": 4.085281980742779, "grad_norm": 2.2108795642852783, "learning_rate": 8.884782672865745e-06, "loss": 0.58439035, "memory(GiB)": 67.62, "step": 5940, "train_speed(iter/s)": 0.278353 }, { "acc": 0.81261024, "epoch": 4.088720770288858, "grad_norm": 2.3239004611968994, "learning_rate": 8.820209452357312e-06, "loss": 0.62102919, "memory(GiB)": 67.62, "step": 5945, "train_speed(iter/s)": 0.277507 }, { "acc": 0.82187653, "epoch": 4.092159559834938, "grad_norm": 2.306704521179199, "learning_rate": 8.755849302327025e-06, "loss": 0.58051348, "memory(GiB)": 67.62, "step": 5950, "train_speed(iter/s)": 0.276835 }, { "acc": 0.83303099, "epoch": 4.095598349381018, "grad_norm": 2.3323071002960205, "learning_rate": 8.691702555743604e-06, "loss": 0.54123106, "memory(GiB)": 67.62, "step": 5955, "train_speed(iter/s)": 0.27621 }, { "acc": 0.82155704, "epoch": 4.099037138927097, "grad_norm": 2.2443792819976807, "learning_rate": 8.627769544471766e-06, "loss": 0.57790089, "memory(GiB)": 67.62, "step": 5960, "train_speed(iter/s)": 0.275577 }, { "acc": 0.81777382, "epoch": 4.1024759284731775, "grad_norm": 2.0098752975463867, "learning_rate": 8.564050599270423e-06, "loss": 0.60635762, "memory(GiB)": 67.62, "step": 5965, "train_speed(iter/s)": 0.27489 }, { "acc": 0.82945662, "epoch": 4.105914718019257, "grad_norm": 2.6297407150268555, "learning_rate": 8.50054604979104e-06, "loss": 0.55736432, "memory(GiB)": 67.62, "step": 5970, "train_speed(iter/s)": 0.274231 }, { "acc": 0.83819923, "epoch": 4.109353507565337, "grad_norm": 2.027495861053467, "learning_rate": 8.43725622457589e-06, "loss": 0.53537874, "memory(GiB)": 67.62, "step": 5975, "train_speed(iter/s)": 0.273579 }, { "acc": 0.83728676, "epoch": 4.112792297111417, "grad_norm": 1.9991952180862427, "learning_rate": 8.37418145105636e-06, "loss": 0.52903852, "memory(GiB)": 67.62, "step": 5980, "train_speed(iter/s)": 0.272886 }, { "acc": 0.83919382, "epoch": 4.116231086657496, "grad_norm": 1.941271424293518, "learning_rate": 8.311322055551258e-06, "loss": 0.54152002, "memory(GiB)": 67.62, "step": 5985, "train_speed(iter/s)": 0.272292 }, { "acc": 0.82639074, "epoch": 4.119669876203576, "grad_norm": 2.0080490112304688, "learning_rate": 8.248678363265168e-06, "loss": 0.58616934, "memory(GiB)": 67.62, "step": 5990, "train_speed(iter/s)": 0.271655 }, { "acc": 0.82685022, "epoch": 4.1231086657496565, "grad_norm": 2.1029014587402344, "learning_rate": 8.186250698286685e-06, "loss": 0.57365303, "memory(GiB)": 67.62, "step": 5995, "train_speed(iter/s)": 0.271018 }, { "acc": 0.83401289, "epoch": 4.126547455295736, "grad_norm": 2.381568431854248, "learning_rate": 8.124039383586785e-06, "loss": 0.54990234, "memory(GiB)": 67.62, "step": 6000, "train_speed(iter/s)": 0.27041 }, { "epoch": 4.126547455295736, "eval_acc": 0.7808971071219688, "eval_loss": 0.7977337837219238, "eval_runtime": 1150.4843, "eval_samples_per_second": 3.723, "eval_steps_per_second": 0.067, "step": 6000 }, { "acc": 0.82955971, "epoch": 4.129986244841816, "grad_norm": 2.2076478004455566, "learning_rate": 8.062044741017174e-06, "loss": 0.56549349, "memory(GiB)": 67.62, "step": 6005, "train_speed(iter/s)": 0.256539 }, { "acc": 0.82928619, "epoch": 4.133425034387895, "grad_norm": 2.240816116333008, "learning_rate": 8.00026709130858e-06, "loss": 0.56595135, "memory(GiB)": 67.62, "step": 6010, "train_speed(iter/s)": 0.256006 }, { "acc": 0.81660957, "epoch": 4.136863823933975, "grad_norm": 2.1177453994750977, "learning_rate": 7.938706754069125e-06, "loss": 0.60902424, "memory(GiB)": 67.62, "step": 6015, "train_speed(iter/s)": 0.255365 }, { "acc": 0.81723537, "epoch": 4.140302613480055, "grad_norm": 2.291558265686035, "learning_rate": 7.877364047782646e-06, "loss": 0.59432869, "memory(GiB)": 67.62, "step": 6020, "train_speed(iter/s)": 0.254797 }, { "acc": 0.83183041, "epoch": 4.143741403026135, "grad_norm": 2.1598074436187744, "learning_rate": 7.816239289807078e-06, "loss": 0.56827602, "memory(GiB)": 67.62, "step": 6025, "train_speed(iter/s)": 0.254252 }, { "acc": 0.82450991, "epoch": 4.147180192572215, "grad_norm": 2.316070556640625, "learning_rate": 7.755332796372783e-06, "loss": 0.5860589, "memory(GiB)": 67.62, "step": 6030, "train_speed(iter/s)": 0.253753 }, { "acc": 0.83219881, "epoch": 4.150618982118294, "grad_norm": 2.0400826930999756, "learning_rate": 7.694644882580929e-06, "loss": 0.56074944, "memory(GiB)": 67.62, "step": 6035, "train_speed(iter/s)": 0.253206 }, { "acc": 0.82405052, "epoch": 4.154057771664374, "grad_norm": 2.167229652404785, "learning_rate": 7.634175862401859e-06, "loss": 0.5924716, "memory(GiB)": 67.62, "step": 6040, "train_speed(iter/s)": 0.252704 }, { "acc": 0.83036137, "epoch": 4.157496561210454, "grad_norm": 2.0544652938842773, "learning_rate": 7.5739260486734785e-06, "loss": 0.56387725, "memory(GiB)": 67.62, "step": 6045, "train_speed(iter/s)": 0.252135 }, { "acc": 0.82638521, "epoch": 4.160935350756533, "grad_norm": 2.0944511890411377, "learning_rate": 7.5138957530996e-06, "loss": 0.58068042, "memory(GiB)": 67.62, "step": 6050, "train_speed(iter/s)": 0.251533 }, { "acc": 0.82557564, "epoch": 4.1643741403026135, "grad_norm": 2.206922769546509, "learning_rate": 7.454085286248365e-06, "loss": 0.57935457, "memory(GiB)": 67.62, "step": 6055, "train_speed(iter/s)": 0.250929 }, { "acc": 0.82684364, "epoch": 4.167812929848694, "grad_norm": 2.0836057662963867, "learning_rate": 7.394494957550617e-06, "loss": 0.57276134, "memory(GiB)": 67.62, "step": 6060, "train_speed(iter/s)": 0.250399 }, { "acc": 0.82708397, "epoch": 4.171251719394773, "grad_norm": 2.394265651702881, "learning_rate": 7.335125075298327e-06, "loss": 0.56799402, "memory(GiB)": 67.62, "step": 6065, "train_speed(iter/s)": 0.249822 }, { "acc": 0.82027712, "epoch": 4.174690508940853, "grad_norm": 2.20003080368042, "learning_rate": 7.2759759466429625e-06, "loss": 0.59135399, "memory(GiB)": 67.62, "step": 6070, "train_speed(iter/s)": 0.249261 }, { "acc": 0.83627338, "epoch": 4.178129298486932, "grad_norm": 2.0617763996124268, "learning_rate": 7.217047877593917e-06, "loss": 0.542978, "memory(GiB)": 67.62, "step": 6075, "train_speed(iter/s)": 0.248741 }, { "acc": 0.83200588, "epoch": 4.181568088033012, "grad_norm": 2.2494707107543945, "learning_rate": 7.158341173016954e-06, "loss": 0.54484763, "memory(GiB)": 67.62, "step": 6080, "train_speed(iter/s)": 0.248292 }, { "acc": 0.83537827, "epoch": 4.1850068775790925, "grad_norm": 2.177746295928955, "learning_rate": 7.099856136632578e-06, "loss": 0.54962234, "memory(GiB)": 67.62, "step": 6085, "train_speed(iter/s)": 0.247751 }, { "acc": 0.83314114, "epoch": 4.188445667125172, "grad_norm": 1.8806217908859253, "learning_rate": 7.041593071014495e-06, "loss": 0.55333209, "memory(GiB)": 67.62, "step": 6090, "train_speed(iter/s)": 0.247191 }, { "acc": 0.83060188, "epoch": 4.191884456671252, "grad_norm": 2.284046173095703, "learning_rate": 6.983552277588039e-06, "loss": 0.55391922, "memory(GiB)": 67.62, "step": 6095, "train_speed(iter/s)": 0.246671 }, { "acc": 0.82202473, "epoch": 4.195323246217331, "grad_norm": 2.113684892654419, "learning_rate": 6.925734056628606e-06, "loss": 0.59055824, "memory(GiB)": 67.62, "step": 6100, "train_speed(iter/s)": 0.246148 }, { "epoch": 4.195323246217331, "eval_acc": 0.7810377018940927, "eval_loss": 0.7978992462158203, "eval_runtime": 1151.0505, "eval_samples_per_second": 3.721, "eval_steps_per_second": 0.067, "step": 6100 }, { "acc": 0.82853069, "epoch": 4.198762035763411, "grad_norm": 1.9292495250701904, "learning_rate": 6.8681387072601215e-06, "loss": 0.56650033, "memory(GiB)": 67.62, "step": 6105, "train_speed(iter/s)": 0.234762 }, { "acc": 0.83146677, "epoch": 4.202200825309491, "grad_norm": 2.457911729812622, "learning_rate": 6.8107665274534755e-06, "loss": 0.56592517, "memory(GiB)": 67.62, "step": 6110, "train_speed(iter/s)": 0.234306 }, { "acc": 0.81799488, "epoch": 4.205639614855571, "grad_norm": 2.1064655780792236, "learning_rate": 6.753617814024982e-06, "loss": 0.59414425, "memory(GiB)": 67.62, "step": 6115, "train_speed(iter/s)": 0.233775 }, { "acc": 0.83699923, "epoch": 4.209078404401651, "grad_norm": 2.15045166015625, "learning_rate": 6.696692862634848e-06, "loss": 0.53455338, "memory(GiB)": 67.62, "step": 6120, "train_speed(iter/s)": 0.233246 }, { "acc": 0.82855034, "epoch": 4.212517193947731, "grad_norm": 1.9256818294525146, "learning_rate": 6.639991967785629e-06, "loss": 0.57589531, "memory(GiB)": 67.62, "step": 6125, "train_speed(iter/s)": 0.232841 }, { "acc": 0.82569561, "epoch": 4.21595598349381, "grad_norm": 2.2568438053131104, "learning_rate": 6.583515422820755e-06, "loss": 0.59608107, "memory(GiB)": 67.62, "step": 6130, "train_speed(iter/s)": 0.232428 }, { "acc": 0.83448133, "epoch": 4.21939477303989, "grad_norm": 2.2241194248199463, "learning_rate": 6.527263519922942e-06, "loss": 0.53996773, "memory(GiB)": 67.62, "step": 6135, "train_speed(iter/s)": 0.232016 }, { "acc": 0.82402668, "epoch": 4.222833562585969, "grad_norm": 2.152508020401001, "learning_rate": 6.471236550112733e-06, "loss": 0.5897275, "memory(GiB)": 67.62, "step": 6140, "train_speed(iter/s)": 0.231577 }, { "acc": 0.82092781, "epoch": 4.2262723521320495, "grad_norm": 2.7539846897125244, "learning_rate": 6.415434803246959e-06, "loss": 0.60109167, "memory(GiB)": 67.62, "step": 6145, "train_speed(iter/s)": 0.231131 }, { "acc": 0.82336702, "epoch": 4.22971114167813, "grad_norm": 2.2428319454193115, "learning_rate": 6.359858568017257e-06, "loss": 0.5810329, "memory(GiB)": 67.62, "step": 6150, "train_speed(iter/s)": 0.230762 }, { "acc": 0.8375886, "epoch": 4.233149931224209, "grad_norm": 2.108989715576172, "learning_rate": 6.304508131948601e-06, "loss": 0.54037862, "memory(GiB)": 67.62, "step": 6155, "train_speed(iter/s)": 0.230346 }, { "acc": 0.82534332, "epoch": 4.236588720770289, "grad_norm": 2.159034252166748, "learning_rate": 6.249383781397765e-06, "loss": 0.58905783, "memory(GiB)": 67.62, "step": 6160, "train_speed(iter/s)": 0.229862 }, { "acc": 0.82394867, "epoch": 4.240027510316368, "grad_norm": 2.191835880279541, "learning_rate": 6.194485801551856e-06, "loss": 0.57035618, "memory(GiB)": 67.62, "step": 6165, "train_speed(iter/s)": 0.229394 }, { "acc": 0.83958015, "epoch": 4.243466299862448, "grad_norm": 1.9931029081344604, "learning_rate": 6.139814476426854e-06, "loss": 0.53320942, "memory(GiB)": 67.62, "step": 6170, "train_speed(iter/s)": 0.22895 }, { "acc": 0.81791973, "epoch": 4.2469050894085285, "grad_norm": 2.611358404159546, "learning_rate": 6.085370088866157e-06, "loss": 0.61060858, "memory(GiB)": 67.62, "step": 6175, "train_speed(iter/s)": 0.22846 }, { "acc": 0.81913891, "epoch": 4.250343878954608, "grad_norm": 2.0629124641418457, "learning_rate": 6.031152920539071e-06, "loss": 0.59518094, "memory(GiB)": 67.62, "step": 6180, "train_speed(iter/s)": 0.228031 }, { "acc": 0.83380852, "epoch": 4.253782668500688, "grad_norm": 2.2911267280578613, "learning_rate": 5.977163251939388e-06, "loss": 0.55708628, "memory(GiB)": 67.62, "step": 6185, "train_speed(iter/s)": 0.227608 }, { "acc": 0.83007746, "epoch": 4.257221458046768, "grad_norm": 2.3003599643707275, "learning_rate": 5.9234013623839155e-06, "loss": 0.56224914, "memory(GiB)": 67.62, "step": 6190, "train_speed(iter/s)": 0.227229 }, { "acc": 0.82731237, "epoch": 4.260660247592847, "grad_norm": 2.23395037651062, "learning_rate": 5.869867530011054e-06, "loss": 0.57990241, "memory(GiB)": 67.62, "step": 6195, "train_speed(iter/s)": 0.226782 }, { "acc": 0.82513866, "epoch": 4.264099037138927, "grad_norm": 1.8877415657043457, "learning_rate": 5.816562031779334e-06, "loss": 0.58530903, "memory(GiB)": 67.62, "step": 6200, "train_speed(iter/s)": 0.226378 }, { "epoch": 4.264099037138927, "eval_acc": 0.7814088720925001, "eval_loss": 0.796574592590332, "eval_runtime": 1138.6928, "eval_samples_per_second": 3.761, "eval_steps_per_second": 0.068, "step": 6200 }, { "acc": 0.83331938, "epoch": 4.267537826685007, "grad_norm": 2.007477283477783, "learning_rate": 5.7634851434660045e-06, "loss": 0.55948911, "memory(GiB)": 67.62, "step": 6205, "train_speed(iter/s)": 0.21694 }, { "acc": 0.83156748, "epoch": 4.270976616231087, "grad_norm": 2.2435107231140137, "learning_rate": 5.7106371396655885e-06, "loss": 0.55306296, "memory(GiB)": 67.62, "step": 6210, "train_speed(iter/s)": 0.21658 }, { "acc": 0.82246685, "epoch": 4.274415405777167, "grad_norm": 2.471839427947998, "learning_rate": 5.658018293788461e-06, "loss": 0.58456354, "memory(GiB)": 67.62, "step": 6215, "train_speed(iter/s)": 0.216188 }, { "acc": 0.8260498, "epoch": 4.277854195323246, "grad_norm": 2.342773675918579, "learning_rate": 5.6056288780594584e-06, "loss": 0.58758726, "memory(GiB)": 67.62, "step": 6220, "train_speed(iter/s)": 0.215836 }, { "acc": 0.83068848, "epoch": 4.281292984869326, "grad_norm": 2.36448073387146, "learning_rate": 5.553469163516459e-06, "loss": 0.55812101, "memory(GiB)": 67.62, "step": 6225, "train_speed(iter/s)": 0.21546 }, { "acc": 0.8118145, "epoch": 4.284731774415405, "grad_norm": 2.0966968536376953, "learning_rate": 5.501539420008957e-06, "loss": 0.62151508, "memory(GiB)": 67.62, "step": 6230, "train_speed(iter/s)": 0.215114 }, { "acc": 0.82315483, "epoch": 4.2881705639614855, "grad_norm": 2.090514898300171, "learning_rate": 5.449839916196701e-06, "loss": 0.59569468, "memory(GiB)": 67.62, "step": 6235, "train_speed(iter/s)": 0.214737 }, { "acc": 0.82968979, "epoch": 4.291609353507566, "grad_norm": 2.4561944007873535, "learning_rate": 5.398370919548289e-06, "loss": 0.56410408, "memory(GiB)": 67.62, "step": 6240, "train_speed(iter/s)": 0.214364 }, { "acc": 0.82265596, "epoch": 4.295048143053645, "grad_norm": 2.0787575244903564, "learning_rate": 5.3471326963397644e-06, "loss": 0.59666047, "memory(GiB)": 67.62, "step": 6245, "train_speed(iter/s)": 0.213976 }, { "acc": 0.83164139, "epoch": 4.298486932599725, "grad_norm": 1.96835458278656, "learning_rate": 5.296125511653292e-06, "loss": 0.56099758, "memory(GiB)": 67.62, "step": 6250, "train_speed(iter/s)": 0.213621 }, { "acc": 0.82760611, "epoch": 4.301925722145804, "grad_norm": 2.032607078552246, "learning_rate": 5.245349629375726e-06, "loss": 0.56520452, "memory(GiB)": 67.62, "step": 6255, "train_speed(iter/s)": 0.213246 }, { "acc": 0.81889114, "epoch": 4.305364511691884, "grad_norm": 2.076733112335205, "learning_rate": 5.194805312197261e-06, "loss": 0.60234947, "memory(GiB)": 67.62, "step": 6260, "train_speed(iter/s)": 0.212879 }, { "acc": 0.83639603, "epoch": 4.3088033012379645, "grad_norm": 2.0413177013397217, "learning_rate": 5.144492821610151e-06, "loss": 0.53537364, "memory(GiB)": 67.62, "step": 6265, "train_speed(iter/s)": 0.212585 }, { "acc": 0.84348145, "epoch": 4.312242090784044, "grad_norm": 2.1440134048461914, "learning_rate": 5.094412417907226e-06, "loss": 0.52636375, "memory(GiB)": 67.62, "step": 6270, "train_speed(iter/s)": 0.21228 }, { "acc": 0.81755209, "epoch": 4.315680880330124, "grad_norm": 2.337132692337036, "learning_rate": 5.0445643601806165e-06, "loss": 0.60215778, "memory(GiB)": 67.62, "step": 6275, "train_speed(iter/s)": 0.211939 }, { "acc": 0.81957273, "epoch": 4.319119669876203, "grad_norm": 2.3544983863830566, "learning_rate": 4.994948906320421e-06, "loss": 0.62419033, "memory(GiB)": 67.62, "step": 6280, "train_speed(iter/s)": 0.211611 }, { "acc": 0.83044434, "epoch": 4.322558459422283, "grad_norm": 2.0763583183288574, "learning_rate": 4.945566313013359e-06, "loss": 0.56670027, "memory(GiB)": 67.62, "step": 6285, "train_speed(iter/s)": 0.211304 }, { "acc": 0.83016624, "epoch": 4.325997248968363, "grad_norm": 2.207101583480835, "learning_rate": 4.896416835741426e-06, "loss": 0.57944641, "memory(GiB)": 67.62, "step": 6290, "train_speed(iter/s)": 0.210967 }, { "acc": 0.82729073, "epoch": 4.329436038514443, "grad_norm": 2.1743686199188232, "learning_rate": 4.847500728780591e-06, "loss": 0.57582512, "memory(GiB)": 67.62, "step": 6295, "train_speed(iter/s)": 0.210593 }, { "acc": 0.83406305, "epoch": 4.332874828060523, "grad_norm": 2.1914258003234863, "learning_rate": 4.798818245199488e-06, "loss": 0.56798325, "memory(GiB)": 67.62, "step": 6300, "train_speed(iter/s)": 0.210291 }, { "epoch": 4.332874828060523, "eval_acc": 0.7814426148378099, "eval_loss": 0.7952266335487366, "eval_runtime": 1113.0563, "eval_samples_per_second": 3.848, "eval_steps_per_second": 0.069, "step": 6300 }, { "acc": 0.81289082, "epoch": 4.336313617606603, "grad_norm": 1.9904134273529053, "learning_rate": 4.7503696368580756e-06, "loss": 0.62703791, "memory(GiB)": 67.62, "step": 6305, "train_speed(iter/s)": 0.202425 }, { "acc": 0.82088013, "epoch": 4.339752407152682, "grad_norm": 2.3138110637664795, "learning_rate": 4.702155154406356e-06, "loss": 0.59575286, "memory(GiB)": 67.62, "step": 6310, "train_speed(iter/s)": 0.202102 }, { "acc": 0.82686548, "epoch": 4.343191196698762, "grad_norm": 2.3361921310424805, "learning_rate": 4.654175047283105e-06, "loss": 0.58184552, "memory(GiB)": 67.62, "step": 6315, "train_speed(iter/s)": 0.201819 }, { "acc": 0.81809053, "epoch": 4.346629986244841, "grad_norm": 2.237659215927124, "learning_rate": 4.606429563714522e-06, "loss": 0.61091933, "memory(GiB)": 67.62, "step": 6320, "train_speed(iter/s)": 0.201524 }, { "acc": 0.83638992, "epoch": 4.3500687757909215, "grad_norm": 2.163444995880127, "learning_rate": 4.558918950712983e-06, "loss": 0.53875408, "memory(GiB)": 67.62, "step": 6325, "train_speed(iter/s)": 0.201286 }, { "acc": 0.83066168, "epoch": 4.353507565337002, "grad_norm": 2.1895644664764404, "learning_rate": 4.511643454075753e-06, "loss": 0.54859762, "memory(GiB)": 67.62, "step": 6330, "train_speed(iter/s)": 0.201 }, { "acc": 0.82753067, "epoch": 4.356946354883081, "grad_norm": 2.3949623107910156, "learning_rate": 4.464603318383724e-06, "loss": 0.57942715, "memory(GiB)": 67.62, "step": 6335, "train_speed(iter/s)": 0.20074 }, { "acc": 0.8256155, "epoch": 4.360385144429161, "grad_norm": 2.2843456268310547, "learning_rate": 4.417798787000139e-06, "loss": 0.5838841, "memory(GiB)": 67.62, "step": 6340, "train_speed(iter/s)": 0.200436 }, { "acc": 0.82477741, "epoch": 4.36382393397524, "grad_norm": 2.1748905181884766, "learning_rate": 4.371230102069333e-06, "loss": 0.57569537, "memory(GiB)": 67.62, "step": 6345, "train_speed(iter/s)": 0.200165 }, { "acc": 0.82552452, "epoch": 4.36726272352132, "grad_norm": 2.2806589603424072, "learning_rate": 4.324897504515494e-06, "loss": 0.5679925, "memory(GiB)": 67.62, "step": 6350, "train_speed(iter/s)": 0.199857 }, { "acc": 0.82228546, "epoch": 4.3707015130674005, "grad_norm": 2.5865187644958496, "learning_rate": 4.278801234041395e-06, "loss": 0.60699501, "memory(GiB)": 67.62, "step": 6355, "train_speed(iter/s)": 0.199561 }, { "acc": 0.83120518, "epoch": 4.37414030261348, "grad_norm": 2.1603238582611084, "learning_rate": 4.2329415291271675e-06, "loss": 0.56461072, "memory(GiB)": 67.62, "step": 6360, "train_speed(iter/s)": 0.199251 }, { "acc": 0.82535934, "epoch": 4.37757909215956, "grad_norm": 2.120961904525757, "learning_rate": 4.18731862702908e-06, "loss": 0.57014971, "memory(GiB)": 67.62, "step": 6365, "train_speed(iter/s)": 0.199009 }, { "acc": 0.82253723, "epoch": 4.38101788170564, "grad_norm": 2.091716766357422, "learning_rate": 4.141932763778269e-06, "loss": 0.58944392, "memory(GiB)": 67.62, "step": 6370, "train_speed(iter/s)": 0.198744 }, { "acc": 0.82127199, "epoch": 4.384456671251719, "grad_norm": 2.529238700866699, "learning_rate": 4.09678417417958e-06, "loss": 0.60495977, "memory(GiB)": 67.62, "step": 6375, "train_speed(iter/s)": 0.19843 }, { "acc": 0.82691174, "epoch": 4.387895460797799, "grad_norm": 2.0297234058380127, "learning_rate": 4.051873091810289e-06, "loss": 0.57716408, "memory(GiB)": 67.62, "step": 6380, "train_speed(iter/s)": 0.198164 }, { "acc": 0.82726593, "epoch": 4.391334250343879, "grad_norm": 2.1247737407684326, "learning_rate": 4.007199749018933e-06, "loss": 0.56230278, "memory(GiB)": 67.62, "step": 6385, "train_speed(iter/s)": 0.197892 }, { "acc": 0.83000584, "epoch": 4.394773039889959, "grad_norm": 2.1872763633728027, "learning_rate": 3.962764376924093e-06, "loss": 0.57364516, "memory(GiB)": 67.62, "step": 6390, "train_speed(iter/s)": 0.197621 }, { "acc": 0.80675488, "epoch": 4.398211829436039, "grad_norm": 2.4632184505462646, "learning_rate": 3.918567205413209e-06, "loss": 0.63493814, "memory(GiB)": 67.62, "step": 6395, "train_speed(iter/s)": 0.197376 }, { "acc": 0.83054581, "epoch": 4.401650618982118, "grad_norm": 2.1453042030334473, "learning_rate": 3.8746084631413774e-06, "loss": 0.55714712, "memory(GiB)": 67.62, "step": 6400, "train_speed(iter/s)": 0.197063 }, { "epoch": 4.401650618982118, "eval_acc": 0.7813807531380753, "eval_loss": 0.7940236926078796, "eval_runtime": 1132.2427, "eval_samples_per_second": 3.783, "eval_steps_per_second": 0.068, "step": 6400 }, { "acc": 0.82402639, "epoch": 4.405089408528198, "grad_norm": 2.5457465648651123, "learning_rate": 3.830888377530191e-06, "loss": 0.58401513, "memory(GiB)": 67.62, "step": 6405, "train_speed(iter/s)": 0.190185 }, { "acc": 0.8252965, "epoch": 4.408528198074277, "grad_norm": 2.4412484169006348, "learning_rate": 3.787407174766534e-06, "loss": 0.57594061, "memory(GiB)": 67.62, "step": 6410, "train_speed(iter/s)": 0.189944 }, { "acc": 0.82564621, "epoch": 4.4119669876203575, "grad_norm": 2.4891350269317627, "learning_rate": 3.7441650798014204e-06, "loss": 0.58461208, "memory(GiB)": 67.62, "step": 6415, "train_speed(iter/s)": 0.189741 }, { "acc": 0.82523041, "epoch": 4.415405777166438, "grad_norm": 2.297450065612793, "learning_rate": 3.7011623163488466e-06, "loss": 0.56609049, "memory(GiB)": 67.62, "step": 6420, "train_speed(iter/s)": 0.18951 }, { "acc": 0.82219734, "epoch": 4.418844566712517, "grad_norm": 2.200800657272339, "learning_rate": 3.6583991068846157e-06, "loss": 0.59716201, "memory(GiB)": 67.62, "step": 6425, "train_speed(iter/s)": 0.189276 }, { "acc": 0.83268661, "epoch": 4.422283356258597, "grad_norm": 2.185145378112793, "learning_rate": 3.61587567264519e-06, "loss": 0.56204829, "memory(GiB)": 67.62, "step": 6430, "train_speed(iter/s)": 0.188997 }, { "acc": 0.82503653, "epoch": 4.425722145804677, "grad_norm": 2.043168067932129, "learning_rate": 3.5735922336265567e-06, "loss": 0.5881556, "memory(GiB)": 67.62, "step": 6435, "train_speed(iter/s)": 0.188753 }, { "acc": 0.81975737, "epoch": 4.429160935350756, "grad_norm": 2.160871982574463, "learning_rate": 3.5315490085830724e-06, "loss": 0.6149045, "memory(GiB)": 67.62, "step": 6440, "train_speed(iter/s)": 0.188516 }, { "acc": 0.82297249, "epoch": 4.4325997248968365, "grad_norm": 2.2979509830474854, "learning_rate": 3.489746215026349e-06, "loss": 0.58171053, "memory(GiB)": 67.62, "step": 6445, "train_speed(iter/s)": 0.188237 }, { "acc": 0.81737309, "epoch": 4.436038514442916, "grad_norm": 2.3226141929626465, "learning_rate": 3.4481840692241092e-06, "loss": 0.61316481, "memory(GiB)": 67.62, "step": 6450, "train_speed(iter/s)": 0.18799 }, { "acc": 0.82329559, "epoch": 4.439477303988996, "grad_norm": 2.2420105934143066, "learning_rate": 3.4068627861991034e-06, "loss": 0.60935397, "memory(GiB)": 67.62, "step": 6455, "train_speed(iter/s)": 0.187782 }, { "acc": 0.82957897, "epoch": 4.442916093535076, "grad_norm": 2.281442403793335, "learning_rate": 3.365782579727948e-06, "loss": 0.58194571, "memory(GiB)": 67.62, "step": 6460, "train_speed(iter/s)": 0.187574 }, { "acc": 0.82551146, "epoch": 4.446354883081155, "grad_norm": 2.1205482482910156, "learning_rate": 3.3249436623400493e-06, "loss": 0.57835684, "memory(GiB)": 67.62, "step": 6465, "train_speed(iter/s)": 0.187326 }, { "acc": 0.82712269, "epoch": 4.449793672627235, "grad_norm": 2.2721188068389893, "learning_rate": 3.284346245316513e-06, "loss": 0.57927489, "memory(GiB)": 67.62, "step": 6470, "train_speed(iter/s)": 0.187079 }, { "acc": 0.82159977, "epoch": 4.453232462173315, "grad_norm": 2.2417726516723633, "learning_rate": 3.24399053868902e-06, "loss": 0.57816648, "memory(GiB)": 67.62, "step": 6475, "train_speed(iter/s)": 0.186865 }, { "acc": 0.83896151, "epoch": 4.456671251719395, "grad_norm": 2.33647084236145, "learning_rate": 3.203876751238749e-06, "loss": 0.53038335, "memory(GiB)": 67.62, "step": 6480, "train_speed(iter/s)": 0.186653 }, { "acc": 0.83427839, "epoch": 4.460110041265475, "grad_norm": 2.063394069671631, "learning_rate": 3.1640050904953505e-06, "loss": 0.56539698, "memory(GiB)": 67.62, "step": 6485, "train_speed(iter/s)": 0.186435 }, { "acc": 0.82990141, "epoch": 4.463548830811554, "grad_norm": 2.2717719078063965, "learning_rate": 3.1243757627357668e-06, "loss": 0.55906305, "memory(GiB)": 67.62, "step": 6490, "train_speed(iter/s)": 0.186164 }, { "acc": 0.82575073, "epoch": 4.466987620357634, "grad_norm": 2.5037717819213867, "learning_rate": 3.0849889729832654e-06, "loss": 0.57216806, "memory(GiB)": 67.62, "step": 6495, "train_speed(iter/s)": 0.185961 }, { "acc": 0.82585573, "epoch": 4.470426409903714, "grad_norm": 2.19950795173645, "learning_rate": 3.045844925006326e-06, "loss": 0.57823243, "memory(GiB)": 67.62, "step": 6500, "train_speed(iter/s)": 0.185733 }, { "epoch": 4.470426409903714, "eval_acc": 0.7815775858190489, "eval_loss": 0.7929303646087646, "eval_runtime": 1155.4394, "eval_samples_per_second": 3.707, "eval_steps_per_second": 0.067, "step": 6500 }, { "acc": 0.81569099, "epoch": 4.4738651994497936, "grad_norm": 2.3302502632141113, "learning_rate": 3.0069438213175954e-06, "loss": 0.61277876, "memory(GiB)": 67.62, "step": 6505, "train_speed(iter/s)": 0.179594 }, { "acc": 0.82125263, "epoch": 4.477303988995874, "grad_norm": 2.0833966732025146, "learning_rate": 2.968285863172848e-06, "loss": 0.59841776, "memory(GiB)": 67.62, "step": 6510, "train_speed(iter/s)": 0.179377 }, { "acc": 0.81880264, "epoch": 4.480742778541953, "grad_norm": 2.5484683513641357, "learning_rate": 2.929871250569924e-06, "loss": 0.59419332, "memory(GiB)": 67.62, "step": 6515, "train_speed(iter/s)": 0.179133 }, { "acc": 0.82751369, "epoch": 4.484181568088033, "grad_norm": 2.393644332885742, "learning_rate": 2.891700182247734e-06, "loss": 0.57184334, "memory(GiB)": 67.62, "step": 6520, "train_speed(iter/s)": 0.17891 }, { "acc": 0.81892633, "epoch": 4.487620357634113, "grad_norm": 2.44018292427063, "learning_rate": 2.8537728556851844e-06, "loss": 0.61149454, "memory(GiB)": 67.62, "step": 6525, "train_speed(iter/s)": 0.178678 }, { "acc": 0.82124205, "epoch": 4.491059147180192, "grad_norm": 2.0875890254974365, "learning_rate": 2.8160894671001892e-06, "loss": 0.5891263, "memory(GiB)": 67.62, "step": 6530, "train_speed(iter/s)": 0.178449 }, { "acc": 0.82609577, "epoch": 4.4944979367262725, "grad_norm": 2.057404041290283, "learning_rate": 2.778650211448648e-06, "loss": 0.56262321, "memory(GiB)": 67.62, "step": 6535, "train_speed(iter/s)": 0.178257 }, { "acc": 0.83202305, "epoch": 4.497936726272352, "grad_norm": 2.3149304389953613, "learning_rate": 2.741455282423418e-06, "loss": 0.55560713, "memory(GiB)": 67.62, "step": 6540, "train_speed(iter/s)": 0.178079 }, { "acc": 0.83527908, "epoch": 4.501375515818432, "grad_norm": 2.2315163612365723, "learning_rate": 2.7045048724533295e-06, "loss": 0.54867306, "memory(GiB)": 67.62, "step": 6545, "train_speed(iter/s)": 0.177882 }, { "acc": 0.82490063, "epoch": 4.504814305364512, "grad_norm": 2.0971333980560303, "learning_rate": 2.667799172702211e-06, "loss": 0.58073626, "memory(GiB)": 67.62, "step": 6550, "train_speed(iter/s)": 0.177654 }, { "acc": 0.82666264, "epoch": 4.508253094910591, "grad_norm": 2.328887701034546, "learning_rate": 2.6313383730678536e-06, "loss": 0.58351974, "memory(GiB)": 67.62, "step": 6555, "train_speed(iter/s)": 0.177423 }, { "acc": 0.81643009, "epoch": 4.511691884456671, "grad_norm": 2.3826959133148193, "learning_rate": 2.5951226621810548e-06, "loss": 0.60832229, "memory(GiB)": 67.62, "step": 6560, "train_speed(iter/s)": 0.17721 }, { "acc": 0.83378086, "epoch": 4.5151306740027515, "grad_norm": 2.135087490081787, "learning_rate": 2.5591522274046416e-06, "loss": 0.56533546, "memory(GiB)": 67.62, "step": 6565, "train_speed(iter/s)": 0.177032 }, { "acc": 0.83013229, "epoch": 4.518569463548831, "grad_norm": 2.335890054702759, "learning_rate": 2.523427254832501e-06, "loss": 0.55983028, "memory(GiB)": 67.62, "step": 6570, "train_speed(iter/s)": 0.176797 }, { "acc": 0.82724657, "epoch": 4.522008253094911, "grad_norm": 2.3773765563964844, "learning_rate": 2.487947929288618e-06, "loss": 0.57505946, "memory(GiB)": 67.62, "step": 6575, "train_speed(iter/s)": 0.176609 }, { "acc": 0.82321806, "epoch": 4.52544704264099, "grad_norm": 2.1447110176086426, "learning_rate": 2.4527144343261097e-06, "loss": 0.58117051, "memory(GiB)": 67.62, "step": 6580, "train_speed(iter/s)": 0.176429 }, { "acc": 0.81534252, "epoch": 4.52888583218707, "grad_norm": 2.3002796173095703, "learning_rate": 2.417726952226283e-06, "loss": 0.59847307, "memory(GiB)": 67.62, "step": 6585, "train_speed(iter/s)": 0.176205 }, { "acc": 0.83123646, "epoch": 4.53232462173315, "grad_norm": 2.134842872619629, "learning_rate": 2.382985663997712e-06, "loss": 0.56259084, "memory(GiB)": 67.62, "step": 6590, "train_speed(iter/s)": 0.175983 }, { "acc": 0.82430344, "epoch": 4.5357634112792296, "grad_norm": 2.316795825958252, "learning_rate": 2.348490749375251e-06, "loss": 0.57970629, "memory(GiB)": 67.62, "step": 6595, "train_speed(iter/s)": 0.1758 }, { "acc": 0.83597136, "epoch": 4.53920220082531, "grad_norm": 2.263073444366455, "learning_rate": 2.3142423868191563e-06, "loss": 0.54895492, "memory(GiB)": 67.62, "step": 6600, "train_speed(iter/s)": 0.175618 }, { "epoch": 4.53920220082531, "eval_acc": 0.7819375084356863, "eval_loss": 0.7933745980262756, "eval_runtime": 1098.756, "eval_samples_per_second": 3.898, "eval_steps_per_second": 0.07, "step": 6600 }, { "acc": 0.832055, "epoch": 4.542640990371389, "grad_norm": 2.175189971923828, "learning_rate": 2.2802407535141275e-06, "loss": 0.56409612, "memory(GiB)": 67.62, "step": 6605, "train_speed(iter/s)": 0.170456 }, { "acc": 0.82646189, "epoch": 4.546079779917469, "grad_norm": 2.112194776535034, "learning_rate": 2.246486025368418e-06, "loss": 0.56891632, "memory(GiB)": 67.62, "step": 6610, "train_speed(iter/s)": 0.170302 }, { "acc": 0.82066345, "epoch": 4.549518569463549, "grad_norm": 2.304631233215332, "learning_rate": 2.212978377012892e-06, "loss": 0.60033989, "memory(GiB)": 67.62, "step": 6615, "train_speed(iter/s)": 0.170106 }, { "acc": 0.83084068, "epoch": 4.552957359009628, "grad_norm": 2.2651240825653076, "learning_rate": 2.179717981800164e-06, "loss": 0.55889602, "memory(GiB)": 67.62, "step": 6620, "train_speed(iter/s)": 0.169961 }, { "acc": 0.82767801, "epoch": 4.5563961485557085, "grad_norm": 2.218092918395996, "learning_rate": 2.1467050118036613e-06, "loss": 0.58023634, "memory(GiB)": 67.62, "step": 6625, "train_speed(iter/s)": 0.1698 }, { "acc": 0.82311954, "epoch": 4.559834938101789, "grad_norm": 2.081865072250366, "learning_rate": 2.1139396378167637e-06, "loss": 0.58637218, "memory(GiB)": 67.62, "step": 6630, "train_speed(iter/s)": 0.169621 }, { "acc": 0.82979736, "epoch": 4.563273727647868, "grad_norm": 2.2547144889831543, "learning_rate": 2.08142202935188e-06, "loss": 0.55914106, "memory(GiB)": 67.62, "step": 6635, "train_speed(iter/s)": 0.169453 }, { "acc": 0.82038078, "epoch": 4.566712517193948, "grad_norm": 2.181720495223999, "learning_rate": 2.0491523546396466e-06, "loss": 0.59662962, "memory(GiB)": 67.62, "step": 6640, "train_speed(iter/s)": 0.169266 }, { "acc": 0.8245801, "epoch": 4.570151306740027, "grad_norm": 2.330573558807373, "learning_rate": 2.01713078062797e-06, "loss": 0.58751688, "memory(GiB)": 67.62, "step": 6645, "train_speed(iter/s)": 0.169123 }, { "acc": 0.83173065, "epoch": 4.573590096286107, "grad_norm": 2.1227643489837646, "learning_rate": 1.9853574729812123e-06, "loss": 0.54269109, "memory(GiB)": 67.62, "step": 6650, "train_speed(iter/s)": 0.168939 }, { "acc": 0.83502407, "epoch": 4.577028885832187, "grad_norm": 2.232192277908325, "learning_rate": 1.953832596079319e-06, "loss": 0.5437376, "memory(GiB)": 67.62, "step": 6655, "train_speed(iter/s)": 0.168764 }, { "acc": 0.83107376, "epoch": 4.580467675378267, "grad_norm": 2.2640929222106934, "learning_rate": 1.9225563130169875e-06, "loss": 0.54885445, "memory(GiB)": 67.62, "step": 6660, "train_speed(iter/s)": 0.168622 }, { "acc": 0.83116302, "epoch": 4.583906464924347, "grad_norm": 2.4255106449127197, "learning_rate": 1.8915287856027996e-06, "loss": 0.57933769, "memory(GiB)": 67.62, "step": 6665, "train_speed(iter/s)": 0.168435 }, { "acc": 0.83079157, "epoch": 4.587345254470426, "grad_norm": 2.252610445022583, "learning_rate": 1.8607501743583902e-06, "loss": 0.57562494, "memory(GiB)": 67.62, "step": 6670, "train_speed(iter/s)": 0.168263 }, { "acc": 0.82178955, "epoch": 4.590784044016506, "grad_norm": 2.378258466720581, "learning_rate": 1.8302206385176258e-06, "loss": 0.59762077, "memory(GiB)": 67.62, "step": 6675, "train_speed(iter/s)": 0.168089 }, { "acc": 0.83059864, "epoch": 4.594222833562586, "grad_norm": 2.4089572429656982, "learning_rate": 1.7999403360257766e-06, "loss": 0.57908206, "memory(GiB)": 67.62, "step": 6680, "train_speed(iter/s)": 0.167941 }, { "acc": 0.82545843, "epoch": 4.5976616231086656, "grad_norm": 2.556912660598755, "learning_rate": 1.7699094235386956e-06, "loss": 0.5731123, "memory(GiB)": 67.62, "step": 6685, "train_speed(iter/s)": 0.167785 }, { "acc": 0.83477535, "epoch": 4.601100412654746, "grad_norm": 2.144914150238037, "learning_rate": 1.7401280564220138e-06, "loss": 0.54660711, "memory(GiB)": 67.62, "step": 6690, "train_speed(iter/s)": 0.167611 }, { "acc": 0.82835121, "epoch": 4.604539202200826, "grad_norm": 2.0818796157836914, "learning_rate": 1.7105963887503236e-06, "loss": 0.57266307, "memory(GiB)": 67.62, "step": 6695, "train_speed(iter/s)": 0.167452 }, { "acc": 0.82310772, "epoch": 4.607977991746905, "grad_norm": 2.37752366065979, "learning_rate": 1.6813145733064094e-06, "loss": 0.5846642, "memory(GiB)": 67.62, "step": 6700, "train_speed(iter/s)": 0.167295 }, { "epoch": 4.607977991746905, "eval_acc": 0.7817238043820579, "eval_loss": 0.7931625843048096, "eval_runtime": 1146.6195, "eval_samples_per_second": 3.735, "eval_steps_per_second": 0.067, "step": 6700 }, { "acc": 0.82920761, "epoch": 4.611416781292985, "grad_norm": 2.6180896759033203, "learning_rate": 1.6522827615804277e-06, "loss": 0.55708656, "memory(GiB)": 67.62, "step": 6705, "train_speed(iter/s)": 0.162482 }, { "acc": 0.82743568, "epoch": 4.614855570839064, "grad_norm": 2.1857407093048096, "learning_rate": 1.6235011037691344e-06, "loss": 0.58240447, "memory(GiB)": 67.62, "step": 6710, "train_speed(iter/s)": 0.16232 }, { "acc": 0.81718578, "epoch": 4.6182943603851445, "grad_norm": 2.2875170707702637, "learning_rate": 1.5949697487751052e-06, "loss": 0.61164322, "memory(GiB)": 67.62, "step": 6715, "train_speed(iter/s)": 0.162187 }, { "acc": 0.82232466, "epoch": 4.621733149931224, "grad_norm": 2.1736197471618652, "learning_rate": 1.5666888442059804e-06, "loss": 0.58460808, "memory(GiB)": 67.62, "step": 6720, "train_speed(iter/s)": 0.162055 }, { "acc": 0.82449484, "epoch": 4.625171939477304, "grad_norm": 2.126422643661499, "learning_rate": 1.538658536373673e-06, "loss": 0.57822762, "memory(GiB)": 67.62, "step": 6725, "train_speed(iter/s)": 0.161946 }, { "acc": 0.82507849, "epoch": 4.628610729023384, "grad_norm": 2.2693231105804443, "learning_rate": 1.5108789702936455e-06, "loss": 0.57952757, "memory(GiB)": 67.62, "step": 6730, "train_speed(iter/s)": 0.161804 }, { "acc": 0.8332633, "epoch": 4.632049518569463, "grad_norm": 2.1562063694000244, "learning_rate": 1.4833502896841289e-06, "loss": 0.55239053, "memory(GiB)": 67.62, "step": 6735, "train_speed(iter/s)": 0.161675 }, { "acc": 0.82784958, "epoch": 4.635488308115543, "grad_norm": 2.1214349269866943, "learning_rate": 1.456072636965399e-06, "loss": 0.5708005, "memory(GiB)": 67.62, "step": 6740, "train_speed(iter/s)": 0.161537 }, { "acc": 0.8265028, "epoch": 4.6389270976616235, "grad_norm": 2.2998435497283936, "learning_rate": 1.4290461532590343e-06, "loss": 0.58597693, "memory(GiB)": 67.62, "step": 6745, "train_speed(iter/s)": 0.161423 }, { "acc": 0.82159843, "epoch": 4.642365887207703, "grad_norm": 2.096148729324341, "learning_rate": 1.4022709783871718e-06, "loss": 0.60574317, "memory(GiB)": 67.62, "step": 6750, "train_speed(iter/s)": 0.161279 }, { "acc": 0.82948322, "epoch": 4.645804676753783, "grad_norm": 1.9622774124145508, "learning_rate": 1.375747250871807e-06, "loss": 0.57297769, "memory(GiB)": 67.62, "step": 6755, "train_speed(iter/s)": 0.161116 }, { "acc": 0.82648077, "epoch": 4.649243466299862, "grad_norm": 2.2610554695129395, "learning_rate": 1.3494751079340738e-06, "loss": 0.56792774, "memory(GiB)": 67.62, "step": 6760, "train_speed(iter/s)": 0.16096 }, { "acc": 0.82656231, "epoch": 4.652682255845942, "grad_norm": 2.134491205215454, "learning_rate": 1.3234546854935154e-06, "loss": 0.56553001, "memory(GiB)": 67.62, "step": 6765, "train_speed(iter/s)": 0.160816 }, { "acc": 0.83355551, "epoch": 4.656121045392022, "grad_norm": 2.2637131214141846, "learning_rate": 1.2976861181673923e-06, "loss": 0.55729747, "memory(GiB)": 67.62, "step": 6770, "train_speed(iter/s)": 0.160707 }, { "acc": 0.83004456, "epoch": 4.6595598349381016, "grad_norm": 2.241671323776245, "learning_rate": 1.2721695392699869e-06, "loss": 0.55024014, "memory(GiB)": 67.62, "step": 6775, "train_speed(iter/s)": 0.16056 }, { "acc": 0.82354479, "epoch": 4.662998624484182, "grad_norm": 2.196913480758667, "learning_rate": 1.2469050808119282e-06, "loss": 0.57635975, "memory(GiB)": 67.62, "step": 6780, "train_speed(iter/s)": 0.160424 }, { "acc": 0.814569, "epoch": 4.666437414030261, "grad_norm": 2.4140119552612305, "learning_rate": 1.221892873499479e-06, "loss": 0.61613665, "memory(GiB)": 67.62, "step": 6785, "train_speed(iter/s)": 0.160253 }, { "acc": 0.83262882, "epoch": 4.669876203576341, "grad_norm": 2.239264726638794, "learning_rate": 1.1971330467338833e-06, "loss": 0.55864224, "memory(GiB)": 67.62, "step": 6790, "train_speed(iter/s)": 0.1601 }, { "acc": 0.82022276, "epoch": 4.673314993122421, "grad_norm": 2.135786771774292, "learning_rate": 1.172625728610676e-06, "loss": 0.58857613, "memory(GiB)": 67.62, "step": 6795, "train_speed(iter/s)": 0.15997 }, { "acc": 0.83236532, "epoch": 4.6767537826685, "grad_norm": 1.979997992515564, "learning_rate": 1.1483710459190515e-06, "loss": 0.56562681, "memory(GiB)": 67.62, "step": 6800, "train_speed(iter/s)": 0.159825 }, { "epoch": 4.6767537826685, "eval_acc": 0.7819093894812615, "eval_loss": 0.7931298613548279, "eval_runtime": 1157.2244, "eval_samples_per_second": 3.701, "eval_steps_per_second": 0.067, "step": 6800 }, { "acc": 0.8271327, "epoch": 4.6801925722145805, "grad_norm": 2.349480152130127, "learning_rate": 1.1243691241411644e-06, "loss": 0.58665218, "memory(GiB)": 67.62, "step": 6805, "train_speed(iter/s)": 0.155462 }, { "acc": 0.83063755, "epoch": 4.683631361760661, "grad_norm": 2.1535379886627197, "learning_rate": 1.1006200874515338e-06, "loss": 0.55733638, "memory(GiB)": 67.62, "step": 6810, "train_speed(iter/s)": 0.155314 }, { "acc": 0.81677713, "epoch": 4.68707015130674, "grad_norm": 2.1077511310577393, "learning_rate": 1.0771240587163464e-06, "loss": 0.60006194, "memory(GiB)": 67.62, "step": 6815, "train_speed(iter/s)": 0.155164 }, { "acc": 0.83417349, "epoch": 4.69050894085282, "grad_norm": 2.45220685005188, "learning_rate": 1.0538811594928607e-06, "loss": 0.53521776, "memory(GiB)": 67.62, "step": 6820, "train_speed(iter/s)": 0.155057 }, { "acc": 0.82799282, "epoch": 4.693947730398899, "grad_norm": 2.1742374897003174, "learning_rate": 1.0308915100287642e-06, "loss": 0.56440144, "memory(GiB)": 67.62, "step": 6825, "train_speed(iter/s)": 0.154917 }, { "acc": 0.83087101, "epoch": 4.697386519944979, "grad_norm": 2.1993463039398193, "learning_rate": 1.0081552292615454e-06, "loss": 0.5529726, "memory(GiB)": 67.62, "step": 6830, "train_speed(iter/s)": 0.154819 }, { "acc": 0.83782015, "epoch": 4.7008253094910595, "grad_norm": 2.260230541229248, "learning_rate": 9.856724348178841e-07, "loss": 0.53974109, "memory(GiB)": 67.62, "step": 6835, "train_speed(iter/s)": 0.154699 }, { "acc": 0.83221836, "epoch": 4.704264099037139, "grad_norm": 2.035860061645508, "learning_rate": 9.634432430130399e-07, "loss": 0.54515915, "memory(GiB)": 67.62, "step": 6840, "train_speed(iter/s)": 0.154586 }, { "acc": 0.82770882, "epoch": 4.707702888583219, "grad_norm": 2.026685953140259, "learning_rate": 9.414677688502594e-07, "loss": 0.5836278, "memory(GiB)": 67.62, "step": 6845, "train_speed(iter/s)": 0.154451 }, { "acc": 0.82769499, "epoch": 4.711141678129298, "grad_norm": 2.1812551021575928, "learning_rate": 9.1974612602017e-07, "loss": 0.57010379, "memory(GiB)": 67.62, "step": 6850, "train_speed(iter/s)": 0.154322 }, { "acc": 0.81980133, "epoch": 4.714580467675378, "grad_norm": 2.3447399139404297, "learning_rate": 8.982784269002089e-07, "loss": 0.59749265, "memory(GiB)": 67.62, "step": 6855, "train_speed(iter/s)": 0.154209 }, { "acc": 0.83671551, "epoch": 4.718019257221458, "grad_norm": 2.019040107727051, "learning_rate": 8.770647825540072e-07, "loss": 0.5339366, "memory(GiB)": 67.62, "step": 6860, "train_speed(iter/s)": 0.154098 }, { "acc": 0.83358383, "epoch": 4.7214580467675376, "grad_norm": 2.4504003524780273, "learning_rate": 8.561053027308616e-07, "loss": 0.54877663, "memory(GiB)": 67.62, "step": 6865, "train_speed(iter/s)": 0.153941 }, { "acc": 0.82203579, "epoch": 4.724896836313618, "grad_norm": 2.2956948280334473, "learning_rate": 8.354000958651198e-07, "loss": 0.58671484, "memory(GiB)": 67.62, "step": 6870, "train_speed(iter/s)": 0.153816 }, { "acc": 0.82069569, "epoch": 4.728335625859698, "grad_norm": 2.3851406574249268, "learning_rate": 8.149492690756679e-07, "loss": 0.58018303, "memory(GiB)": 67.62, "step": 6875, "train_speed(iter/s)": 0.153716 }, { "acc": 0.82189007, "epoch": 4.731774415405777, "grad_norm": 2.3761680126190186, "learning_rate": 7.947529281653329e-07, "loss": 0.5802557, "memory(GiB)": 67.62, "step": 6880, "train_speed(iter/s)": 0.153572 }, { "acc": 0.8201951, "epoch": 4.735213204951857, "grad_norm": 2.3680715560913086, "learning_rate": 7.748111776203488e-07, "loss": 0.5941371, "memory(GiB)": 67.62, "step": 6885, "train_speed(iter/s)": 0.153396 }, { "acc": 0.83601265, "epoch": 4.738651994497936, "grad_norm": 2.2949132919311523, "learning_rate": 7.551241206098402e-07, "loss": 0.54753556, "memory(GiB)": 67.62, "step": 6890, "train_speed(iter/s)": 0.153255 }, { "acc": 0.82891521, "epoch": 4.7420907840440165, "grad_norm": 2.6076362133026123, "learning_rate": 7.356918589852512e-07, "loss": 0.56754522, "memory(GiB)": 67.62, "step": 6895, "train_speed(iter/s)": 0.153143 }, { "acc": 0.82609663, "epoch": 4.745529573590097, "grad_norm": 2.297222852706909, "learning_rate": 7.165144932798456e-07, "loss": 0.56647487, "memory(GiB)": 67.62, "step": 6900, "train_speed(iter/s)": 0.153005 }, { "epoch": 4.745529573590097, "eval_acc": 0.7816394475187834, "eval_loss": 0.7943344116210938, "eval_runtime": 1104.7871, "eval_samples_per_second": 3.877, "eval_steps_per_second": 0.07, "step": 6900 }, { "acc": 0.83013258, "epoch": 4.748968363136176, "grad_norm": 2.427417755126953, "learning_rate": 6.975921227081685e-07, "loss": 0.55977812, "memory(GiB)": 67.62, "step": 6905, "train_speed(iter/s)": 0.149232 }, { "acc": 0.8199255, "epoch": 4.752407152682256, "grad_norm": 2.2759101390838623, "learning_rate": 6.789248451655523e-07, "loss": 0.58387136, "memory(GiB)": 67.62, "step": 6910, "train_speed(iter/s)": 0.149131 }, { "acc": 0.82206144, "epoch": 4.755845942228335, "grad_norm": 2.231541395187378, "learning_rate": 6.605127572275894e-07, "loss": 0.59709778, "memory(GiB)": 67.62, "step": 6915, "train_speed(iter/s)": 0.149024 }, { "acc": 0.8175106, "epoch": 4.759284731774415, "grad_norm": 2.4362175464630127, "learning_rate": 6.423559541496492e-07, "loss": 0.6127625, "memory(GiB)": 67.62, "step": 6920, "train_speed(iter/s)": 0.148912 }, { "acc": 0.83411427, "epoch": 4.7627235213204955, "grad_norm": 2.0732574462890625, "learning_rate": 6.244545298663843e-07, "loss": 0.54563398, "memory(GiB)": 67.62, "step": 6925, "train_speed(iter/s)": 0.148809 }, { "acc": 0.8238575, "epoch": 4.766162310866575, "grad_norm": 2.174506187438965, "learning_rate": 6.068085769912308e-07, "loss": 0.58828888, "memory(GiB)": 67.62, "step": 6930, "train_speed(iter/s)": 0.148728 }, { "acc": 0.82762337, "epoch": 4.769601100412655, "grad_norm": 2.551449775695801, "learning_rate": 5.894181868159313e-07, "loss": 0.57614126, "memory(GiB)": 67.62, "step": 6935, "train_speed(iter/s)": 0.148607 }, { "acc": 0.82847862, "epoch": 4.773039889958735, "grad_norm": 2.242396354675293, "learning_rate": 5.722834493100845e-07, "loss": 0.58625593, "memory(GiB)": 67.62, "step": 6940, "train_speed(iter/s)": 0.148523 }, { "acc": 0.83427067, "epoch": 4.776478679504814, "grad_norm": 2.2920279502868652, "learning_rate": 5.554044531206463e-07, "loss": 0.55577106, "memory(GiB)": 67.62, "step": 6945, "train_speed(iter/s)": 0.148434 }, { "acc": 0.82505064, "epoch": 4.779917469050894, "grad_norm": 2.4490933418273926, "learning_rate": 5.387812855715081e-07, "loss": 0.57476597, "memory(GiB)": 67.62, "step": 6950, "train_speed(iter/s)": 0.148301 }, { "acc": 0.82180548, "epoch": 4.7833562585969736, "grad_norm": 2.4874212741851807, "learning_rate": 5.224140326630133e-07, "loss": 0.59430389, "memory(GiB)": 67.62, "step": 6955, "train_speed(iter/s)": 0.148156 }, { "acc": 0.81490593, "epoch": 4.786795048143054, "grad_norm": 2.1581063270568848, "learning_rate": 5.063027790715248e-07, "loss": 0.60423484, "memory(GiB)": 67.62, "step": 6960, "train_speed(iter/s)": 0.148057 }, { "acc": 0.82663193, "epoch": 4.790233837689134, "grad_norm": 2.1210756301879883, "learning_rate": 4.904476081489975e-07, "loss": 0.56228495, "memory(GiB)": 67.62, "step": 6965, "train_speed(iter/s)": 0.147956 }, { "acc": 0.83753424, "epoch": 4.793672627235213, "grad_norm": 2.065978527069092, "learning_rate": 4.7484860192252317e-07, "loss": 0.53960943, "memory(GiB)": 67.62, "step": 6970, "train_speed(iter/s)": 0.147817 }, { "acc": 0.83375235, "epoch": 4.797111416781293, "grad_norm": 2.488433837890625, "learning_rate": 4.595058410939305e-07, "loss": 0.55561361, "memory(GiB)": 67.62, "step": 6975, "train_speed(iter/s)": 0.147709 }, { "acc": 0.82256441, "epoch": 4.800550206327372, "grad_norm": 2.134580135345459, "learning_rate": 4.4441940503934173e-07, "loss": 0.59003773, "memory(GiB)": 67.62, "step": 6980, "train_speed(iter/s)": 0.147603 }, { "acc": 0.82549543, "epoch": 4.8039889958734525, "grad_norm": 2.2374000549316406, "learning_rate": 4.295893718088e-07, "loss": 0.57104192, "memory(GiB)": 67.62, "step": 6985, "train_speed(iter/s)": 0.147515 }, { "acc": 0.83225937, "epoch": 4.807427785419533, "grad_norm": 1.948536992073059, "learning_rate": 4.150158181258259e-07, "loss": 0.55912457, "memory(GiB)": 67.62, "step": 6990, "train_speed(iter/s)": 0.147431 }, { "acc": 0.81516037, "epoch": 4.810866574965612, "grad_norm": 2.495556354522705, "learning_rate": 4.0069881938703406e-07, "loss": 0.59933119, "memory(GiB)": 67.62, "step": 6995, "train_speed(iter/s)": 0.147319 }, { "acc": 0.82695866, "epoch": 4.814305364511692, "grad_norm": 2.299910545349121, "learning_rate": 3.866384496617616e-07, "loss": 0.58013859, "memory(GiB)": 67.62, "step": 7000, "train_speed(iter/s)": 0.147213 }, { "epoch": 4.814305364511692, "eval_acc": 0.7819543798083413, "eval_loss": 0.793637216091156, "eval_runtime": 1150.3821, "eval_samples_per_second": 3.723, "eval_steps_per_second": 0.067, "step": 7000 }, { "acc": 0.82657938, "epoch": 4.817744154057772, "grad_norm": 2.2365365028381348, "learning_rate": 3.7283478169165165e-07, "loss": 0.59894753, "memory(GiB)": 67.62, "step": 7005, "train_speed(iter/s)": 0.143635 }, { "acc": 0.82827587, "epoch": 4.821182943603851, "grad_norm": 2.287341833114624, "learning_rate": 3.592878868903036e-07, "loss": 0.56538892, "memory(GiB)": 67.62, "step": 7010, "train_speed(iter/s)": 0.143557 }, { "acc": 0.82892952, "epoch": 4.8246217331499315, "grad_norm": 2.37528920173645, "learning_rate": 3.459978353429071e-07, "loss": 0.56618586, "memory(GiB)": 67.62, "step": 7015, "train_speed(iter/s)": 0.143484 }, { "acc": 0.82298727, "epoch": 4.828060522696011, "grad_norm": 2.1354215145111084, "learning_rate": 3.3296469580584186e-07, "loss": 0.58705649, "memory(GiB)": 67.62, "step": 7020, "train_speed(iter/s)": 0.143374 }, { "acc": 0.81914625, "epoch": 4.831499312242091, "grad_norm": 2.2157156467437744, "learning_rate": 3.201885357063674e-07, "loss": 0.60606232, "memory(GiB)": 67.62, "step": 7025, "train_speed(iter/s)": 0.143299 }, { "acc": 0.82481365, "epoch": 4.83493810178817, "grad_norm": 2.350295066833496, "learning_rate": 3.076694211422452e-07, "loss": 0.59341784, "memory(GiB)": 67.62, "step": 7030, "train_speed(iter/s)": 0.14321 }, { "acc": 0.8327177, "epoch": 4.83837689133425, "grad_norm": 2.483370304107666, "learning_rate": 2.954074168814115e-07, "loss": 0.57141585, "memory(GiB)": 67.62, "step": 7035, "train_speed(iter/s)": 0.1431 }, { "acc": 0.82273092, "epoch": 4.84181568088033, "grad_norm": 2.237597942352295, "learning_rate": 2.8340258636162734e-07, "loss": 0.59980655, "memory(GiB)": 67.62, "step": 7040, "train_speed(iter/s)": 0.142991 }, { "acc": 0.81792231, "epoch": 4.8452544704264096, "grad_norm": 2.165174961090088, "learning_rate": 2.716549916901624e-07, "loss": 0.59414587, "memory(GiB)": 67.62, "step": 7045, "train_speed(iter/s)": 0.142869 }, { "acc": 0.83234692, "epoch": 4.84869325997249, "grad_norm": 2.2582786083221436, "learning_rate": 2.601646936434731e-07, "loss": 0.56242762, "memory(GiB)": 67.62, "step": 7050, "train_speed(iter/s)": 0.142752 }, { "acc": 0.82207642, "epoch": 4.85213204951857, "grad_norm": 2.386744737625122, "learning_rate": 2.4893175166689693e-07, "loss": 0.5899931, "memory(GiB)": 67.62, "step": 7055, "train_speed(iter/s)": 0.142651 }, { "acc": 0.83866978, "epoch": 4.855570839064649, "grad_norm": 2.4052698612213135, "learning_rate": 2.3795622387430887e-07, "loss": 0.52610168, "memory(GiB)": 67.62, "step": 7060, "train_speed(iter/s)": 0.14255 }, { "acc": 0.81752338, "epoch": 4.859009628610729, "grad_norm": 2.0953776836395264, "learning_rate": 2.272381670478657e-07, "loss": 0.60933762, "memory(GiB)": 67.62, "step": 7065, "train_speed(iter/s)": 0.142448 }, { "acc": 0.82258358, "epoch": 4.862448418156809, "grad_norm": 2.4188003540039062, "learning_rate": 2.1677763663768406e-07, "loss": 0.58760223, "memory(GiB)": 67.62, "step": 7070, "train_speed(iter/s)": 0.142309 }, { "acc": 0.82635889, "epoch": 4.8658872077028885, "grad_norm": 2.2215888500213623, "learning_rate": 2.0657468676155762e-07, "loss": 0.58528147, "memory(GiB)": 67.62, "step": 7075, "train_speed(iter/s)": 0.142201 }, { "acc": 0.84418049, "epoch": 4.869325997248969, "grad_norm": 2.0003366470336914, "learning_rate": 1.9662937020469589e-07, "loss": 0.51888628, "memory(GiB)": 67.62, "step": 7080, "train_speed(iter/s)": 0.142127 }, { "acc": 0.83303547, "epoch": 4.872764786795048, "grad_norm": 2.1851377487182617, "learning_rate": 1.8694173841941928e-07, "loss": 0.55756779, "memory(GiB)": 67.62, "step": 7085, "train_speed(iter/s)": 0.142023 }, { "acc": 0.82351046, "epoch": 4.876203576341128, "grad_norm": 2.3260505199432373, "learning_rate": 1.775118415249201e-07, "loss": 0.58764186, "memory(GiB)": 67.62, "step": 7090, "train_speed(iter/s)": 0.141935 }, { "acc": 0.82887058, "epoch": 4.879642365887207, "grad_norm": 2.2045719623565674, "learning_rate": 1.6833972830699635e-07, "loss": 0.56427956, "memory(GiB)": 67.62, "step": 7095, "train_speed(iter/s)": 0.141825 }, { "acc": 0.81958294, "epoch": 4.883081155433287, "grad_norm": 2.245159149169922, "learning_rate": 1.5942544621777965e-07, "loss": 0.60630999, "memory(GiB)": 67.62, "step": 7100, "train_speed(iter/s)": 0.141715 }, { "epoch": 4.883081155433287, "eval_acc": 0.7818137850362172, "eval_loss": 0.7931898832321167, "eval_runtime": 1152.8663, "eval_samples_per_second": 3.715, "eval_steps_per_second": 0.067, "step": 7100 }, { "acc": 0.83190765, "epoch": 4.8865199449793675, "grad_norm": 2.2760040760040283, "learning_rate": 1.507690413755244e-07, "loss": 0.56932721, "memory(GiB)": 67.62, "step": 7105, "train_speed(iter/s)": 0.138438 }, { "acc": 0.82073574, "epoch": 4.889958734525447, "grad_norm": 2.2572543621063232, "learning_rate": 1.423705585643412e-07, "loss": 0.59770269, "memory(GiB)": 67.62, "step": 7110, "train_speed(iter/s)": 0.138349 }, { "acc": 0.82008648, "epoch": 4.893397524071527, "grad_norm": 2.431645631790161, "learning_rate": 1.342300412339805e-07, "loss": 0.60884895, "memory(GiB)": 67.62, "step": 7115, "train_speed(iter/s)": 0.138261 }, { "acc": 0.83563404, "epoch": 4.896836313617607, "grad_norm": 2.210167646408081, "learning_rate": 1.2634753149959394e-07, "loss": 0.55552473, "memory(GiB)": 67.62, "step": 7120, "train_speed(iter/s)": 0.138175 }, { "acc": 0.83866234, "epoch": 4.900275103163686, "grad_norm": 2.1584184169769287, "learning_rate": 1.1872307014153448e-07, "loss": 0.5373682, "memory(GiB)": 67.62, "step": 7125, "train_speed(iter/s)": 0.138058 }, { "acc": 0.83324118, "epoch": 4.903713892709766, "grad_norm": 2.51465106010437, "learning_rate": 1.1135669660512879e-07, "loss": 0.54701567, "memory(GiB)": 67.62, "step": 7130, "train_speed(iter/s)": 0.137984 }, { "acc": 0.84189644, "epoch": 4.9071526822558456, "grad_norm": 2.2430858612060547, "learning_rate": 1.0424844900048863e-07, "loss": 0.52747626, "memory(GiB)": 67.62, "step": 7135, "train_speed(iter/s)": 0.137875 }, { "acc": 0.82826939, "epoch": 4.910591471801926, "grad_norm": 2.468977451324463, "learning_rate": 9.739836410229431e-08, "loss": 0.56382651, "memory(GiB)": 67.62, "step": 7140, "train_speed(iter/s)": 0.137804 }, { "acc": 0.82850714, "epoch": 4.914030261348006, "grad_norm": 2.1959378719329834, "learning_rate": 9.080647734961705e-08, "loss": 0.5642982, "memory(GiB)": 67.62, "step": 7145, "train_speed(iter/s)": 0.137706 }, { "acc": 0.80990505, "epoch": 4.917469050894085, "grad_norm": 2.1937224864959717, "learning_rate": 8.447282284574144e-08, "loss": 0.64270401, "memory(GiB)": 67.62, "step": 7150, "train_speed(iter/s)": 0.137623 }, { "acc": 0.83124857, "epoch": 4.920907840440165, "grad_norm": 2.274343967437744, "learning_rate": 7.839743335798222e-08, "loss": 0.58021183, "memory(GiB)": 67.62, "step": 7155, "train_speed(iter/s)": 0.137544 }, { "acc": 0.83879738, "epoch": 4.924346629986244, "grad_norm": 2.0642943382263184, "learning_rate": 7.258034031750108e-08, "loss": 0.55038834, "memory(GiB)": 67.62, "step": 7160, "train_speed(iter/s)": 0.137462 }, { "acc": 0.82772274, "epoch": 4.9277854195323245, "grad_norm": 2.236903429031372, "learning_rate": 6.702157381916804e-08, "loss": 0.57812862, "memory(GiB)": 67.62, "step": 7165, "train_speed(iter/s)": 0.137368 }, { "acc": 0.81524467, "epoch": 4.931224209078405, "grad_norm": 2.3664135932922363, "learning_rate": 6.172116262139473e-08, "loss": 0.59173594, "memory(GiB)": 67.62, "step": 7170, "train_speed(iter/s)": 0.137273 }, { "acc": 0.84409065, "epoch": 4.934662998624484, "grad_norm": 1.8912343978881836, "learning_rate": 5.66791341459791e-08, "loss": 0.51706591, "memory(GiB)": 67.62, "step": 7175, "train_speed(iter/s)": 0.13718 }, { "acc": 0.82883434, "epoch": 4.938101788170564, "grad_norm": 2.3647637367248535, "learning_rate": 5.189551447797223e-08, "loss": 0.57346845, "memory(GiB)": 67.62, "step": 7180, "train_speed(iter/s)": 0.137112 }, { "acc": 0.82722406, "epoch": 4.941540577716644, "grad_norm": 2.390969753265381, "learning_rate": 4.7370328365550553e-08, "loss": 0.58734665, "memory(GiB)": 67.62, "step": 7185, "train_speed(iter/s)": 0.137029 }, { "acc": 0.83363981, "epoch": 4.944979367262723, "grad_norm": 2.3286654949188232, "learning_rate": 4.3103599219855e-08, "loss": 0.55134306, "memory(GiB)": 67.62, "step": 7190, "train_speed(iter/s)": 0.13694 }, { "acc": 0.8398654, "epoch": 4.9484181568088035, "grad_norm": 1.95890474319458, "learning_rate": 3.909534911492433e-08, "loss": 0.52122355, "memory(GiB)": 67.62, "step": 7195, "train_speed(iter/s)": 0.136882 }, { "acc": 0.82601204, "epoch": 4.951856946354883, "grad_norm": 2.7101948261260986, "learning_rate": 3.534559878752308e-08, "loss": 0.58264699, "memory(GiB)": 67.62, "step": 7200, "train_speed(iter/s)": 0.13679 }, { "epoch": 4.951856946354883, "eval_acc": 0.7817575471273677, "eval_loss": 0.7930530309677124, "eval_runtime": 1110.3461, "eval_samples_per_second": 3.857, "eval_steps_per_second": 0.069, "step": 7200 }, { "acc": 0.83350286, "epoch": 4.955295735900963, "grad_norm": 2.281674861907959, "learning_rate": 3.185436763708053e-08, "loss": 0.55087848, "memory(GiB)": 67.62, "step": 7205, "train_speed(iter/s)": 0.13388 }, { "acc": 0.83256226, "epoch": 4.958734525447043, "grad_norm": 2.406829357147217, "learning_rate": 2.862167372556297e-08, "loss": 0.55789819, "memory(GiB)": 67.62, "step": 7210, "train_speed(iter/s)": 0.133821 }, { "acc": 0.81454487, "epoch": 4.962173314993122, "grad_norm": 2.2203316688537598, "learning_rate": 2.564753377737945e-08, "loss": 0.60484362, "memory(GiB)": 67.62, "step": 7215, "train_speed(iter/s)": 0.133748 }, { "acc": 0.82130527, "epoch": 4.965612104539202, "grad_norm": 2.130246162414551, "learning_rate": 2.2931963179320628e-08, "loss": 0.59843221, "memory(GiB)": 67.62, "step": 7220, "train_speed(iter/s)": 0.133637 }, { "acc": 0.83102131, "epoch": 4.9690508940852816, "grad_norm": 2.521017551422119, "learning_rate": 2.04749759804478e-08, "loss": 0.55911312, "memory(GiB)": 67.62, "step": 7225, "train_speed(iter/s)": 0.133538 }, { "acc": 0.81951447, "epoch": 4.972489683631362, "grad_norm": 2.495345115661621, "learning_rate": 1.8276584892048502e-08, "loss": 0.59946508, "memory(GiB)": 67.62, "step": 7230, "train_speed(iter/s)": 0.133454 }, { "acc": 0.83500395, "epoch": 4.975928473177442, "grad_norm": 2.169851541519165, "learning_rate": 1.6336801287547673e-08, "loss": 0.55714474, "memory(GiB)": 67.62, "step": 7235, "train_speed(iter/s)": 0.133371 }, { "acc": 0.83105001, "epoch": 4.979367262723521, "grad_norm": 1.9003541469573975, "learning_rate": 1.4655635202457724e-08, "loss": 0.56020293, "memory(GiB)": 67.62, "step": 7240, "train_speed(iter/s)": 0.133301 }, { "acc": 0.82036457, "epoch": 4.982806052269601, "grad_norm": 2.2826859951019287, "learning_rate": 1.3233095334339681e-08, "loss": 0.5854476, "memory(GiB)": 67.62, "step": 7245, "train_speed(iter/s)": 0.133207 }, { "acc": 0.82185326, "epoch": 4.986244841815681, "grad_norm": 2.5508041381835938, "learning_rate": 1.2069189042725465e-08, "loss": 0.58682165, "memory(GiB)": 67.62, "step": 7250, "train_speed(iter/s)": 0.133133 }, { "acc": 0.83299255, "epoch": 4.9896836313617605, "grad_norm": 2.2958316802978516, "learning_rate": 1.1163922349123454e-08, "loss": 0.54637289, "memory(GiB)": 67.62, "step": 7255, "train_speed(iter/s)": 0.133052 }, { "acc": 0.81462727, "epoch": 4.993122420907841, "grad_norm": 2.2949371337890625, "learning_rate": 1.051729993694077e-08, "loss": 0.60125666, "memory(GiB)": 67.62, "step": 7260, "train_speed(iter/s)": 0.132965 }, { "acc": 0.83669167, "epoch": 4.99656121045392, "grad_norm": 2.335374593734741, "learning_rate": 1.0129325151499931e-08, "loss": 0.51913919, "memory(GiB)": 67.62, "step": 7265, "train_speed(iter/s)": 0.132889 }, { "acc": 0.83688688, "epoch": 5.0, "grad_norm": 2.2776167392730713, "learning_rate": 1e-08, "loss": 0.54480848, "memory(GiB)": 67.62, "step": 7270, "train_speed(iter/s)": 0.132804 }, { "epoch": 5.0, "eval_acc": 0.781712556800288, "eval_loss": 0.793134868144989, "eval_runtime": 1106.3573, "eval_samples_per_second": 3.871, "eval_steps_per_second": 0.07, "step": 7270 } ], "logging_steps": 5, "max_steps": 7270, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.939618530083786e+19, "train_batch_size": 14, "trial_name": null, "trial_params": null }