taylanbil · July 20, 2020 19:51
diff --git a/fairseq-trainslation-gpu-1bucket-loss.txt b/fairseq-trainslation-gpu-1bucket-loss.txt
 2020-07-20 19:30:27 | INFO | train_inner | epoch 001:    100 / 648283 loss=15.175, ppl=36985.3, wps=1680.2, ups=6.93, wpb=242.4, bsz=8, num_updates=100, lr=1e-06, gnorm=8.917, loss_scale=128, train_wall=15, wall=95
 RAWLOSS @ 200 tensor(1351.2732, device='cuda:0')
 2020-07-20 19:30:42 | INFO | train_inner | epoch 001:    200 / 648283 loss=13.532, ppl=11843.4, wps=1672.1, ups=6.92, wpb=241.6, bsz=8, num_updates=200, lr=2e-06, gnorm=6.148, loss_scale=128, train_wall=14, wall=110
 RAWLOSS @ 300 tensor(2289.2922, device='cuda:0')
 2020-07-20 19:30:56 | INFO | train_inner | epoch 001:    300 / 648283 loss=12.885, ppl=7566.95, wps=1622.2, ups=6.92, wpb=234.5, bsz=8, num_updates=300, lr=3e-06, gnorm=5.264, loss_scale=128, train_wall=14, wall=124
 RAWLOSS @ 400 tensor(1715.7347, device='cuda:0')
 2020-07-20 19:31:11 | INFO | train_inner | epoch 001:    400 / 648283 loss=12.564, ppl=6055.17, wps=1642.5, ups=6.92, wpb=237.4, bsz=8, num_updates=400, lr=4e-06, gnorm=4.725, loss_scale=128, train_wall=14, wall=139
 RAWLOSS @ 500 tensor(2465.9827, device='cuda:0')
 2020-07-20 19:31:25 | INFO | train_inner | epoch 001:    500 / 648283 loss=12.171, ppl=4612.85, wps=1613.5, ups=6.88, wpb=234.5, bsz=8, num_updates=500, lr=5e-06, gnorm=4.369, loss_scale=128, train_wall=14, wall=153
 RAWLOSS @ 600 tensor(2054.0337, device='cuda:0')
 2020-07-20 19:31:40 | INFO | train_inner | epoch 001:    600 / 648283 loss=11.87, ppl=3742.86, wps=1668.3, ups=6.91, wpb=241.4, bsz=8, num_updates=600, lr=6e-06, gnorm=4.025, loss_scale=128, train_wall=14, wall=168
 RAWLOSS @ 700 tensor(1702.4202, device='cuda:0')
 2020-07-20 19:31:54 | INFO | train_inner | epoch 001:    700 / 648283 loss=11.598, ppl=3100.37, wps=1625.6, ups=6.91, wpb=235.3, bsz=8, num_updates=700, lr=7e-06, gnorm=3.855, loss_scale=128, train_wall=14, wall=182
 RAWLOSS @ 800 tensor(1390.9583, device='cuda:0')
 2020-07-20 19:32:09 | INFO | train_inner | epoch 001:    800 / 648283 loss=11.283, ppl=2491.04, wps=1632.2, ups=6.91, wpb=236.1, bsz=8, num_updates=800, lr=8e-06, gnorm=3.714, loss_scale=128, train_wall=14, wall=196
 RAWLOSS @ 900 tensor(1542.2812, device='cuda:0')
 2020-07-20 19:32:23 | INFO | train_inner | epoch 001:    900 / 648283 loss=11.052, ppl=2123.46, wps=1692.6, ups=6.91, wpb=244.9, bsz=8, num_updates=900, lr=9e-06, gnorm=3.659, loss_scale=128, train_wall=14, wall=211
 RAWLOSS @ 1000 tensor(1371.7253, device='cuda:0')
 2020-07-20 19:32:38 | INFO | train_inner | epoch 001:   1000 / 648283 loss=10.961, ppl=1992.71, wps=1660.7, ups=6.92, wpb=240.1, bsz=8, num_updates=1000, lr=1e-05, gnorm=3.738, loss_scale=128, train_wall=14, wall=225
 RAWLOSS @ 1100 tensor(1693.6879, device='cuda:0')
 2020-07-20 19:32:52 | INFO | train_inner | epoch 001:   1100 / 648283 loss=10.813, ppl=1799.33, wps=1600.9, ups=6.91, wpb=231.8, bsz=8, num_updates=1100, lr=1.1e-05, gnorm=3.74, loss_scale=128, train_wall=14, wall=240
 RAWLOSS @ 1200 tensor(1228.1172, device='cuda:0')
 2020-07-20 19:33:07 | INFO | train_inner | epoch 001:   1200 / 648283 loss=10.731, ppl=1700.03, wps=1592.4, ups=6.91, wpb=230.3, bsz=8, num_updates=1200, lr=1.2e-05, gnorm=3.842, loss_scale=128, train_wall=14, wall=254
 RAWLOSS @ 1300 tensor(1328.1652, device='cuda:0')
 2020-07-20 19:33:21 | INFO | train_inner | epoch 001:   1300 / 648283 loss=10.609, ppl=1561.78, wps=1678.9, ups=6.9, wpb=243.2, bsz=8, num_updates=1300, lr=1.3e-05, gnorm=3.824, loss_scale=128, train_wall=14, wall=269
 RAWLOSS @ 1400 tensor(1641.3103, device='cuda:0')
 2020-07-20 19:33:36 | INFO | train_inner | epoch 001:   1400 / 648283 loss=10.597, ppl=1548.56, wps=1639.7, ups=6.89, wpb=237.8, bsz=8, num_updates=1400, lr=1.4e-05, gnorm=3.972, loss_scale=128, train_wall=14, wall=283
 RAWLOSS @ 1500 tensor(1422.3258, device='cuda:0')
 2020-07-20 19:33:50 | INFO | train_inner | epoch 001:   1500 / 648283 loss=10.524, ppl=1472.3, wps=1674.4, ups=6.9, wpb=242.6, bsz=8, num_updates=1500, lr=1.5e-05, gnorm=4.026, loss_scale=128, train_wall=14, wall=298
 RAWLOSS @ 1600 tensor(1923.9146, device='cuda:0')
 2020-07-20 19:34:05 | INFO | train_inner | epoch 001:   1600 / 648283 loss=10.502, ppl=1449.97, wps=1603.7, ups=6.89, wpb=232.6, bsz=8, num_updates=1600, lr=1.6e-05, gnorm=4.119, loss_scale=128, train_wall=14, wall=312
 RAWLOSS @ 1700 tensor(1688.3428, device='cuda:0')
 2020-07-20 19:34:19 | INFO | train_inner | epoch 001:   1700 / 648283 loss=10.486, ppl=1434.22, wps=1667.3, ups=6.9, wpb=241.6, bsz=8, num_updates=1700, lr=1.7e-05, gnorm=4.013, loss_scale=128, train_wall=14, wall=327
 RAWLOSS @ 1800 tensor(1570.4878, device='cuda:0')
 2020-07-20 19:34:34 | INFO | train_inner | epoch 001:   1800 / 648283 loss=10.356, ppl=1310.82, wps=1613.9, ups=6.9, wpb=233.8, bsz=8, num_updates=1800, lr=1.8e-05, gnorm=4.107, loss_scale=128, train_wall=14, wall=341
 RAWLOSS @ 1900 tensor(1828.7078, device='cuda:0')
 2020-07-20 19:34:48 | INFO | train_inner | epoch 001:   1900 / 648283 loss=10.368, ppl=1321.44, wps=1606.2, ups=6.91, wpb=232.4, bsz=8, num_updates=1900, lr=1.9e-05, gnorm=4.235, loss_scale=128, train_wall=14, wall=356
 RAWLOSS @ 2000 tensor(1782.3804, device='cuda:0')
 2020-07-20 19:35:03 | INFO | train_inner | epoch 001:   2000 / 648283 loss=10.34, ppl=1296.37, wps=1628.9, ups=6.9, wpb=236.1, bsz=8, num_updates=2000, lr=2e-05, gnorm=4.082, loss_scale=128, train_wall=14, wall=370
 RAWLOSS @ 2100 tensor(2344.2183, device='cuda:0')
 2020-07-20 19:35:17 | INFO | train_inner | epoch 001:   2100 / 648283 loss=10.377, ppl=1329.89, wps=1634.2, ups=6.91, wpb=236.4, bsz=8, num_updates=2100, lr=2.1e-05, gnorm=4.098, loss_scale=128, train_wall=14, wall=385
 RAWLOSS @ 2200 tensor(1709.7332, device='cuda:0')
 2020-07-20 19:35:31 | INFO | train_inner | epoch 001:   2200 / 648283 loss=10.287, ppl=1248.99, wps=1698.8, ups=6.91, wpb=245.8, bsz=8, num_updates=2200, lr=2.2e-05, gnorm=4.092, loss_scale=128, train_wall=14, wall=399
 RAWLOSS @ 2300 tensor(1072.0853, device='cuda:0')
 2020-07-20 19:35:46 | INFO | train_inner | epoch 001:   2300 / 648283 loss=10.127, ppl=1118.58, wps=1593.4, ups=6.91, wpb=230.6, bsz=8, num_updates=2300, lr=2.3e-05, gnorm=4.083, loss_scale=128, train_wall=14, wall=414
 RAWLOSS @ 2400 tensor(1800.6949, device='cuda:0')
 2020-07-20 19:36:00 | INFO | train_inner | epoch 001:   2400 / 648283 loss=10.287, ppl=1249.09, wps=1668.2, ups=6.9, wpb=241.6, bsz=8, num_updates=2400, lr=2.4e-05, gnorm=3.975, loss_scale=128, train_wall=14, wall=428
 RAWLOSS @ 2500 tensor(1392.2799, device='cuda:0')
 2020-07-20 19:36:15 | INFO | train_inner | epoch 001:   2500 / 648283 loss=10.071, ppl=1075.89, wps=1627.9, ups=6.91, wpb=235.7, bsz=8, num_updates=2500, lr=2.5e-05, gnorm=3.887, loss_scale=128, train_wall=14, wall=443
 RAWLOSS @ 2600 tensor(1319.8795, device='cuda:0')
 2020-07-20 19:36:29 | INFO | train_inner | epoch 001:   2600 / 648283 loss=10.063, ppl=1069.83, wps=1709.6, ups=6.92, wpb=247.2, bsz=8, num_updates=2600, lr=2.6e-05, gnorm=3.872, loss_scale=128, train_wall=14, wall=457
 RAWLOSS @ 2700 tensor(1346.2390, device='cuda:0')
 2020-07-20 19:36:44 | INFO | train_inner | epoch 001:   2700 / 648283 loss=9.965, ppl=999.25, wps=1614.2, ups=6.9, wpb=234, bsz=8, num_updates=2700, lr=2.7e-05, gnorm=3.811, loss_scale=128, train_wall=14, wall=472
 RAWLOSS @ 2800 tensor(1433.3136, device='cuda:0')
 2020-07-20 19:36:58 | INFO | train_inner | epoch 001:   2800 / 648283 loss=10.003, ppl=1025.93, wps=1670.2, ups=6.92, wpb=241.4, bsz=8, num_updates=2800, lr=2.8e-05, gnorm=3.786, loss_scale=128, train_wall=14, wall=486
 RAWLOSS @ 2900 tensor(1693.1978, device='cuda:0')
 2020-07-20 19:37:13 | INFO | train_inner | epoch 001:   2900 / 648283 loss=9.855, ppl=926.31, wps=1597.9, ups=6.92, wpb=230.9, bsz=8, num_updates=2900, lr=2.9e-05, gnorm=3.8, loss_scale=128, train_wall=14, wall=500
 RAWLOSS @ 3000 tensor(867.2125, device='cuda:0')
 2020-07-20 19:37:27 | INFO | train_inner | epoch 001:   3000 / 648283 loss=9.838, ppl=915.06, wps=1638.5, ups=6.91, wpb=237.3, bsz=8, num_updates=3000, lr=3e-05, gnorm=3.72, loss_scale=128, train_wall=14, wall=515
 RAWLOSS @ 3100 tensor(1706.2174, device='cuda:0')
 2020-07-20 19:37:42 | INFO | train_inner | epoch 001:   3100 / 648283 loss=9.704, ppl=834.26, wps=1642.4, ups=6.87, wpb=239.2, bsz=8, num_updates=3100, lr=3.1e-05, gnorm=3.666, loss_scale=128, train_wall=14, wall=530
 RAWLOSS @ 3200 tensor(1676.3699, device='cuda:0')
 2020-07-20 19:37:56 | INFO | train_inner | epoch 001:   3200 / 648283 loss=9.703, ppl=833.55, wps=1614.9, ups=6.86, wpb=235.3, bsz=8, num_updates=3200, lr=3.2e-05, gnorm=3.712, loss_scale=128, train_wall=14, wall=544
 RAWLOSS @ 3300 tensor(1000.9597, device='cuda:0')
 2020-07-20 19:38:11 | INFO | train_inner | epoch 001:   3300 / 648283 loss=9.783, ppl=881.1, wps=1613.2, ups=6.88, wpb=234.4, bsz=8, num_updates=3300, lr=3.3e-05, gnorm=3.647, loss_scale=128, train_wall=14, wall=559
 RAWLOSS @ 3400 tensor(1666.6620, device='cuda:0')
 2020-07-20 19:38:25 | INFO | train_inner | epoch 001:   3400 / 648283 loss=9.647, ppl=801.95, wps=1664.7, ups=6.91, wpb=241, bsz=8, num_updates=3400, lr=3.4e-05, gnorm=3.624, loss_scale=128, train_wall=14, wall=573
 RAWLOSS @ 3500 tensor(1426.3322, device='cuda:0')
 2020-07-20 19:38:40 | INFO | train_inner | epoch 001:   3500 / 648283 loss=9.69, ppl=826.04, wps=1596.3, ups=6.91, wpb=231.1, bsz=8, num_updates=3500, lr=3.5e-05, gnorm=3.604, loss_scale=128, train_wall=14, wall=588
 RAWLOSS @ 3600 tensor(1634.4712, device='cuda:0')
 2020-07-20 19:38:54 | INFO | train_inner | epoch 001:   3600 / 648283 loss=9.622, ppl=787.92, wps=1638.5, ups=6.91, wpb=237, bsz=8, num_updates=3600, lr=3.6e-05, gnorm=3.591, loss_scale=128, train_wall=14, wall=602
 RAWLOSS @ 3700 tensor(1551.6877, device='cuda:0')
 2020-07-20 19:39:09 | INFO | train_inner | epoch 001:   3700 / 648283 loss=9.496, ppl=722.11, wps=1708.6, ups=6.92, wpb=247, bsz=8, num_updates=3700, lr=3.7e-05, gnorm=3.442, loss_scale=128, train_wall=14, wall=617
 RAWLOSS @ 3800 tensor(1543.4656, device='cuda:0')
 2020-07-20 19:39:23 | INFO | train_inner | epoch 001:   3800 / 648283 loss=9.584, ppl=767.5, wps=1631.1, ups=6.92, wpb=235.8, bsz=8, num_updates=3800, lr=3.8e-05, gnorm=3.499, loss_scale=128, train_wall=14, wall=631
 RAWLOSS @ 3900 tensor(1488.4733, device='cuda:0')
 2020-07-20 19:39:38 | INFO | train_inner | epoch 001:   3900 / 648283 loss=9.525, ppl=736.96, wps=1601.9, ups=6.92, wpb=231.6, bsz=8, num_updates=3900, lr=3.9e-05, gnorm=3.558, loss_scale=128, train_wall=14, wall=645
 RAWLOSS @ 4000 tensor(1679.9448, device='cuda:0')
 2020-07-20 19:39:52 | INFO | train_inner | epoch 001:   4000 / 648283 loss=9.531, ppl=739.75, wps=1665.1, ups=6.92, wpb=240.8, bsz=8, num_updates=4000, lr=4e-05, gnorm=3.496, loss_scale=128, train_wall=14, wall=660
 RAWLOSS @ 4100 tensor(1940.9502, device='cuda:0')
 2020-07-20 19:40:07 | INFO | train_inner | epoch 001:   4100 / 648283 loss=9.396, ppl=673.76, wps=1640.9, ups=6.89, wpb=238, bsz=8, num_updates=4100, lr=4.1e-05, gnorm=3.467, loss_scale=128, train_wall=14, wall=674
 RAWLOSS @ 4200 tensor(1738.5826, device='cuda:0')
 2020-07-20 19:40:21 | INFO | train_inner | epoch 001:   4200 / 648283 loss=9.444, ppl=696.43, wps=1621.1, ups=6.9, wpb=234.9, bsz=8, num_updates=4200, lr=4.2e-05, gnorm=3.578, loss_scale=128, train_wall=14, wall=689
 RAWLOSS @ 4300 tensor(1848.1174, device='cuda:0')
 2020-07-20 19:40:36 | INFO | train_inner | epoch 001:   4300 / 648283 loss=9.407, ppl=678.78, wps=1628, ups=6.9, wpb=236, bsz=8, num_updates=4300, lr=4.3e-05, gnorm=3.476, loss_scale=128, train_wall=14, wall=703
 RAWLOSS @ 4400 tensor(2160.4170, device='cuda:0')
 2020-07-20 19:40:50 | INFO | train_inner | epoch 001:   4400 / 648283 loss=9.292, ppl=626.93, wps=1629.1, ups=6.91, wpb=235.9, bsz=8, num_updates=4400, lr=4.4e-05, gnorm=3.461, loss_scale=128, train_wall=14, wall=718
 RAWLOSS @ 4500 tensor(1741.8374, device='cuda:0')
 2020-07-20 19:41:05 | INFO | train_inner | epoch 001:   4500 / 648283 loss=9.252, ppl=609.9, wps=1638.2, ups=6.91, wpb=237.1, bsz=8, num_updates=4500, lr=4.5e-05, gnorm=3.434, loss_scale=128, train_wall=14, wall=732
 RAWLOSS @ 4600 tensor(1013.6898, device='cuda:0')
 2020-07-20 19:41:19 | INFO | train_inner | epoch 001:   4600 / 648283 loss=9.289, ppl=625.51, wps=1618.4, ups=6.91, wpb=234.2, bsz=8, num_updates=4600, lr=4.6e-05, gnorm=3.489, loss_scale=128, train_wall=14, wall=747
 RAWLOSS @ 4700 tensor(1215.4442, device='cuda:0')
 2020-07-20 19:41:34 | INFO | train_inner | epoch 001:   4700 / 648283 loss=9.161, ppl=572.44, wps=1595.7, ups=6.92, wpb=230.6, bsz=8, num_updates=4700, lr=4.7e-05, gnorm=3.487, loss_scale=128, train_wall=14, wall=761
 RAWLOSS @ 4800 tensor(1686.5798, device='cuda:0')
 2020-07-20 19:41:48 | INFO | train_inner | epoch 001:   4800 / 648283 loss=9.195, ppl=585.98, wps=1663.3, ups=6.91, wpb=240.9, bsz=8, num_updates=4800, lr=4.8e-05, gnorm=3.403, loss_scale=128, train_wall=14, wall=776
 RAWLOSS @ 4900 tensor(1270.6083, device='cuda:0')
 2020-07-20 19:42:02 | INFO | train_inner | epoch 001:   4900 / 648283 loss=9.156, ppl=570.28, wps=1597.5, ups=6.91, wpb=231.1, bsz=8, num_updates=4900, lr=4.9e-05, gnorm=3.499, loss_scale=128, train_wall=14, wall=790
 RAWLOSS @ 5000 tensor(1934.5712, device='cuda:0')
 2020-07-20 19:42:17 | INFO | train_inner | epoch 001:   5000 / 648283 loss=9.077, ppl=540.15, wps=1633.7, ups=6.92, wpb=236.1, bsz=8, num_updates=5000, lr=5e-05, gnorm=3.412, loss_scale=128, train_wall=14, wall=805
 RAWLOSS @ 5100 tensor(1389.7025, device='cuda:0')
 RAWLOSS @ 5200 tensor(1521.7289, device='cuda:0')
 RAWLOSS @ 5300 tensor(986.0732, device='cuda:0')
 RAWLOSS @ 5400 tensor(1238.0280, device='cuda:0')
 RAWLOSS @ 5500 tensor(1070.5186, device='cuda:0')
 RAWLOSS @ 5600 tensor(2137.5652, device='cuda:0')
 RAWLOSS @ 5700 tensor(1708.1031, device='cuda:0')
 RAWLOSS @ 5800 tensor(1423.8556, device='cuda:0')
 RAWLOSS @ 5900 tensor(1369.5192, device='cuda:0')
 RAWLOSS @ 6000 tensor(1245.5112, device='cuda:0')
 RAWLOSS @ 6100 tensor(2512.7710, device='cuda:0')
 RAWLOSS @ 6200 tensor(1891.8496, device='cuda:0')
 RAWLOSS @ 6300 tensor(1130.3253, device='cuda:0')
 RAWLOSS @ 6400 tensor(961.2222, device='cuda:0')
 RAWLOSS @ 6500 tensor(1826.8185, device='cuda:0')
 RAWLOSS @ 6600 tensor(668.8315, device='cuda:0')
 RAWLOSS @ 6700 tensor(1368.3977, device='cuda:0')
 RAWLOSS @ 6800 tensor(2059.0720, device='cuda:0')
 RAWLOSS @ 6900 tensor(1417.4907, device='cuda:0')
 RAWLOSS @ 7000 tensor(1296.0875, device='cuda:0')
 RAWLOSS @ 7100 tensor(1533.7075, device='cuda:0')
 RAWLOSS @ 7200 tensor(1195.6879, device='cuda:0')
 RAWLOSS @ 7300 tensor(1629.7560, device='cuda:0')
 RAWLOSS @ 7400 tensor(1123.7363, device='cuda:0')
 RAWLOSS @ 7500 tensor(1551.4454, device='cuda:0')
 RAWLOSS @ 7600 tensor(1700.6863, device='cuda:0')
 RAWLOSS @ 7700 tensor(1107.6963, device='cuda:0')
 RAWLOSS @ 7800 tensor(1450.6442, device='cuda:0')
 RAWLOSS @ 7900 tensor(1845.3767, device='cuda:0')
 RAWLOSS @ 8000 tensor(1751.1384, device='cuda:0')
 RAWLOSS @ 8100 tensor(1534.0872, device='cuda:0')
 RAWLOSS @ 8200 tensor(1527.2074, device='cuda:0')
 RAWLOSS @ 8300 tensor(2236.2222, device='cuda:0')
 RAWLOSS @ 8400 tensor(1125.7655, device='cuda:0')
 RAWLOSS @ 8500 tensor(1681.7218, device='cuda:0')
 RAWLOSS @ 8600 tensor(1828.4143, device='cuda:0')
 RAWLOSS @ 8700 tensor(1644.8458, device='cuda:0')
 RAWLOSS @ 8800 tensor(1691.0027, device='cuda:0')
 RAWLOSS @ 8900 tensor(1959.2830, device='cuda:0')
 RAWLOSS @ 9000 tensor(1217.1136, device='cuda:0')
 RAWLOSS @ 9100 tensor(1144.0026, device='cuda:0')
 RAWLOSS @ 9200 tensor(1399.2024, device='cuda:0')
 RAWLOSS @ 9300 tensor(1251.1783, device='cuda:0')
 RAWLOSS @ 9400 tensor(1406.9491, device='cuda:0')
 RAWLOSS @ 9500 tensor(937.4207, device='cuda:0')
 RAWLOSS @ 9600 tensor(1411.9232, device='cuda:0')
 RAWLOSS @ 9700 tensor(1632.3060, device='cuda:0')
 RAWLOSS @ 9800 tensor(2037.6841, device='cuda:0')
 RAWLOSS @ 9900 tensor(1467.7316, device='cuda:0')
 RAWLOSS @ 10000 tensor(1566.2866, device='cuda:0')
 RAWLOSS @ 10100 tensor(1275.3107, device='cuda:0')
 RAWLOSS @ 10200 tensor(1482.7819, device='cuda:0')
 RAWLOSS @ 10300 tensor(1025.0098, device='cuda:0')
 RAWLOSS @ 10400 tensor(1641.3466, device='cuda:0')
 RAWLOSS @ 10500 tensor(1740.3737, device='cuda:0')
 RAWLOSS @ 10600 tensor(1478.7198, device='cuda:0')
 RAWLOSS @ 10700 tensor(1440.9719, device='cuda:0')
 RAWLOSS @ 10800 tensor(1178.1041, device='cuda:0')
 RAWLOSS @ 10900 tensor(1759.4432, device='cuda:0')
 RAWLOSS @ 11000 tensor(2279.4111, device='cuda:0')
 RAWLOSS @ 11100 tensor(1960.0864, device='cuda:0')
 RAWLOSS @ 11200 tensor(1575.8950, device='cuda:0')
 RAWLOSS @ 11300 tensor(1673.9479, device='cuda:0')
 RAWLOSS @ 11400 tensor(1403.0437, device='cuda:0')
 RAWLOSS @ 11500 tensor(2001.8221, device='cuda:0')
 2020-07-20 19:46:22 | INFO | valid | epoch 001 | valid on 'valid' subset | loss 9.117 | ppl 555.32 | wps 6327.2 | wpb 236.3 | bsz 8 | num_updates 5000
	2020-07-20 19:30:27 \| INFO \| train_inner \| epoch 001: 100 / 648283 loss=15.175, ppl=36985.3, wps=1680.2, ups=6.93, wpb=242.4, bsz=8, num_updates=100, lr=1e-06, gnorm=8.917, loss_scale=128, train_wall=15, wall=95
	RAWLOSS @ 200 tensor(1351.2732, device='cuda:0')
	2020-07-20 19:30:42 \| INFO \| train_inner \| epoch 001: 200 / 648283 loss=13.532, ppl=11843.4, wps=1672.1, ups=6.92, wpb=241.6, bsz=8, num_updates=200, lr=2e-06, gnorm=6.148, loss_scale=128, train_wall=14, wall=110
	RAWLOSS @ 300 tensor(2289.2922, device='cuda:0')
	2020-07-20 19:30:56 \| INFO \| train_inner \| epoch 001: 300 / 648283 loss=12.885, ppl=7566.95, wps=1622.2, ups=6.92, wpb=234.5, bsz=8, num_updates=300, lr=3e-06, gnorm=5.264, loss_scale=128, train_wall=14, wall=124
	RAWLOSS @ 400 tensor(1715.7347, device='cuda:0')
	2020-07-20 19:31:11 \| INFO \| train_inner \| epoch 001: 400 / 648283 loss=12.564, ppl=6055.17, wps=1642.5, ups=6.92, wpb=237.4, bsz=8, num_updates=400, lr=4e-06, gnorm=4.725, loss_scale=128, train_wall=14, wall=139
	RAWLOSS @ 500 tensor(2465.9827, device='cuda:0')
	2020-07-20 19:31:25 \| INFO \| train_inner \| epoch 001: 500 / 648283 loss=12.171, ppl=4612.85, wps=1613.5, ups=6.88, wpb=234.5, bsz=8, num_updates=500, lr=5e-06, gnorm=4.369, loss_scale=128, train_wall=14, wall=153
	RAWLOSS @ 600 tensor(2054.0337, device='cuda:0')
	2020-07-20 19:31:40 \| INFO \| train_inner \| epoch 001: 600 / 648283 loss=11.87, ppl=3742.86, wps=1668.3, ups=6.91, wpb=241.4, bsz=8, num_updates=600, lr=6e-06, gnorm=4.025, loss_scale=128, train_wall=14, wall=168
	RAWLOSS @ 700 tensor(1702.4202, device='cuda:0')
	2020-07-20 19:31:54 \| INFO \| train_inner \| epoch 001: 700 / 648283 loss=11.598, ppl=3100.37, wps=1625.6, ups=6.91, wpb=235.3, bsz=8, num_updates=700, lr=7e-06, gnorm=3.855, loss_scale=128, train_wall=14, wall=182
	RAWLOSS @ 800 tensor(1390.9583, device='cuda:0')
	2020-07-20 19:32:09 \| INFO \| train_inner \| epoch 001: 800 / 648283 loss=11.283, ppl=2491.04, wps=1632.2, ups=6.91, wpb=236.1, bsz=8, num_updates=800, lr=8e-06, gnorm=3.714, loss_scale=128, train_wall=14, wall=196
	RAWLOSS @ 900 tensor(1542.2812, device='cuda:0')
	2020-07-20 19:32:23 \| INFO \| train_inner \| epoch 001: 900 / 648283 loss=11.052, ppl=2123.46, wps=1692.6, ups=6.91, wpb=244.9, bsz=8, num_updates=900, lr=9e-06, gnorm=3.659, loss_scale=128, train_wall=14, wall=211
	RAWLOSS @ 1000 tensor(1371.7253, device='cuda:0')
	2020-07-20 19:32:38 \| INFO \| train_inner \| epoch 001: 1000 / 648283 loss=10.961, ppl=1992.71, wps=1660.7, ups=6.92, wpb=240.1, bsz=8, num_updates=1000, lr=1e-05, gnorm=3.738, loss_scale=128, train_wall=14, wall=225
	RAWLOSS @ 1100 tensor(1693.6879, device='cuda:0')
	2020-07-20 19:32:52 \| INFO \| train_inner \| epoch 001: 1100 / 648283 loss=10.813, ppl=1799.33, wps=1600.9, ups=6.91, wpb=231.8, bsz=8, num_updates=1100, lr=1.1e-05, gnorm=3.74, loss_scale=128, train_wall=14, wall=240
	RAWLOSS @ 1200 tensor(1228.1172, device='cuda:0')
	2020-07-20 19:33:07 \| INFO \| train_inner \| epoch 001: 1200 / 648283 loss=10.731, ppl=1700.03, wps=1592.4, ups=6.91, wpb=230.3, bsz=8, num_updates=1200, lr=1.2e-05, gnorm=3.842, loss_scale=128, train_wall=14, wall=254
	RAWLOSS @ 1300 tensor(1328.1652, device='cuda:0')
	2020-07-20 19:33:21 \| INFO \| train_inner \| epoch 001: 1300 / 648283 loss=10.609, ppl=1561.78, wps=1678.9, ups=6.9, wpb=243.2, bsz=8, num_updates=1300, lr=1.3e-05, gnorm=3.824, loss_scale=128, train_wall=14, wall=269
	RAWLOSS @ 1400 tensor(1641.3103, device='cuda:0')
	2020-07-20 19:33:36 \| INFO \| train_inner \| epoch 001: 1400 / 648283 loss=10.597, ppl=1548.56, wps=1639.7, ups=6.89, wpb=237.8, bsz=8, num_updates=1400, lr=1.4e-05, gnorm=3.972, loss_scale=128, train_wall=14, wall=283
	RAWLOSS @ 1500 tensor(1422.3258, device='cuda:0')
	2020-07-20 19:33:50 \| INFO \| train_inner \| epoch 001: 1500 / 648283 loss=10.524, ppl=1472.3, wps=1674.4, ups=6.9, wpb=242.6, bsz=8, num_updates=1500, lr=1.5e-05, gnorm=4.026, loss_scale=128, train_wall=14, wall=298
	RAWLOSS @ 1600 tensor(1923.9146, device='cuda:0')
	2020-07-20 19:34:05 \| INFO \| train_inner \| epoch 001: 1600 / 648283 loss=10.502, ppl=1449.97, wps=1603.7, ups=6.89, wpb=232.6, bsz=8, num_updates=1600, lr=1.6e-05, gnorm=4.119, loss_scale=128, train_wall=14, wall=312
	RAWLOSS @ 1700 tensor(1688.3428, device='cuda:0')
	2020-07-20 19:34:19 \| INFO \| train_inner \| epoch 001: 1700 / 648283 loss=10.486, ppl=1434.22, wps=1667.3, ups=6.9, wpb=241.6, bsz=8, num_updates=1700, lr=1.7e-05, gnorm=4.013, loss_scale=128, train_wall=14, wall=327
	RAWLOSS @ 1800 tensor(1570.4878, device='cuda:0')
	2020-07-20 19:34:34 \| INFO \| train_inner \| epoch 001: 1800 / 648283 loss=10.356, ppl=1310.82, wps=1613.9, ups=6.9, wpb=233.8, bsz=8, num_updates=1800, lr=1.8e-05, gnorm=4.107, loss_scale=128, train_wall=14, wall=341
	RAWLOSS @ 1900 tensor(1828.7078, device='cuda:0')
	2020-07-20 19:34:48 \| INFO \| train_inner \| epoch 001: 1900 / 648283 loss=10.368, ppl=1321.44, wps=1606.2, ups=6.91, wpb=232.4, bsz=8, num_updates=1900, lr=1.9e-05, gnorm=4.235, loss_scale=128, train_wall=14, wall=356
	RAWLOSS @ 2000 tensor(1782.3804, device='cuda:0')
	2020-07-20 19:35:03 \| INFO \| train_inner \| epoch 001: 2000 / 648283 loss=10.34, ppl=1296.37, wps=1628.9, ups=6.9, wpb=236.1, bsz=8, num_updates=2000, lr=2e-05, gnorm=4.082, loss_scale=128, train_wall=14, wall=370
	RAWLOSS @ 2100 tensor(2344.2183, device='cuda:0')
	2020-07-20 19:35:17 \| INFO \| train_inner \| epoch 001: 2100 / 648283 loss=10.377, ppl=1329.89, wps=1634.2, ups=6.91, wpb=236.4, bsz=8, num_updates=2100, lr=2.1e-05, gnorm=4.098, loss_scale=128, train_wall=14, wall=385
	RAWLOSS @ 2200 tensor(1709.7332, device='cuda:0')
	2020-07-20 19:35:31 \| INFO \| train_inner \| epoch 001: 2200 / 648283 loss=10.287, ppl=1248.99, wps=1698.8, ups=6.91, wpb=245.8, bsz=8, num_updates=2200, lr=2.2e-05, gnorm=4.092, loss_scale=128, train_wall=14, wall=399
	RAWLOSS @ 2300 tensor(1072.0853, device='cuda:0')
	2020-07-20 19:35:46 \| INFO \| train_inner \| epoch 001: 2300 / 648283 loss=10.127, ppl=1118.58, wps=1593.4, ups=6.91, wpb=230.6, bsz=8, num_updates=2300, lr=2.3e-05, gnorm=4.083, loss_scale=128, train_wall=14, wall=414
	RAWLOSS @ 2400 tensor(1800.6949, device='cuda:0')
	2020-07-20 19:36:00 \| INFO \| train_inner \| epoch 001: 2400 / 648283 loss=10.287, ppl=1249.09, wps=1668.2, ups=6.9, wpb=241.6, bsz=8, num_updates=2400, lr=2.4e-05, gnorm=3.975, loss_scale=128, train_wall=14, wall=428
	RAWLOSS @ 2500 tensor(1392.2799, device='cuda:0')
	2020-07-20 19:36:15 \| INFO \| train_inner \| epoch 001: 2500 / 648283 loss=10.071, ppl=1075.89, wps=1627.9, ups=6.91, wpb=235.7, bsz=8, num_updates=2500, lr=2.5e-05, gnorm=3.887, loss_scale=128, train_wall=14, wall=443
	RAWLOSS @ 2600 tensor(1319.8795, device='cuda:0')
	2020-07-20 19:36:29 \| INFO \| train_inner \| epoch 001: 2600 / 648283 loss=10.063, ppl=1069.83, wps=1709.6, ups=6.92, wpb=247.2, bsz=8, num_updates=2600, lr=2.6e-05, gnorm=3.872, loss_scale=128, train_wall=14, wall=457
	RAWLOSS @ 2700 tensor(1346.2390, device='cuda:0')
	2020-07-20 19:36:44 \| INFO \| train_inner \| epoch 001: 2700 / 648283 loss=9.965, ppl=999.25, wps=1614.2, ups=6.9, wpb=234, bsz=8, num_updates=2700, lr=2.7e-05, gnorm=3.811, loss_scale=128, train_wall=14, wall=472
	RAWLOSS @ 2800 tensor(1433.3136, device='cuda:0')
	2020-07-20 19:36:58 \| INFO \| train_inner \| epoch 001: 2800 / 648283 loss=10.003, ppl=1025.93, wps=1670.2, ups=6.92, wpb=241.4, bsz=8, num_updates=2800, lr=2.8e-05, gnorm=3.786, loss_scale=128, train_wall=14, wall=486
	RAWLOSS @ 2900 tensor(1693.1978, device='cuda:0')
	2020-07-20 19:37:13 \| INFO \| train_inner \| epoch 001: 2900 / 648283 loss=9.855, ppl=926.31, wps=1597.9, ups=6.92, wpb=230.9, bsz=8, num_updates=2900, lr=2.9e-05, gnorm=3.8, loss_scale=128, train_wall=14, wall=500
	RAWLOSS @ 3000 tensor(867.2125, device='cuda:0')
	2020-07-20 19:37:27 \| INFO \| train_inner \| epoch 001: 3000 / 648283 loss=9.838, ppl=915.06, wps=1638.5, ups=6.91, wpb=237.3, bsz=8, num_updates=3000, lr=3e-05, gnorm=3.72, loss_scale=128, train_wall=14, wall=515
	RAWLOSS @ 3100 tensor(1706.2174, device='cuda:0')
	2020-07-20 19:37:42 \| INFO \| train_inner \| epoch 001: 3100 / 648283 loss=9.704, ppl=834.26, wps=1642.4, ups=6.87, wpb=239.2, bsz=8, num_updates=3100, lr=3.1e-05, gnorm=3.666, loss_scale=128, train_wall=14, wall=530
	RAWLOSS @ 3200 tensor(1676.3699, device='cuda:0')
	2020-07-20 19:37:56 \| INFO \| train_inner \| epoch 001: 3200 / 648283 loss=9.703, ppl=833.55, wps=1614.9, ups=6.86, wpb=235.3, bsz=8, num_updates=3200, lr=3.2e-05, gnorm=3.712, loss_scale=128, train_wall=14, wall=544
	RAWLOSS @ 3300 tensor(1000.9597, device='cuda:0')
	2020-07-20 19:38:11 \| INFO \| train_inner \| epoch 001: 3300 / 648283 loss=9.783, ppl=881.1, wps=1613.2, ups=6.88, wpb=234.4, bsz=8, num_updates=3300, lr=3.3e-05, gnorm=3.647, loss_scale=128, train_wall=14, wall=559
	RAWLOSS @ 3400 tensor(1666.6620, device='cuda:0')
	2020-07-20 19:38:25 \| INFO \| train_inner \| epoch 001: 3400 / 648283 loss=9.647, ppl=801.95, wps=1664.7, ups=6.91, wpb=241, bsz=8, num_updates=3400, lr=3.4e-05, gnorm=3.624, loss_scale=128, train_wall=14, wall=573
	RAWLOSS @ 3500 tensor(1426.3322, device='cuda:0')
	2020-07-20 19:38:40 \| INFO \| train_inner \| epoch 001: 3500 / 648283 loss=9.69, ppl=826.04, wps=1596.3, ups=6.91, wpb=231.1, bsz=8, num_updates=3500, lr=3.5e-05, gnorm=3.604, loss_scale=128, train_wall=14, wall=588
	RAWLOSS @ 3600 tensor(1634.4712, device='cuda:0')
	2020-07-20 19:38:54 \| INFO \| train_inner \| epoch 001: 3600 / 648283 loss=9.622, ppl=787.92, wps=1638.5, ups=6.91, wpb=237, bsz=8, num_updates=3600, lr=3.6e-05, gnorm=3.591, loss_scale=128, train_wall=14, wall=602
	RAWLOSS @ 3700 tensor(1551.6877, device='cuda:0')
	2020-07-20 19:39:09 \| INFO \| train_inner \| epoch 001: 3700 / 648283 loss=9.496, ppl=722.11, wps=1708.6, ups=6.92, wpb=247, bsz=8, num_updates=3700, lr=3.7e-05, gnorm=3.442, loss_scale=128, train_wall=14, wall=617
	RAWLOSS @ 3800 tensor(1543.4656, device='cuda:0')
	2020-07-20 19:39:23 \| INFO \| train_inner \| epoch 001: 3800 / 648283 loss=9.584, ppl=767.5, wps=1631.1, ups=6.92, wpb=235.8, bsz=8, num_updates=3800, lr=3.8e-05, gnorm=3.499, loss_scale=128, train_wall=14, wall=631
	RAWLOSS @ 3900 tensor(1488.4733, device='cuda:0')
	2020-07-20 19:39:38 \| INFO \| train_inner \| epoch 001: 3900 / 648283 loss=9.525, ppl=736.96, wps=1601.9, ups=6.92, wpb=231.6, bsz=8, num_updates=3900, lr=3.9e-05, gnorm=3.558, loss_scale=128, train_wall=14, wall=645
	RAWLOSS @ 4000 tensor(1679.9448, device='cuda:0')
	2020-07-20 19:39:52 \| INFO \| train_inner \| epoch 001: 4000 / 648283 loss=9.531, ppl=739.75, wps=1665.1, ups=6.92, wpb=240.8, bsz=8, num_updates=4000, lr=4e-05, gnorm=3.496, loss_scale=128, train_wall=14, wall=660
	RAWLOSS @ 4100 tensor(1940.9502, device='cuda:0')
	2020-07-20 19:40:07 \| INFO \| train_inner \| epoch 001: 4100 / 648283 loss=9.396, ppl=673.76, wps=1640.9, ups=6.89, wpb=238, bsz=8, num_updates=4100, lr=4.1e-05, gnorm=3.467, loss_scale=128, train_wall=14, wall=674
	RAWLOSS @ 4200 tensor(1738.5826, device='cuda:0')
	2020-07-20 19:40:21 \| INFO \| train_inner \| epoch 001: 4200 / 648283 loss=9.444, ppl=696.43, wps=1621.1, ups=6.9, wpb=234.9, bsz=8, num_updates=4200, lr=4.2e-05, gnorm=3.578, loss_scale=128, train_wall=14, wall=689
	RAWLOSS @ 4300 tensor(1848.1174, device='cuda:0')
	2020-07-20 19:40:36 \| INFO \| train_inner \| epoch 001: 4300 / 648283 loss=9.407, ppl=678.78, wps=1628, ups=6.9, wpb=236, bsz=8, num_updates=4300, lr=4.3e-05, gnorm=3.476, loss_scale=128, train_wall=14, wall=703
	RAWLOSS @ 4400 tensor(2160.4170, device='cuda:0')
	2020-07-20 19:40:50 \| INFO \| train_inner \| epoch 001: 4400 / 648283 loss=9.292, ppl=626.93, wps=1629.1, ups=6.91, wpb=235.9, bsz=8, num_updates=4400, lr=4.4e-05, gnorm=3.461, loss_scale=128, train_wall=14, wall=718
	RAWLOSS @ 4500 tensor(1741.8374, device='cuda:0')
	2020-07-20 19:41:05 \| INFO \| train_inner \| epoch 001: 4500 / 648283 loss=9.252, ppl=609.9, wps=1638.2, ups=6.91, wpb=237.1, bsz=8, num_updates=4500, lr=4.5e-05, gnorm=3.434, loss_scale=128, train_wall=14, wall=732
	RAWLOSS @ 4600 tensor(1013.6898, device='cuda:0')
	2020-07-20 19:41:19 \| INFO \| train_inner \| epoch 001: 4600 / 648283 loss=9.289, ppl=625.51, wps=1618.4, ups=6.91, wpb=234.2, bsz=8, num_updates=4600, lr=4.6e-05, gnorm=3.489, loss_scale=128, train_wall=14, wall=747
	RAWLOSS @ 4700 tensor(1215.4442, device='cuda:0')
	2020-07-20 19:41:34 \| INFO \| train_inner \| epoch 001: 4700 / 648283 loss=9.161, ppl=572.44, wps=1595.7, ups=6.92, wpb=230.6, bsz=8, num_updates=4700, lr=4.7e-05, gnorm=3.487, loss_scale=128, train_wall=14, wall=761
	RAWLOSS @ 4800 tensor(1686.5798, device='cuda:0')
	2020-07-20 19:41:48 \| INFO \| train_inner \| epoch 001: 4800 / 648283 loss=9.195, ppl=585.98, wps=1663.3, ups=6.91, wpb=240.9, bsz=8, num_updates=4800, lr=4.8e-05, gnorm=3.403, loss_scale=128, train_wall=14, wall=776
	RAWLOSS @ 4900 tensor(1270.6083, device='cuda:0')
	2020-07-20 19:42:02 \| INFO \| train_inner \| epoch 001: 4900 / 648283 loss=9.156, ppl=570.28, wps=1597.5, ups=6.91, wpb=231.1, bsz=8, num_updates=4900, lr=4.9e-05, gnorm=3.499, loss_scale=128, train_wall=14, wall=790
	RAWLOSS @ 5000 tensor(1934.5712, device='cuda:0')
	2020-07-20 19:42:17 \| INFO \| train_inner \| epoch 001: 5000 / 648283 loss=9.077, ppl=540.15, wps=1633.7, ups=6.92, wpb=236.1, bsz=8, num_updates=5000, lr=5e-05, gnorm=3.412, loss_scale=128, train_wall=14, wall=805
	RAWLOSS @ 5100 tensor(1389.7025, device='cuda:0')
	RAWLOSS @ 5200 tensor(1521.7289, device='cuda:0')
	RAWLOSS @ 5300 tensor(986.0732, device='cuda:0')
	RAWLOSS @ 5400 tensor(1238.0280, device='cuda:0')
	RAWLOSS @ 5500 tensor(1070.5186, device='cuda:0')
	RAWLOSS @ 5600 tensor(2137.5652, device='cuda:0')
	RAWLOSS @ 5700 tensor(1708.1031, device='cuda:0')
	RAWLOSS @ 5800 tensor(1423.8556, device='cuda:0')
	RAWLOSS @ 5900 tensor(1369.5192, device='cuda:0')
	RAWLOSS @ 6000 tensor(1245.5112, device='cuda:0')
	RAWLOSS @ 6100 tensor(2512.7710, device='cuda:0')
	RAWLOSS @ 6200 tensor(1891.8496, device='cuda:0')
	RAWLOSS @ 6300 tensor(1130.3253, device='cuda:0')
	RAWLOSS @ 6400 tensor(961.2222, device='cuda:0')
	RAWLOSS @ 6500 tensor(1826.8185, device='cuda:0')
	RAWLOSS @ 6600 tensor(668.8315, device='cuda:0')
	RAWLOSS @ 6700 tensor(1368.3977, device='cuda:0')
	RAWLOSS @ 6800 tensor(2059.0720, device='cuda:0')
	RAWLOSS @ 6900 tensor(1417.4907, device='cuda:0')
	RAWLOSS @ 7000 tensor(1296.0875, device='cuda:0')
	RAWLOSS @ 7100 tensor(1533.7075, device='cuda:0')
	RAWLOSS @ 7200 tensor(1195.6879, device='cuda:0')
	RAWLOSS @ 7300 tensor(1629.7560, device='cuda:0')
	RAWLOSS @ 7400 tensor(1123.7363, device='cuda:0')
	RAWLOSS @ 7500 tensor(1551.4454, device='cuda:0')
	RAWLOSS @ 7600 tensor(1700.6863, device='cuda:0')
	RAWLOSS @ 7700 tensor(1107.6963, device='cuda:0')
	RAWLOSS @ 7800 tensor(1450.6442, device='cuda:0')
	RAWLOSS @ 7900 tensor(1845.3767, device='cuda:0')
	RAWLOSS @ 8000 tensor(1751.1384, device='cuda:0')
	RAWLOSS @ 8100 tensor(1534.0872, device='cuda:0')
	RAWLOSS @ 8200 tensor(1527.2074, device='cuda:0')
	RAWLOSS @ 8300 tensor(2236.2222, device='cuda:0')
	RAWLOSS @ 8400 tensor(1125.7655, device='cuda:0')
	RAWLOSS @ 8500 tensor(1681.7218, device='cuda:0')
	RAWLOSS @ 8600 tensor(1828.4143, device='cuda:0')
	RAWLOSS @ 8700 tensor(1644.8458, device='cuda:0')
	RAWLOSS @ 8800 tensor(1691.0027, device='cuda:0')
	RAWLOSS @ 8900 tensor(1959.2830, device='cuda:0')
	RAWLOSS @ 9000 tensor(1217.1136, device='cuda:0')
	RAWLOSS @ 9100 tensor(1144.0026, device='cuda:0')
	RAWLOSS @ 9200 tensor(1399.2024, device='cuda:0')
	RAWLOSS @ 9300 tensor(1251.1783, device='cuda:0')
	RAWLOSS @ 9400 tensor(1406.9491, device='cuda:0')
	RAWLOSS @ 9500 tensor(937.4207, device='cuda:0')
	RAWLOSS @ 9600 tensor(1411.9232, device='cuda:0')
	RAWLOSS @ 9700 tensor(1632.3060, device='cuda:0')
	RAWLOSS @ 9800 tensor(2037.6841, device='cuda:0')
	RAWLOSS @ 9900 tensor(1467.7316, device='cuda:0')
	RAWLOSS @ 10000 tensor(1566.2866, device='cuda:0')
	RAWLOSS @ 10100 tensor(1275.3107, device='cuda:0')
	RAWLOSS @ 10200 tensor(1482.7819, device='cuda:0')
	RAWLOSS @ 10300 tensor(1025.0098, device='cuda:0')
	RAWLOSS @ 10400 tensor(1641.3466, device='cuda:0')
	RAWLOSS @ 10500 tensor(1740.3737, device='cuda:0')
	RAWLOSS @ 10600 tensor(1478.7198, device='cuda:0')
	RAWLOSS @ 10700 tensor(1440.9719, device='cuda:0')
	RAWLOSS @ 10800 tensor(1178.1041, device='cuda:0')
	RAWLOSS @ 10900 tensor(1759.4432, device='cuda:0')
	RAWLOSS @ 11000 tensor(2279.4111, device='cuda:0')
	RAWLOSS @ 11100 tensor(1960.0864, device='cuda:0')
	RAWLOSS @ 11200 tensor(1575.8950, device='cuda:0')
	RAWLOSS @ 11300 tensor(1673.9479, device='cuda:0')
	RAWLOSS @ 11400 tensor(1403.0437, device='cuda:0')
	RAWLOSS @ 11500 tensor(2001.8221, device='cuda:0')
	2020-07-20 19:46:22 \| INFO \| valid \| epoch 001 \| valid on 'valid' subset \| loss 9.117 \| ppl 555.32 \| wps 6327.2 \| wpb 236.3 \| bsz 8 \| num_updates 5000