July 20, 2020 19:57 · July 20, 2020 19:54 · July 20, 2020 19:51 · July 17, 2020 17:59 · July 13, 2020 22:49 · November 20, 2020 18:06
 taylanbil@dlrm-gpu-8:~/kkissmart-fairseq$ paste <( grep RAWLOS gpu-repro.txt  ) <( grep RAWLOS tpulog.txt  )
 RAWLOSS @ 100 tensor(1410.1447, device='cuda:0')        RAWLOSS @ 100 tensor(1408.6381, device='xla:1')
 RAWLOSS @ 200 tensor(1351.2732, device='cuda:0')        RAWLOSS @ 200 tensor(1351.1304, device='xla:1')
 RAWLOSS @ 300 tensor(2289.2922, device='cuda:0')        RAWLOSS @ 300 tensor(2289.3340, device='xla:1')
 RAWLOSS @ 400 tensor(1715.7347, device='cuda:0')        RAWLOSS @ 400 tensor(1715.6587, device='xla:1')
 RAWLOSS @ 500 tensor(2465.9827, device='cuda:0')        RAWLOSS @ 500 tensor(2465.9453, device='xla:1')
 RAWLOSS @ 600 tensor(2054.0337, device='cuda:0')        RAWLOSS @ 600 tensor(2054.0742, device='xla:1')
 RAWLOSS @ 700 tensor(1702.4202, device='cuda:0')        RAWLOSS @ 700 tensor(1702.3367, device='xla:1')
 RAWLOSS @ 800 tensor(1390.9583, device='cuda:0')        RAWLOSS @ 800 tensor(1390.9158, device='xla:1')
 RAWLOSS @ 900 tensor(1542.2812, device='cuda:0')        RAWLOSS @ 900 tensor(1542.2
 2020-07-20 19:26:21 | INFO | train_inner | epoch 001:    100 / 648283 loss=14.211, ppl=18970, wps=0, ups=0, wpb=143, bsz=8, num_updates=100, lr=1e-06, gnorm=8.634, train_wall=11, wall=130
 RAWLOSS @ 200 tensor(1351.1304, device='xla:1')
 2020-07-20 19:26:46 | INFO | train_inner | epoch 001:    200 / 648283 loss=12.824, ppl=7251.87, wps=6, ups=0.04, wpb=152, bsz=8, num_updates=200, lr=2e-06, gnorm=6.779, train_wall=10, wall=156
 RAWLOSS @ 300 tensor(2289.3340, device='xla:1')
 2020-07-20 19:27:11 | INFO | train_inner | epoch 001:    300 / 648283 loss=12.463, ppl=5647.65, wps=10.5, ups=0.04, wpb=265, bsz=8, num_updates=300, lr=3e-06, gnorm=4.417, train_wall=10, wall=181
 RAWLOSS @ 400 tensor(1715.6587, device='xla:1')
 2020-07-20 19:27:38 | INFO | train_inner | epoch 001:    400 / 648283 loss=11.9, ppl=3821.35, wps=7.8, ups=0.04, wpb=208, bsz=8, num_updates=400, lr=4e-06, gnorm=4.5, train_wall=11, wall=208
 RAWLOSS @ 500 tensor(2465.9453, device='xla:1')
 2020-07-20 19:28:03 | INFO | train_inner | epoch 001:    500 / 6
 2020-07-20 19:30:27 | INFO | train_inner | epoch 001:    100 / 648283 loss=15.175, ppl=36985.3, wps=1680.2, ups=6.93, wpb=242.4, bsz=8, num_updates=100, lr=1e-06, gnorm=8.917, loss_scale=128, train_wall=15, wall=95
 RAWLOSS @ 200 tensor(1351.2732, device='cuda:0')
 2020-07-20 19:30:42 | INFO | train_inner | epoch 001:    200 / 648283 loss=13.532, ppl=11843.4, wps=1672.1, ups=6.92, wpb=241.6, bsz=8, num_updates=200, lr=2e-06, gnorm=6.148, loss_scale=128, train_wall=14, wall=110
 RAWLOSS @ 300 tensor(2289.2922, device='cuda:0')
 2020-07-20 19:30:56 | INFO | train_inner | epoch 001:    300 / 648283 loss=12.885, ppl=7566.95, wps=1622.2, ups=6.92, wpb=234.5, bsz=8, num_updates=300, lr=3e-06, gnorm=5.264, loss_scale=128, train_wall=14, wall=124
 RAWLOSS @ 400 tensor(1715.7347, device='cuda:0')
 2020-07-20 19:31:11 | INFO | train_inner | epoch 001:    400 / 648283 loss=12.564, ppl=6055.17, wps=1642.5, ups=6.92, wpb=237.4, bsz=8, num_updates=400, lr=4e-06, gnorm=4.725, loss_scale=128, train_wall=14, wall=139
 RAWLOSS @ 500 
 $ git diff d45342e tpu-criteo-kaggle
 diff --git a/.gitignore b/.gitignore
 new file mode 100644
 index 0000000..a81c8ee
 --- /dev/null
 +++ b/.gitignore
 @@ -0,0 +1,138 @@
 +# Byte-compiled / optimized / DLL files
 +__pycache__/
 +*.py[cod]
 import torch
 import torch.nn as nn
 import sys
 #sys.path.insert(0, '/usr/share/torch-xla-nightly/pytorch/xla/')
 import torch_xla.distributed.xla_multiprocessing as xmp


 def main(*a):
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
 # TPU CLI
 tpu=dlrm-init
 TPU_IP_ADDRESS=`gcloud compute tpus describe --zone=europe-west4-a dlrm-init | grep ipAddress | cut -d ':' -f2 | head -1 | sed 's/ //g'`
 export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"

 python dlrm/dlrm_tpu_runner.py \
    --arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 \
    --arch-sparse-feature-size=64 \
    --arch-mlp-bot=512-512-64 \
    --arch-mlp-top=1024-1024-1024-1 \
 git diff HEAD~1
 diff --git a/dlrm_data_pytorch.py b/dlrm_data_pytorch.py
 index 6cbe382..6f1c849 100644
 --- a/dlrm_data_pytorch.py
 +++ b/dlrm_data_pytorch.py
 @@ -266,7 +266,7 @@ class CriteoDataset(Dataset):

         if self.memory_map:
             if self.split == 'none' or self.split == 'train':
 -                # check if need to swicth to next day and load data
 import torch
 import torch.nn as nn
 import torch_xla.core.xla_model as xm


 device = xm.xla_device()
 d = nn.EmbeddingBag(10, 10, mode="sum", sparse=False).to(device)
 inp = torch.LongTensor([1, 5, 9]).to(device)
 x = d(inp, offsets=torch.LongTensor([0]).to(device))
 loss = x.sum()
 git diff HEAD~1 .
 diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
 index 1955bb9..e9ff88a 100644
 --- a/dlrm_s_pytorch.py
 +++ b/dlrm_s_pytorch.py
 @@ -177,9 +177,11 @@ class DLRM_Net(nn.Module):
             n = ln[i]
             # construct embedding operator
             if self.qr_flag and n > self.qr_threshold:
 +                # XXX: code path not hit with current tpu tests.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim

 import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.xla_multiprocessing as xmp
	taylanbil@dlrm-gpu-8:~/kkissmart-fairseq$ paste <( grep RAWLOS gpu-repro.txt ) <( grep RAWLOS tpulog.txt )
	RAWLOSS @ 100 tensor(1410.1447, device='cuda:0') RAWLOSS @ 100 tensor(1408.6381, device='xla:1')
	RAWLOSS @ 200 tensor(1351.2732, device='cuda:0') RAWLOSS @ 200 tensor(1351.1304, device='xla:1')
	RAWLOSS @ 300 tensor(2289.2922, device='cuda:0') RAWLOSS @ 300 tensor(2289.3340, device='xla:1')
	RAWLOSS @ 400 tensor(1715.7347, device='cuda:0') RAWLOSS @ 400 tensor(1715.6587, device='xla:1')
	RAWLOSS @ 500 tensor(2465.9827, device='cuda:0') RAWLOSS @ 500 tensor(2465.9453, device='xla:1')
	RAWLOSS @ 600 tensor(2054.0337, device='cuda:0') RAWLOSS @ 600 tensor(2054.0742, device='xla:1')
	RAWLOSS @ 700 tensor(1702.4202, device='cuda:0') RAWLOSS @ 700 tensor(1702.3367, device='xla:1')
	RAWLOSS @ 800 tensor(1390.9583, device='cuda:0') RAWLOSS @ 800 tensor(1390.9158, device='xla:1')
	RAWLOSS @ 900 tensor(1542.2812, device='cuda:0') RAWLOSS @ 900 tensor(1542.2
	2020-07-20 19:26:21 \| INFO \| train_inner \| epoch 001: 100 / 648283 loss=14.211, ppl=18970, wps=0, ups=0, wpb=143, bsz=8, num_updates=100, lr=1e-06, gnorm=8.634, train_wall=11, wall=130
	RAWLOSS @ 200 tensor(1351.1304, device='xla:1')
	2020-07-20 19:26:46 \| INFO \| train_inner \| epoch 001: 200 / 648283 loss=12.824, ppl=7251.87, wps=6, ups=0.04, wpb=152, bsz=8, num_updates=200, lr=2e-06, gnorm=6.779, train_wall=10, wall=156
	RAWLOSS @ 300 tensor(2289.3340, device='xla:1')
	2020-07-20 19:27:11 \| INFO \| train_inner \| epoch 001: 300 / 648283 loss=12.463, ppl=5647.65, wps=10.5, ups=0.04, wpb=265, bsz=8, num_updates=300, lr=3e-06, gnorm=4.417, train_wall=10, wall=181
	RAWLOSS @ 400 tensor(1715.6587, device='xla:1')
	2020-07-20 19:27:38 \| INFO \| train_inner \| epoch 001: 400 / 648283 loss=11.9, ppl=3821.35, wps=7.8, ups=0.04, wpb=208, bsz=8, num_updates=400, lr=4e-06, gnorm=4.5, train_wall=11, wall=208
	RAWLOSS @ 500 tensor(2465.9453, device='xla:1')
	2020-07-20 19:28:03 \| INFO \| train_inner \| epoch 001: 500 / 6
	2020-07-20 19:30:27 \| INFO \| train_inner \| epoch 001: 100 / 648283 loss=15.175, ppl=36985.3, wps=1680.2, ups=6.93, wpb=242.4, bsz=8, num_updates=100, lr=1e-06, gnorm=8.917, loss_scale=128, train_wall=15, wall=95
	RAWLOSS @ 200 tensor(1351.2732, device='cuda:0')
	2020-07-20 19:30:42 \| INFO \| train_inner \| epoch 001: 200 / 648283 loss=13.532, ppl=11843.4, wps=1672.1, ups=6.92, wpb=241.6, bsz=8, num_updates=200, lr=2e-06, gnorm=6.148, loss_scale=128, train_wall=14, wall=110
	RAWLOSS @ 300 tensor(2289.2922, device='cuda:0')
	2020-07-20 19:30:56 \| INFO \| train_inner \| epoch 001: 300 / 648283 loss=12.885, ppl=7566.95, wps=1622.2, ups=6.92, wpb=234.5, bsz=8, num_updates=300, lr=3e-06, gnorm=5.264, loss_scale=128, train_wall=14, wall=124
	RAWLOSS @ 400 tensor(1715.7347, device='cuda:0')
	2020-07-20 19:31:11 \| INFO \| train_inner \| epoch 001: 400 / 648283 loss=12.564, ppl=6055.17, wps=1642.5, ups=6.92, wpb=237.4, bsz=8, num_updates=400, lr=4e-06, gnorm=4.725, loss_scale=128, train_wall=14, wall=139
	RAWLOSS @ 500
	$ git diff d45342e tpu-criteo-kaggle
	diff --git a/.gitignore b/.gitignore
	new file mode 100644
	index 0000000..a81c8ee
	--- /dev/null
	+++ b/.gitignore
	@@ -0,0 +1,138 @@
	+# Byte-compiled / optimized / DLL files
	+__pycache__/
	+*.py[cod]
	import torch
	import torch.nn as nn
	import sys
	#sys.path.insert(0, '/usr/share/torch-xla-nightly/pytorch/xla/')
	import torch_xla.distributed.xla_multiprocessing as xmp


	def main(*a):
	import torch_xla.core.xla_model as xm
	device = xm.xla_device()
	# TPU CLI
	tpu=dlrm-init
	TPU_IP_ADDRESS=`gcloud compute tpus describe --zone=europe-west4-a dlrm-init \| grep ipAddress \| cut -d ':' -f2 \| head -1 \| sed 's/ //g'`
	export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"

	python dlrm/dlrm_tpu_runner.py \
	--arch-embedding-size=1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000 \
	--arch-sparse-feature-size=64 \
	--arch-mlp-bot=512-512-64 \
	--arch-mlp-top=1024-1024-1024-1 \
	git diff HEAD~1
	diff --git a/dlrm_data_pytorch.py b/dlrm_data_pytorch.py
	index 6cbe382..6f1c849 100644
	--- a/dlrm_data_pytorch.py
	+++ b/dlrm_data_pytorch.py
	@@ -266,7 +266,7 @@ class CriteoDataset(Dataset):

	if self.memory_map:
	if self.split == 'none' or self.split == 'train':
	- # check if need to swicth to next day and load data
	git diff HEAD~1 .
	diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
	index 1955bb9..e9ff88a 100644
	--- a/dlrm_s_pytorch.py
	+++ b/dlrm_s_pytorch.py
	@@ -177,9 +177,11 @@ class DLRM_Net(nn.Module):
	n = ln[i]
	# construct embedding operator
	if self.qr_flag and n > self.qr_threshold:
	+ # XXX: code path not hit with current tpu tests.
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim

	import torch_xla
	import torch_xla.core.xla_model as xm
	import torch_xla.distributed.xla_multiprocessing as xmp