May 5, 2020 20:32 · December 6, 2019 18:25 · December 6, 2019 18:07 · November 26, 2019 00:19 · November 18, 2019 19:46 · October 3, 2019 18:46
 diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
 index ab82ea45..646545bc 100644
 --- a/fairseq/data/data_utils.py
 +++ b/fairseq/data/data_utils.py
 @@ -199,7 +199,7 @@ def filter_by_size(indices, dataset, max_positions, raise_exception=False):

 def batch_by_size(
     indices, num_tokens_fn, max_tokens=None, max_sentences=None,
 -    required_batch_size_multiple=1,
 +    required_batch_size_multiple=1, tpu=False
 diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
 index eb2fcf3..df27187 100644
 --- a/fairseq/criterions/masked_lm.py
 +++ b/fairseq/criterions/masked_lm.py
 @@ -29,9 +29,14 @@ class MaskedLmLoss(FairseqCriterion):
         2) the sample size, which is used as the denominator for the gradient
         3) logging outputs to display while training
         """
 +        # FIXME: proving reduce is always True
 +        assert reduce, 'OMG NON REDUCE'
 diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
 index 10de955..a6a6187 100644
 --- a/fairseq/checkpoint_utils.py
 +++ b/fairseq/checkpoint_utils.py
 @@ -17,6 +17,8 @@ from torch.serialization import default_restore_location

 from fairseq.models import FairseqEncoder, FairseqDecoder

 +import torch_xla.core.xla_model as xm
 +
 Metric: CompileTime
  TotalSamples: 4
  Accumulator: 02m10s739ms169.854us
  ValueRate: 762ms277.314us / second
  Rate: 0.0235018 / second
  Percentiles: 1%=25s766ms505.593us; 5%=25s766ms505.593us; 10%=25s766ms505.593us; 20%=25s766ms505.593us; 50%=39s737ms929.180us; 80%=41s729ms347.105us; 90%=41s729ms347.105us; 95%=41s729ms347.105us; 99%=41s729ms347.105us
 Metric: ExecuteTime
  TotalSamples: 139
  Accumulator: 07m06s439ms701.634us
  ValueRate: 831ms927.457us / second
 diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
 index 10de955..aa8160f 100644
 --- a/fairseq/checkpoint_utils.py
 +++ b/fairseq/checkpoint_utils.py
 @@ -17,6 +17,8 @@ from torch.serialization import default_restore_location

 from fairseq.models import FairseqEncoder, FairseqDecoder

 +import torch_xla.core.xla_model as xm
 +
 Epoch 5 end 23:40:43
 Metric: CompileTime
  TotalSamples: 103
  Counter: 12h10m19s368ms61.206us
  ValueRate: 01s026ms336.810us / second
  Rate: 0.00590595 / second
  Percentiles: 1%=014ms574.635us; 5%=051ms968.017us; 10%=252ms244.650us; 20%=538ms284.546us; 50%=31s090ms150.944us; 80%=06m51s432ms266.183us; 90%=06m26s349ms452.444us; 95%=08m46s288ms682.970us; 99%=01h03m02s697ms309.382us
 Metric: ExecuteTime
  TotalSamples: 60976
  Counter: 01d20h12m35s140ms552.464us
 Epoch 5 end 07:32:37
 Metric: CompileTime
  TotalSamples: 98
  Accumulator: 11h12m28s413ms977.979us
  ValueRate: 790ms91.790us / second
  Rate: 0.00511136 / second
  Percentiles: 1%=012ms455.058us; 5%=036ms797.399us; 10%=309ms487.393us; 20%=01s073ms779.463us; 50%=02m34s048ms533.549us; 80%=06m43s037ms491.406us; 90%=06m59s355ms429.817us; 95%=06m24s546ms665.777us; 99%=21m42s139ms386.139us
 Metric: ExecuteTime
  TotalSamples: 60976
  Accumulator: 02d36h02m12s990ms414.578us
 Epoch 1 begin 21:38:10
 training/ 21:39:31, device xla:1, step 1, Rate=19.64, GlobalRate=19.64
 training/ 21:39:31, device xla:2, step 1, Rate=19.52, GlobalRate=19.52
 training/ 21:39:31, device xla:5, step 1, Rate=19.37, GlobalRate=19.37
 training/ 21:39:31, device xla:8, step 1, Rate=38.76, GlobalRate=38.76
 training/ 21:39:31, device xla:4, step 1, Rate=19.31, GlobalRate=19.31
 training/ 21:39:31, device xla:6, step 1, Rate=38.53, GlobalRate=38.53
 training/ 21:39:31, device xla:7, step 1, Rate=76.98, GlobalRate=76.98
 training/ 21:39:31, device xla:3, step 1, Rate=38.31, GlobalRate=38.31
 training/ 21:39:51, device xla:8, step 2, Rate=45.96, GlobalRate=46.01
 Epoch 1 begin 19:52:45
 training/ 19:55:52, device xla:5, step 1, Rate=2.19, GlobalRate=2.19
 training/ 19:55:52, device xla:4, step 1, Rate=2.19, GlobalRate=2.19
 training/ 19:56:05, device xla:2, step 1, Rate=1.97, GlobalRate=1.97
 training/ 19:56:05, device xla:1, step 1, Rate=1.97, GlobalRate=1.97
 training/ 19:59:14, device xla:8, step 1, Rate=1.60, GlobalRate=1.60
 training/ 19:59:14, device xla:3, step 1, Rate=1.60, GlobalRate=1.60
 training/ 19:59:16, device xla:6, step 1, Rate=1.60, GlobalRate=1.60
 training/ 20:12:01, device xla:7, step 1, Rate=0.94, GlobalRate=0.94
 training/ 20:17:46, device xla:5, step 2, Rate=1.11, GlobalRate=0.54
 | WARNING: 240829 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1422704, 2718830, 2897878, 3673048, 2016896, 2200333, 3886976, 2097242, 3124502, 2871279]
 Epoch 1 begin 00:17:55
 training/ 00:19:08, device xla:1, step 1, Rate=132.04, GlobalRate=132.04, loss=15.8125, nll_loss=15.8750
 training/ 00:20:21, device xla:1, step 2, Rate=54.94, GlobalRate=6.89, loss=15.8125, nll_loss=15.8125
 training/ 00:25:46, device xla:1, step 3, Rate=22.92, GlobalRate=2.56, loss=16.0000, nll_loss=16.0000
 training/ 00:40:56, device xla:1, step 4, Rate=9.34, GlobalRate=0.98, loss=15.9375, nll_loss=15.9375
 training/ 01:58:50, device xla:1, step 5, Rate=3.77, GlobalRate=0.26, loss=15.7500, nll_loss=15.8125
 2019-09-18 03:13:04.411218: E tensorflow/compiler/xla/xla_client/tf_logging.cc:11] Check failed: session_work.first->session()->Run( session_work.second.feed_inputs, session_work.second.outputs_handles, &outputs) == ::tensorflow::Status::OK() (Unavailable: From /job:tpu_worker/replica:0/
	diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
	index ab82ea45..646545bc 100644
	--- a/fairseq/data/data_utils.py
	+++ b/fairseq/data/data_utils.py
	@@ -199,7 +199,7 @@ def filter_by_size(indices, dataset, max_positions, raise_exception=False):

	def batch_by_size(
	indices, num_tokens_fn, max_tokens=None, max_sentences=None,
	- required_batch_size_multiple=1,
	+ required_batch_size_multiple=1, tpu=False
	diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
	index eb2fcf3..df27187 100644
	--- a/fairseq/criterions/masked_lm.py
	+++ b/fairseq/criterions/masked_lm.py
	@@ -29,9 +29,14 @@ class MaskedLmLoss(FairseqCriterion):
	2) the sample size, which is used as the denominator for the gradient
	3) logging outputs to display while training
	"""
	+ # FIXME: proving reduce is always True
	+ assert reduce, 'OMG NON REDUCE'
	diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
	index 10de955..a6a6187 100644
	--- a/fairseq/checkpoint_utils.py
	+++ b/fairseq/checkpoint_utils.py
	@@ -17,6 +17,8 @@ from torch.serialization import default_restore_location

	from fairseq.models import FairseqEncoder, FairseqDecoder

	+import torch_xla.core.xla_model as xm
	+
	Metric: CompileTime
	TotalSamples: 4
	Accumulator: 02m10s739ms169.854us
	ValueRate: 762ms277.314us / second
	Rate: 0.0235018 / second
	Percentiles: 1%=25s766ms505.593us; 5%=25s766ms505.593us; 10%=25s766ms505.593us; 20%=25s766ms505.593us; 50%=39s737ms929.180us; 80%=41s729ms347.105us; 90%=41s729ms347.105us; 95%=41s729ms347.105us; 99%=41s729ms347.105us
	Metric: ExecuteTime
	TotalSamples: 139
	Accumulator: 07m06s439ms701.634us
	ValueRate: 831ms927.457us / second
	diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
	index 10de955..aa8160f 100644
	--- a/fairseq/checkpoint_utils.py
	+++ b/fairseq/checkpoint_utils.py
	@@ -17,6 +17,8 @@ from torch.serialization import default_restore_location

	from fairseq.models import FairseqEncoder, FairseqDecoder

	+import torch_xla.core.xla_model as xm
	+
	Epoch 5 end 23:40:43
	Metric: CompileTime
	TotalSamples: 103
	Counter: 12h10m19s368ms61.206us
	ValueRate: 01s026ms336.810us / second
	Rate: 0.00590595 / second
	Percentiles: 1%=014ms574.635us; 5%=051ms968.017us; 10%=252ms244.650us; 20%=538ms284.546us; 50%=31s090ms150.944us; 80%=06m51s432ms266.183us; 90%=06m26s349ms452.444us; 95%=08m46s288ms682.970us; 99%=01h03m02s697ms309.382us
	Metric: ExecuteTime
	TotalSamples: 60976
	Counter: 01d20h12m35s140ms552.464us
	Epoch 5 end 07:32:37
	Metric: CompileTime
	TotalSamples: 98
	Accumulator: 11h12m28s413ms977.979us
	ValueRate: 790ms91.790us / second
	Rate: 0.00511136 / second
	Percentiles: 1%=012ms455.058us; 5%=036ms797.399us; 10%=309ms487.393us; 20%=01s073ms779.463us; 50%=02m34s048ms533.549us; 80%=06m43s037ms491.406us; 90%=06m59s355ms429.817us; 95%=06m24s546ms665.777us; 99%=21m42s139ms386.139us
	Metric: ExecuteTime
	TotalSamples: 60976
	Accumulator: 02d36h02m12s990ms414.578us
	Epoch 1 begin 21:38:10
	training/ 21:39:31, device xla:1, step 1, Rate=19.64, GlobalRate=19.64
	training/ 21:39:31, device xla:2, step 1, Rate=19.52, GlobalRate=19.52
	training/ 21:39:31, device xla:5, step 1, Rate=19.37, GlobalRate=19.37
	training/ 21:39:31, device xla:8, step 1, Rate=38.76, GlobalRate=38.76
	training/ 21:39:31, device xla:4, step 1, Rate=19.31, GlobalRate=19.31
	training/ 21:39:31, device xla:6, step 1, Rate=38.53, GlobalRate=38.53
	training/ 21:39:31, device xla:7, step 1, Rate=76.98, GlobalRate=76.98
	training/ 21:39:31, device xla:3, step 1, Rate=38.31, GlobalRate=38.31
	training/ 21:39:51, device xla:8, step 2, Rate=45.96, GlobalRate=46.01
	Epoch 1 begin 19:52:45
	training/ 19:55:52, device xla:5, step 1, Rate=2.19, GlobalRate=2.19
	training/ 19:55:52, device xla:4, step 1, Rate=2.19, GlobalRate=2.19
	training/ 19:56:05, device xla:2, step 1, Rate=1.97, GlobalRate=1.97
	training/ 19:56:05, device xla:1, step 1, Rate=1.97, GlobalRate=1.97
	training/ 19:59:14, device xla:8, step 1, Rate=1.60, GlobalRate=1.60
	training/ 19:59:14, device xla:3, step 1, Rate=1.60, GlobalRate=1.60
	training/ 19:59:16, device xla:6, step 1, Rate=1.60, GlobalRate=1.60
	training/ 20:12:01, device xla:7, step 1, Rate=0.94, GlobalRate=0.94
	training/ 20:17:46, device xla:5, step 2, Rate=1.11, GlobalRate=0.54
	\| WARNING: 240829 samples have invalid sizes and will be skipped, max_positions=(64, 64), first few sample ids=[1422704, 2718830, 2897878, 3673048, 2016896, 2200333, 3886976, 2097242, 3124502, 2871279]
	Epoch 1 begin 00:17:55
	training/ 00:19:08, device xla:1, step 1, Rate=132.04, GlobalRate=132.04, loss=15.8125, nll_loss=15.8750
	training/ 00:20:21, device xla:1, step 2, Rate=54.94, GlobalRate=6.89, loss=15.8125, nll_loss=15.8125
	training/ 00:25:46, device xla:1, step 3, Rate=22.92, GlobalRate=2.56, loss=16.0000, nll_loss=16.0000
	training/ 00:40:56, device xla:1, step 4, Rate=9.34, GlobalRate=0.98, loss=15.9375, nll_loss=15.9375
	training/ 01:58:50, device xla:1, step 5, Rate=3.77, GlobalRate=0.26, loss=15.7500, nll_loss=15.8125
	2019-09-18 03:13:04.411218: E tensorflow/compiler/xla/xla_client/tf_logging.cc:11] Check failed: session_work.first->session()->Run( session_work.second.feed_inputs, session_work.second.outputs_handles, &outputs) == ::tensorflow::Status::OK() (Unavailable: From /job:tpu_worker/replica:0/