Skip to content

Instantly share code, notes, and snippets.

View taylanbil's full-sized avatar

Taylan Bilal taylanbil

View GitHub Profile
@taylanbil
taylanbil / gist:ced8c407f415eee08f85bac156bcce26
Last active October 3, 2019 18:49
2019-10-03 / fairseq transformer metrics report wmt18, master branch, end of epoch 5
Epoch 5 end 07:32:37
Metric: CompileTime
TotalSamples: 98
Accumulator: 11h12m28s413ms977.979us
ValueRate: 790ms91.790us / second
Rate: 0.00511136 / second
Percentiles: 1%=012ms455.058us; 5%=036ms797.399us; 10%=309ms487.393us; 20%=01s073ms779.463us; 50%=02m34s048ms533.549us; 80%=06m43s037ms491.406us; 90%=06m59s355ms429.817us; 95%=06m24s546ms665.777us; 99%=21m42s139ms386.139us
Metric: ExecuteTime
TotalSamples: 60976
Accumulator: 02d36h02m12s990ms414.578us
@taylanbil
taylanbil / gist:f7e5d631a4a92811d16fbebfcc675349
Created October 3, 2019 18:46
2019-09-11 / fairseq transformer metrics report wmt18, minor change to loss function branch, end of epoch 5
Epoch 5 end 23:40:43
Metric: CompileTime
TotalSamples: 103
Counter: 12h10m19s368ms61.206us
ValueRate: 01s026ms336.810us / second
Rate: 0.00590595 / second
Percentiles: 1%=014ms574.635us; 5%=051ms968.017us; 10%=252ms244.650us; 20%=538ms284.546us; 50%=31s090ms150.944us; 80%=06m51s432ms266.183us; 90%=06m26s349ms452.444us; 95%=08m46s288ms682.970us; 99%=01h03m02s697ms309.382us
Metric: ExecuteTime
TotalSamples: 60976
Counter: 01d20h12m35s140ms552.464us
@taylanbil
taylanbil / tpuchanges.20191118.diff
Created November 18, 2019 19:46
Fairseq changes so it works well with TPUs
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 10de955..aa8160f 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -17,6 +17,8 @@ from torch.serialization import default_restore_location
from fairseq.models import FairseqEncoder, FairseqDecoder
+import torch_xla.core.xla_model as xm
+
@taylanbil
taylanbil / gist:6fc69ce632aabfd03eeb3b4903d51ddf
Created November 26, 2019 00:19
roberta metrics report 20191125
Metric: CompileTime
TotalSamples: 4
Accumulator: 02m10s739ms169.854us
ValueRate: 762ms277.314us / second
Rate: 0.0235018 / second
Percentiles: 1%=25s766ms505.593us; 5%=25s766ms505.593us; 10%=25s766ms505.593us; 20%=25s766ms505.593us; 50%=39s737ms929.180us; 80%=41s729ms347.105us; 90%=41s729ms347.105us; 95%=41s729ms347.105us; 99%=41s729ms347.105us
Metric: ExecuteTime
TotalSamples: 139
Accumulator: 07m06s439ms701.634us
ValueRate: 831ms927.457us / second
@taylanbil
taylanbil / upstreammaster_a0f7599-vs-tpu.diff
Created December 6, 2019 18:07
Fairseq changes to make Transformer + translation task work well with TPUs
diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py
index 10de955..a6a6187 100644
--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -17,6 +17,8 @@ from torch.serialization import default_restore_location
from fairseq.models import FairseqEncoder, FairseqDecoder
+import torch_xla.core.xla_model as xm
+
@taylanbil
taylanbil / tpu-vs-robertatpu.diff
Created December 6, 2019 18:25
Fairseq changes on top of tpu branch to make RoBERTa work well with TPUs
diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py
index eb2fcf3..df27187 100644
--- a/fairseq/criterions/masked_lm.py
+++ b/fairseq/criterions/masked_lm.py
@@ -29,9 +29,14 @@ class MaskedLmLoss(FairseqCriterion):
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
+ # FIXME: proving reduce is always True
+ assert reduce, 'OMG NON REDUCE'
@taylanbil
taylanbil / gist:dfe0010c906f9dfb3dc582f3bbb2933b
Last active May 5, 2020 20:32
Diff to get Myle's branch in a better state
diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index ab82ea45..646545bc 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -199,7 +199,7 @@ def filter_by_size(indices, dataset, max_positions, raise_exception=False):
def batch_by_size(
indices, num_tokens_fn, max_tokens=None, max_sentences=None,
- required_batch_size_multiple=1,
+ required_batch_size_multiple=1, tpu=False
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
@taylanbil
taylanbil / dlrm.diff
Created June 12, 2020 23:13
[wip] dlrm on tpu
git diff HEAD~1 .
diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
index 1955bb9..e9ff88a 100644
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
@@ -177,9 +177,11 @@ class DLRM_Net(nn.Module):
n = ln[i]
# construct embedding operator
if self.qr_flag and n > self.qr_threshold:
+ # XXX: code path not hit with current tpu tests.
@taylanbil
taylanbil / embbag.py
Created June 13, 2020 00:27
EmbeddingBag backward error repro
import torch
import torch.nn as nn
import torch_xla.core.xla_model as xm
device = xm.xla_device()
d = nn.EmbeddingBag(10, 10, mode="sum", sparse=False).to(device)
inp = torch.LongTensor([1, 5, 9]).to(device)
x = d(inp, offsets=torch.LongTensor([0]).to(device))
loss = x.sum()