Last active
November 21, 2021 06:41
-
-
Save Shreeyak/edfadb59b4745b9902f71396122c7e41 to your computer and use it in GitHub Desktop.
When using multiple GPUs, getting weird error regarding logger. It creates a DummyLogger for processes with rank != 0. However, it fails a check for DummyLogger instance - it checks true for WandbLogger instance, which messes with the code. This script is a minimal reproducible example for this bug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""When using multiple GPUs, getting weird error regarding logger. It creates a DummyLogger for every GPU except | |
the main one, which messes with the code. This script is a minimal reproducible example for this bug""" | |
import os | |
import warnings | |
import torch | |
import wandb | |
from torch.utils.data import DataLoader, Dataset | |
from pytorch_lightning import LightningModule, Trainer | |
from pytorch_lightning.plugins import DDPPlugin | |
from pytorch_lightning import loggers as pl_loggers | |
from pytorch_lightning.utilities import rank_zero_only | |
class RandomDataset(Dataset): | |
def __init__(self, size, length): | |
self.len = length | |
self.data = torch.randn(length, size) | |
def __getitem__(self, index): | |
return self.data[index] | |
def __len__(self): | |
return self.len | |
class BoringModel(LightningModule): | |
def __init__(self): | |
super().__init__() | |
self.layer = torch.nn.Linear(32, 2) | |
def forward(self, x): | |
return self.layer(x) | |
def training_step(self, batch, batch_idx): | |
loss = self(batch).sum() | |
self.log("train_loss", loss) | |
return {"loss": loss} | |
def validation_step(self, batch, batch_idx): | |
loss = self(batch).sum() | |
self.log("valid_loss", loss) | |
def test_step(self, batch, batch_idx): | |
loss = self(batch).sum() | |
self.log("test_loss", loss) | |
def configure_optimizers(self): | |
return torch.optim.SGD(self.layer.parameters(), lr=0.1) | |
def get_logger(save_dir: str = "./"): | |
wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir)) | |
# Does not work: Checking for type of logger. I should be able to check for DummyLogger | |
if isinstance(wb_logger, pl_loggers.WandbLogger): | |
print("Debug logger 1 (A): ", type(wb_logger)) | |
else: | |
print(f"Debug logger 1 (B): Warning - got DummyLogger", type(wb_logger)) | |
# Use workaround instead to catch DummyLogger instances giving weird path. | |
if isinstance(wb_logger.experiment.dir, str): | |
print("Debug logger 2 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir) | |
# In my training script, I would save a config file to the exp dir created by wandblogger | |
else: | |
print("Debug logger 2 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir) | |
return wb_logger | |
# Alternate workaround: run func only on rank zero? | |
@rank_zero_only | |
def get_logger_2(save_dir: str = "./"): | |
wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir)) | |
# Normally, save file to wb_logger.experiment.dir | |
# Use workaround instead to catch DummyLogger instances giving weird path. | |
if isinstance(wb_logger.experiment.dir, str): | |
print("Debug logger 3 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir) | |
else: | |
print("Debug logger 3 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir) | |
return wb_logger | |
def run(): | |
warnings.filterwarnings("ignore", "^The dataloader, train_dataloader.*, does not have many workers.*") | |
warnings.filterwarnings("ignore", "^The dataloader, val_dataloader.*, does not have many workers.*") | |
warnings.filterwarnings("ignore", "^The dataloader, test_dataloader.*, does not have many workers.*") | |
warnings.filterwarnings("ignore", "^The number of training samples .* is smaller than the logging interval.*") | |
train_data = DataLoader(RandomDataset(32, 64), batch_size=2) | |
val_data = DataLoader(RandomDataset(32, 64), batch_size=2) | |
test_data = DataLoader(RandomDataset(32, 64), batch_size=2) | |
# Does not works: enable an experimental feature behind a flag to see if it resolves issue | |
wandb.require(experiment="service") | |
wb_logger = get_logger() | |
# wb_logger = get_logger_2() # Alternate workaround - use rank_zero_only | |
model = BoringModel() | |
trainer = Trainer( | |
default_root_dir=os.getcwd(), | |
limit_train_batches=1, | |
limit_val_batches=1, | |
num_sanity_val_steps=0, | |
max_epochs=1, | |
enable_model_summary=False, | |
gpus=2, | |
# DDPPlugin was used to disable `find_unused_parameters`. This annoying warning pops up even | |
# in the BoringModel example. | |
strategy=DDPPlugin(find_unused_parameters=False), | |
logger=wb_logger, | |
) | |
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data) | |
trainer.test(model, dataloaders=test_data) | |
if __name__ == "__main__": | |
run() | |
from pytorch_lightning.loggers.base import DummyLogger | |
# --- | |
""" Analyze Outputs (wandb==0.12.7, latest as of Nov 21 2021): | |
Expected Output: | |
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'> | |
Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files | |
Debug logger 1 (B): Debug logger 1 (B): Warning - got DummyLogger <class 'pytorch_lightning.loggers.base.DummyLogger'> | |
Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>> | |
Got Output: | |
Expected Output: | |
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'> | |
Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files | |
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'> | |
Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>> | |
Somehow, 2nd GPU's logger is getting detected as an WandbLogger instance. | |
""" | |
# --- | |
""" Full Sample Output: | |
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'> | |
wandb: Currently logged in as: ... <ignored> | |
Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files | |
GPU available: True, used: True | |
TPU available: False, using: 0 TPU cores | |
IPU available: False, using: 0 IPUs | |
Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'> | |
Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>> | |
initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2 | |
initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2 | |
---------------------------------------------------------------------------------------------------- | |
distributed_backend=nccl | |
All distributed processes registered. Starting with 2 processes | |
---------------------------------------------------------------------------------------------------- | |
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1] | |
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1] | |
Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 7.35it/s, loss=-1.15, v_num=mira] | |
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1] | |
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1] | |
Testing: 0it [00:00, ?it/s]-------------------------------------------------------------------------------- | |
DATALOADER:0 TEST RESULTS | |
{'test_loss': -0.055130764842033386} | |
-------------------------------------------------------------------------------- | |
Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 1187.89it/s] | |
wandb: Waiting for W&B process to finish, PID 20907... (success). | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment