Shreeyak · November 21, 2021 06:41
diff --git a/debug-logger-multiple-gpus.py b/debug-logger-multiple-gpus.py
 """When using multiple GPUs, getting weird error regarding logger. It creates a DummyLogger for every GPU except
 the main one, which messes with the code. This script is a minimal reproducible example for this bug"""

 import os
 import warnings

 import torch
 import wandb
 from torch.utils.data import DataLoader, Dataset

 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.plugins import DDPPlugin
 from pytorch_lightning import loggers as pl_loggers
 from pytorch_lightning.utilities import rank_zero_only


 class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


 class BoringModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(32, 2)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("valid_loss", loss)

    def test_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("test_loss", loss)

    def configure_optimizers(self):
        return torch.optim.SGD(self.layer.parameters(), lr=0.1)


 def get_logger(save_dir: str = "./"):
    wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir))

    # Does not work: Checking for type of logger. I should be able to check for DummyLogger
    if isinstance(wb_logger, pl_loggers.WandbLogger):
        print("Debug logger 1 (A): ", type(wb_logger))
    else:
        print(f"Debug logger 1 (B): Warning - got DummyLogger", type(wb_logger))

    # Use workaround instead to catch DummyLogger instances giving weird path.
    if isinstance(wb_logger.experiment.dir, str):
        print("Debug logger 2 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir)
        # In my training script, I would save a config file to the exp dir created by wandblogger
    else:
        print("Debug logger 2 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir)

    return wb_logger


 # Alternate workaround: run func only on rank zero?
 @rank_zero_only
 def get_logger_2(save_dir: str = "./"):
    wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir))
    # Normally, save file to wb_logger.experiment.dir

    # Use workaround instead to catch DummyLogger instances giving weird path.
    if isinstance(wb_logger.experiment.dir, str):
        print("Debug logger 3 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir)
    else:
        print("Debug logger 3 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir)

    return wb_logger


 def run():
    warnings.filterwarnings("ignore", "^The dataloader, train_dataloader.*, does not have many workers.*")
    warnings.filterwarnings("ignore", "^The dataloader, val_dataloader.*, does not have many workers.*")
    warnings.filterwarnings("ignore", "^The dataloader, test_dataloader.*, does not have many workers.*")
    warnings.filterwarnings("ignore", "^The number of training samples .* is smaller than the logging interval.*")

    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    test_data = DataLoader(RandomDataset(32, 64), batch_size=2)

    # Does not works: enable an experimental feature behind a flag to see if it resolves issue
    wandb.require(experiment="service")  
    wb_logger = get_logger()
    # wb_logger = get_logger_2()  # Alternate workaround - use rank_zero_only

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=os.getcwd(),
        limit_train_batches=1,
        limit_val_batches=1,
        num_sanity_val_steps=0,
        max_epochs=1,
        enable_model_summary=False,
        gpus=2,
        # DDPPlugin was used to disable `find_unused_parameters`. This annoying warning pops up even
        #   in the BoringModel example.
        strategy=DDPPlugin(find_unused_parameters=False),
        logger=wb_logger,
    )
    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
    trainer.test(model, dataloaders=test_data)


 if __name__ == "__main__":
    run()
    from pytorch_lightning.loggers.base import DummyLogger

 # ---
 """ Analyze Outputs (wandb==0.12.7, latest as of Nov 21 2021):

 Expected Output:
 Debug logger 1 (A):  <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
 Debug logger 2 (A): Got path to exp dir from WandB logger:  ./wandb/run-20211121_012407-3of6fq7j/files

 Debug logger 1 (B): Debug logger 1 (B): Warning - got DummyLogger <class 'pytorch_lightning.loggers.base.DummyLogger'>
 Debug logger 2 (B): Warning - got DummyLogger obj instead of path:  <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>

 Got Output:
 Expected Output:
 Debug logger 1 (A):  <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
 Debug logger 2 (A): Got path to exp dir from WandB logger:  ./wandb/run-20211121_012407-3of6fq7j/files

 Debug logger 1 (A):  <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
 Debug logger 2 (B): Warning - got DummyLogger obj instead of path:  <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>

 Somehow, 2nd GPU's logger is getting detected as an WandbLogger instance.
 """

 # ---
 """ Full Sample Output:

 Debug logger 1 (A):  <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
 wandb: Currently logged in as: ... <ignored>

 Debug logger 2 (A): Got path to exp dir from WandB logger:  ./wandb/run-20211121_012407-3of6fq7j/files

 GPU available: True, used: True
 TPU available: False, using: 0 TPU cores
 IPU available: False, using: 0 IPUs
 Debug logger 1 (A):  <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
 Debug logger 2 (B): Warning - got DummyLogger obj instead of path:  <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>
 initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
 initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
 ----------------------------------------------------------------------------------------------------
 distributed_backend=nccl
 All distributed processes registered. Starting with 2 processes
 ----------------------------------------------------------------------------------------------------

 LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
 LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
 Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  7.35it/s, loss=-1.15, v_num=mira]
 LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]                                                                                                                                                                                                                                                                                                     
 LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
 Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
 DATALOADER:0 TEST RESULTS
 {'test_loss': -0.055130764842033386}
 --------------------------------------------------------------------------------
 Testing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 1187.89it/s]

 wandb: Waiting for W&B process to finish, PID 20907... (success).
 """
	"""When using multiple GPUs, getting weird error regarding logger. It creates a DummyLogger for every GPU except
	the main one, which messes with the code. This script is a minimal reproducible example for this bug"""

	import os
	import warnings

	import torch
	import wandb
	from torch.utils.data import DataLoader, Dataset

	from pytorch_lightning import LightningModule, Trainer
	from pytorch_lightning.plugins import DDPPlugin
	from pytorch_lightning import loggers as pl_loggers
	from pytorch_lightning.utilities import rank_zero_only


	class RandomDataset(Dataset):
	def __init__(self, size, length):
	self.len = length
	self.data = torch.randn(length, size)

	def __getitem__(self, index):
	return self.data[index]

	def __len__(self):
	return self.len


	class BoringModel(LightningModule):
	def __init__(self):
	super().__init__()
	self.layer = torch.nn.Linear(32, 2)

	def forward(self, x):
	return self.layer(x)

	def training_step(self, batch, batch_idx):
	loss = self(batch).sum()
	self.log("train_loss", loss)
	return {"loss": loss}

	def validation_step(self, batch, batch_idx):
	loss = self(batch).sum()
	self.log("valid_loss", loss)

	def test_step(self, batch, batch_idx):
	loss = self(batch).sum()
	self.log("test_loss", loss)

	def configure_optimizers(self):
	return torch.optim.SGD(self.layer.parameters(), lr=0.1)


	def get_logger(save_dir: str = "./"):
	wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir))

	# Does not work: Checking for type of logger. I should be able to check for DummyLogger
	if isinstance(wb_logger, pl_loggers.WandbLogger):
	print("Debug logger 1 (A): ", type(wb_logger))
	else:
	print(f"Debug logger 1 (B): Warning - got DummyLogger", type(wb_logger))

	# Use workaround instead to catch DummyLogger instances giving weird path.
	if isinstance(wb_logger.experiment.dir, str):
	print("Debug logger 2 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir)
	# In my training script, I would save a config file to the exp dir created by wandblogger
	else:
	print("Debug logger 2 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir)

	return wb_logger


	# Alternate workaround: run func only on rank zero?
	@rank_zero_only
	def get_logger_2(save_dir: str = "./"):
	wb_logger = pl_loggers.WandbLogger(name=None, id=None, save_dir=str(save_dir))
	# Normally, save file to wb_logger.experiment.dir

	# Use workaround instead to catch DummyLogger instances giving weird path.
	if isinstance(wb_logger.experiment.dir, str):
	print("Debug logger 3 (A): Got path to exp dir from WandB logger: ", wb_logger.experiment.dir)
	else:
	print("Debug logger 3 (B): Warning - got DummyLogger obj instead of path: ", wb_logger.experiment.dir)

	return wb_logger


	def run():
	warnings.filterwarnings("ignore", "^The dataloader, train_dataloader., does not have many workers.")
	warnings.filterwarnings("ignore", "^The dataloader, val_dataloader., does not have many workers.")
	warnings.filterwarnings("ignore", "^The dataloader, test_dataloader., does not have many workers.")
	warnings.filterwarnings("ignore", "^The number of training samples .* is smaller than the logging interval.*")

	train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
	val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
	test_data = DataLoader(RandomDataset(32, 64), batch_size=2)

	# Does not works: enable an experimental feature behind a flag to see if it resolves issue
	wandb.require(experiment="service")
	wb_logger = get_logger()
	# wb_logger = get_logger_2() # Alternate workaround - use rank_zero_only

	model = BoringModel()
	trainer = Trainer(
	default_root_dir=os.getcwd(),
	limit_train_batches=1,
	limit_val_batches=1,
	num_sanity_val_steps=0,
	max_epochs=1,
	enable_model_summary=False,
	gpus=2,
	# DDPPlugin was used to disable `find_unused_parameters`. This annoying warning pops up even
	# in the BoringModel example.
	strategy=DDPPlugin(find_unused_parameters=False),
	logger=wb_logger,
	)
	trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
	trainer.test(model, dataloaders=test_data)


	if __name__ == "__main__":
	run()
	from pytorch_lightning.loggers.base import DummyLogger

	# ---
	""" Analyze Outputs (wandb==0.12.7, latest as of Nov 21 2021):

	Expected Output:
	Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
	Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files

	Debug logger 1 (B): Debug logger 1 (B): Warning - got DummyLogger <class 'pytorch_lightning.loggers.base.DummyLogger'>
	Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>

	Got Output:
	Expected Output:
	Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
	Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files

	Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
	Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>

	Somehow, 2nd GPU's logger is getting detected as an WandbLogger instance.
	"""

	# ---
	""" Full Sample Output:

	Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
	wandb: Currently logged in as: ... <ignored>

	Debug logger 2 (A): Got path to exp dir from WandB logger: ./wandb/run-20211121_012407-3of6fq7j/files

	GPU available: True, used: True
	TPU available: False, using: 0 TPU cores
	IPU available: False, using: 0 IPUs
	Debug logger 1 (A): <class 'pytorch_lightning.loggers.wandb.WandbLogger'>
	Debug logger 2 (B): Warning - got DummyLogger obj instead of path: <bound method DummyExperiment.nop of <pytorch_lightning.loggers.base.DummyExperiment object at 0x7f9293cf9490>>
	initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
	initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
	----------------------------------------------------------------------------------------------------
	distributed_backend=nccl
	All distributed processes registered. Starting with 2 processes
	----------------------------------------------------------------------------------------------------

	LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
	LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
	Epoch 0: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 2/2 [00:00<00:00, 7.35it/s, loss=-1.15, v_num=mira]
	LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
	LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
	Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
	DATALOADER:0 TEST RESULTS
	{'test_loss': -0.055130764842033386}
	--------------------------------------------------------------------------------
	Testing: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [00:00<00:00, 1187.89it/s]

	wandb: Waiting for W&B process to finish, PID 20907... (success).
	"""