python main.py data=in32_pickle sg.params.condition_method=cluster dynamic=unet_fast dynamic.params.model_channels=128 data.params.batch_size=128 sg.params.cond_dim=10000 data.h5_file=sg_data/cluster/v3_in32p_cluster10000_iter30minp200_nns-1_dino_vitb16_2022-08-17T21_7b919c8.h5 sg.params.cond_drop_prob=0.1 sg.params.cond_scale=2 data.trainer.max_epochs=800 data.fid_every_n_epoch=10 name=aaaa_ep800_4gpu pl.trainer.strategy=ddp devices=4 debug=0
python main.py data=in32_pickle dynamic=unet_fast sg.params.condition_method=label sg.params.cond_drop_prob=0.1 sg.params.cond_scale=2 dynamic.params.model_channels=128 sg.params.cond_dim=1000 data.trainer.max_epochs=20 data.fid_every_n_epoch=4 data.params.batch_size=128 name=aaaa_v1.6.2_label_in32p_unet_fast_ep8800 debug=0 data.trainer.max_epochs=8800 data.fid_every_n_epoch=10 pl.trainer.strategy=ddp devices=4 debug=0
python main.py data=in32_pickle dynamic=unet_fast sg.params.condition_method=label sg.params.cond_drop_prob=0.1 sg.params.cond_scale=2 dynamic.params.model_channels=128 sg.params.cond_dim=1000 data.trainer.max_epochs=20 data.fid_every_n_epoch=4 data.params.batch_size=128 name=aaaa_v1.6.2_label_in32p_unet_fast_ep8888 debug=0 data.trainer.max_epochs=8888 data.fid_every_n_epoch=10 debug=0
- /ssdstore for ivi-cn022
- /local for das6
- /var/scratch for das5
- /local-ssd for das5
name | local | ssd |
---|---|---|
das5 | /var/scratch | /local-ssd |
ivi | . | /ssdstore |
das6/local/thu |
srun -u --pty --gres=gpu:4 --time=6-23 bash -i srun -u --pty -p fatq --gres=gpu:4 --time=6-23 bash -i
1GPU:
srun -u --pty --gres=gpu:1 --mem=60G --cpus-per-task=10 --time=6-23 bash -i
4GPU:
srun -u --pty --gres=gpu:4 --mem=120G --cpus-per-task=20 --time=6-23 bash -i
srun -u --pty --gres=gpu:4 --account=ceesusers --nodelist=ivi-cn022 --time=6-23 --mem=250G --cpus-per-task=110 -p cees bash -i
8GPU:
srun -u --pty --gres=gpu:8 --account=ceesusers --nodelist=ivi-cn022 --time=6-23 --mem=250G --cpus-per-task=110 -p cees bash -i
srun -u --pty --nodelist=ivi-cn019 --gres=gpu:8 --mem=200G --cpus-per-task=42 --time=6-23 -p biggpu bash -i
srun -u --pty --gres=gpu:8 --account=quvausers --nodelist=ivi-cn023 --time=6-23 --mem=250G --cpus-per-task=110 -p quva bash -i
srun -u --pty --gres=gpu:4 --account=quvausers --nodelist=ivi-cn023 --time=6-23 --mem=110G --cpus-per-task=55 -p quva bash -i
srun -p gpu -t 4-23 [email protected] --mail-type=ALL --cpus-per-task=32 --gres=gpu:2 -u --pty bash -i
srun -p gpu -t 4-23 [email protected] --mail-type=ALL --cpus-per-task=72 --gres=gpu:4 -u --pty bash -i
accinfo to check budge
"squeue -u thu32 " to check current jobs of you
use
/scratch-local/
/scratch-shared/
You have several TBs quota in that path
but the data will be deleted after 6 days(/scatch/local) / 14 days (/scratch-shared)
scp -r lsun thu32@snellius:/scratch-shared/thu/data/ scp -r ffhq thu32@snellius:/scratch-shared/thu/data/
ivi:
df -h|grep thu
das6:
quota -sv
das5, go to jumphost
quota -sv
check current dir disk usage
du -hd 1
srun -u --pty --gres=gpu:4 --account=ceesusers --nodelist=ivi-cn022 --time=6-23 --mem=250G --cpus-per-task=110 -p cees bash -i
module avail|grep cuda1
check your loaded modules
module list
https://curc.readthedocs.io/en/latest/compute/modules.html
- das5: /home/koelma/pytorch_work/ilsvrc2012_{train,val}.zip
- ivi:
- das6:
how to resize-pane:
https://michaelsoolee.com/resize-tmux-panes/
ctrl+b, :, :resize-pane -R 10
go to beginning of the line
Ctrl+A