AmericanPresidentJimmyCarter · September 27, 2024 11:02
diff --git a/config.env b/config.env
 # Configure these values.

 # 'lora' or 'full'
 # lora - train a small network for a character or style, or both. quite versatile.
 # full - requires lots of vram, trains very slowly, needs a lot of data and concepts.
 export MODEL_TYPE='lora'

 # Set this to 'true' if you are training a Stable Diffusion 3 checkpoint.
 # Use MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
 export STABLE_DIFFUSION_3=false
 # Similarly, this is to train PixArt Sigma (1K or 2K) models.
 # Use MODEL_NAME="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS"
 export PIXART_SIGMA=false
 # For old Stable Diffusion 1.x/2.x models, you'll enable this.
 # Use MODEL_NAME="stabilityai/stable-diffusion-2-1"
 export STABLE_DIFFUSION_LEGACY=false
 # For Kwai-Kolors, enable KOLORS.
 # Use MODEL_NAME="kwai-kolors/kolors-diffusers"
 export KOLORS=false
 # For Flux, if you have 8 GPUs and DeepSpeed configured.
 # Use MODEL_NAME="black-forest-labs/FLUX.1-dev"
 export FLUX=true

 # ControlNet model training is only supported when MODEL_TYPE='full'
 # See this document for more information: https://github.com/bghira/SimpleTuner/blob/main/documentation/CONTROLNET.md
 # DeepFloyd, PixArt, and SD3 do not currently support ControlNet model training.
 export CONTROLNET=false

 # DoRA enhances the training style of LoRA, but it will run more slowly at the same rank.
 # See: https://arxiv.org/abs/2402.09353
 # See: https://github.com/huggingface/peft/pull/1474
 export USE_DORA=false

 # BitFit freeze strategy for the u-net causes everything but the biases to be frozen.
 # This may help retain the full model's underlying capabilities. LoRA is currently not tested/known to work.
 #if [[ "$MODEL_TYPE" == "full" ]]; then
 #    # When training a full model, we will rely on BitFit to keep the u-net intact.
 #    export USE_BITFIT=true
 #elif [[ "$MODEL_TYPE" == "lora" ]]; then
 #    # LoRA can not use BitFit.
 #    export USE_BITFIT=false
 #elif [[ "$MODEL_TYPE" == "deepfloyd-full" ]]; then
 #    export USE_BITFIT=true
 #fi

 # Restart where we left off. Change this to "checkpoint-1234" to start from a specific checkpoint.
 export RESUME_CHECKPOINT="latest"

 # How often to checkpoint. Depending on your learning rate, you may wish to change this.
 # For the default settings with 10 gradient accumulations, more frequent checkpoints might be preferable at first.
 export CHECKPOINTING_STEPS=50
 # This is how many checkpoints we will keep. Two is safe, but three is safer.
 export CHECKPOINTING_LIMIT=2

 # This is decided as a relatively conservative 'constant' learning rate.
 # Adjust higher or lower depending on how burnt your model becomes.
 #
 # Originally the loona lora was trained with 5e-5, but that seems really slow.
 # ptx0 says 1e-4 to 5e-4 works better.
 export LEARNING_RATE=1e-4 #@param {type:"number"}

 # Using a Huggingface Hub model:
 export MODEL_NAME="black-forest-labs/FLUX.1-dev"
 # Using a local path to a huggingface hub model or saved checkpoint:
 #export MODEL_NAME="/datasets/models/pipeline"

 # Make DEBUG_EXTRA_ARGS empty to disable wandb.
 export DEBUG_EXTRA_ARGS="--report_to=wandb"
 export TRACKER_PROJECT_NAME="flux-training-losercity"
 export TRACKER_RUN_NAME="losercity-flux-loona-more"

 # Max number of steps OR epochs can be used. Not both.
 export MAX_NUM_STEPS=200000
 # Will likely overtrain, but that's fine.
 export NUM_EPOCHS=0

 # A convenient prefix for all of your training paths.
 # These may be absolute or relative paths. Here, we are using relative paths.
 # The output will just be in a folder called "output/models" by default.
 export DATALOADER_CONFIG="config/multidatabackend.json"
 export OUTPUT_DIR="/root/SimpleTuner/output/models"

 # Set this to "true" to push your model to Hugging Face Hub.
 export PUSH_TO_HUB="true"
 # If PUSH_TO_HUB and PUSH_CHECKPOINTS are both enabled, every saved checkpoint will be pushed to Hugging Face Hub.
 export PUSH_CHECKPOINTS="true"
 # This will be the model name for your final hub upload, eg. "yourusername/yourmodelname"
 # It defaults to the wandb project name, but you can override this here.
 export HUB_MODEL_NAME=$TRACKER_PROJECT_NAME

 # By default, images will be resized so their SMALLER EDGE is 1024 pixels, maintaining aspect ratio.
 # Setting this value to 768px might result in more reasonable training data sizes for SDXL.
 export RESOLUTION=1024
 # If you want to have the training data resized by pixel area (Megapixels) rather than edge length,
 #  set this value to "area" instead of "pixel", and uncomment the next RESOLUTION declaration.
 export RESOLUTION_TYPE="pixel"
 #export RESOLUTION=1          # 1.0 Megapixel training sizes
 # If RESOLUTION_TYPE="pixel", the minimum resolution specifies the smaller edge length, measured in pixels. Recommended: 1024.
 # If RESOLUTION_TYPE="area", the minimum resolution specifies the total image area, measured in megapixels. Recommended: 1.
 export MINIMUM_RESOLUTION=$RESOLUTION

 # How many decimals to round aspect buckets to.
 #export ASPECT_BUCKET_ROUNDING=2

 # Use this to append an instance prompt to each caption, used for adding trigger words.
 # This has not been tested in SDXL.
 #export INSTANCE_PROMPT="lotr style "
 # If you also supply a user prompt library or `--use_prompt_library`, this will be added to those lists.
 export VALIDATION_PROMPT="loona from helluva boss is eating a donut"
 export VALIDATION_GUIDANCE=3.5
 # You'll want to set this to 0.7 if you are training a terminal SNR model.
 export VALIDATION_GUIDANCE_RESCALE=0.0
 # For flux training, you may want to do validation with classifier free guidance.
 # You can set this to be >1.0 to do so. If you don't enable CFG, results may be unreliable or look very bad.
 # Flux LoRAs have a side-effect of requiring a blank negative prompt, so be sure to set VALIDATION_NEGATIVE_PROMPT="" as well.
 export VALIDATION_GUIDANCE_REAL=4.5
 # Skip CFG during validation sampling with CFG (flux only, default=2).
 export VALIDATION_NO_CFG_UNTIL_TIMESTEP=2
 # How frequently we will save and run a pipeline for validations.
 export VALIDATION_STEPS=25
 export VALIDATION_NUM_INFERENCE_STEPS=15
 export VALIDATION_NEGATIVE_PROMPT=""
 export VALIDATION_SEED=42
 export VALIDATION_RESOLUTION=$RESOLUTION


 # Adjust this for your GPU memory size. This, and resolution, are the biggest VRAM killers.
 export TRAIN_BATCH_SIZE=6
 # Accumulate your update gradient over many steps, to save VRAM while still having higher effective batch size:
 # effective batch size = ($TRAIN_BATCH_SIZE * $GRADIENT_ACCUMULATION_STEPS).
 export GRADIENT_ACCUMULATION_STEPS=1

 # Use any standard scheduler type. constant, polynomial, constant_with_warmup
 export LR_SCHEDULE="constant_with_warmup"
 # A warmup period allows the model and the EMA weights more importantly to familiarise itself with the current quanta.
 # For the cosine or sine type schedules, the warmup period defines the interval between peaks or valleys.
 # Use a sine schedule to simulate a warmup period, or a Cosine period to simulate a polynomial start.
 #export LR_WARMUP_STEPS=$((MAX_NUM_STEPS / 10))
 export LR_WARMUP_STEPS=50

 # Caption dropout probability. Set to 0.1 for 10% of captions dropped out. Set to 0 to disable.
 # You may wish to disable dropout if you want to limit your changes strictly to the prompts you show the model.
 # You may wish to increase the rate of dropout if you want to more broadly adopt your changes across the model.
 export CAPTION_DROPOUT_PROBABILITY=0.1

 export METADATA_UPDATE_INTERVAL=65

 # How many workers to use for VAE caching.
 export MAX_WORKERS=32
 # Read and write batch sizes for VAE caching.
 export READ_BATCH_SIZE=25
 export WRITE_BATCH_SIZE=64
 # How many images to encode at once with the VAE. Can increase VRAM use.
 export VAE_BATCH_SIZE=12
 # How many images to process at once (resize, crop, transform) during VAE caching.
 export IMAGE_PROCESSING_BATCH_SIZE=32
 # When using large batch sizes, you'll need to increase the pool connection limit.
 export AWS_MAX_POOL_CONNECTIONS=128
 # For very large systems, setting this can reduce CPU overhead of torch spawning an unnecessarily large number of threads.
 export TORCH_NUM_THREADS=8

 # If this is set, any images that fail to open will be DELETED to avoid re-checking them every time.
 export DELETE_ERRORED_IMAGES=0
 # If this is set, any images that are too small for the minimum resolution size will be DELETED.
 export DELETE_SMALL_IMAGES=0

 # Bytedance recommends these be set to "trailing" so that inference and training behave in a more congruent manner.
 # To follow the original SDXL training strategy, use "leading" instead, though results are generally worse.
 export TRAINING_SCHEDULER_TIMESTEP_SPACING="trailing"
 export INFERENCE_SCHEDULER_TIMESTEP_SPACING="trailing"

 # Removing this option or unsetting it uses vanilla training. Setting it reweights the loss by the position of the timestep in the noise schedule.
 # A value "5" is recommended by the researchers. A value of "20" is the least impact, and "1" is the most impact.
 export MIN_SNR_GAMMA=5

 # Set this to an explicit value of "false" to disable Xformers. Probably required for AMD users.
 export USE_XFORMERS=true

 # There's basically no reason to unset this. However, to disable it, use an explicit value of "false".
 # This will save a lot of memory consumption when enabled.
 export USE_GRADIENT_CHECKPOINTING=true

 ##
 # Options below here may require a bit more complicated configuration, so they are not simple variables.
 ##

 # TF32 is great on Ampere or Ada, not sure about earlier generations.
 export ALLOW_TF32=true
 # AdamW 8Bit is a robust and lightweight choice. Adafactor might reduce memory consumption, and Dadaptation is slow and experimental.
 # AdamW is the default optimizer, but it uses a lot of memory and is slower than AdamW8Bit or Adafactor.
 # When training a quantised base model, you can't use adamw_bf16. Instead, try adafactor or adamw.
 # Choices: adamw, adamw8bit, adafactor, dadaptation, adamw_bf16
 export OPTIMIZER="adamw_bf16"


 # EMA is a strong regularisation method that uses a lot of extra VRAM to hold two copies of the weights.
 # This is worthwhile on large training runs, but not so much for smaller training runs.
 export USE_EMA=false
 export EMA_DECAY=0.999

 export TRAINER_EXTRA_ARGS="--flux_lora_target=all+ffs --lora_rank=64 --keep_vae_loaded --adam_weight_decay=0.1 --max_grad_norm=0.01 --lr_scheduler=constant_with_warmup --validation_torch_compile=false --lora_init_type=default"
 ## For offset noise training:
 # Not recommended for terminal SNR models.
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --offset_noise --noise_offset=0.02"

 ## For terminal SNR training:
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --prediction_type=v_prediction --rescale_betas_zero_snr"
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --training_scheduler_timestep_spacing=trailing --inference_scheduler_timestep_spacing=trailing"
 ## You may benefit from directing training toward a specific weighted subset of timesteps.
 # In this example, we train the final 25% of the timestep schedule with a 3x bias.
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=later --timestep_bias_portion=0.25 --timestep_bias_multiplier=3"
 # In this example, we train the earliest 25% of the timestep schedule with a 5x bias.
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=earlier --timestep_bias_portion=0.25 --timestep_bias_multiplier=5"
 # Here, we designate that specifically, timesteps 200 to 500 should be prioritised.
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=range --timestep_bias_begin=200 --timestep_bias_end=500 --timestep_bias_multiplier=3"

 ## For experimental min-SNR weighted loss training (5 is suggested value by the original researchers):
 # Not recommended for terminal SNR models.
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --snr_gamma=5.0"

 # For Wasabi S3 filesystem backend (experimental)
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --data_backend=aws --aws_bucket_name=test123"
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_endpoint_url=https://s3.wasabisys.com"
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_access_key=1234567890"
 #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_secret_access_key=0987654321"


 # Reproducible training. Set to -1 to disable.
 export TRAINING_SEED=42

 # Mixed precision is the best. You honestly might need to YOLO it in fp16 mode for Google Colab type setups.
 export MIXED_PRECISION="bf16"                # Might not be supported on all GPUs. fp32 will be needed for others.
 export PURE_BF16=true

 # This has to be changed if you're training with multiple GPUs.
 export TRAINING_NUM_PROCESSES=1
 export TRAINING_NUM_MACHINES=1
 export ACCELERATE_EXTRA_ARGS=""                          # --multi_gpu or other similar flags for huggingface accelerate

 # With Pytorch 2.1, you might have pretty good luck here.
 # If you're using aspect bucketing however, each resolution change will recompile. Seriously, just don't do it.
 # Well, then again... Pytorch 2.2 has support for dynamic shapes. Why not?
 export TRAINING_DYNAMO_BACKEND='no'                # or 'no' if you want to disable torch compile in case of performance issues or lack of support (eg. AMD)

 export TOKENIZERS_PARALLELISM=false
	# Configure these values.

	# 'lora' or 'full'
	# lora - train a small network for a character or style, or both. quite versatile.
	# full - requires lots of vram, trains very slowly, needs a lot of data and concepts.
	export MODEL_TYPE='lora'

	# Set this to 'true' if you are training a Stable Diffusion 3 checkpoint.
	# Use MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
	export STABLE_DIFFUSION_3=false
	# Similarly, this is to train PixArt Sigma (1K or 2K) models.
	# Use MODEL_NAME="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS"
	export PIXART_SIGMA=false
	# For old Stable Diffusion 1.x/2.x models, you'll enable this.
	# Use MODEL_NAME="stabilityai/stable-diffusion-2-1"
	export STABLE_DIFFUSION_LEGACY=false
	# For Kwai-Kolors, enable KOLORS.
	# Use MODEL_NAME="kwai-kolors/kolors-diffusers"
	export KOLORS=false
	# For Flux, if you have 8 GPUs and DeepSpeed configured.
	# Use MODEL_NAME="black-forest-labs/FLUX.1-dev"
	export FLUX=true

	# ControlNet model training is only supported when MODEL_TYPE='full'
	# See this document for more information: https://github.com/bghira/SimpleTuner/blob/main/documentation/CONTROLNET.md
	# DeepFloyd, PixArt, and SD3 do not currently support ControlNet model training.
	export CONTROLNET=false

	# DoRA enhances the training style of LoRA, but it will run more slowly at the same rank.
	# See: https://arxiv.org/abs/2402.09353
	# See: https://github.com/huggingface/peft/pull/1474
	export USE_DORA=false

	# BitFit freeze strategy for the u-net causes everything but the biases to be frozen.
	# This may help retain the full model's underlying capabilities. LoRA is currently not tested/known to work.
	#if [[ "$MODEL_TYPE" == "full" ]]; then
	# # When training a full model, we will rely on BitFit to keep the u-net intact.
	# export USE_BITFIT=true
	#elif [[ "$MODEL_TYPE" == "lora" ]]; then
	# # LoRA can not use BitFit.
	# export USE_BITFIT=false
	#elif [[ "$MODEL_TYPE" == "deepfloyd-full" ]]; then
	# export USE_BITFIT=true
	#fi

	# Restart where we left off. Change this to "checkpoint-1234" to start from a specific checkpoint.
	export RESUME_CHECKPOINT="latest"

	# How often to checkpoint. Depending on your learning rate, you may wish to change this.
	# For the default settings with 10 gradient accumulations, more frequent checkpoints might be preferable at first.
	export CHECKPOINTING_STEPS=50
	# This is how many checkpoints we will keep. Two is safe, but three is safer.
	export CHECKPOINTING_LIMIT=2

	# This is decided as a relatively conservative 'constant' learning rate.
	# Adjust higher or lower depending on how burnt your model becomes.
	#
	# Originally the loona lora was trained with 5e-5, but that seems really slow.
	# ptx0 says 1e-4 to 5e-4 works better.
	export LEARNING_RATE=1e-4 #@param {type:"number"}

	# Using a Huggingface Hub model:
	export MODEL_NAME="black-forest-labs/FLUX.1-dev"
	# Using a local path to a huggingface hub model or saved checkpoint:
	#export MODEL_NAME="/datasets/models/pipeline"

	# Make DEBUG_EXTRA_ARGS empty to disable wandb.
	export DEBUG_EXTRA_ARGS="--report_to=wandb"
	export TRACKER_PROJECT_NAME="flux-training-losercity"
	export TRACKER_RUN_NAME="losercity-flux-loona-more"

	# Max number of steps OR epochs can be used. Not both.
	export MAX_NUM_STEPS=200000
	# Will likely overtrain, but that's fine.
	export NUM_EPOCHS=0

	# A convenient prefix for all of your training paths.
	# These may be absolute or relative paths. Here, we are using relative paths.
	# The output will just be in a folder called "output/models" by default.
	export DATALOADER_CONFIG="config/multidatabackend.json"
	export OUTPUT_DIR="/root/SimpleTuner/output/models"

	# Set this to "true" to push your model to Hugging Face Hub.
	export PUSH_TO_HUB="true"
	# If PUSH_TO_HUB and PUSH_CHECKPOINTS are both enabled, every saved checkpoint will be pushed to Hugging Face Hub.
	export PUSH_CHECKPOINTS="true"
	# This will be the model name for your final hub upload, eg. "yourusername/yourmodelname"
	# It defaults to the wandb project name, but you can override this here.
	export HUB_MODEL_NAME=$TRACKER_PROJECT_NAME

	# By default, images will be resized so their SMALLER EDGE is 1024 pixels, maintaining aspect ratio.
	# Setting this value to 768px might result in more reasonable training data sizes for SDXL.
	export RESOLUTION=1024
	# If you want to have the training data resized by pixel area (Megapixels) rather than edge length,
	# set this value to "area" instead of "pixel", and uncomment the next RESOLUTION declaration.
	export RESOLUTION_TYPE="pixel"
	#export RESOLUTION=1 # 1.0 Megapixel training sizes
	# If RESOLUTION_TYPE="pixel", the minimum resolution specifies the smaller edge length, measured in pixels. Recommended: 1024.
	# If RESOLUTION_TYPE="area", the minimum resolution specifies the total image area, measured in megapixels. Recommended: 1.
	export MINIMUM_RESOLUTION=$RESOLUTION

	# How many decimals to round aspect buckets to.
	#export ASPECT_BUCKET_ROUNDING=2

	# Use this to append an instance prompt to each caption, used for adding trigger words.
	# This has not been tested in SDXL.
	#export INSTANCE_PROMPT="lotr style "
	# If you also supply a user prompt library or `--use_prompt_library`, this will be added to those lists.
	export VALIDATION_PROMPT="loona from helluva boss is eating a donut"
	export VALIDATION_GUIDANCE=3.5
	# You'll want to set this to 0.7 if you are training a terminal SNR model.
	export VALIDATION_GUIDANCE_RESCALE=0.0
	# For flux training, you may want to do validation with classifier free guidance.
	# You can set this to be >1.0 to do so. If you don't enable CFG, results may be unreliable or look very bad.
	# Flux LoRAs have a side-effect of requiring a blank negative prompt, so be sure to set VALIDATION_NEGATIVE_PROMPT="" as well.
	export VALIDATION_GUIDANCE_REAL=4.5
	# Skip CFG during validation sampling with CFG (flux only, default=2).
	export VALIDATION_NO_CFG_UNTIL_TIMESTEP=2
	# How frequently we will save and run a pipeline for validations.
	export VALIDATION_STEPS=25
	export VALIDATION_NUM_INFERENCE_STEPS=15
	export VALIDATION_NEGATIVE_PROMPT=""
	export VALIDATION_SEED=42
	export VALIDATION_RESOLUTION=$RESOLUTION


	# Adjust this for your GPU memory size. This, and resolution, are the biggest VRAM killers.
	export TRAIN_BATCH_SIZE=6
	# Accumulate your update gradient over many steps, to save VRAM while still having higher effective batch size:
	# effective batch size = ($TRAIN_BATCH_SIZE * $GRADIENT_ACCUMULATION_STEPS).
	export GRADIENT_ACCUMULATION_STEPS=1

	# Use any standard scheduler type. constant, polynomial, constant_with_warmup
	export LR_SCHEDULE="constant_with_warmup"
	# A warmup period allows the model and the EMA weights more importantly to familiarise itself with the current quanta.
	# For the cosine or sine type schedules, the warmup period defines the interval between peaks or valleys.
	# Use a sine schedule to simulate a warmup period, or a Cosine period to simulate a polynomial start.
	#export LR_WARMUP_STEPS=$((MAX_NUM_STEPS / 10))
	export LR_WARMUP_STEPS=50

	# Caption dropout probability. Set to 0.1 for 10% of captions dropped out. Set to 0 to disable.
	# You may wish to disable dropout if you want to limit your changes strictly to the prompts you show the model.
	# You may wish to increase the rate of dropout if you want to more broadly adopt your changes across the model.
	export CAPTION_DROPOUT_PROBABILITY=0.1

	export METADATA_UPDATE_INTERVAL=65

	# How many workers to use for VAE caching.
	export MAX_WORKERS=32
	# Read and write batch sizes for VAE caching.
	export READ_BATCH_SIZE=25
	export WRITE_BATCH_SIZE=64
	# How many images to encode at once with the VAE. Can increase VRAM use.
	export VAE_BATCH_SIZE=12
	# How many images to process at once (resize, crop, transform) during VAE caching.
	export IMAGE_PROCESSING_BATCH_SIZE=32
	# When using large batch sizes, you'll need to increase the pool connection limit.
	export AWS_MAX_POOL_CONNECTIONS=128
	# For very large systems, setting this can reduce CPU overhead of torch spawning an unnecessarily large number of threads.
	export TORCH_NUM_THREADS=8

	# If this is set, any images that fail to open will be DELETED to avoid re-checking them every time.
	export DELETE_ERRORED_IMAGES=0
	# If this is set, any images that are too small for the minimum resolution size will be DELETED.
	export DELETE_SMALL_IMAGES=0

	# Bytedance recommends these be set to "trailing" so that inference and training behave in a more congruent manner.
	# To follow the original SDXL training strategy, use "leading" instead, though results are generally worse.
	export TRAINING_SCHEDULER_TIMESTEP_SPACING="trailing"
	export INFERENCE_SCHEDULER_TIMESTEP_SPACING="trailing"

	# Removing this option or unsetting it uses vanilla training. Setting it reweights the loss by the position of the timestep in the noise schedule.
	# A value "5" is recommended by the researchers. A value of "20" is the least impact, and "1" is the most impact.
	export MIN_SNR_GAMMA=5

	# Set this to an explicit value of "false" to disable Xformers. Probably required for AMD users.
	export USE_XFORMERS=true

	# There's basically no reason to unset this. However, to disable it, use an explicit value of "false".
	# This will save a lot of memory consumption when enabled.
	export USE_GRADIENT_CHECKPOINTING=true

	##
	# Options below here may require a bit more complicated configuration, so they are not simple variables.
	##

	# TF32 is great on Ampere or Ada, not sure about earlier generations.
	export ALLOW_TF32=true
	# AdamW 8Bit is a robust and lightweight choice. Adafactor might reduce memory consumption, and Dadaptation is slow and experimental.
	# AdamW is the default optimizer, but it uses a lot of memory and is slower than AdamW8Bit or Adafactor.
	# When training a quantised base model, you can't use adamw_bf16. Instead, try adafactor or adamw.
	# Choices: adamw, adamw8bit, adafactor, dadaptation, adamw_bf16
	export OPTIMIZER="adamw_bf16"


	# EMA is a strong regularisation method that uses a lot of extra VRAM to hold two copies of the weights.
	# This is worthwhile on large training runs, but not so much for smaller training runs.
	export USE_EMA=false
	export EMA_DECAY=0.999

	export TRAINER_EXTRA_ARGS="--flux_lora_target=all+ffs --lora_rank=64 --keep_vae_loaded --adam_weight_decay=0.1 --max_grad_norm=0.01 --lr_scheduler=constant_with_warmup --validation_torch_compile=false --lora_init_type=default"
	## For offset noise training:
	# Not recommended for terminal SNR models.
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --offset_noise --noise_offset=0.02"

	## For terminal SNR training:
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --prediction_type=v_prediction --rescale_betas_zero_snr"
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --training_scheduler_timestep_spacing=trailing --inference_scheduler_timestep_spacing=trailing"
	## You may benefit from directing training toward a specific weighted subset of timesteps.
	# In this example, we train the final 25% of the timestep schedule with a 3x bias.
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=later --timestep_bias_portion=0.25 --timestep_bias_multiplier=3"
	# In this example, we train the earliest 25% of the timestep schedule with a 5x bias.
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=earlier --timestep_bias_portion=0.25 --timestep_bias_multiplier=5"
	# Here, we designate that specifically, timesteps 200 to 500 should be prioritised.
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=range --timestep_bias_begin=200 --timestep_bias_end=500 --timestep_bias_multiplier=3"

	## For experimental min-SNR weighted loss training (5 is suggested value by the original researchers):
	# Not recommended for terminal SNR models.
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --snr_gamma=5.0"

	# For Wasabi S3 filesystem backend (experimental)
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --data_backend=aws --aws_bucket_name=test123"
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_endpoint_url=https://s3.wasabisys.com"
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_access_key=1234567890"
	#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_secret_access_key=0987654321"


	# Reproducible training. Set to -1 to disable.
	export TRAINING_SEED=42

	# Mixed precision is the best. You honestly might need to YOLO it in fp16 mode for Google Colab type setups.
	export MIXED_PRECISION="bf16" # Might not be supported on all GPUs. fp32 will be needed for others.
	export PURE_BF16=true

	# This has to be changed if you're training with multiple GPUs.
	export TRAINING_NUM_PROCESSES=1
	export TRAINING_NUM_MACHINES=1
	export ACCELERATE_EXTRA_ARGS="" # --multi_gpu or other similar flags for huggingface accelerate

	# With Pytorch 2.1, you might have pretty good luck here.
	# If you're using aspect bucketing however, each resolution change will recompile. Seriously, just don't do it.
	# Well, then again... Pytorch 2.2 has support for dynamic shapes. Why not?
	export TRAINING_DYNAMO_BACKEND='no' # or 'no' if you want to disable torch compile in case of performance issues or lack of support (eg. AMD)

	export TOKENIZERS_PARALLELISM=false
No results found