Jaid · August 17, 2024 10:34
diff --git a/kohya-sd-scripts-help.txt b/kohya-sd-scripts-help.txt
 usage: sdxl_train_network.py [-h] [--console_log_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
                             [--console_log_file CONSOLE_LOG_FILE] [--console_log_simple] [--v2]
                             [--v_parameterization]
                             [--pretrained_model_name_or_path PRETRAINED_MODEL_NAME_OR_PATH]
                             [--tokenizer_cache_dir TOKENIZER_CACHE_DIR]
                             [--train_data_dir TRAIN_DATA_DIR] [--cache_info] [--shuffle_caption]
                             [--caption_separator CAPTION_SEPARATOR]
                             [--caption_extension CAPTION_EXTENSION]
                             [--caption_extention CAPTION_EXTENTION] [--keep_tokens KEEP_TOKENS]
                             [--keep_tokens_separator KEEP_TOKENS_SEPARATOR]
                             [--secondary_separator SECONDARY_SEPARATOR] [--enable_wildcard]
                             [--caption_prefix CAPTION_PREFIX] [--caption_suffix CAPTION_SUFFIX]
                             [--color_aug] [--flip_aug]
                             [--face_crop_aug_range FACE_CROP_AUG_RANGE] [--random_crop]
                             [--debug_dataset] [--resolution RESOLUTION] [--cache_latents]
                             [--vae_batch_size VAE_BATCH_SIZE] [--cache_latents_to_disk]
                             [--enable_bucket] [--min_bucket_reso MIN_BUCKET_RESO]
                             [--max_bucket_reso MAX_BUCKET_RESO]
                             [--bucket_reso_steps BUCKET_RESO_STEPS] [--bucket_no_upscale]
                             [--token_warmup_min TOKEN_WARMUP_MIN]
                             [--token_warmup_step TOKEN_WARMUP_STEP] [--alpha_mask]
                             [--dataset_class DATASET_CLASS]
                             [--caption_dropout_rate CAPTION_DROPOUT_RATE]
                             [--caption_dropout_every_n_epochs CAPTION_DROPOUT_EVERY_N_EPOCHS]
                             [--caption_tag_dropout_rate CAPTION_TAG_DROPOUT_RATE]
                             [--reg_data_dir REG_DATA_DIR] [--in_json IN_JSON]
                             [--dataset_repeats DATASET_REPEATS] [--output_dir OUTPUT_DIR]
                             [--output_name OUTPUT_NAME]
                             [--huggingface_repo_id HUGGINGFACE_REPO_ID]
                             [--huggingface_repo_type HUGGINGFACE_REPO_TYPE]
                             [--huggingface_path_in_repo HUGGINGFACE_PATH_IN_REPO]
                             [--huggingface_token HUGGINGFACE_TOKEN]
                             [--huggingface_repo_visibility HUGGINGFACE_REPO_VISIBILITY]
                             [--save_state_to_huggingface] [--resume_from_huggingface]
                             [--async_upload] [--save_precision {None,float,fp16,bf16}]
                             [--save_every_n_epochs SAVE_EVERY_N_EPOCHS]
                             [--save_every_n_steps SAVE_EVERY_N_STEPS]
                             [--save_n_epoch_ratio SAVE_N_EPOCH_RATIO]
                             [--save_last_n_epochs SAVE_LAST_N_EPOCHS]
                             [--save_last_n_epochs_state SAVE_LAST_N_EPOCHS_STATE]
                             [--save_last_n_steps SAVE_LAST_N_STEPS]
                             [--save_last_n_steps_state SAVE_LAST_N_STEPS_STATE] [--save_state]
                             [--save_state_on_train_end] [--resume RESUME]
                             [--train_batch_size TRAIN_BATCH_SIZE]
                             [--max_token_length {None,150,225}] [--mem_eff_attn]
                             [--torch_compile]
                             [--dynamo_backend {eager,aot_eager,inductor,aot_ts_nvfuser,nvprims_nvfuser,cudagraphs,ofi,fx2trt,onnxrt}]
                             [--xformers] [--sdpa] [--vae VAE]
                             [--max_train_steps MAX_TRAIN_STEPS]
                             [--max_train_epochs MAX_TRAIN_EPOCHS]
                             [--max_data_loader_n_workers MAX_DATA_LOADER_N_WORKERS]
                             [--persistent_data_loader_workers] [--seed SEED]
                             [--gradient_checkpointing]
                             [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
                             [--mixed_precision {no,fp16,bf16}] [--full_fp16] [--full_bf16]
                             [--fp8_base] [--ddp_timeout DDP_TIMEOUT]
                             [--ddp_gradient_as_bucket_view] [--ddp_static_graph]
                             [--clip_skip CLIP_SKIP] [--logging_dir LOGGING_DIR]
                             [--log_with {tensorboard,wandb,all}] [--log_prefix LOG_PREFIX]
                             [--log_tracker_name LOG_TRACKER_NAME]
                             [--wandb_run_name WANDB_RUN_NAME]
                             [--log_tracker_config LOG_TRACKER_CONFIG]
                             [--wandb_api_key WANDB_API_KEY] [--log_config]
                             [--noise_offset NOISE_OFFSET] [--noise_offset_random_strength]
                             [--multires_noise_iterations MULTIRES_NOISE_ITERATIONS]
                             [--ip_noise_gamma IP_NOISE_GAMMA]
                             [--ip_noise_gamma_random_strength]
                             [--multires_noise_discount MULTIRES_NOISE_DISCOUNT]
                             [--adaptive_noise_scale ADAPTIVE_NOISE_SCALE] [--zero_terminal_snr]
                             [--min_timestep MIN_TIMESTEP] [--max_timestep MAX_TIMESTEP]
                             [--loss_type {l1,l2,huber,smooth_l1}]
                             [--huber_schedule {constant,exponential,snr}] [--huber_c HUBER_C]
                             [--lowram] [--highvram]
                             [--sample_every_n_steps SAMPLE_EVERY_N_STEPS] [--sample_at_first]
                             [--sample_every_n_epochs SAMPLE_EVERY_N_EPOCHS]
                             [--sample_prompts SAMPLE_PROMPTS]
                             [--sample_sampler {ddim,pndm,lms,euler,euler_a,heun,dpm_2,dpm_2_a,dpmsolver,dpmsolver++,dpmsingle,k_lms,k_euler,k_euler_a,k_dpm_2,k_dpm_2_a}]
                             [--config_file CONFIG_FILE] [--output_config]
                             [--metadata_title METADATA_TITLE] [--metadata_author METADATA_AUTHOR]
                             [--metadata_description METADATA_DESCRIPTION]
                             [--metadata_license METADATA_LICENSE]
                             [--metadata_tags METADATA_TAGS]
                             [--prior_loss_weight PRIOR_LOSS_WEIGHT]
                             [--conditioning_data_dir CONDITIONING_DATA_DIR] [--masked_loss]
                             [--deepspeed] [--zero_stage {0,1,2,3}]
                             [--offload_optimizer_device {None,cpu,nvme}]
                             [--offload_optimizer_nvme_path OFFLOAD_OPTIMIZER_NVME_PATH]
                             [--offload_param_device {None,cpu,nvme}]
                             [--offload_param_nvme_path OFFLOAD_PARAM_NVME_PATH]
                             [--zero3_init_flag] [--zero3_save_16bit_model]
                             [--fp16_master_weights_and_gradients]
                             [--optimizer_type OPTIMIZER_TYPE] [--use_8bit_adam]
                             [--use_lion_optimizer] [--learning_rate LEARNING_RATE]
                             [--max_grad_norm MAX_GRAD_NORM]
                             [--optimizer_args [OPTIMIZER_ARGS ...]]
                             [--lr_scheduler_type LR_SCHEDULER_TYPE]
                             [--lr_scheduler_args [LR_SCHEDULER_ARGS ...]]
                             [--lr_scheduler LR_SCHEDULER] [--lr_warmup_steps LR_WARMUP_STEPS]
                             [--lr_scheduler_num_cycles LR_SCHEDULER_NUM_CYCLES]
                             [--lr_scheduler_power LR_SCHEDULER_POWER] [--fused_backward_pass]
                             [--dataset_config DATASET_CONFIG] [--min_snr_gamma MIN_SNR_GAMMA]
                             [--scale_v_pred_loss_like_noise_pred]
                             [--v_pred_like_loss V_PRED_LIKE_LOSS] [--debiased_estimation_loss]
                             [--weighted_captions] [--no_metadata]
                             [--save_model_as {None,ckpt,pt,safetensors}] [--unet_lr UNET_LR]
                             [--text_encoder_lr TEXT_ENCODER_LR]
                             [--network_weights NETWORK_WEIGHTS]
                             [--network_module NETWORK_MODULE] [--network_dim NETWORK_DIM]
                             [--network_alpha NETWORK_ALPHA] [--network_dropout NETWORK_DROPOUT]
                             [--network_args [NETWORK_ARGS ...]] [--network_train_unet_only]
                             [--network_train_text_encoder_only]
                             [--training_comment TRAINING_COMMENT] [--dim_from_weights]
                             [--scale_weight_norms SCALE_WEIGHT_NORMS]
                             [--base_weights [BASE_WEIGHTS ...]]
                             [--base_weights_multiplier [BASE_WEIGHTS_MULTIPLIER ...]]
                             [--no_half_vae] [--skip_until_initial_step]
                             [--initial_epoch INITIAL_EPOCH] [--initial_step INITIAL_STEP]
                             [--cache_text_encoder_outputs]
                             [--cache_text_encoder_outputs_to_disk]
                             [--disable_mmap_load_safetensors]

 options:
  -h, --help            show this help message and exit
  --console_log_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
                        Set the logging level
  --console_log_file CONSOLE_LOG_FILE
                        Log to a file instead of stderr
  --console_log_simple  Simple log output
  --v2                  load Stable Diffusion v2.0 model
  --v_parameterization  enable v-parameterization training
  --pretrained_model_name_or_path PRETRAINED_MODEL_NAME_OR_PATH
                        pretrained model to train, directory to Diffusers model or StableDiffusion
                        checkpoint
  --tokenizer_cache_dir TOKENIZER_CACHE_DIR
                        directory for caching Tokenizer (for offline training)
  --train_data_dir TRAIN_DATA_DIR
                        directory for train images
  --cache_info          cache meta information (caption and image size) for faster dataset
                        loading. only available for DreamBooth
  --shuffle_caption     shuffle separated caption
  --caption_separator CAPTION_SEPARATOR
                        separator for caption
  --caption_extension CAPTION_EXTENSION
                        extension of caption files
  --caption_extention CAPTION_EXTENTION
                        extension of caption files (backward compatibility)
  --keep_tokens KEEP_TOKENS
                        keep heading N tokens when shuffling caption tokens (token means comma
                        separated strings)
  --keep_tokens_separator KEEP_TOKENS_SEPARATOR
                        A custom separator to divide the caption into fixed and flexible parts.
                        Tokens before this separator will not be shuffled. If not specified, '--
                        keep_tokens' will be used to determine the fixed number of tokens.
  --secondary_separator SECONDARY_SEPARATOR
                        a secondary separator for caption. This separator is replaced to
                        caption_separator after dropping/shuffling caption
  --enable_wildcard     enable wildcard for caption (e.g. '{image|picture|rendition}')
  --caption_prefix CAPTION_PREFIX
                        prefix for caption text
  --caption_suffix CAPTION_SUFFIX
                        suffix for caption text
  --color_aug           enable weak color augmentation
  --flip_aug            enable horizontal flip augmentation
  --face_crop_aug_range FACE_CROP_AUG_RANGE
                        enable face-centered crop augmentation and its range (e.g. 2.0,4.0)
  --random_crop         enable random crop (for style training in face-centered crop augmentation)
  --debug_dataset       show images for debugging (do not train)
  --resolution RESOLUTION
                        resolution in training ('size' or 'width,height')
  --cache_latents       cache latents to main memory to reduce VRAM usage (augmentations must be
                        disabled)
  --vae_batch_size VAE_BATCH_SIZE
                        batch size for caching latents
  --cache_latents_to_disk
                        cache latents to disk to reduce VRAM usage (augmentations must be
                        disabled)
  --enable_bucket       enable buckets for multi aspect ratio training
  --min_bucket_reso MIN_BUCKET_RESO
                        minimum resolution for buckets
  --max_bucket_reso MAX_BUCKET_RESO
                        maximum resolution for buckets
  --bucket_reso_steps BUCKET_RESO_STEPS
                        steps of resolution for buckets, divisible by 8 is recommended
  --bucket_no_upscale   make bucket for each image without upscaling
  --token_warmup_min TOKEN_WARMUP_MIN
                        start learning at N tags (token means comma separated strings)
  --token_warmup_step TOKEN_WARMUP_STEP
                        tag length reaches maximum on N steps (or N*max_train_steps if N<1)
  --alpha_mask          use alpha channel as mask for training
  --dataset_class DATASET_CLASS
                        dataset class for arbitrary dataset (package.module.Class)
  --caption_dropout_rate CAPTION_DROPOUT_RATE
                        Rate out dropout caption(0.0~1.0)
  --caption_dropout_every_n_epochs CAPTION_DROPOUT_EVERY_N_EPOCHS
                        Dropout all captions every N epochs
  --caption_tag_dropout_rate CAPTION_TAG_DROPOUT_RATE
                        Rate out dropout comma separated tokens(0.0~1.0)
  --reg_data_dir REG_DATA_DIR
                        directory for regularization images
  --in_json IN_JSON     json metadata for dataset
  --dataset_repeats DATASET_REPEATS
                        repeat dataset when training with captions
  --output_dir OUTPUT_DIR
                        directory to output trained model
  --output_name OUTPUT_NAME
                        base name of trained model file
  --huggingface_repo_id HUGGINGFACE_REPO_ID
                        huggingface repo name to upload
  --huggingface_repo_type HUGGINGFACE_REPO_TYPE
                        huggingface repo type to upload
  --huggingface_path_in_repo HUGGINGFACE_PATH_IN_REPO
                        huggingface model path to upload files
  --huggingface_token HUGGINGFACE_TOKEN
                        huggingface token
  --huggingface_repo_visibility HUGGINGFACE_REPO_VISIBILITY
                        huggingface repository visibility ('public' for public, 'private' or None
                        for private)
  --save_state_to_huggingface
                        save state to huggingface
  --resume_from_huggingface
                        resume from huggingface (ex: --resume
                        {repo_id}/{path_in_repo}:{revision}:{repo_type})
  --async_upload        upload to huggingface asynchronously
  --save_precision {None,float,fp16,bf16}
                        precision in saving
  --save_every_n_epochs SAVE_EVERY_N_EPOCHS
                        save checkpoint every N epochs
  --save_every_n_steps SAVE_EVERY_N_STEPS
                        save checkpoint every N steps
  --save_n_epoch_ratio SAVE_N_EPOCH_RATIO
                        save checkpoint N epoch ratio (for example 5 means save at least 5 files
                        total)
  --save_last_n_epochs SAVE_LAST_N_EPOCHS
                        save last N checkpoints when saving every N epochs (remove older
                        checkpoints)
  --save_last_n_epochs_state SAVE_LAST_N_EPOCHS_STATE
                        save last N checkpoints of state (overrides the value of
                        --save_last_n_epochs)
  --save_last_n_steps SAVE_LAST_N_STEPS
                        save checkpoints until N steps elapsed (remove older checkpoints if N
                        steps elapsed)
  --save_last_n_steps_state SAVE_LAST_N_STEPS_STATE
                        save states until N steps elapsed (remove older states if N steps elapsed,
                        overrides --save_last_n_steps)
  --save_state          save training state additionally (including optimizer states etc.) when
                        saving model
  --save_state_on_train_end
                        save training state (including optimizer states etc.) on train end
  --resume RESUME       saved state to resume training
  --train_batch_size TRAIN_BATCH_SIZE
                        batch size for training
  --max_token_length {None,150,225}
                        max token length of text encoder (default for 75, 150 or 225)
  --mem_eff_attn        use memory efficient attention for CrossAttention
  --torch_compile       use torch.compile (requires PyTorch 2.0)
  --dynamo_backend {eager,aot_eager,inductor,aot_ts_nvfuser,nvprims_nvfuser,cudagraphs,ofi,fx2trt,onnxrt}
                        dynamo backend type (default is inductor)
  --xformers            use xformers for CrossAttention
  --sdpa                use sdpa for CrossAttention (requires PyTorch 2.0)
  --vae VAE             path to checkpoint of vae to replace
  --max_train_steps MAX_TRAIN_STEPS
                        training steps
  --max_train_epochs MAX_TRAIN_EPOCHS
                        training epochs (overrides max_train_steps)
  --max_data_loader_n_workers MAX_DATA_LOADER_N_WORKERS
                        max num workers for DataLoader (lower is less main RAM usage, faster epoch
                        start and slower data loading)
  --persistent_data_loader_workers
                        persistent DataLoader workers (useful for reduce time gap between epoch,
                        but may use more memory)
  --seed SEED           random seed for training
  --gradient_checkpointing
                        enable gradient checkpointing
  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
                        Number of updates steps to accumulate before performing a backward/update
                        pass
  --mixed_precision {no,fp16,bf16}
                        use mixed precision
  --full_fp16           fp16 training including gradients
  --full_bf16           bf16 training including gradients
  --fp8_base            use fp8 for base model
  --ddp_timeout DDP_TIMEOUT
                        DDP timeout (min, None for default of accelerate)
  --ddp_gradient_as_bucket_view
                        enable gradient_as_bucket_view for DDP
  --ddp_static_graph    enable static_graph for DDP
  --clip_skip CLIP_SKIP
                        use output of nth layer from back of text encoder (n>=1)
  --logging_dir LOGGING_DIR
                        enable logging and output TensorBoard log to this directory
  --log_with {tensorboard,wandb,all}
                        what logging tool(s) to use (if 'all', TensorBoard and WandB are both
                        used)
  --log_prefix LOG_PREFIX
                        add prefix for each log directory
  --log_tracker_name LOG_TRACKER_NAME
                        name of tracker to use for logging, default is script-specific default
                        name
  --wandb_run_name WANDB_RUN_NAME
                        The name of the specific wandb session
  --log_tracker_config LOG_TRACKER_CONFIG
                        path to tracker config file to use for logging
  --wandb_api_key WANDB_API_KEY
                        specify WandB API key to log in before starting training (optional).
  --log_config          log training configuration
  --noise_offset NOISE_OFFSET
                        enable noise offset with this value (if enabled, around 0.1 is
                        recommended)
  --noise_offset_random_strength
                        use random strength between 0~noise_offset for noise offset.
  --multires_noise_iterations MULTIRES_NOISE_ITERATIONS
                        enable multires noise with this number of iterations (if enabled, around
                        6-10 is recommended)
  --ip_noise_gamma IP_NOISE_GAMMA
                        enable input perturbation noise. used for regularization. recommended
                        value: around 0.1 (from arxiv.org/abs/2301.11706)
  --ip_noise_gamma_random_strength
                        Use random strength between 0~ip_noise_gamma for input perturbation noise.
  --multires_noise_discount MULTIRES_NOISE_DISCOUNT
                        set discount value for multires noise (has no effect without
                        --multires_noise_iterations)
  --adaptive_noise_scale ADAPTIVE_NOISE_SCALE
                        add `latent mean absolute value * this value` to noise_offset (disabled if
                        None, default)
  --zero_terminal_snr   fix noise scheduler betas to enforce zero terminal SNR
  --min_timestep MIN_TIMESTEP
                        set minimum time step for U-Net training (0~999, default is 0)
  --max_timestep MAX_TIMESTEP
                        set maximum time step for U-Net training (1~1000, default is 1000)
  --loss_type {l1,l2,huber,smooth_l1}
                        The type of loss function to use (L1, L2, Huber, or smooth L1), default is
                        L2
  --huber_schedule {constant,exponential,snr}
                        The scheduling method for Huber loss (constant, exponential, or SNR-
                        based). Only used when loss_type is 'huber' or 'smooth_l1'. default is snr
  --huber_c HUBER_C     The huber loss parameter. Only used if one of the huber loss modes (huber
                        or smooth l1) is selected with loss_type. default is 0.1
  --lowram              enable low RAM optimization. e.g. load models to VRAM instead of RAM (for
                        machines which have bigger VRAM than RAM such as Colab and Kaggle)
  --highvram            disable low VRAM optimization. e.g. do not clear CUDA cache after each
                        latent caching (for machines which have bigger VRAM)
  --sample_every_n_steps SAMPLE_EVERY_N_STEPS
                        generate sample images every N steps
  --sample_at_first     generate sample images before training
  --sample_every_n_epochs SAMPLE_EVERY_N_EPOCHS
                        generate sample images every N epochs (overwrites n_steps)
  --sample_prompts SAMPLE_PROMPTS
                        file for prompts to generate sample images
  --sample_sampler {ddim,pndm,lms,euler,euler_a,heun,dpm_2,dpm_2_a,dpmsolver,dpmsolver++,dpmsingle,k_lms,k_euler,k_euler_a,k_dpm_2,k_dpm_2_a}
                        sampler (scheduler) type for sample images
  --config_file CONFIG_FILE
                        using .toml instead of args to pass hyperparameter
  --output_config       output command line args to given .toml file
  --metadata_title METADATA_TITLE
                        title for model metadata (default is output_name)
  --metadata_author METADATA_AUTHOR
                        author name for model metadata
  --metadata_description METADATA_DESCRIPTION
                        description for model metadata
  --metadata_license METADATA_LICENSE
                        license for model metadata
  --metadata_tags METADATA_TAGS
                        tags for model metadata, separated by comma
  --prior_loss_weight PRIOR_LOSS_WEIGHT
                        loss weight for regularization images
  --conditioning_data_dir CONDITIONING_DATA_DIR
                        conditioning data directory
  --masked_loss         apply mask for calculating loss. conditioning_data_dir is required for
                        dataset.
  --deepspeed           enable deepspeed training
  --zero_stage {0,1,2,3}
                        Possible options are 0,1,2,3.
  --offload_optimizer_device {None,cpu,nvme}
                        Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and
                        3.
  --offload_optimizer_nvme_path OFFLOAD_OPTIMIZER_NVME_PATH
                        Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
  --offload_param_device {None,cpu,nvme}
                        Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.
  --offload_param_nvme_path OFFLOAD_PARAM_NVME_PATH
                        Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.
  --zero3_init_flag     Flag to indicate whether to enable `deepspeed.zero.Init` for constructing
                        massive models.Only applicable with ZeRO Stage-3.
  --zero3_save_16bit_model
                        Flag to indicate whether to save 16-bit model. Only applicable with ZeRO
                        Stage-3.
  --fp16_master_weights_and_gradients
                        fp16_master_and_gradients requires optimizer to support keeping fp16
                        master and gradients while keeping the optimizer states in fp32.
  --optimizer_type OPTIMIZER_TYPE
                        Optimizer to use: AdamW (default), AdamW8bit, PagedAdamW, PagedAdamW8bit,
                        PagedAdamW32bit, Lion8bit, PagedLion8bit, Lion, SGDNesterov,
                        SGDNesterov8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad,
                        DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, AdaFactor
  --use_8bit_adam       use 8bit AdamW optimizer (requires bitsandbytes)
  --use_lion_optimizer  use Lion optimizer (requires lion-pytorch)
  --learning_rate LEARNING_RATE
                        learning rate
  --max_grad_norm MAX_GRAD_NORM
                        Max gradient norm, 0 for no clipping
  --optimizer_args [OPTIMIZER_ARGS ...]
                        additional arguments for optimizer (like "weight_decay=0.01
                        betas=0.9,0.999 ...")
  --lr_scheduler_type LR_SCHEDULER_TYPE
                        custom scheduler module
  --lr_scheduler_args [LR_SCHEDULER_ARGS ...]
                        additional arguments for scheduler (like "T_max=100")
  --lr_scheduler LR_SCHEDULER
                        scheduler to use for learning rate: linear, cosine,
                        cosine_with_restarts, polynomial, constant (default),
                        constant_with_warmup, adafactor
  --lr_warmup_steps LR_WARMUP_STEPS
                        Number of steps for the warmup in the lr scheduler (default is 0)
  --lr_scheduler_num_cycles LR_SCHEDULER_NUM_CYCLES
                        Number of restarts for cosine scheduler with restarts
  --lr_scheduler_power LR_SCHEDULER_POWER
                        Polynomial power for polynomial scheduler
  --fused_backward_pass
                        Combines backward pass and optimizer step to reduce VRAM usage. Only
                        available in SDXL
  --dataset_config DATASET_CONFIG
                        config file for detail settings
  --min_snr_gamma MIN_SNR_GAMMA
                        gamma for reducing the weight of high loss timesteps. Lower numbers have
                        stronger effect. 5 is recommended by paper.
  --scale_v_pred_loss_like_noise_pred
                        scale v-prediction loss like noise prediction loss
  --v_pred_like_loss V_PRED_LIKE_LOSS
                        add v-prediction like loss multiplied by this value
  --debiased_estimation_loss
                        debiased estimation loss
  --weighted_captions   Enable weighted captions in the standard style (token:1.3). No commas
                        inside parens, or shuffle/dropout may break the decoder.
  --no_metadata         do not save metadata in output model
  --save_model_as {None,ckpt,pt,safetensors}
                        format to save the model (default is .safetensors)
  --unet_lr UNET_LR     learning rate for U-Net
  --text_encoder_lr TEXT_ENCODER_LR
                        learning rate for Text Encoder
  --network_weights NETWORK_WEIGHTS
                        pretrained weights for network
  --network_module NETWORK_MODULE
                        network module to train
  --network_dim NETWORK_DIM
                        network dimensions (depends on each network)
  --network_alpha NETWORK_ALPHA
                        alpha for LoRA weight scaling, default 1 (same as network_dim for same
                        behavior as old version)
  --network_dropout NETWORK_DROPOUT
                        Drops neurons out of training every step (0 or None is default behavior
                        (no dropout), 1 would drop all neurons)
  --network_args [NETWORK_ARGS ...]
                        additional arguments for network (key=value)
  --network_train_unet_only
                        only training U-Net part
  --network_train_text_encoder_only
                        only training Text Encoder part
  --training_comment TRAINING_COMMENT
                        arbitrary comment string stored in metadata
  --dim_from_weights    automatically determine dim (rank) from network_weights
  --scale_weight_norms SCALE_WEIGHT_NORMS
                        Scale the weight of each key pair to help prevent overtraing via exploding
                        gradients. (1 is a good starting point)
  --base_weights [BASE_WEIGHTS ...]
                        network weights to merge into the model before training
  --base_weights_multiplier [BASE_WEIGHTS_MULTIPLIER ...]
                        multiplier for network weights to merge into the model before training
  --no_half_vae         do not use fp16/bf16 VAE in mixed precision (use float VAE)
  --skip_until_initial_step
                        skip training until initial_step is reached
  --initial_epoch INITIAL_EPOCH
                        initial epoch number, 1 means first epoch (same as not specifying). NOTE:
                        initial_epoch/step doesn't affect to lr scheduler. Which means lr
                        scheduler will start from 0 without `--resume`.
  --initial_step INITIAL_STEP
                        initial step number including all epochs, 0 means first step (same as not
                        specifying). overwrites initial_epoch.
  --cache_text_encoder_outputs
                        cache text encoder outputs
  --cache_text_encoder_outputs_to_disk
                        cache text encoder outputs