| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156 |
- import logging
- import os
- import os.path as osp
- from datetime import datetime
- import torch
- from easydict import EasyDict
- cfg = EasyDict(__name__='Config: VideoComposer')
- pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
- gpus_per_machine = torch.cuda.device_count()
- world_size = pmi_world_size * gpus_per_machine
- cfg.video_compositions = [
- 'text', 'mask', 'depthmap', 'sketch', 'motion', 'image', 'local_image',
- 'single_sketch'
- ]
- # dataset
- cfg.root_dir = 'webvid10m/'
- cfg.alpha = 0.7
- cfg.misc_size = 384
- cfg.depth_std = 20.0
- cfg.depth_clamp = 10.0
- cfg.hist_sigma = 10.0
- cfg.use_image_dataset = False
- cfg.alpha_img = 0.7
- cfg.resolution = 256
- cfg.mean = [0.5, 0.5, 0.5]
- cfg.std = [0.5, 0.5, 0.5]
- # sketch
- cfg.sketch_mean = [0.485, 0.456, 0.406]
- cfg.sketch_std = [0.229, 0.224, 0.225]
- # dataloader
- cfg.max_words = 1000
- cfg.frame_lens = [
- 16,
- 16,
- 16,
- 16,
- ]
- cfg.feature_framerates = [
- 4,
- ]
- cfg.feature_framerate = 4
- cfg.batch_sizes = {
- str(1): 1,
- str(4): 1,
- str(8): 1,
- str(16): 1,
- }
- cfg.chunk_size = 64
- cfg.num_workers = 8
- cfg.prefetch_factor = 2
- cfg.seed = 8888
- # diffusion
- cfg.num_timesteps = 1000
- cfg.mean_type = 'eps'
- cfg.var_type = 'fixed_small'
- cfg.loss_type = 'mse'
- cfg.ddim_timesteps = 50
- cfg.ddim_eta = 0.0
- cfg.clamp = 1.0
- cfg.share_noise = False
- cfg.use_div_loss = False
- # classifier-free guidance
- cfg.p_zero = 0.9
- cfg.guide_scale = 6.0
- # stable diffusion
- cfg.sd_checkpoint = 'v2-1_512-ema-pruned.ckpt'
- # clip vision encoder
- cfg.vit_image_size = 336
- cfg.vit_patch_size = 14
- cfg.vit_dim = 1024
- cfg.vit_out_dim = 768
- cfg.vit_heads = 16
- cfg.vit_layers = 24
- cfg.vit_mean = [0.48145466, 0.4578275, 0.40821073]
- cfg.vit_std = [0.26862954, 0.26130258, 0.27577711]
- cfg.clip_checkpoint = 'open_clip_pytorch_model.bin'
- cfg.mvs_visual = False
- # unet
- cfg.unet_in_dim = 4
- cfg.unet_concat_dim = 8
- cfg.unet_y_dim = cfg.vit_out_dim
- cfg.unet_context_dim = 1024
- cfg.unet_out_dim = 8 if cfg.var_type.startswith('learned') else 4
- cfg.unet_dim = 320
- cfg.unet_dim_mult = [1, 2, 4, 4]
- cfg.unet_res_blocks = 2
- cfg.unet_num_heads = 8
- cfg.unet_head_dim = 64
- cfg.unet_attn_scales = [1 / 1, 1 / 2, 1 / 4]
- cfg.unet_dropout = 0.1
- cfg.misc_dropout = 0.5
- cfg.p_all_zero = 0.1
- cfg.p_all_keep = 0.1
- cfg.temporal_conv = False
- cfg.temporal_attn_times = 1
- cfg.temporal_attention = True
- cfg.use_fps_condition = False
- cfg.use_sim_mask = False
- # Default: load 2d pretrain
- cfg.pretrained = False
- cfg.fix_weight = False
- # Default resume
- cfg.resume = True
- cfg.resume_step = 148000
- cfg.resume_check_dir = '.'
- cfg.resume_checkpoint = os.path.join(
- cfg.resume_check_dir,
- f'step_{cfg.resume_step}/non_ema_{cfg.resume_step}.pth')
- cfg.resume_optimizer = False
- if cfg.resume_optimizer:
- cfg.resume_optimizer = os.path.join(
- cfg.resume_check_dir, f'optimizer_step_{cfg.resume_step}.pt')
- # acceleration
- cfg.use_ema = True
- # for debug, no ema
- if world_size < 2:
- cfg.use_ema = False
- cfg.load_from = None
- cfg.use_checkpoint = True
- cfg.use_sharded_ddp = False
- cfg.use_fsdp = False
- cfg.use_fp16 = True
- # training
- cfg.ema_decay = 0.9999
- cfg.viz_interval = 1000
- cfg.save_ckp_interval = 1000
- # logging
- cfg.log_interval = 100
- composition_strings = '_'.join(cfg.video_compositions)
- # Default log_dir
- cfg.log_dir = 'outputs/'
|