add interp training

Doubiiu · Doubiiu · commit 59ef5a54bd07 · 2024-06-14T22:58:14.000+08:00
diff --git a/README.md b/README.md
@@ -277,7 +277,8 @@ From CUHK and Tencent AI Lab.
 
 
 ## 📝 Changelog
-- __[2024.05.24]__: 🔥🔥 Release WebVid10M-motion annotations.
+- __[2024.06.14]__: 🔥🔥 Release training code for interpolation.
+- __[2024.05.24]__: Release WebVid10M-motion annotations.
 - __[2024.05.05]__: Release training code.
 - __[2024.03.14]__: Release generative frame interpolation and looping video models (320x512).
 - __[2024.02.05]__: Release high-resolution models (320x512 & 576x1024).
@@ -361,6 +362,13 @@ We adopt `DDPShardedStrategy` by default for training, please make sure it is av
 ```
 5. All the checkpoints/tensorboard record/loginfo will be saved in `<YOUR_SAVE_ROOT_DIR>`.
 
+### Generative Frame Interpolation
+Download pretrained model DynamiCrafter512_interp and put the `model.ckpt` in `checkpoints/dynamicrafter_512_interp_v1/model.ckpt`. Follow the same fine-tuning procedure in "Image-to-Video Generation", and run the script below:
+```bash
+sh configs/training_512_v1.0/run_interp.sh
+```
+
+
 ## 🎁 WebVid-10M-motion annotations (~2.6M)
 The annoations of our WebVid-10M-motion is available on [Huggingface Dataset](https://huggingface.co/datasets/Doubiiu/webvid10m_motion). In addition to the original annotations, we add three more motion-related annotations: `dynamic_confidence`, `dynamic_wording`, and `dynamic_source_category`. Please refer to our [supplementary document](https://arxiv.org/pdf/2310.12190) (Section D) for more details.
 
diff --git a/configs/training_512_v1.0/config_interp.yaml b/configs/training_512_v1.0/config_interp.yaml
@@ -0,0 +1,167 @@
+model:
+  pretrained_checkpoint: checkpoints/dynamicrafter_512_interp_v1/model.ckpt
+  base_learning_rate: 1.0e-05
+  scale_lr: False
+  target: lvdm.models.ddpm3d.LatentVisualDiffusion
+  params:
+    rescale_betas_zero_snr: True
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: False
+    image_proj_model_trainable: True
+    conditioning_key: hybrid
+    image_size: [40, 64]
+    channels: 4
+    scale_by_std: False
+    scale_factor: 0.18215
+    use_ema: False
+    uncond_prob: 0.05
+    uncond_type: 'empty_seq'
+    rand_cond_frame: false
+    use_dynamic_rescale: true
+    base_scale: 0.7
+    fps_condition_type: 'fps'
+    perframe_ae: true
+    interp_mode: true
+
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        dropout: 0.1
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: True
+        temporal_conv: True
+        temporal_attention: True
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: False
+        temporal_length: 16
+        addition_attention: true
+        image_cross_attention: true
+        default_fs: 10
+        fs_condition: true
+
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: "penultimate"
+
+    img_cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
+      params:
+        freeze: true
+    
+    image_proj_stage_config:
+      target: lvdm.modules.encoders.resampler.Resampler
+      params:
+        dim: 1024
+        depth: 4
+        dim_head: 64
+        heads: 12
+        num_queries: 16
+        embedding_dim: 1280
+        output_dim: 1024
+        ff_mult: 4
+        video_length: 16
+
+data:
+  target: utils_data.DataModuleFromConfig
+  params:
+    batch_size: 2
+    num_workers: 12
+    wrap: false
+    train:
+      target: lvdm.data.webvid.WebVid
+      params:
+        data_dir: <WebVid10M DATA>
+        meta_path: <.csv FILE>
+        video_length: 16
+        frame_stride: 6
+        load_raw_resolution: true
+        resolution: [320, 512]
+        spatial_transform: resize_center_crop
+        random_fs: true  ## if true, we uniformly sample fs with max_fs=frame_stride (above)
+
+lightning:
+  precision: 16
+  # strategy: deepspeed_stage_2
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
+    max_steps: 100000
+    # logger
+    log_every_n_steps: 50
+    # val
+    val_check_interval: 0.5
+    gradient_clip_algorithm: 'norm'
+    gradient_clip_val: 0.5
+  callbacks:
+    model_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        every_n_train_steps: 9000 #1000
+        filename: "{epoch}-{step}"
+        save_weights_only: True
+    metrics_over_trainsteps_checkpoint:
+      target: pytorch_lightning.callbacks.ModelCheckpoint
+      params:
+        filename: '{epoch}-{step}'
+        save_weights_only: True
+        every_n_train_steps: 10000 #20000 # 3s/step*2w=
+    batch_logger:
+      target: callbacks.ImageLogger
+      params:
+        batch_frequency: 500
+        to_local: False
+        max_images: 8
+        log_images_kwargs:
+          ddim_steps: 50
+          unconditional_guidance_scale: 7.5
+          timestep_spacing: uniform_trailing
+          guidance_rescale: 0.7
diff --git a/configs/training_512_v1.0/run_interp.sh b/configs/training_512_v1.0/run_interp.sh
@@ -0,0 +1,37 @@
+# NCCL configuration
+# export NCCL_DEBUG=INFO
+# export NCCL_IB_DISABLE=0
+# export NCCL_IB_GID_INDEX=3
+# export NCCL_NET_GDR_LEVEL=3
+# export NCCL_TOPO_FILE=/tmp/topo.txt
+
+# args
+name="training_512_v1.0"
+config_file=configs/${name}/config_interp.yaml
+
+# save root dir for logs, checkpoints, tensorboard record, etc.
+save_root="<YOUR_SAVE_ROOT_DIR>"
+
+mkdir -p $save_root/${name}_interp
+
+## run
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
+--nproc_per_node=$HOST_GPU_NUM --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
+./main/trainer.py \
+--base $config_file \
+--train \
+--name ${name}_interp \
+--logdir $save_root \
+--devices $HOST_GPU_NUM \
+lightning.trainer.num_nodes=1
+
+## debugging
+# CUDA_VISIBLE_DEVICES=2,3,4,5,6,7 python3 -m torch.distributed.launch \
+# --nproc_per_node=6 --nnodes=1 --master_addr=127.0.0.1 --master_port=12352 --node_rank=0 \
+# ./main/trainer.py \
+# --base $config_file \
+# --train \
+# --name ${name}_interp \
+# --logdir $save_root \
+# --devices 6 \
+# lightning.trainer.num_nodes=1
diff --git a/lvdm/models/ddpm3d.py b/lvdm/models/ddpm3d.py
@@ -481,7 +481,7 @@ def __init__(self,
                  use_dynamic_rescale=False,
                  base_scale=0.7,
                  turning_step=400,
-                 loop_video=False,
+                 interp_mode=False,
                  fps_condition_type='fs',
                  perframe_ae=False,
                  # added
@@ -502,7 +502,7 @@ def __init__(self,
         self.cond_stage_key = cond_stage_key
         self.noise_strength = noise_strength
         self.use_dynamic_rescale = use_dynamic_rescale
-        self.loop_video = loop_video
+        self.interp_mode = interp_mode
         self.fps_condition_type = fps_condition_type
         self.perframe_ae = perframe_ae
 
@@ -1093,10 +1093,16 @@ def get_batch_input(self, batch, random_uncond, return_first_stage_outputs=False
         img_emb = self.image_proj_model(img_emb)
 
         if self.model.conditioning_key == 'hybrid':
-            ## simply repeat the cond_frame to match the seq_len of z
-            img_cat_cond = z[:,:,cond_frame_index,:,:]
-            img_cat_cond = img_cat_cond.unsqueeze(2)
-            img_cat_cond = repeat(img_cat_cond, 'b c t h w -> b c (repeat t) h w', repeat=z.shape[2])
+            if self.interp_mode:
+                ## starting frame + (L-2 empty frames) + ending frame
+                img_cat_cond = torch.zeros_like(z)
+                img_cat_cond[:,:,0,:,:] = z[:,:,0,:,:]
+                img_cat_cond[:,:,-1,:,:] = z[:,:,-1,:,:]
+            else:
+                ## simply repeat the cond_frame to match the seq_len of z
+                img_cat_cond = z[:,:,cond_frame_index,:,:]
+                img_cat_cond = img_cat_cond.unsqueeze(2)
+                img_cat_cond = repeat(img_cat_cond, 'b c t h w -> b c (repeat t) h w', repeat=z.shape[2])
 
             cond["c_concat"] = [img_cat_cond] # b c t h w
         cond["c_crossattn"] = [torch.cat([prompt_imb, img_emb], dim=1)] ## concat in the seq_len dim