feat: hugging face publish (#33)

GreyElaina · ControlNet · web-flow · commit c9d5698f9879 · 2025-03-30T15:58:34.000+11:00
* feat: hugging face publish

* edit docs

---------

Co-authored-by: ControlNet &lt;smczx@hotmail.com&gt;
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@
     <a href="https://arxiv.org/abs/2211.06627">
         <img src="https://img.shields.io/badge/arXiv-2211.06627-b31b1b.svg?style=flat-square">
     </a>
-    <a href="https://huggingface.co/ControlNet/MARLIN">
+    <a href="https://huggingface.co/collections/ControlNet/marlin-67e79296284080c98d95e3d9">
         <img src="https://img.shields.io/badge/huggingface-model-FFD21E?style=flat-square&logo=huggingface">
     </a>
 </div>
@@ -50,6 +50,7 @@ This repo is the official PyTorch implementation for the paper
 
 The repository contains 2 parts:
  - `marlin-pytorch`: The PyPI package for MARLIN used for inference.
+ - The HuggingFace wrapper for MARLIN used for inference.
  - The implementation for the paper including training and evaluation scripts.
 
 ```
@@ -70,6 +71,9 @@ The repository contains 2 parts:
 ├── init.py
 ├── version.txt
 
+# below is for the huggingface wrapper
+├── hf_src
+
 # below is for the paper implementation
 ├── configs              # Configs for experiments settings
 ├── model                # Marlin models
@@ -150,6 +154,29 @@ features = model.extract_features(x)  # torch.Size([B, k, 768])
 features = model.extract_features(x, keep_seq=False)  # torch.Size([B, 768])
 ```
 
+## Use `transformers` (HuggingFace) for Feature Extraction
+
+Requirements:
+- Python
+- PyTorch
+- transformers
+- einops
+
+Currently the huggingface model is only for direct feature extraction without any video pre-processing (e.g. face detection, cropping, strided window, etc).
+
+
+```python
+import torch
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained(
+    "ControlNet/marlin_vit_base_ytf",  # or other variants
+    trust_remote_code=True
+)
+tensor = torch.rand([1, 3, 16, 224, 224])  # (B, C, T, H, W)
+output = model(tensor)  # torch.Size([1, 1568, 384])
+```
+
 ## Paper Implementation
 
 ### Requirements
diff --git a/hf_src/marlin_configs/vit_base.py b/hf_src/marlin_configs/vit_base.py
@@ -0,0 +1,22 @@
+from marlin_huggingface import MarlinConfig
+
+
+vit_base_config = MarlinConfig(
+    img_size=224,
+    patch_size=16,
+    n_frames=16,
+    mlp_ratio=4.0,
+    qkv_bias=True,
+    qk_scale=None,
+    drop_rate=0.0,
+    attn_drop_rate=0.0,
+    norm_layer="LayerNorm",
+    init_values=0.0,
+    tubelet_size=2,
+    encoder_embed_dim=768,
+    encoder_depth=12,
+    encoder_num_heads=12,
+    decoder_embed_dim=384,
+    decoder_depth=4,
+    decoder_num_heads=6,
+)
diff --git a/hf_src/marlin_configs/vit_large.py b/hf_src/marlin_configs/vit_large.py
@@ -0,0 +1,22 @@
+from marlin_huggingface import MarlinConfig
+
+
+vit_large_config = MarlinConfig(
+    img_size=224,
+    patch_size=16,
+    n_frames=16,
+    mlp_ratio=4.0,
+    qkv_bias=True,
+    qk_scale=None,
+    drop_rate=0.0,
+    attn_drop_rate=0.0,
+    norm_layer="LayerNorm",
+    init_values=0.0,
+    tubelet_size=2,
+    encoder_embed_dim=1024,
+    encoder_depth=24,
+    encoder_num_heads=16,
+    decoder_embed_dim=512,
+    decoder_depth=12,
+    decoder_num_heads=8,
+)
diff --git a/hf_src/marlin_configs/vit_small.py b/hf_src/marlin_configs/vit_small.py
@@ -0,0 +1,21 @@
+from marlin_huggingface import MarlinConfig
+
+vit_small_config = MarlinConfig(
+    img_size=224,
+    patch_size=16,
+    n_frames=16,
+    mlp_ratio=4.,
+    qkv_bias=True,
+    qk_scale=None,
+    drop_rate=0.,
+    attn_drop_rate=0.,
+    norm_layer="LayerNorm",
+    init_values=0.,
+    tubelet_size=2,
+    encoder_embed_dim=384,
+    encoder_depth=12,
+    encoder_num_heads=6,
+    decoder_embed_dim=192,
+    decoder_depth=4,
+    decoder_num_heads=3,
+)
diff --git a/hf_src/marlin_huggingface/__init__.py b/hf_src/marlin_huggingface/__init__.py
@@ -0,0 +1,11 @@
+from transformers import AutoModel, AutoConfig
+
+from .config import MarlinConfig
+from .marlin import Marlin, MarlinModel
+
+MarlinConfig.register_for_auto_class()
+MarlinModel.register_for_auto_class()
+AutoConfig.register("marlin", MarlinConfig)
+AutoModel.register(MarlinConfig, MarlinModel)
+
+__all__ = ["Marlin", "MarlinModel", "MarlinConfig"]
diff --git a/hf_src/marlin_huggingface/config.py b/hf_src/marlin_huggingface/config.py
@@ -0,0 +1,27 @@
+from transformers import PretrainedConfig
+
+
+class MarlinConfig(PretrainedConfig):
+    model_type = "marlin"
+
+    def __init__(self, **kwargs):
+        self.img_size = kwargs.pop("img_size", None)
+        self.patch_size = kwargs.pop("patch_size", None)
+        self.n_frames = kwargs.pop("n_frames", None)
+        self.encoder_embed_dim = kwargs.pop("encoder_embed_dim", None)
+        self.encoder_depth = kwargs.pop("encoder_depth", None)
+        self.encoder_num_heads = kwargs.pop("encoder_num_heads", None)
+        self.decoder_embed_dim = kwargs.pop("decoder_embed_dim", None)
+        self.decoder_depth = kwargs.pop("decoder_depth", None)
+        self.decoder_num_heads = kwargs.pop("decoder_num_heads", None)
+        self.mlp_ratio = kwargs.pop("mlp_ratio", None)
+        self.qkv_bias = kwargs.pop("qkv_bias", None)
+        self.qk_scale = kwargs.pop("qk_scale", None)
+        self.drop_rate = kwargs.pop("drop_rate", None)
+        self.attn_drop_rate = kwargs.pop("attn_drop_rate", None)
+        self.norm_layer = kwargs.pop("norm_layer", None)
+        self.init_values = kwargs.pop("init_values", None)
+        self.tubelet_size = kwargs.pop("tubelet_size", None)
+        self.as_feature_extractor = kwargs.pop("as_feature_extractor", True)
+
+        super().__init__(**kwargs)
diff --git a/hf_src/marlin_huggingface/decoder.py b/hf_src/marlin_huggingface/decoder.py
@@ -0,0 +1,87 @@
+import torch
+from einops import rearrange
+from torch import nn, Tensor
+from torch.nn import LayerNorm, Linear, ModuleList
+
+from .modules import Block, no_grad_trunc_normal_
+from .positional_embedding import SinCosPositionalEmbedding
+
+
+class MarlinDecoder(nn.Module):
+
+    def __init__(self, img_size=224, patch_size=16, n_frames=16, embed_dim=384, depth=8,
+        num_heads=6, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+        norm_layer="LayerNorm", init_values=1., tubelet_size=2
+    ):
+        super().__init__()
+        output_dim = 3 * tubelet_size * patch_size * patch_size
+        self.patch_size = patch_size
+        self.tubelet_size = tubelet_size
+        self.n_patch_h = img_size // patch_size
+        self.n_patch_w = img_size // patch_size
+        self.embed_dim = embed_dim
+        if norm_layer == "LayerNorm":
+            self.norm_layer = LayerNorm
+            self.norm = self.norm_layer(embed_dim)
+        else:
+            raise NotImplementedError("Only LayerNorm is supported")
+
+        # sine-cosine positional embeddings
+        self.pos_embedding = SinCosPositionalEmbedding(
+            (self.n_patch_h * self.n_patch_w * (n_frames // tubelet_size), embed_dim), dropout_rate=0.)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+
+        self.blocks = ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=self.norm_layer,
+                init_values=init_values
+            ) for _ in range(depth)])
+
+        self.head = Linear(embed_dim, output_dim)
+        self.apply(self._init_weights)
+        no_grad_trunc_normal_(self.mask_token, mean=0., std=0.02, a=-0.02, b=0.02)
+
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def unpatch_to_img(self, x: Tensor) -> Tensor:
+        # x: (Batch, No. batches, Prod of cube size * C)
+        x = rearrange(x, "b n (c p) -> b n p c", c=3)
+        # x: (Batch, No. batches, Prod of cube size, C)
+        x = rearrange(x, "b (t h w) (p0 p1 p2) c -> b c (t p0) (h p1) (w p2)", p0=self.tubelet_size,
+            p1=self.patch_size, p2=self.patch_size, h=self.n_patch_h, w=self.n_patch_w)
+        # x: (B, C, T, H, W)
+        return x
+
+    def forward_features(self, x, return_token_num=0):
+        for block in self.blocks:
+            x = block(x)
+
+        if return_token_num > 0:
+            x = x[:, -return_token_num:]
+
+        x = self.norm(x)
+        x = self.head(x)
+        # x: (B, N_mask, C)
+        return x
+
+    def forward(self, x, mask):
+        # mask: 0 -> masked, 1 -> visible
+        b, n, c = x.shape
+        expand_pos_embed = self.pos_embedding.emb.data.expand(b, -1, -1)
+        pos_emb_vis = expand_pos_embed[mask].view(b, -1, c)
+        pos_emb_mask = expand_pos_embed[~mask].view(b, -1, c)
+        x = torch.cat([x + pos_emb_vis, self.mask_token + pos_emb_mask], dim=1)
+
+        mask_num = pos_emb_mask.shape[1]
+
+        x = self.forward_features(x, return_token_num=mask_num)
+        return x
diff --git a/hf_src/marlin_huggingface/encoder.py b/hf_src/marlin_huggingface/encoder.py
@@ -0,0 +1,78 @@
+from torch import nn, Tensor
+from torch.nn import ModuleList, LayerNorm
+
+from .modules import PatchEmbedding3d, Block
+from .positional_embedding import SinCosPositionalEmbedding
+
+
+class MarlinEncoder(nn.Module):
+
+    def __init__(self, img_size=224, patch_size=16, n_frames=16, embed_dim=768, depth=12,
+        num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+        norm_layer="LayerNorm", init_values=0., tubelet_size=2
+    ):
+        super().__init__()
+
+        self.embed_dim = embed_dim
+        self.patch_embedding = PatchEmbedding3d(
+            input_size=(3, n_frames, img_size, img_size),
+            patch_size=(tubelet_size, patch_size, patch_size),
+            embedding=embed_dim
+        )
+        num_patches = (img_size // patch_size) * (img_size // patch_size) * (n_frames // tubelet_size)
+
+        # sine-cosine positional embeddings
+        self.pos_embedding = SinCosPositionalEmbedding((num_patches, embed_dim), dropout_rate=0.)
+
+        if norm_layer == "LayerNorm":
+            self.norm_layer = LayerNorm
+            self.norm = self.norm_layer(embed_dim)
+        else:
+            raise NotImplementedError("Only LayerNorm is supported")
+
+        self.blocks = ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=self.norm_layer,
+                init_values=init_values)
+            for _ in range(depth)
+        ])
+
+        self.apply(self._init_weights)
+
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_features(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm(x)
+        return x
+
+    def forward(self, x: Tensor, mask: Tensor) -> Tensor:
+        # mask: (B, T, N) with boolean values, 0 -> masked, 1 -> visible
+        assert len(x.shape) == 5, "x must be 5D"
+        emb = self.patch_embedding(x)
+        emb = self.pos_embedding(emb)
+        b, _, c = emb.shape
+        emb = emb[mask].view(b, -1, c)  # only visible patches are used
+        emb = self.forward_features(emb)
+        return emb
+
+    def extract_features(self, x: Tensor, seq_mean_pool: bool) -> Tensor:
+        x = self.patch_embedding(x)
+        x = self.pos_embedding(x)
+        for block in self.blocks:
+            x = block(x)
+
+        if seq_mean_pool:
+            x = x.mean(dim=1)
+        x = self.norm(x)
+        return x
diff --git a/hf_src/marlin_huggingface/marlin.py b/hf_src/marlin_huggingface/marlin.py
diff --git a/hf_src/marlin_huggingface/modules.py b/hf_src/marlin_huggingface/modules.py
diff --git a/hf_src/marlin_huggingface/positional_embedding.py b/hf_src/marlin_huggingface/positional_embedding.py
diff --git a/hf_src/publish_hf.py b/hf_src/publish_hf.py