feat: scan layers + gradient checkpointing (borisdayma#161)

borisdayma · web-flow · commit 07a6f9a7ae59 · 2022-04-09T14:52:18.000-05:00
* scan layers for faster compilation
* support gradient checkpointing
diff --git a/src/dalle_mini/model/configuration.py b/src/dalle_mini/model/configuration.py
@@ -51,23 +51,24 @@ def __init__(
         activation_dropout=0.0,
         init_std=0.02,
         scale_embedding=False,
-        gradient_checkpointing=False,
+        gradient_checkpointing=True,
+        use_scan=None,
         use_cache=True,
         is_encoder_decoder=True,
         forced_eos_token_id=None,
         tie_word_embeddings=False,  # different modalities and sizes
         do_sample=True,
         # transformer variants
         use_bias=False,  # use bias in attention and dense layers (except for lm_head)
-        ln_type="layernorm",  # layer normalization type, "rmsnorm", "layernorm"
+        ln_type="rmsnorm",  # layer normalization type, "rmsnorm", "layernorm"
         ln_positions="normformer",  # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
         use_head_scale=False,  # used in NormFormer
         use_cosine_attention=False,  # used in Swin v2
         tau_init=0.05,  # used only in cosine attention (Swin v2)
         use_absolute_position_embeddings=True,  # default
         use_swin_position_embeddings=False,  # used in Swin v1/v2
         use_deepnet_scaling=False,  # used in Deepnet
-        use_glu=False,  # "GLU Variants Improve Transformer"
+        use_glu=True,  # "GLU Variants Improve Transformer"
         use_alibi=False,  # Not implemented yet - from "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
         sinkhorn_iters=1,  # used in SinkFormers
         use_final_ln_encoder=True,  # final layer normalization in encoder
@@ -136,6 +137,11 @@ def __init__(
         self.init_std = init_std
         self.use_cache = use_cache
         self.gradient_checkpointing = gradient_checkpointing
+        # all layers are the same in most configurations
+        self.use_scan = use_scan if use_scan is not None else ln_positions != "swinv2"
+        assert not (
+            self.use_scan and ln_positions == "swinv2"
+        ), "scan cannot be used with 'swinv2'"
         self.scale_embedding = (
             scale_embedding  # scale factor will be sqrt(d_model) if True
         )
diff --git a/src/dalle_mini/model/modeling.py b/src/dalle_mini/model/modeling.py
@@ -619,6 +619,9 @@ def __call__(
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
 
+        if self.config.use_scan:
+            hidden_states = hidden_states[0]
+
         res_gain = (
             deepnet_gain["encoder"]["alpha"](self.config)
             if self.config.use_deepnet_scaling
@@ -679,12 +682,8 @@ def __call__(
         )
         hidden_states = ff_block(hidden_states, deterministic=deterministic)
         hidden_states = residual * res_gain + hidden_states
-        if self.add_norm or self.config.ln_positions in ["postln"]:
-            use_scale = (
-                self.use_scale
-                or self.config.ln_positions == "postln"
-                or self.config.force_ln_scale
-            )
+        if self.add_norm:
+            use_scale = self.use_scale or self.config.force_ln_scale
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -697,6 +696,9 @@ def __call__(
         if output_attentions:
             outputs += (attn_weights,)
 
+        if self.config.use_scan:
+            outputs = (outputs, None)
+
         return outputs
 
 
@@ -710,7 +712,7 @@ class FlaxBartDecoderLayer(nn.Module):
     config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32
     add_norm: bool = False
-    use_scale: bool = False
+    use_scale: bool = True
 
     @nn.compact
     def __call__(
@@ -724,6 +726,9 @@ def __call__(
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
 
+        if self.config.use_scan:
+            hidden_states = hidden_states[0]
+
         res_gain = (
             deepnet_gain["decoder"]["alpha"](self.config)
             if self.config.use_deepnet_scaling
@@ -831,12 +836,8 @@ def __call__(
         )
         hidden_states = ff_block(hidden_states, deterministic=deterministic)
         hidden_states = residual * res_gain + hidden_states
-        if self.add_norm or self.config.ln_positions in ["postln"]:
-            use_scale = (
-                self.use_scale
-                or self.config.ln_positions == "postln"
-                or self.config.force_ln_scale
-            )
+        if self.add_norm:
+            use_scale = self.use_scale or self.config.force_ln_scale
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -849,6 +850,9 @@ def __call__(
         if output_attentions:
             outputs += (attn_weights, cross_attn_weights)
 
+        if self.config.use_scan:
+            outputs = (outputs, None)
+
         return outputs
 
 
@@ -876,35 +880,80 @@ def __call__(
 
         n_layers = self.config.encoder_layers
         layer = (
-            remat(FlaxBartEncoderLayer, static_argnums=(2, 3))
+            remat(
+                FlaxBartEncoderLayer,
+                static_argnums=(2, 3),
+                prevent_cse=not self.config.use_scan,
+            )
             if self.config.gradient_checkpointing
             else FlaxBartEncoderLayer
         )
-        for i in range(n_layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            # final layernorm on the output of the last layer
-            # or every 6 layers for Swin v2
-            add_norm = (
-                self.config.ln_positions == "swinv2" and ((i + 1) % 6 == 0)
-            ) or (self.config.use_final_ln_encoder and (i == n_layers - 1))
-            # we don't need to scale the norm for the last layer
-            use_scale = i != n_layers - 1
-            layer_outputs = layer(
-                self.config, dtype=self.dtype, add_norm=add_norm, use_scale=use_scale
+
+        if self.config.use_scan:
+            # all blocks are the same so we use nn.scan
+            assert not output_attentions, "cannot scan with output_attentions"
+            assert not output_hidden_states, "cannot scan with output_hidden_states"
+            hidden_states = (hidden_states,)
+            # we use a scale on all norms (even last layer) to allow scanning
+            hidden_states, _ = nn.scan(
+                layer,
+                variable_axes={"params": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(nn.broadcast, nn.broadcast, nn.broadcast),
+                length=n_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                add_norm=self.config.ln_positions == "postln",
+                name="FlaxBartEncoderLayers",
             )(
                 hidden_states,
                 attention_mask,
                 output_attentions,
                 deterministic,
             )
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
+            hidden_states = hidden_states[0]
+        else:
+            for i in range(n_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                # final layernorm on the output of the last layer
+                # or every 6 layers for Swin v2
+                add_norm = self.config.ln_positions == "postln" or (
+                    self.config.ln_positions == "swinv2"
+                    and ((i + 1) % 6 == 0)
+                    and (i != n_layers - 1)
+                )
+                # we don't need to scale the norm for the last layer
+                use_scale = i != n_layers - 1
+                layer_outputs = layer(
+                    self.config,
+                    dtype=self.dtype,
+                    add_norm=add_norm,
+                    use_scale=use_scale,
+                    name=f"FlaxBartEncoderLayer_{i}",
+                )(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                    deterministic,
+                )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
 
-        # add hidden states from the last layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
+            # add hidden states from the last layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+        # postln is already applied in every layer
+        if self.config.use_final_ln_encoder and self.config.ln_positions != "postln":
+            hidden_states = norm(
+                self.config.ln_type,
+                dtype=self.dtype,
+                epsilon=1e-05,
+                use_scale=self.config.force_ln_scale,
+            )(hidden_states)
 
         outputs = [
             hidden_states,
@@ -953,22 +1002,39 @@ def __call__(
 
         n_layers = self.config.decoder_layers
         layer = (
-            remat(FlaxBartDecoderLayer, static_argnums=(4, 5, 6))
+            remat(
+                FlaxBartDecoderLayer,
+                static_argnums=(4, 5, 6),
+                prevent_cse=not self.config.use_scan,
+            )
             if self.config.gradient_checkpointing
             else FlaxBartDecoderLayer
         )
-        for i in range(n_layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            # final layernorm on the output of the last layer
-            # or every 6 layers for Swin v2
-            add_norm = (
-                self.config.ln_positions == "swinv2" and ((i + 1) % 6 == 0)
-            ) or (self.config.use_final_ln_decoder and (i == n_layers - 1))
-            # we don't need to scale the norm for the last layer
-            use_scale = i != n_layers - 1
-            layer_outputs = layer(
-                self.config, dtype=self.dtype, add_norm=add_norm, use_scale=use_scale
+
+        if self.config.use_scan:
+            # all blocks are the same so we use nn.scan
+            assert not output_attentions, "cannot scan with output_attentions"
+            assert not output_hidden_states, "cannot scan with output_hidden_states"
+            hidden_states = (hidden_states,)
+            # we use a scale on all norms (even last layer) to allow scanning
+            hidden_states, _ = nn.scan(
+                layer,
+                variable_axes={"params": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                length=n_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                add_norm=self.config.ln_positions == "postln",
+                name="FlaxBartEncoderLayers",
             )(
                 hidden_states,
                 attention_mask,
@@ -978,17 +1044,56 @@ def __call__(
                 output_attentions,
                 deterministic,
             )
+            hidden_states = hidden_states[0]
 
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
+        else:
+            for i in range(n_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                # final layernorm on the output of the last layer
+                # or every 6 layers for Swin v2
+                add_norm = self.config.ln_positions == "postln" or (
+                    self.config.ln_positions == "swinv2"
+                    and ((i + 1) % 6 == 0)
+                    and (i != n_layers - 1)
+                )
+                # we don't need to scale the norm for the last layer
+                use_scale = i != n_layers - 1
+                layer_outputs = layer(
+                    self.config,
+                    dtype=self.dtype,
+                    add_norm=add_norm,
+                    use_scale=use_scale,
+                    name=f"FlaxBartDecoderLayer_{i}",
+                )(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    init_cache,
+                    output_attentions,
+                    deterministic,
+                )
+
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+
+                    if encoder_hidden_states is not None:
+                        all_cross_attentions += (layer_outputs[2],)
 
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
 
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
+        # postln is already applied in every layer
+        if self.config.use_final_ln_decoder and self.config.ln_positions != "postln":
+            hidden_states = norm(
+                self.config.ln_type,
+                dtype=self.dtype,
+                epsilon=1e-05,
+                use_scale=self.config.force_ln_scale,
+            )(hidden_states)
 
         outputs = [
             hidden_states,
diff --git a/src/dalle_mini/model/partitions.py b/src/dalle_mini/model/partitions.py
@@ -55,13 +55,22 @@ def _get_partition_rules():
     ]
 
 
-def set_partitions(in_dict):
+def set_partitions(in_dict, use_scan):
     rules = _get_partition_rules()
     replace = _replacement_rules(rules)
     initd = {k: _unmatched for k in flatten_dict(in_dict)}
     result = {k: replace(k, v) for k, v in initd.items()}
     for k, v in result.items():
         if v == _unmatched:
             print(f"Unmatched -> {k}")
+    l = list(result.keys())
+    if use_scan:
+        # add None dimension to scanned layers
+        result = {
+            k: (P(*(None,) + v) if v is not None else None)
+            if any(x in k for x in ["FlaxBartEncoderLayers", "FlaxBartDecoderLayers"])
+            else v
+            for k, v in result.items()
+        }
     assert _unmatched not in result.values(), "Incomplete partition spec."
     return freeze(unflatten_dict(result))
diff --git a/tools/train/config/mega/config.json b/tools/train/config/mega/config.json
@@ -7,14 +7,14 @@
   "decoder_attention_heads": 32,
   "decoder_ffn_dim": 4096,
   "decoder_layerdrop": 0.0,
-  "decoder_layers": 25,
+  "decoder_layers": 26,
   "decoder_start_token_id": 16384,
   "do_sample": true,
   "dropout": 0.0,
   "encoder_attention_heads": 32,
   "encoder_ffn_dim": 4096,
   "encoder_layerdrop": 0.0,
-  "encoder_layers": 25,
+  "encoder_layers": 26,
   "encoder_vocab_size": 50272,
   "eos_token_id": 16385,
   "force_ln_scale": false,
diff --git a/tools/train/train.py b/tools/train/train.py