remove bert initialization into MTDNNModel

Emmanuel Awa · Emmanuel Awa · commit e0a2d6a0811b · 2020-01-21T23:15:51.000Z
diff --git a/examples/text_classification/test.py b/examples/text_classification/test.py
@@ -1,14 +1,19 @@
+from utils_nlp.models.mtdnn.common.types import EncoderModelType
 from utils_nlp.models.mtdnn.configuration_mtdnn import MTDNNConfig
 from utils_nlp.models.mtdnn.modeling_mtdnn import MTDNNModel
 
 if __name__ == "__main__":
     config = MTDNNConfig()
     b = MTDNNModel(config)
+    print("Network: ", b.network)
     print("Config Class: ", b.config_class)
     print("Config: ", b.config)
-    print("Embeddings: ", b.embeddings)
-    print("Encoding: ", b.encoder)
     print("Pooler: ", b.pooler)
+
+    if config.encoder_type == EncoderModelType.BERT:
+        print("Encoding: ", b.encoder)
+        print("Embeddings: ", b.embeddings)
+        print("Bert Config: ", b.bert_config)
+
     print("Archive Map: ", b.pretrained_model_archive_map)
     print("Base Model Prefix: ", b.base_model_prefix)
-    print("Bert Config: ", b.bert_config)
diff --git a/utils_nlp/models/mtdnn/common/san.py b/utils_nlp/models/mtdnn/common/san.py
@@ -132,22 +132,12 @@ class SANNetwork(nn.Module):
     https://arxiv.org/abs/1804.07888
     """
 
-    def __init__(self, config: MTDNNConfig):
+    def __init__(self, config: MTDNNConfig, pooler):
         super(SANNetwork, self).__init__()
         self.config = config
         self.dropout_list = nn.ModuleList()
         self.encoder_type = config.encoder_type
-
-        # Setup the baseline network
-        # Define the encoder based on config options
-        self.bert_config = BertConfig.from_dict(self.config.to_dict())
-        self.bert = BertModel(self.bert_config)
-        self.hidden_size = self.bert_config.hidden_size
-
-        if self.encoder_type == EncoderModelType.ROBERTA:
-            self.bert = FairseqRobertModel.from_pretrained(config.init_checkpoint)
-            self.hidden_size = self.bert.args.encoder_embed_dim
-            self.pooler = LinearPooler(self.hidden_size)
+        self.pooler = pooler
 
         # Dump other features if value is set to true
         if config.dump_feature:
@@ -233,7 +223,8 @@ def forward(
                 logits = self.scoring_list[task_id](pooled_output)
             return logits
 
-    def generate_scoring_options(self):
+    # TODO - Move to training step
+    def generate_tasks_scoring_options(self):
         """ Enumerate over tasks and setup of decoding and scoring list for training """
         assert len(self.tasks_nclass_list) > 0, "Number of classes to train for cannot be 0"
         for idx, task_num_labels in enumerate(self.tasks_nclass_list):
diff --git a/utils_nlp/models/mtdnn/configuration_mtdnn.py b/utils_nlp/models/mtdnn/configuration_mtdnn.py
@@ -13,6 +13,8 @@
 
 """MTDNN model configuration"""
 
+encoder_checkpoint_map = {1: "bert", 2: "roberta"}
+
 
 class MTDNNConfig(PretrainedConfig):
     r"""
@@ -49,7 +51,8 @@ class MTDNNConfig(PretrainedConfig):
 
     def __init__(
         self,
-        encoder_type=EncoderModelType.BERT,
+        use_pretrained_model=False,
+        encoder_type=EncoderModelType.ROBERTA,
         vocab_size=30522,
         hidden_size=768,
         num_hidden_layers=12,
@@ -70,7 +73,7 @@ def __init__(
         tasks_dropout_p=[],
         enable_variational_dropout=True,
         init_ratio=1.0,
-        init_checkpoint="bert-base-uncased",
+        init_checkpoint="roberta.base",
         # Training config
         cuda=torch.cuda.is_available(),
         multi_gpu_on=False,
@@ -89,6 +92,7 @@ def __init__(
         warmup=0.1,
         warmup_schedule="warmup_linear",
         adam_eps=1e-6,
+        pooler=None,
         # Scheduler config
         have_lr_scheduler=True,
         multi_step_lr="10,20,30",
@@ -109,7 +113,16 @@ def __init__(
         weighted_on=False,
         **kwargs,
     ):
+        # basic Configuration validation
+        # assert inital checkpoint and encoder type are same
+        assert init_checkpoint.startswith(
+            encoder_checkpoint_map[encoder_type]
+        ), """Encoder type and initial checkpoint mismatch. 
+            1 - Bert models
+            2 - Roberta models
+            """
         super(MTDNNConfig, self).__init__(**kwargs)
+        self.use_pretrained_model = use_pretrained_model
         self.encoder_type = encoder_type
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
@@ -148,6 +161,7 @@ def __init__(
         self.momentum = momentum
         self.warmup = warmup
         self.warmup_schedule = warmup_schedule
+        self.pooler = pooler
         self.adam_eps = adam_eps
         self.have_lr_scheduler = have_lr_scheduler
         self.multi_step_lr = multi_step_lr
diff --git a/utils_nlp/models/mtdnn/modeling_mtdnn.py b/utils_nlp/models/mtdnn/modeling_mtdnn.py
@@ -41,14 +41,14 @@
 logger = logging.getLogger(__name__)
 
 
-class MTDNNPretrainedModel(BertPreTrainedModel):
+class MTDNNPretrainedModel(nn.Module):
     config_class = MTDNNConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = "mtdnn"
 
     def __init__(self, config):
-        super(MTDNNPretrainedModel, self).__init__(config)
+        super(MTDNNPretrainedModel, self).__init__()
         if not isinstance(config, PretrainedConfig):
             raise ValueError(
                 "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
@@ -61,43 +61,69 @@ def __init__(self, config):
         self.config = config
 
 
-class MTDNNModel(MTDNNPretrainedModel, BertModel):
+class MTDNNModel(MTDNNPretrainedModel):
     def __init__(
         self,
         config: MTDNNConfig,
         pretrained_model_name: str = "mtdnn-base-uncased",
         num_train_step: int = -1,
     ):
+        assert (
+            config.init_checkpoint in self.supported_init_checkpoints()
+        ), f"Initial checkpoint must be in {self.supported_init_checkpoints()}"
         super(MTDNNModel, self).__init__(config)
         self.config = config
 
-        # Set the config base on encoder type set for initial checkpoint
-
-        # Download pretrained model
+        # Setup the baseline network
+        # - Define the encoder based on config options
+        # - Set state dictionary based on configuration setting
+        # - Download pretrained model if flag is set
         # TODO - Use Model.pretrained_model() after configuration file is hosted.
-        with download_path() as file_path:
-            path = pathlib.Path(file_path)
-            self.local_model_path = maybe_download(
-                url=self.pretrained_model_archive_map[pretrained_model_name]
-            )
-        self.mtdnn_model = MTDNNCommonUtils.load_pytorch_model(self.local_model_path)
+        if self.config.use_pretrained_model:
+            with download_path() as file_path:
+                path = pathlib.Path(file_path)
+                self.local_model_path = maybe_download(
+                    url=self.pretrained_model_archive_map[pretrained_model_name]
+                )
+            self.mtdnn_model = MTDNNCommonUtils.load_pytorch_model(self.local_model_path)
+            self.state_dict = self.mtdnn_model["state"]
+        else:
+            # Set the config base on encoder type set for initial checkpoint
+            if config.encoder_type == EncoderModelType.BERT:
+                self.bert_config = BertConfig.from_dict(self.config.to_dict())
+                self.bert_model = BertModel(self.bert_config)
+                self.state_dict = self.bert_model.state_dict()
+            if config.encoder_type == EncoderModelType.ROBERTA:
+                # Download and extract from PyTorch hub if not downloaded before
+                self.bert_model = torch.hub.load("pytorch/fairseq", config.init_checkpoint)
+                self.config.hidden_size = self.bert_model.args.encoder_embed_dim
+                self.pooler = LinearPooler(self.config.hidden_size)
+                new_state_dict = {}
+                for key, val in self.bert_model.state_dict().items():
+                    if key.startswith("model.decoder.sentence_encoder") or key.startswith(
+                        "model.classification_heads"
+                    ):
+                        key = f"bert.{key}"
+                        new_state_dict[key] = val
+                    # backward compatibility PyTorch <= 1.0.0
+                    if key.startswith("classification_heads"):
+                        key = f"bert.model.{key}"
+                        new_state_dict[key] = val
+                self.state_dict = new_state_dict
 
-        self.state_dict = self.mtdnn_model["state"]
         self.updates = (
             self.state_dict["updates"] if self.state_dict and "updates" in self.state_dict else 0
         )
         self.local_updates = 0
         self.train_loss = AverageMeter()
-        self.network = SANNetwork(self.config)
+        self.network = SANNetwork(self.config, self.pooler)
         if self.state_dict:
             self.network.load_state_dict(self.state_dict, strict=False)
         self.mnetwork = nn.DataParallel(self.network) if self.config.multi_gpu_on else self.network
         self.total_param = sum([p.nelement() for p in self.network.parameters() if p.requires_grad])
 
         # Move network to GPU if device available and flag set
-        print(f" =======> Can move to cuda {self.config.cuda} and {torch.cuda.is_available()}")
         if self.config.cuda:
-            print(" =======> Moving to cuda")
             self.network.cuda()
         self.optimizer_parameters = self._get_param_groups()
         self._setup_optim(self.optimizer_parameters, self.state_dict, num_train_step)
@@ -383,3 +409,16 @@ def load(self, checkpoint):
 
     def cuda(self):
         self.network.cuda()
+
+    def supported_init_checkpoints(self):
+        """List of allowed check points
+        """
+        return [
+            "bert-base-uncased",
+            "bert-base-cased",
+            "bert-large-uncased",
+            "mtdnn-base-uncased",
+            "mtdnn-large-uncased",
+            "roberta.base",
+            "roberta.large",
+        ]