Add support for CausalML models (#282)

imfaruqi · web-flow · commit 1e257f98d57b · 2024-05-07T15:09:47.000+01:00
* Add new manager for CausalML models

* Update requirements-dev1.txt

* Add unit tests for CausalML model manager

* Add examples for Causal ML model manager

* Fix formatting

* Add verbose -1 param for LGBM to filter training logs

* Update unit test for managers

* Add tf-keras as dev dependendy to resolve transformers compatibility issue

* Pin numpy version compatible with mxnet

* Update optional depedencies for causal ML manager
diff --git a/examples/examples-by-ml-library/libraries/causalml_example.py b/examples/examples-by-ml-library/libraries/causalml_example.py
@@ -0,0 +1,84 @@
+#    Copyright 2024 Neal Lathia
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import numpy as np
+import pandas as pd
+
+from libraries.util.datasets import load_causal_regression_dataset
+from libraries.util.domains import DIABETES_DOMAIN
+from causalml.inference.meta import XGBTRegressor, BaseSRegressor
+from causalml.metrics import qini_score
+from lightgbm.sklearn import LGBMRegressor
+
+from modelstore.model_store import ModelStore
+
+
+def _train_example_model() -> XGBTRegressor:
+    X_train, X_test, y_train, y_test, treatment_vector_train, treatment_vector_test = load_causal_regression_dataset()
+
+    params = {
+        "n_estimators": 250,
+        "max_depth": 4,
+        "learning_rate": 0.01,
+        "n_jobs": 1,
+        "verbose": -1
+    }
+
+    # Train causal regressor
+    lgbm = LGBMRegressor(**params)
+    model = BaseSRegressor(learner=lgbm)
+    model.fit(X_train, treatment_vector_train, y_train)
+
+    X_test = pd.DataFrame(X_test)
+    X_test["causal_scores"] = model.predict(X_test)
+    X_test["outcomes"] = y_test
+    X_test["treatment"] = treatment_vector_test
+
+    result = qini_score(
+        X_test[["causal_scores", "outcomes", "treatment"]],
+        outcome_col="outcomes",
+        treatment_col="treatment",
+    )
+    print(f"🔍  Trained model Qini score={result}.")
+    return model
+
+
+def train_and_upload(modelstore: ModelStore) -> dict:
+    # Train a causalml regressor
+    model = _train_example_model()
+
+    # Upload the model to the model store
+    print(f'⤴️  Uploading the causalml model to the "{DIABETES_DOMAIN}" domain.')
+    meta_data = modelstore.upload(DIABETES_DOMAIN, model=model)
+    return meta_data
+
+
+def load_and_test(modelstore: ModelStore, model_domain: str, model_id: str):
+    # Load the model back into memory!
+    print(f'⤵️  Loading the causalml "{model_domain}" domain model={model_id}')
+    model = modelstore.load(model_domain, model_id)
+
+    # Run some example predictions
+    _, X_test, _, y_test, _, treatment_vector_test = load_causal_regression_dataset()
+
+    X_test = pd.DataFrame(X_test)
+    X_test["causal_scores"] = model.predict(X_test)
+    X_test["outcomes"] = y_test
+    X_test["treatment"] = treatment_vector_test
+
+    result = qini_score(
+        X_test[["causal_scores", "outcomes", "treatment"]],
+        outcome_col="outcomes",
+        treatment_col="treatment",
+    )
+    print(f"🔍  Loaded model Qini score={result}.")
diff --git a/examples/examples-by-ml-library/libraries/util/datasets.py b/examples/examples-by-ml-library/libraries/util/datasets.py
@@ -12,6 +12,7 @@
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 import pandas as pd
+import numpy as np
 import torch
 from sklearn.datasets import (
     fetch_20newsgroups,
@@ -50,6 +51,23 @@ def load_regression_dataframe():
     return df
 
 
+def load_causal_regression_dataset():
+    X_train, X_test, y_train, y_test = load_regression_dataset()
+
+    # Dummy treatment vector to simulate experiment
+    treatment_vector_train = np.zeros(X_train.shape[0])
+    treatment_vector_test = np.zeros(X_test.shape[0])
+
+    # Simulating a 50% treatment / control split
+    treatment_mask_train = int(X_train.shape[0] * 0.5)
+    treatment_mask_test = int(X_test.shape[0] * 0.5)
+
+    treatment_vector_train[:treatment_mask_train] = 1
+    treatment_vector_test[:treatment_mask_test] = 1
+
+    return X_train, X_test, y_train, y_test, treatment_vector_train, treatment_vector_test
+
+
 def load_classification_dataset():
     print(f"🔍  Loading the breast cancer dataset")
     databunch = load_breast_cancer()
diff --git a/examples/examples-by-ml-library/main.py b/examples/examples-by-ml-library/main.py
@@ -17,6 +17,7 @@
 from libraries import (
     annoy_example,
     catboost_example,
+    causalml_example,
     fastai_example,
     gensim_example,
     keras_example,
@@ -50,6 +51,7 @@
 EXAMPLES = {
     "annoy": annoy_example,
     "catboost": catboost_example,
+    "causalml": causalml_example,
     "dpt": dpt,
     "fastai": fastai_example,
     "file": raw_file_example,
diff --git a/examples/examples-by-ml-library/run-all.sh b/examples/examples-by-ml-library/run-all.sh
@@ -1,6 +1,6 @@
 set -e
 backends=( filesystem aws-s3 google-cloud-storage azure-container minio )
-frameworks=( annoy catboost fastai file gensim keras lightgbm \
+frameworks=( annoy catboost causalml fastai file gensim keras lightgbm \
 	onnx-sklearn onnx-lightgbm prophet pyspark pytorch pytorch-lightning \
 	sklearn sklearn-with-explainer sklearn-with-extras skorch xgboost xgboost-booster \
 	tensorflow hf-distilbert hf-gpt2-pt hf-gpt2-tf segment-anything yolov5 )
diff --git a/modelstore/models/causalml.py b/modelstore/models/causalml.py
@@ -0,0 +1,71 @@
+#    Copyright 2024 Neal Lathia
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+from functools import partial
+from typing import Any
+
+from modelstore.metadata import metadata
+from modelstore.models.common import load_joblib, save_joblib
+from modelstore.models.model_manager import ModelManager
+from modelstore.storage.storage import CloudStorage
+
+MODEL_FILE = "model.joblib"
+
+
+class CausalMLManager(ModelManager):
+
+    """
+    Model persistence for Causal ML models:
+    https://causalml.readthedocs.io/en/latest/index.html
+    """
+
+    NAME = "causalml"
+
+    def __init__(self, storage: CloudStorage = None):
+        super().__init__(self.NAME, storage)
+
+    def required_dependencies(self) -> list:
+        return ["causalml"]
+
+    def optional_dependencies(self) -> list:
+        deps = super().optional_dependencies()
+        return deps + ["Cython", "joblib"]
+
+    def _required_kwargs(self):
+        return ["model"]
+
+    def matches_with(self, **kwargs) -> bool:
+        # pylint: disable=import-outside-toplevel
+        import causalml
+
+        return any(
+            [
+                isinstance(kwargs.get("model"), causalml.inference.meta.base.BaseLearner),
+                isinstance(kwargs.get("model"), causalml.propensity.PropensityModel),
+            ]
+        )
+
+    def _get_functions(self, **kwargs) -> list:
+        if not self.matches_with(**kwargs):
+            raise TypeError("This model is not a Causal ML model!")
+
+        return [partial(save_joblib, model=kwargs["model"], file_name=MODEL_FILE)]
+
+    def load(self, model_path: str, meta_data: metadata.Summary) -> Any:
+        super().load(model_path, meta_data)
+
+        # @Future: check if loading into same version of joblib
+        # as was used for saving
+        file_name = os.path.join(model_path, MODEL_FILE)
+        return load_joblib(file_name)
diff --git a/modelstore/models/managers.py b/modelstore/models/managers.py
@@ -16,6 +16,7 @@
 from modelstore.metadata.code.dependencies import module_exists
 from modelstore.models.annoy import AnnoyManager
 from modelstore.models.catboost import CatBoostManager
+from modelstore.models.causalml import CausalMLManager
 from modelstore.models.fastai import FastAIManager
 from modelstore.models.gensim import GensimManager
 from modelstore.models.lightgbm import LightGbmManager
@@ -41,6 +42,7 @@
     for m in [
         AnnoyManager,
         CatBoostManager,
+        CausalMLManager,
         FastAIManager,
         ModelFileManager,
         GensimManager,
@@ -97,4 +99,3 @@ def get_manager(name: str, storage: CloudStorage = None) -> ModelManager:
         if not module_exists(x):
             raise ValueError(f"could not create manager for {name}: {x} not installed")
     return manager
-    
diff --git a/requirements-dev1.txt b/requirements-dev1.txt
@@ -11,6 +11,7 @@ minio
 # Machine Learning
 annoy
 catboost
+causalml
 fastai  # Note: 1.0.61 has different import paths!
 gensim
 Keras-Preprocessing
@@ -26,6 +27,7 @@ skl2onnx
 skorch
 tf-keras
 tensorflow
+tf-keras
 torch
 torchvision
 transformers
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,4 @@ joblib>=1.0.0
 requests>=2.23.0
 tqdm>=4.54.1
 click>=7.0
-numpy 
+numpy==1.23.1
diff --git a/tests/models/test_causalml.py b/tests/models/test_causalml.py
diff --git a/tests/models/test_managers.py b/tests/models/test_managers.py