From 76f4b4e84ef8b736bfb6aedaa950e25d1c1d92a7 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@users.noreply.github.com>
Date: Tue, 5 May 2020 17:30:54 +0100
Subject: [PATCH] [ML] Add a constant to the prediction which minimises the
 unregularised loss for classification and regression (#1192)

---
 docs/CHANGELOG.asciidoc                            |  3 +++
 include/maths/CBoostedTreeImpl.h                   |  1 +
 .../CDataFrameAnalyzerFeatureImportanceTest.cc     |  4 ++--
 lib/maths/CBoostedTreeImpl.cc                      | 14 +++++++-------
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index 096b7a5ff..ef031738c 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -59,6 +59,9 @@
 * Improve false positive rates from periodicity test for time series anomaly detection.
   (See {ml-pull}1177[#1177].)
 * Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].)
+* Really centre the data before training for classification and regression begins. This
+  means we can choose more optimal smoothing bias and should reduce the number of trees.
+  (See {ml-pull}1192[#1192].)
 
 === Bug Fixes
 
diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h
index d8b99e0fe..292311975 100644
--- a/include/maths/CBoostedTreeImpl.h
+++ b/include/maths/CBoostedTreeImpl.h
@@ -247,6 +247,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
                                               const core::CPackedBitVector& trainingRowMask,
                                               const core::CPackedBitVector& testingRowMask,
                                               double eta,
+                                              double lambda,
                                               TNodeVec& tree) const;
 
     //! Compute the mean of the loss function on the masked rows of \p frame.
diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
index afa83fc61..6594549af 100644
--- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
+++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
@@ -485,11 +485,11 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoImportance, SFixture) {
             double c1{readShapValue(result, "c1")};
             double prediction{
                 result["row_results"]["results"]["ml"]["target_prediction"].GetDouble()};
-            // c1 explains 94% of the prediction value, i.e. the difference from the prediction is less than 2%.
+            // c1 explains 94% of the prediction value, i.e. the difference from the prediction is less than 6%.
             BOOST_REQUIRE_CLOSE(c1, prediction, 6.0);
             for (const auto& feature : {"c2", "c3", "c4"}) {
                 double c = readShapValue(result, feature);
-                BOOST_REQUIRE_SMALL(c, 2.0);
+                BOOST_REQUIRE_SMALL(c, 3.0);
                 cNoImportanceMean.add(std::fabs(c));
             }
         }
diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc
index 0c8995afa..b07275fc0 100644
--- a/lib/maths/CBoostedTreeImpl.cc
+++ b/lib/maths/CBoostedTreeImpl.cc
@@ -523,8 +523,8 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat
 
     // At the start we will centre the data w.r.t. the given loss function.
     TNodeVec tree{CBoostedTreeNode{m_Loss->numberParameters()}};
-    this->refreshPredictionsAndLossDerivatives(frame, trainingRowMask,
-                                               testingRowMask, 1.0, tree);
+    this->refreshPredictionsAndLossDerivatives(frame, trainingRowMask, testingRowMask,
+                                               1.0 /*eta*/, 0.0 /*lambda*/, tree);
 
     return tree;
 }
@@ -585,8 +585,9 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame,
 
         if (tree.size() > 1) {
             scopeMemoryUsage.add(tree);
-            this->refreshPredictionsAndLossDerivatives(frame, trainingRowMask,
-                                                       testingRowMask, eta, tree);
+            this->refreshPredictionsAndLossDerivatives(
+                frame, trainingRowMask, testingRowMask, eta,
+                m_Regularization.leafWeightPenaltyMultiplier(), tree);
             forest.push_back(std::move(tree));
             eta = std::min(1.0, m_EtaGrowthRatePerTree * eta);
             retries = 0;
@@ -990,13 +991,12 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr
                                                             const core::CPackedBitVector& trainingRowMask,
                                                             const core::CPackedBitVector& testingRowMask,
                                                             double eta,
+                                                            double lambda,
                                                             TNodeVec& tree) const {
 
     using TArgMinLossVec = std::vector<CArgMinLoss>;
 
-    TArgMinLossVec leafValues(
-        tree.size(),
-        m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier(), m_Rng));
+    TArgMinLossVec leafValues(tree.size(), m_Loss->minimizer(lambda, m_Rng));
     auto nextPass = [&] {
         bool done{true};
         for (const auto& value : leafValues) {