From 76f4b4e84ef8b736bfb6aedaa950e25d1c1d92a7 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 5 May 2020 17:30:54 +0100 Subject: [PATCH] [ML] Add a constant to the prediction which minimises the unregularised loss for classification and regression (#1192) --- docs/CHANGELOG.asciidoc | 3 +++ include/maths/CBoostedTreeImpl.h | 1 + .../CDataFrameAnalyzerFeatureImportanceTest.cc | 4 ++-- lib/maths/CBoostedTreeImpl.cc | 14 +++++++------- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 096b7a5ff..ef031738c 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -59,6 +59,9 @@ * Improve false positive rates from periodicity test for time series anomaly detection. (See {ml-pull}1177[#1177].) * Break progress reporting of data frame analyses into multiple phases. (See {ml-pull}1179[#1179].) +* Really centre the data before training for classification and regression begins. This + means we can choose more optimal smoothing bias and should reduce the number of trees. + (See {ml-pull}1192[#1192].) === Bug Fixes diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index d8b99e0fe..292311975 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -247,6 +247,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { const core::CPackedBitVector& trainingRowMask, const core::CPackedBitVector& testingRowMask, double eta, + double lambda, TNodeVec& tree) const; //! Compute the mean of the loss function on the masked rows of \p frame. diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index afa83fc61..6594549af 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -485,11 +485,11 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoImportance, SFixture) { double c1{readShapValue(result, "c1")}; double prediction{ result["row_results"]["results"]["ml"]["target_prediction"].GetDouble()}; - // c1 explains 94% of the prediction value, i.e. the difference from the prediction is less than 2%. + // c1 explains 94% of the prediction value, i.e. the difference from the prediction is less than 6%. BOOST_REQUIRE_CLOSE(c1, prediction, 6.0); for (const auto& feature : {"c2", "c3", "c4"}) { double c = readShapValue(result, feature); - BOOST_REQUIRE_SMALL(c, 2.0); + BOOST_REQUIRE_SMALL(c, 3.0); cNoImportanceMean.add(std::fabs(c)); } } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 0c8995afa..b07275fc0 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -523,8 +523,8 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat // At the start we will centre the data w.r.t. the given loss function. TNodeVec tree{CBoostedTreeNode{m_Loss->numberParameters()}}; - this->refreshPredictionsAndLossDerivatives(frame, trainingRowMask, - testingRowMask, 1.0, tree); + this->refreshPredictionsAndLossDerivatives(frame, trainingRowMask, testingRowMask, + 1.0 /*eta*/, 0.0 /*lambda*/, tree); return tree; } @@ -585,8 +585,9 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, if (tree.size() > 1) { scopeMemoryUsage.add(tree); - this->refreshPredictionsAndLossDerivatives(frame, trainingRowMask, - testingRowMask, eta, tree); + this->refreshPredictionsAndLossDerivatives( + frame, trainingRowMask, testingRowMask, eta, + m_Regularization.leafWeightPenaltyMultiplier(), tree); forest.push_back(std::move(tree)); eta = std::min(1.0, m_EtaGrowthRatePerTree * eta); retries = 0; @@ -990,13 +991,12 @@ void CBoostedTreeImpl::refreshPredictionsAndLossDerivatives(core::CDataFrame& fr const core::CPackedBitVector& trainingRowMask, const core::CPackedBitVector& testingRowMask, double eta, + double lambda, TNodeVec& tree) const { using TArgMinLossVec = std::vector; - TArgMinLossVec leafValues( - tree.size(), - m_Loss->minimizer(m_Regularization.leafWeightPenaltyMultiplier(), m_Rng)); + TArgMinLossVec leafValues(tree.size(), m_Loss->minimizer(lambda, m_Rng)); auto nextPass = [&] { bool done{true}; for (const auto& value : leafValues) {