From 4f8f6bf865620cbc73c5b5d216d3f3ebdeb22405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20P=C3=B6tz?= Date: Fri, 21 Jan 2022 00:26:30 +0100 Subject: [PATCH 1/4] add testfiles, first draft without binning --- scripts/builtin/impurityMeasures.dml | 111 ++++++++++++++++++ .../org/apache/sysds/common/Builtins.java | 1 + .../part1/BuiltinImpurityMeasuresTest.java | 83 +++++++++++++ .../functions/builtin/impurityMeasures.dml | 27 +++++ 4 files changed, 222 insertions(+) create mode 100644 scripts/builtin/impurityMeasures.dml create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java create mode 100644 src/test/scripts/functions/builtin/impurityMeasures.dml diff --git a/scripts/builtin/impurityMeasures.dml b/scripts/builtin/impurityMeasures.dml new file mode 100644 index 00000000000..db3ac780a5d --- /dev/null +++ b/scripts/builtin/impurityMeasures.dml @@ -0,0 +1,111 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This function computes the measure of impurity for the given dataset based on the passed method (gini or entropy). +# The current version expects the target vector to contain only 0 or 1 values. +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------------------------------------------------- +# X Matrix[Double] --- Feature matrix. +# Y Matrix[Double] --- Target vector containing 0 and 1 values +# R Matrix[Double] --- Vector indicating whether a feature is categorical or continuous. +# 1 denotes a continuous feature, larger values indicate the number +# of categories. +# method String --- String indicating the method to use; either "entropy" or "gini". +# ---------------------------------------------------------------------------------------------------------------------- + +# Output(s) +# ---------------------------------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------------------------------------------------- +# impurity_measures Matrix[Double] --- Row vector containing information/gini gain for each feature of +# the dataset. +# In case of gini, the values denote the gini gains, i.e. how much +# impurity was "removed" with the respective split. The higher the +# value, the better the split. +# In case of entropy, the values denote the information gains, i.e. +# how much entropy was removed. The higher the information gain, +# the better the split. +# ---------------------------------------------------------------------------------------------------------------------- + +m_impurityMeasures = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] R, String method) + return (Matrix[Double] impurity_measures) +{ + if (method != "entropy" & method != "gini") { + stop("Please specify the correct method - should be either entropy or gini.") + } + + imp_measures = matrix(numeric(), nrow = 1, ncol = ncol(X)) + + for (i in 1:ncol(X)) { + feature_measure = getMeasure(X[,i], Y, R[,i], method) + imp_measures[1,i] = feature_measure + } + impurity_measures = imp_measures +} + +getMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double n_categories, String method) + return (double impurity) +{ + n_true_labels = sum(Y) + n_false_labels = length(Y) - n_true_labels + parent_impurity = calcImpurity(n_true_labels, n_false_labels, length(feature), method) + + # calculate the impurity after the split + new_impurity = 0 + for (i in 1:n_categories) { + count_true = 0 + count_false = 0 + for (j in 1:length(feature)) { + if (feature[j] == i) { + if (Y[j] == 0) { + count_false = count_false + 1 + } else { + count_true = count_true + 1 + } + } + } + new_impurity = new_impurity + calcImpurity(count_true, count_false, length(feature), method) + } + impurity = parent_impurity - new_impurity +} + +calcImpurity = function(Double n_true, Double n_false, Integer n_vars, String method) + return (double impurity) +{ + prob_true = n_true / (n_true + n_false) + prob_false = n_false / (n_true + n_false) + weight = (n_true + n_false) / n_vars + + if (method == "entropy") { + if (prob_true == 1 | prob_false == 1) { + impurity = 0 + } + else { + impurity = (-1) * weight * (prob_true * log2(prob_true) + prob_false * log2(prob_false)) + } + } else if (method == "gini") { + impurity = weight * (prob_true * prob_true + prob_false * prob_false) + } + impurity = 0 +} diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index aa9c58c0b35..75188c1cbda 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -157,6 +157,7 @@ public enum Builtins { IMG_SAMPLE_PAIRING("img_sample_pairing", true), IMG_INVERT("img_invert", true), IMG_POSTERIZE("img_posterize", true), + IMPURITY_MEASURES("impurityMeasures", true), IMPUTE_BY_MEAN("imputeByMean", true), IMPUTE_BY_MEAN_APPLY("imputeByMeanApply", true), IMPUTE_BY_MEDIAN("imputeByMedian", true), diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java new file mode 100644 index 00000000000..67da0cb6ec5 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part1; + +import java.util.HashMap; + +import org.apache.sysds.common.Types; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.runtime.matrix.data.MatrixValue; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +public class BuiltinImpurityMeasuresTest extends AutomatedTestBase { + private final static String TEST_NAME = "impurityMeasures"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinImpurityMeasuresTest.class.getSimpleName() + "/"; + + private final static double eps = 1e-10; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"C"})); + } + + @Test + public void basicTest() { + double[][] X = {{1, 1}, {2, 2}}; + double[][] Y = {{1}, {0}}; + double[][] R = {{2, 2}}; + String method = "hi"; + String method2 = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method); + } + + private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][] Y, double[][] R, String method) { + Types.ExecMode platform_old = setExecMode(exec_type); + + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[] {"-args", input("X"), input("Y"), input("R"), method, output("impurity_measures")}; + HashMap expected_measures = new HashMap<>(); + expected_measures.put(new MatrixValue.CellIndex(1,1), 1.0); + expected_measures.put(new MatrixValue.CellIndex(1, 2), 1.0); + + writeInputMatrixWithMTD("X", X, true); + writeInputMatrixWithMTD("Y", Y, true); + writeInputMatrixWithMTD("R", R, true); + + runTest(true, false, null, -1); + //runTest(); + + HashMap actual_measures = readDMLMatrixFromOutputDir("impurity_measures"); + + TestUtils.compareMatrices(expected_measures, actual_measures, eps, "Expected measures", "Actual measures"); + } + finally { + rtplatform = platform_old; + } + } +} diff --git a/src/test/scripts/functions/builtin/impurityMeasures.dml b/src/test/scripts/functions/builtin/impurityMeasures.dml new file mode 100644 index 00000000000..d0402332cdf --- /dev/null +++ b/src/test/scripts/functions/builtin/impurityMeasures.dml @@ -0,0 +1,27 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1) +Y = read($2) +R = read($3) +impurity_measures = impurityMeasures(X = X, Y = Y, R = R, method = $4); + +write(impurity_measures, $5); From 02e6ccfc8fc4749ab181bf0c4c7da0b2ca336a16 Mon Sep 17 00:00:00 2001 From: Florian Poetz Date: Fri, 21 Jan 2022 07:23:01 +0100 Subject: [PATCH 2/4] fixed bugs in impuritymeasures builtin, extended tests --- scripts/builtin/impurityMeasures.dml | 48 +++++++++--------- .../part1/BuiltinImpurityMeasuresTest.java | 49 ++++++++++++++++--- .../functions/builtin/impurityMeasures.dml | 4 +- 3 files changed, 67 insertions(+), 34 deletions(-) diff --git a/scripts/builtin/impurityMeasures.dml b/scripts/builtin/impurityMeasures.dml index db3ac780a5d..a32cb75f68a 100644 --- a/scripts/builtin/impurityMeasures.dml +++ b/scripts/builtin/impurityMeasures.dml @@ -38,7 +38,7 @@ # ---------------------------------------------------------------------------------------------------------------------- # NAME TYPE DEFAULT MEANING # ---------------------------------------------------------------------------------------------------------------------- -# impurity_measures Matrix[Double] --- Row vector containing information/gini gain for each feature of +# IM Matrix[Double] --- Row vector containing information/gini gain for each feature of # the dataset. # In case of gini, the values denote the gini gains, i.e. how much # impurity was "removed" with the respective split. The higher the @@ -49,63 +49,61 @@ # ---------------------------------------------------------------------------------------------------------------------- m_impurityMeasures = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] R, String method) - return (Matrix[Double] impurity_measures) + return (Matrix[Double] IM) { if (method != "entropy" & method != "gini") { stop("Please specify the correct method - should be either entropy or gini.") } - imp_measures = matrix(numeric(), nrow = 1, ncol = ncol(X)) + IM = matrix(0.0, rows = 1, cols = ncol(X)) for (i in 1:ncol(X)) { - feature_measure = getMeasure(X[,i], Y, R[,i], method) - imp_measures[1,i] = feature_measure + feature_measure = getMeasure(X[,i], Y, as.scalar(R[1,i]), method) + IM[1,i] = feature_measure } - impurity_measures = imp_measures } getMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double n_categories, String method) - return (double impurity) + return (Double gain) { n_true_labels = sum(Y) n_false_labels = length(Y) - n_true_labels parent_impurity = calcImpurity(n_true_labels, n_false_labels, length(feature), method) # calculate the impurity after the split - new_impurity = 0 + children_impurity = 0 for (i in 1:n_categories) { count_true = 0 count_false = 0 for (j in 1:length(feature)) { - if (feature[j] == i) { - if (Y[j] == 0) { - count_false = count_false + 1 + if (as.scalar(feature[j]) == i) { + if (as.scalar(Y[j]) == 0) { + count_false += 1 } else { - count_true = count_true + 1 + count_true += 1 } } } - new_impurity = new_impurity + calcImpurity(count_true, count_false, length(feature), method) + children_impurity = children_impurity + calcImpurity(count_true, count_false, length(feature), method) } - impurity = parent_impurity - new_impurity + gain = parent_impurity - children_impurity } -calcImpurity = function(Double n_true, Double n_false, Integer n_vars, String method) - return (double impurity) +calcImpurity = function(Double n_true, Double n_false, Double n_vars, String method) + return (Double impurity) { + impurity = 0 prob_true = n_true / (n_true + n_false) prob_false = n_false / (n_true + n_false) weight = (n_true + n_false) / n_vars - if (method == "entropy") { - if (prob_true == 1 | prob_false == 1) { - impurity = 0 - } - else { - impurity = (-1) * weight * (prob_true * log2(prob_true) + prob_false * log2(prob_false)) + if (prob_true != 1 & prob_false != 1) { # if there is more than one class, calculate new impurity according to method. + if (method == "entropy") { + scale_log = log(2) # doesn't change the result, just brings the calculation on the right scale for easier testing. + impurity = (-1) * weight * (prob_true * log(prob_true)/scale_log + prob_false * log(prob_false)/scale_log) + # impurity = (-1) * weight * (prob_true * log(prob_true) + prob_false * log(prob_false)) + } else if (method == "gini") { + impurity = weight * (1 - (prob_true^2 + prob_false^2)) } - } else if (method == "gini") { - impurity = weight * (prob_true * prob_true + prob_false * prob_false) } - impurity = 0 } diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java index 67da0cb6ec5..101da2f34a3 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java @@ -42,12 +42,33 @@ public void setUp() { } @Test - public void basicTest() { - double[][] X = {{1, 1}, {2, 2}}; + public void basicGiniTest() { + /*double[][] X = {{1, 1}, {2, 2}}; double[][] Y = {{1}, {0}}; - double[][] R = {{2, 2}}; - String method = "hi"; - String method2 = "gini"; + double[][] R = {{2, 2}};*/ + double[][] X = {{1,1,2,1}, {1,3,1,2}, {2,1,1,2}, {3,2,1,1}, {1,3,2,1}}; + double[][] Y = {{0}, {0}, {1}, {1}, {1}}; + double[][] R = {{3, 3, 2, 2}}; + /*double[][] X = {{1},{1},{1},{1},{1},{1},{2},{2},{2},{2}}; + double[][] Y = {{0}, {0}, {0}, {0}, {0}, {1}, {1}, {1}, {1}, {1}}; + double[][] R = {{2}};*/ + String method = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method); + } + + @Test + public void basicEntropyTest() { + /*double[][] X = {{1, 1}, {2, 2}}; + double[][] Y = {{1}, {0}}; + double[][] R = {{2, 2}};*/ + double[][] X = {{1,1,2,1}, {1,3,1,2}, {2,1,1,2}, {3,2,1,1}, {1,3,2,1}}; + double[][] Y = {{0}, {0}, {1}, {1}, {1}}; + double[][] R = {{3, 3, 2, 2}}; + /*double[][] X = {{1},{1},{1},{1},{1},{1},{2},{2},{2},{2}}; + double[][] Y = {{0}, {0}, {0}, {0}, {0}, {1}, {1}, {1}, {1}, {1}}; + double[][] R = {{2}};*/ + String method = "entropy"; runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method); } @@ -61,9 +82,21 @@ private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][ String HOME = SCRIPT_DIR + TEST_DIR; fullDMLScriptName = HOME + TEST_NAME + ".dml"; programArgs = new String[] {"-args", input("X"), input("Y"), input("R"), method, output("impurity_measures")}; + HashMap expected_measures = new HashMap<>(); - expected_measures.put(new MatrixValue.CellIndex(1,1), 1.0); - expected_measures.put(new MatrixValue.CellIndex(1, 2), 1.0); + if(method.equals("gini")) { + expected_measures.put(new MatrixValue.CellIndex(1, 1), 0.2133333333); + expected_measures.put(new MatrixValue.CellIndex(1, 2), 0.0799999999); + expected_measures.put(new MatrixValue.CellIndex(1, 3), 0.0133333333); + expected_measures.put(new MatrixValue.CellIndex(1, 4), 0.0133333333); + } + // comparing with values from https://planetcalc.com/8421/ + if(method.equals("entropy")) { + expected_measures.put(new MatrixValue.CellIndex(1, 1), 0.4199730940); + expected_measures.put(new MatrixValue.CellIndex(1, 2), 0.1709505945); + expected_measures.put(new MatrixValue.CellIndex(1, 3), 0.0199730940); + expected_measures.put(new MatrixValue.CellIndex(1, 4), 0.0199730940); + } writeInputMatrixWithMTD("X", X, true); writeInputMatrixWithMTD("Y", Y, true); @@ -74,6 +107,8 @@ private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][ HashMap actual_measures = readDMLMatrixFromOutputDir("impurity_measures"); + System.out.println(actual_measures); + System.out.println(expected_measures); TestUtils.compareMatrices(expected_measures, actual_measures, eps, "Expected measures", "Actual measures"); } finally { diff --git a/src/test/scripts/functions/builtin/impurityMeasures.dml b/src/test/scripts/functions/builtin/impurityMeasures.dml index d0402332cdf..01ab4cbf214 100644 --- a/src/test/scripts/functions/builtin/impurityMeasures.dml +++ b/src/test/scripts/functions/builtin/impurityMeasures.dml @@ -22,6 +22,6 @@ X = read($1) Y = read($2) R = read($3) -impurity_measures = impurityMeasures(X = X, Y = Y, R = R, method = $4); +IM = impurityMeasures(X = X, Y = Y, R = R, method = $4); -write(impurity_measures, $5); +write(IM, $5); From 23b0d9dca6a62a92722682b4747305c0b01cbb0e Mon Sep 17 00:00:00 2001 From: Florian Poetz Date: Sat, 22 Jan 2022 00:11:06 +0100 Subject: [PATCH 3/4] removed bugs from impurityMeasures builtin, added tests --- scripts/builtin/impurityMeasures.dml | 53 ++++++-- .../part1/BuiltinImpurityMeasuresTest.java | 123 +++++++++++++----- 2 files changed, 132 insertions(+), 44 deletions(-) diff --git a/scripts/builtin/impurityMeasures.dml b/scripts/builtin/impurityMeasures.dml index a32cb75f68a..ee39634a43d 100644 --- a/scripts/builtin/impurityMeasures.dml +++ b/scripts/builtin/impurityMeasures.dml @@ -38,8 +38,8 @@ # ---------------------------------------------------------------------------------------------------------------------- # NAME TYPE DEFAULT MEANING # ---------------------------------------------------------------------------------------------------------------------- -# IM Matrix[Double] --- Row vector containing information/gini gain for each feature of -# the dataset. +# IM Matrix[Double] --- (1 x ncol(X)) row vector containing information/gini gain for +# each feature of the dataset. # In case of gini, the values denote the gini gains, i.e. how much # impurity was "removed" with the respective split. The higher the # value, the better the split. @@ -58,12 +58,20 @@ m_impurityMeasures = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] IM = matrix(0.0, rows = 1, cols = ncol(X)) for (i in 1:ncol(X)) { - feature_measure = getMeasure(X[,i], Y, as.scalar(R[1,i]), method) - IM[1,i] = feature_measure + if (as.scalar(R[,i]) == 1) { + bins = 4 # todo: decide number of bins + binning_feature = applyBinning(X[,i], bins) + feature_measure = getMeasure(binning_feature, Y, bins, method) + IM[,i] = feature_measure + } else { + max_cat = max(X[,i]) + feature_measure = getMeasure(X[,i], Y, max_cat, method) + IM[,i] = feature_measure + } } } -getMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double n_categories, String method) +getMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double max_cat, String method) return (Double gain) { n_true_labels = sum(Y) @@ -72,19 +80,21 @@ getMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double n_categor # calculate the impurity after the split children_impurity = 0 - for (i in 1:n_categories) { + for (i in 1:max_cat) { count_true = 0 count_false = 0 for (j in 1:length(feature)) { - if (as.scalar(feature[j]) == i) { - if (as.scalar(Y[j]) == 0) { + if (as.scalar(feature[j,]) == i) { + if (as.scalar(Y[j,]) == 0) { count_false += 1 } else { count_true += 1 } } } - children_impurity = children_impurity + calcImpurity(count_true, count_false, length(feature), method) + if (!(count_true == 0 & count_false == 0)) { + children_impurity = children_impurity + calcImpurity(count_true, count_false, length(feature), method) + } } gain = parent_impurity - children_impurity } @@ -99,11 +109,32 @@ calcImpurity = function(Double n_true, Double n_false, Double n_vars, String met if (prob_true != 1 & prob_false != 1) { # if there is more than one class, calculate new impurity according to method. if (method == "entropy") { - scale_log = log(2) # doesn't change the result, just brings the calculation on the right scale for easier testing. + scale_log = log(2) # scales the result for easier testing. impurity = (-1) * weight * (prob_true * log(prob_true)/scale_log + prob_false * log(prob_false)/scale_log) - # impurity = (-1) * weight * (prob_true * log(prob_true) + prob_false * log(prob_false)) } else if (method == "gini") { impurity = weight * (1 - (prob_true^2 + prob_false^2)) } } } + +# for now not very efficient equal width binning... +applyBinning = function(Matrix[Double] feature, Double bins) + return (Matrix[Double] output_f) +{ + n_bins = max(bins, nrow(feature)) + max_v = max(feature) + min_v = min(feature) + width = (max_v - min_v) / n_bins + output_f = matrix(1, rows = nrow(feature), cols = 1) + + for (i in 1:nrow(feature)) { + c_value = as.scalar(feature[i,]) + filled = 0 + for (j in 1:n_bins) { + if (c_value <= (min_v + j * width) & filled == 0) { + output_f[i,] = j + filled = 1 + } + } + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java index 101da2f34a3..c59601f9e64 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java @@ -42,38 +42,111 @@ public void setUp() { } @Test - public void basicGiniTest() { - /*double[][] X = {{1, 1}, {2, 2}}; + public void GiniTest1() { + double[][] X = {{1, 1}, {2, 2}}; double[][] Y = {{1}, {0}}; - double[][] R = {{2, 2}};*/ + double[][] R = {{2, 2}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.5); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.5); + String method = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void GiniTest2() { + double[][] X = {{1},{1},{1},{1},{1},{1},{2},{2},{2},{2}}; + double[][] Y = {{0}, {0}, {0}, {0}, {0}, {1}, {1}, {1}, {1}, {1}}; + double[][] R = {{2}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.3333333333); + String method = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void GiniTest3() { double[][] X = {{1,1,2,1}, {1,3,1,2}, {2,1,1,2}, {3,2,1,1}, {1,3,2,1}}; double[][] Y = {{0}, {0}, {1}, {1}, {1}}; double[][] R = {{3, 3, 2, 2}}; - /*double[][] X = {{1},{1},{1},{1},{1},{1},{2},{2},{2},{2}}; - double[][] Y = {{0}, {0}, {0}, {0}, {0}, {1}, {1}, {1}, {1}, {1}}; - double[][] R = {{2}};*/ + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.2133333333); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.0799999999); + expected_m.put(new MatrixValue.CellIndex(1, 3), 0.0133333333); + expected_m.put(new MatrixValue.CellIndex(1, 4), 0.0133333333); + String method = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void GiniWithContinuousValues1() { + double[][] X = {{1.5}, {12.6}, {3.4}, {14.2}}; + double[][] Y = {{0}, {1}, {0}, {1}}; + double[][] R = {{1}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.5); String method = "gini"; - runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method); + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); } @Test - public void basicEntropyTest() { - /*double[][] X = {{1, 1}, {2, 2}}; + public void GiniWithContinuousValues2() { + double[][] X = {{1.5}, {12.6}, {3.4}, {14.2}}; + double[][] Y = {{0}, {1}, {0}, {1}}; + double[][] R = {{1}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.5); + String method = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + // comparing with values from https://planetcalc.com/8421/ + @Test + public void EntropyTest1() { + double[][] X = {{1, 1}, {2, 2}}; double[][] Y = {{1}, {0}}; - double[][] R = {{2, 2}};*/ + double[][] R = {{2, 2}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 1.0); + expected_m.put(new MatrixValue.CellIndex(1, 2), 1.0); + String method = "entropy"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void EntropyTest2() { + double[][] X = {{1},{1},{1},{1},{1},{1},{2},{2},{2},{2}}; + double[][] Y = {{0},{0},{0},{0},{0},{1},{1},{1},{1},{1}}; + double[][] R = {{2}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.6099865470); + String method = "entropy"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void EntropyTest3() { double[][] X = {{1,1,2,1}, {1,3,1,2}, {2,1,1,2}, {3,2,1,1}, {1,3,2,1}}; double[][] Y = {{0}, {0}, {1}, {1}, {1}}; double[][] R = {{3, 3, 2, 2}}; - /*double[][] X = {{1},{1},{1},{1},{1},{1},{2},{2},{2},{2}}; - double[][] Y = {{0}, {0}, {0}, {0}, {0}, {1}, {1}, {1}, {1}, {1}}; - double[][] R = {{2}};*/ + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.4199730940); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.1709505945); + expected_m.put(new MatrixValue.CellIndex(1, 3), 0.0199730940); + expected_m.put(new MatrixValue.CellIndex(1, 4), 0.0199730940); String method = "entropy"; - runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method); + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); } - private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][] Y, double[][] R, String method) { + private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][] Y, double[][] R, String method, HashMap expected_m) { Types.ExecMode platform_old = setExecMode(exec_type); try { @@ -83,33 +156,17 @@ private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][ fullDMLScriptName = HOME + TEST_NAME + ".dml"; programArgs = new String[] {"-args", input("X"), input("Y"), input("R"), method, output("impurity_measures")}; - HashMap expected_measures = new HashMap<>(); - if(method.equals("gini")) { - expected_measures.put(new MatrixValue.CellIndex(1, 1), 0.2133333333); - expected_measures.put(new MatrixValue.CellIndex(1, 2), 0.0799999999); - expected_measures.put(new MatrixValue.CellIndex(1, 3), 0.0133333333); - expected_measures.put(new MatrixValue.CellIndex(1, 4), 0.0133333333); - } - // comparing with values from https://planetcalc.com/8421/ - if(method.equals("entropy")) { - expected_measures.put(new MatrixValue.CellIndex(1, 1), 0.4199730940); - expected_measures.put(new MatrixValue.CellIndex(1, 2), 0.1709505945); - expected_measures.put(new MatrixValue.CellIndex(1, 3), 0.0199730940); - expected_measures.put(new MatrixValue.CellIndex(1, 4), 0.0199730940); - } - writeInputMatrixWithMTD("X", X, true); writeInputMatrixWithMTD("Y", Y, true); writeInputMatrixWithMTD("R", R, true); runTest(true, false, null, -1); - //runTest(); HashMap actual_measures = readDMLMatrixFromOutputDir("impurity_measures"); System.out.println(actual_measures); - System.out.println(expected_measures); - TestUtils.compareMatrices(expected_measures, actual_measures, eps, "Expected measures", "Actual measures"); + System.out.println(expected_m); + TestUtils.compareMatrices(expected_m, actual_measures, eps, "Expected measures", "Actual measures"); } finally { rtplatform = platform_old; From a295933fba447bff1612f7c7e47bd80286c64664 Mon Sep 17 00:00:00 2001 From: Florian Poetz Date: Tue, 8 Feb 2022 08:02:10 +0100 Subject: [PATCH 4/4] added builtin instructions, added tests, code refactoring --- docs/site/builtins-reference.md | 45 +++++++++ scripts/builtin/impurityMeasures.dml | 53 +++++------ .../part1/BuiltinImpurityMeasuresTest.java | 94 ++++++++++++++++++- 3 files changed, 160 insertions(+), 32 deletions(-) diff --git a/docs/site/builtins-reference.md b/docs/site/builtins-reference.md index 9c9bb325277..11d19701ca6 100644 --- a/docs/site/builtins-reference.md +++ b/docs/site/builtins-reference.md @@ -50,6 +50,7 @@ limitations under the License. * [`img_brightness`-Function](#img_brightness-function) * [`img_crop`-Function](#img_crop-function) * [`img_mirror`-Function](#img_mirror-function) + * [`impurityMeasures`-Function](#impurityMeasures-function) * [`imputeByFD`-Function](#imputeByFD-function) * [`intersect`-Function](#intersect-function) * [`KMeans`-Function](#KMeans-function) @@ -1018,6 +1019,50 @@ B = img_mirror(img_in = A, horizontal_axis = TRUE) ``` +## `impurityMeasures`-Function + +`impurityMeasures()` computes the measure of impurity for each feature of the given dataset based on the passed method (gini or entropy). + +### Usage + +```r +IM = impurityMeasures(X = X, Y = Y, R = R, n_bins = 20, method = "gini"); +``` + +### Arguments + +| Name | Type | Default | Description | +| :--------- | :-------------- | :------ | :---------- | +| X | Matrix[Double] | --- | Feature matrix X | +| Y | Matrix[Double] | --- | Target vector Y containing only 0 or 1 values | +| R | Matrix[Double] | --- | Row vector R indicating whether a feature is categorical or continuous. 1 denotes a continuous feature, 2 denotes a categorical feature. | +| n_bins | Integer | `20` | Number of equi-width bins for binning in case of scale features. | +| method | String | --- | String indicating the method to use; either "entropy" or "gini". | + +### Returns + +| Name | Type | Description | +| :--- | :------------- | :---------- | +| IM | Matrix[Double] | (1 x ncol(X)) row vector containing information/gini gain for each feature of the dataset. In case of gini, the values denote the gini gains, i.e. how much impurity was removed with the respective split. The higher the value, the better the split. In case of entropy, the values denote the information gain, i.e. how much entropy was removed. The higher the information gain, the better the split. | + +### Example + +```r +X = matrix("4.0 3.0 2.8 3.5 + 2.4 1.0 3.4 2.9 + 1.1 1.0 4.9 3.4 + 5.0 2.0 1.4 1.8 + 1.1 3.0 1.0 1.9", rows=5, cols=4) +Y = matrix("1.0 + 0.0 + 0.0 + 1.0 + 0.0", rows=5, cols=1) +R = matrix("1.0 2.0 1.0 1.0", rows=1, cols=4) +IM = impurityMeasures(X = X, Y = Y, R = R, method = "entropy") +``` + + ## `imputeByFD`-Function The `imputeByFD`-function imputes missing values from observed values (if exist) diff --git a/scripts/builtin/impurityMeasures.dml b/scripts/builtin/impurityMeasures.dml index ee39634a43d..860bc629f6b 100644 --- a/scripts/builtin/impurityMeasures.dml +++ b/scripts/builtin/impurityMeasures.dml @@ -27,10 +27,10 @@ # NAME TYPE DEFAULT MEANING # ---------------------------------------------------------------------------------------------------------------------- # X Matrix[Double] --- Feature matrix. -# Y Matrix[Double] --- Target vector containing 0 and 1 values +# Y Matrix[Double] --- Target vector containing 0 and 1 values. # R Matrix[Double] --- Vector indicating whether a feature is categorical or continuous. -# 1 denotes a continuous feature, larger values indicate the number -# of categories. +# 1 denotes a continuous feature, 2 denotes a categorical feature. +# n_bins Integer 20 Number of bins for binning in case of scale features. # method String --- String indicating the method to use; either "entropy" or "gini". # ---------------------------------------------------------------------------------------------------------------------- @@ -41,14 +41,14 @@ # IM Matrix[Double] --- (1 x ncol(X)) row vector containing information/gini gain for # each feature of the dataset. # In case of gini, the values denote the gini gains, i.e. how much -# impurity was "removed" with the respective split. The higher the +# impurity was removed with the respective split. The higher the # value, the better the split. -# In case of entropy, the values denote the information gains, i.e. +# In case of entropy, the values denote the information gain, i.e. # how much entropy was removed. The higher the information gain, # the better the split. # ---------------------------------------------------------------------------------------------------------------------- -m_impurityMeasures = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] R, String method) +m_impurityMeasures = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] R, Integer n_bins = 20, String method) return (Matrix[Double] IM) { if (method != "entropy" & method != "gini") { @@ -57,21 +57,17 @@ m_impurityMeasures = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] IM = matrix(0.0, rows = 1, cols = ncol(X)) - for (i in 1:ncol(X)) { + parfor (i in 1:ncol(X)) { if (as.scalar(R[,i]) == 1) { - bins = 4 # todo: decide number of bins - binning_feature = applyBinning(X[,i], bins) - feature_measure = getMeasure(binning_feature, Y, bins, method) - IM[,i] = feature_measure + binned_feature = applyBinning(X[,i], n_bins) + IM[,i] = getImpurityMeasure(binned_feature, Y, n_bins, method) } else { - max_cat = max(X[,i]) - feature_measure = getMeasure(X[,i], Y, max_cat, method) - IM[,i] = feature_measure + IM[,i] = getImpurityMeasure(X[,i], Y, max(X[,i]), method) } } } -getMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double max_cat, String method) +getImpurityMeasure = function(Matrix[Double] feature, Matrix[Double] Y, Double max_cat, String method) return (Double gain) { n_true_labels = sum(Y) @@ -108,33 +104,36 @@ calcImpurity = function(Double n_true, Double n_false, Double n_vars, String met weight = (n_true + n_false) / n_vars if (prob_true != 1 & prob_false != 1) { # if there is more than one class, calculate new impurity according to method. - if (method == "entropy") { - scale_log = log(2) # scales the result for easier testing. - impurity = (-1) * weight * (prob_true * log(prob_true)/scale_log + prob_false * log(prob_false)/scale_log) + if (method == "entropy") { # dividing by log(2) to obtain the information gain in bits + impurity = (-1) * weight * (prob_true * log(prob_true)/log(2) + prob_false * log(prob_false)/log(2)) } else if (method == "gini") { impurity = weight * (1 - (prob_true^2 + prob_false^2)) } } } -# for now not very efficient equal width binning... -applyBinning = function(Matrix[Double] feature, Double bins) +applyBinning = function(Matrix[Double] feature, Double n_bins) return (Matrix[Double] output_f) { - n_bins = max(bins, nrow(feature)) + # equi-width binning. + + if (length(feature) < n_bins) { + n_bins = length(feature) + } max_v = max(feature) min_v = min(feature) width = (max_v - min_v) / n_bins output_f = matrix(1, rows = nrow(feature), cols = 1) - for (i in 1:nrow(feature)) { - c_value = as.scalar(feature[i,]) - filled = 0 - for (j in 1:n_bins) { - if (c_value <= (min_v + j * width) & filled == 0) { + parfor (i in 1:length(feature)) { + binned = FALSE + j = 1 + while (binned == FALSE) { + if (as.scalar(feature[i,]) <= min_v + j * width) { output_f[i,] = j - filled = 1 + binned = TRUE } + j += 1 } } } diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java index c59601f9e64..d67768b68d7 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImpurityMeasuresTest.java @@ -70,7 +70,7 @@ public void GiniTest2() { public void GiniTest3() { double[][] X = {{1,1,2,1}, {1,3,1,2}, {2,1,1,2}, {3,2,1,1}, {1,3,2,1}}; double[][] Y = {{0}, {0}, {1}, {1}, {1}}; - double[][] R = {{3, 3, 2, 2}}; + double[][] R = {{2, 2, 2, 2}}; HashMap expected_m = new HashMap<>(); expected_m.put(new MatrixValue.CellIndex(1, 1), 0.2133333333); expected_m.put(new MatrixValue.CellIndex(1, 2), 0.0799999999); @@ -81,9 +81,37 @@ public void GiniTest3() { runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); } + @Test + public void GiniPlayTennisTest() { + double[][] X = {{1,1,1,1}, + {1,1,1,2}, + {2,1,1,1}, + {3,2,1,1}, + {3,3,2,1}, + {3,3,2,2}, + {2,3,2,2}, + {1,2,1,1}, + {1,3,2,1}, + {3,2,2,1}, + {1,2,2,2}, + {2,2,1,2}, + {2,1,2,1}, + {3,2,1,2}}; + double[][] Y = {{0}, {0}, {1}, {1}, {1}, {0}, {1}, {0}, {1}, {1}, {1}, {1}, {1}, {0}}; + double[][] R = {{2, 2, 2, 2}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.1163265306); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.0187074829); + expected_m.put(new MatrixValue.CellIndex(1, 3), 0.0918367346); + expected_m.put(new MatrixValue.CellIndex(1, 4), 0.0306122448); + String method = "gini"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + @Test public void GiniWithContinuousValues1() { - double[][] X = {{1.5}, {12.6}, {3.4}, {14.2}}; + double[][] X = {{10.3}, {31.2}, {9.5}, {34.3}}; double[][] Y = {{0}, {1}, {0}, {1}}; double[][] R = {{1}}; HashMap expected_m = new HashMap<>(); @@ -95,11 +123,13 @@ public void GiniWithContinuousValues1() { @Test public void GiniWithContinuousValues2() { - double[][] X = {{1.5}, {12.6}, {3.4}, {14.2}}; + double[][] X = {{1.5, 23.7, 2929.6}, {12.6, 80.2, 2823.3}, {3.4, 238.2, 832.2}, {14.2, 282.1, 23.1}}; double[][] Y = {{0}, {1}, {0}, {1}}; - double[][] R = {{1}}; + double[][] R = {{1, 1, 1}}; HashMap expected_m = new HashMap<>(); expected_m.put(new MatrixValue.CellIndex(1, 1), 0.5); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.0); + expected_m.put(new MatrixValue.CellIndex(1, 3), 0.25); String method = "gini"; runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); @@ -135,7 +165,7 @@ public void EntropyTest2() { public void EntropyTest3() { double[][] X = {{1,1,2,1}, {1,3,1,2}, {2,1,1,2}, {3,2,1,1}, {1,3,2,1}}; double[][] Y = {{0}, {0}, {1}, {1}, {1}}; - double[][] R = {{3, 3, 2, 2}}; + double[][] R = {{2, 2, 2, 2}}; HashMap expected_m = new HashMap<>(); expected_m.put(new MatrixValue.CellIndex(1, 1), 0.4199730940); expected_m.put(new MatrixValue.CellIndex(1, 2), 0.1709505945); @@ -146,6 +176,60 @@ public void EntropyTest3() { runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); } + @Test + public void EntropyPlayTennisTest() { + double[][] X = {{1,1,1,1}, + {1,1,1,2}, + {2,1,1,1}, + {3,2,1,1}, + {3,3,2,1}, + {3,3,2,2}, + {2,3,2,2}, + {1,2,1,1}, + {1,3,2,1}, + {3,2,2,1}, + {1,2,2,2}, + {2,2,1,2}, + {2,1,2,1}, + {3,2,1,2}}; + double[][] Y = {{0}, {0}, {1}, {1}, {1}, {0}, {1}, {0}, {1}, {1}, {1}, {1}, {1}, {0}}; + double[][] R = {{2, 2, 2, 2}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 0.2467498198); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.0292225657); + expected_m.put(new MatrixValue.CellIndex(1, 3), 0.1518355014); + expected_m.put(new MatrixValue.CellIndex(1, 4), 0.0481270304); + String method = "entropy"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void EntropyWithContinuousValues1() { + double[][] X = {{10.3}, {31.2}, {9.5}, {34.3}}; + double[][] Y = {{0}, {1}, {0}, {1}}; + double[][] R = {{1}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 1.0); + String method = "entropy"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + + @Test + public void EntropyWithContinuousValues2() { + double[][] X = {{1.5, 23.7, 2929.6}, {12.6, 80.2, 2823.3}, {3.4, 238.2, 832.2}, {14.2, 282.1, 23.1}}; + double[][] Y = {{0}, {1}, {0}, {1}}; + double[][] R = {{1, 1, 1}}; + HashMap expected_m = new HashMap<>(); + expected_m.put(new MatrixValue.CellIndex(1, 1), 1.0); + expected_m.put(new MatrixValue.CellIndex(1, 2), 0.0); + expected_m.put(new MatrixValue.CellIndex(1, 3), 0.5); + String method = "entropy"; + + runImpurityMeasuresTest(ExecType.SPARK, X, Y, R, method, expected_m); + } + private void runImpurityMeasuresTest(ExecType exec_type, double[][] X, double[][] Y, double[][] R, String method, HashMap expected_m) { Types.ExecMode platform_old = setExecMode(exec_type);