diff --git a/scripts/builtin/applyAndEvaluate.dml b/scripts/builtin/applyAndEvaluate.dml index 646c71891c8..96e199d803a 100644 --- a/scripts/builtin/applyAndEvaluate.dml +++ b/scripts/builtin/applyAndEvaluate.dml @@ -99,10 +99,10 @@ return (Matrix[Double] result) if(max(eYtrain) == min(eYtrain)) stop("Y contains only one class") - score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE)) + score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp)) trainAccuracy = as.scalar(score[1, 1]) - score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE)) + score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) testAccuracy = as.scalar(score[1, 1]) @@ -172,9 +172,9 @@ return(Double dirtyScore) mask = as.matrix(metaList['mask']) mask = ifelse(sum(mask == dmask) < ncol(mask), matrix(1, rows=1, cols=ncol(mask)), mask) [eXtrain, eXtest] = recodeData(X, Xtest, mask, FALSE, "recode") - eXtrain = replace(target=eXtrain, pattern=NaN, replacement=1) - eXtest = replace(target=eXtest, pattern=NaN, replacement=1) + eXtrain = replace(target=eXtrain, pattern=NaN, replacement=0) + eXtest = replace(target=eXtest, pattern=NaN, replacement=0) [eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, FALSE, "dummycode") - score = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE)) + score = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) dirtyScore = as.scalar(score[1, 1]) } diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml index 74c1aebf104..22ad7b58e53 100644 --- a/scripts/builtin/bandit.dml +++ b/scripts/builtin/bandit.dml @@ -300,16 +300,17 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer { pipList = list(lp = lp, ph = ph_pip[i], hp = hp_matrix, flags = no_of_flag_vars) [evalFunOutput, hpForPruning, changesByOp] = crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, hpForPruning=hpForPruning, - changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = 0) + changesByOp=changesByOp, evalFunc=evaluationFunc) } else { - [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList, - hyperParameters=hp_matrix, hpForPruning=hpForPruning, changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) + [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i], + X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList, hyperParameters=hp_matrix, hpForPruning=hpForPruning, + changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) if(max(eYtrain) == min(eYtrain)) print("Y contains only one class") else - evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 0)) + evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) } # evalFunOutput = eval(evaluationFunc, argList) @@ -564,9 +565,8 @@ return (Matrix[Double] features) # OHE features OHE = sum(colMaxs(X) * mask) features[1, 10] = OHE - tab = table(Y, 1) - distVal = nrow(tab) - if(nrow(Y) > 1 & distVal <= 10) + + if(nrow(Y) > 1 & min(Y) >= 1) { ctab = table(Y, 1) features[1, 11] = nrow(ctab) # number of classes @@ -630,7 +630,7 @@ return (String s) } crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList, - Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Integer trainML = 0) + Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc) return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] changesByOp) { accuracyMatrix = matrix(0, cvk, 1) @@ -679,7 +679,7 @@ return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] chang changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE) } # print("test out: "+nrow(testy)) - res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = trainML)) + res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp)) accuracyMatrix[i] = res[1, 1] evalFunHp = res[, 2:ncol(res)] } diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml index 3a0358a2697..5eacc558158 100644 --- a/scripts/builtin/executePipeline.dml +++ b/scripts/builtin/executePipeline.dml @@ -83,7 +83,7 @@ s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[Str op = as.scalar(pipeline[1,i]) lgOp = as.scalar(logical[1,i]) - if(test == FALSE | lgOp != "CI") { + if(lgOp != "CI") { Xclone = X [hp, dataFlag, yFlag, executeFlag] = matrixToList(X, Y, mask, FD, hyperParameters[i], flagsCount, op) if(executeFlag == 1) { @@ -228,6 +228,9 @@ return (Matrix[Double] X) cat = removeEmpty(target=X, margin="cols", select = mask) # round categorical (if there is any floating point) cat = round(cat) + less_than_1_mask = cat < 1 + less_than_1 = less_than_1_mask * 9999 + cat = (cat * (less_than_1_mask == 0)) + less_than_1 # reconstruct original X X = X * (mask == 0) q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", @@ -485,11 +488,9 @@ return(Matrix[Double] hpForPruning, Matrix[Double] changesByOp) ######################################################## # The function will flip the noisy labels ######################################################## -flipLabels = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Boolean verbose = FALSE) +flipLabels = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Integer maxIter =10, Boolean verbose = FALSE) return (Matrix[Double] XY) { - - print("---- starting flip labels ---") max_y = max(Y) if(min(Y) != max(Y)) { @@ -498,7 +499,7 @@ return (Matrix[Double] XY) inc = ((yhat != Y) & (rowMaxs(prob) > threshold)) Xcor = removeEmpty(target = X, margin = "rows", select = (inc==0)) Ycor = removeEmpty(target = Y, margin = "rows", select = (inc==0)) - while(sum(inc) > 0) + while(sum(inc) > 0 & maxIter > 0) { # print("inc vector "+toString(inc)) Xinc = removeEmpty(target = X, margin = "rows", select = inc) @@ -512,9 +513,95 @@ return (Matrix[Double] XY) Ycor = rbind(Ycor, YcorI) X = Xinc Y = Yinc + print("maxIter: "+maxIter) + maxIter = maxIter - 1 } XY = cbind(Xcor, Ycor) } else XY = cbind(X, Y) +} + +####################################################################### +# function frequency conversion +# Inputs: The input dataset X, and mask of the columns +# Output: categorical columns are replaced with their frequencies +####################################################################### + +frequencyEncoding = function(Matrix[Double] X, Matrix[Double] mask) +return (Matrix[Double] freqX) { + + freqX = X + X = replace(target=X, pattern=NaN, replacement=1) + if(sum(mask) > 0) + { + + parfor(i in 1:ncol(mask)) + { + if(as.scalar(mask[1, i]) == 1) + { + Y = X[, i] + # print("max of Y: "+max(Y)+" max of Ytrain: "+max(Xtrain[, i])) + valueCount = table(Y, 1) + resp = matrix(0, nrow(Y), max(Y)) + resp = (resp + t(seq(1, max(Y)))) == Y + # print("cols in resp: "+ncol(resp)+" cols in valueCount: "+nrow(valueCount)) + # while(FALSE){} + resp = resp * t(valueCount) + freqX[, i] = rowSums(resp) + } + + } + } +} + +####################################################################### +# function Weight of evidence / information gain +# Inputs: The input dataset X, and mask of the columns +# Output: categorical columns are replaced with their frequencies +####################################################################### + +WoE = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask) +return (Matrix[Double] output) { + + freqX = X + X = replace(target=X, pattern=NaN, replacement=1) + if(sum(mask) > 0) + { + parfor(i in 1:ncol(mask)) + { + if(as.scalar(mask[1, i]) == 1) + { + L = X[, i] + entropy = getEntropy(L, Y) + resp = matrix(0, nrow(L), max(L)) + resp = (resp + t(seq(1, max(L)))) == L + resp = resp * entropy + freqX[, i] = rowSums(resp) + } + + } + } + output = cbind(freqX, Y) +} + + +getEntropy = function(Matrix[Double] eX, Matrix[Double] eY) +return(Matrix[Double] entropyMatrix) +{ + + tab = table(eX, eY) + # print("tab \n"+toString(tab)) + entropyMatrix = matrix(0, rows=1, cols=nrow(tab)) + catTotal = rowSums(tab) + for(i in 1:nrow(tab)) + { + # print("catProb: " +catProb) + entropy = (tab[i,]/catTotal[i]) + # print("entropy: "+toString(entropy)) + catEntropy = sum(-entropy * log(entropy, 2)) + catEntropy = ifelse(is.na(catEntropy), 0, catEntropy) + # print("cat entropy: "+catEntropy) + entropyMatrix[1, i] = catEntropy + } } \ No newline at end of file diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index e9aebafe76e..07bf98b96ad 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -113,36 +113,39 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # # # create logical pipeline seeds logicalSeedCI = frame([ "4", "ED", "MVI", "OTLR", "EC", "0", "0", "0", "0", - "5", "ED", "MVI", "CI", "SCALE","DUMMY","0", "0", "0", - "5", "OTLR", "EC", "CI", "SCALE", "DUMMY", "0","0", "0", - "7", "MVI", "OTLR", "ED", "EC", "SCALE", "CI", "DUMMY", "0", + "5", "ED", "EC", "SCALE", "CI","DUMMY","0", "0", "0", + "5", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0","0", "0", + "8", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "CI", "DUMMY", "5", "ED", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0", "4", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0", "0", - "4", "ED", "EC", "CI", "DUMMY", "0", "0", "0", "0", - "4", "MVI", "OTLR", "CI", "DUMMY", "0", "0", "0", "0", - "6", "MVI", "OTLR", "EC", "CI", "SCALE", "DUMMY", "0", "0", + "6", "ED", "MVI", "EC", "SCALE", "CI", "DUMMY", "0", "0", + "6", "MVI", "OTLR","EC", "SCALE", "CI", "DUMMY", "0", "0", + "7", "OTLR", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0", "7", "ED", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0" ], rows=10, cols=9) - logicalSeedNoCI = frame([ - "4", "ED", "MVI", "OTLR", "EC", "0", "0", - "3", "ED", "MVI", "DUMMY", "0","0","0", - "3", "OTLR", "EC", "DUMMY", "0","0","0", - "5", "MVI", "OTLR", "ED", "EC", "DUMMY", "0", - "3", "ED", "MVI", "DUMMY", "0", "0", "0", - "3", "MVI", "SCALE", "DUMMY", "0", "0", "0", - "3", "ED", "EC", "DUMMY", "0", "0", "0", - "3", "MVI", "OTLR", "DUMMY", "0", "0", "0", - "4", "MVI", "OTLR", "EC", "DUMMY", "0", "0", - "6", "ED", "MVI", "OTLR", "EC", "SCALE", "DUMMY" - ], rows=10, cols=7) + logicalSeedNoCI = frame([ + "3", "ED", "MVI", "OTLR", "EC", "0", "0", "0", + "4", "ED", "EC", "SCALE", "DUMMY","0", "0", "0", + "4", "OTLR", "EC", "SCALE", "DUMMY", "0","0", "0", + "7", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "DUMMY", + "4", "ED", "MVI", "SCALE", "DUMMY", "0", "0", "0", + "3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0", + "5", "ED", "MVI", "EC", "SCALE", "DUMMY", "0", "0", + "5", "MVI", "OTLR","EC", "SCALE", "DUMMY", "0", "0", + "6", "OTLR", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0", + "6", "ED", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0" + ], rows=10, cols=8) - tab = table(eYtrain, 1) - dist = nrow(tab) - if(nrow(eYtrain) > 0 & dist < 15) + if(min(eYtrain) >= 1) { + tab = table(eYtrain, 1) + dist = nrow(tab) + } + if(nrow(eYtrain) > 0 & min(eYtrain) >= 1 & dist <= 15) logical = logicalSeedCI - else + else { logical = logicalSeedNoCI + } idx = as.integer(as.scalar(logical[1, 1])) + 1 category = logical[1, 2:idx] @@ -150,7 +153,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a [bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, cmr=cmr, cat=category, population=logical[2:nrow(logical)], max_iter=ceil(resource_val/topK), metaList = metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters, - num_inst=3 , num_exec=2, cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx) + num_inst=nrow(primitives), num_exec=ceil(resource_val/topK), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx) t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s"); topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL") @@ -231,8 +234,8 @@ return(Double dirtyScore, Matrix[Double] evalFunHp) mask = as.matrix(metaList['mask']) mask = ifelse(sum(mask == dmask) < ncol(mask), matrix(1, rows=1, cols=ncol(mask)), mask) [eXtrain, eXtest] = recodeData(X, Xtest, mask, cv, "recode") - eXtrain = replace(target=eXtrain, pattern=NaN, replacement = 1) - eXtest = replace(target=eXtest, pattern=NaN, replacement = 1) + eXtrain = replace(target=eXtrain, pattern=NaN, replacement = 0) + eXtest = replace(target=eXtest, pattern=NaN, replacement = 0) dirtyScore = 100 print(prefix+" sample from train data and dummy code"); [eXtrain, Ytrain] = utils::doSample(eXtrain, Y, sample, TRUE) @@ -242,10 +245,10 @@ return(Double dirtyScore, Matrix[Double] evalFunHp) print(prefix+" hyper-parameter tuning"); if(cv) { score = crossV(X=eXtrain, y=Ytrain, cvk=cvk, evalFunHp=evalFunHp, - pipList=pipList, metaList=metaList, evalFunc=evaluationFunc, trainML = 1) + pipList=pipList, metaList=metaList, evalFunc=evaluationFunc) } else { - score = eval(evaluationFunc, list(X=eXtrain, Y=Ytrain, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 1)) + score = eval(evaluationFunc, list(X=eXtrain, Y=Ytrain, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) } dirtyScore = as.scalar(score[1, 1]) diff --git a/scripts/pipelines/properties/param.csv b/scripts/pipelines/properties/param.csv index c99a18eb317..cde9c2605ab 100644 --- a/scripts/pipelines/properties/param.csv +++ b/scripts/pipelines/properties/param.csv @@ -6,15 +6,17 @@ normalize,0,0,0,0,0,0,,,,,,,,,,,, imputeByMean,0,1,0,0,0,2,,,,,,,,,,,, imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,, mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1,,,,,, -abstain,1,0,0,1,1,2,FP,0.6,0.9,,,,,,,,, -flipLabels,1,0,0,1,1,2,FP,0.6,0.9,,,,,,,,, +abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,,,, +flipLabels,2,0,0,1,1,2,FP,INT,0.6,0.9,1,20,,,,,, SMOTE,1,1,0,1,1,2,INT,100,500,,,,,,,,, m_pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0,,, ppca,4,0,0,0,1,2,INT,INT,FP,FP,100,200,1,10,1.00E-09,1.00E-06,1.00E-02,1.00E-01 fillDefault,0,0,0,0,0,2,,,,,,,,,,,, dummycoding,0,1,0,0,0,2,,,,,,,,,,,, +frequencyEncoding,0,1,0,0,0,2,,,,,,,,,,,, +WoE,0,1,0,1,0,2,,,,,,,,,,,, scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,,,, forward_fill,1,0,0,0,1,2,BOOL,0,1,,,,,,,,, imputeByFd,1,0,1,0,0,1,FP,0.6,0.9,,,,,,,,, -underSampling,1,0,0,1,0,2,FP,0.6,0.99,,,,,,,,, wtomeklink,0,0,0,1,0,2,,,,,,,,,,,, +underSampling,1,0,0,1,0,2,FP,0.1,0.6,,,,,,,,, diff --git a/scripts/pipelines/properties/primitives.csv b/scripts/pipelines/properties/primitives.csv index 0afcc52e2b4..53d916079bd 100644 --- a/scripts/pipelines/properties/primitives.csv +++ b/scripts/pipelines/properties/primitives.csv @@ -1,7 +1,7 @@ ED,MVI,OTLR,EC,SCALE,CI,DUMMY,DIM imputeByFd,imputeByMean,winsorize,imputeByMean,scale,abstain,dummycoding,m_pca -outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,wtomeklink,,ppca -outlierByIQR,mice,outlierByIQR,fillDefault,,SMOTE,, +outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,wtomeklink,frequencyEncoding,ppca +outlierByIQR,mice,outlierByIQR,fillDefault,,SMOTE,WoE, ,fillDefault,,,,flipLabels,, -,imputeByFd,,,,,, +,imputeByFd,,,,underSampling,, ,forward_fill,,,,,, diff --git a/scripts/pipelines/properties/testPrimitives.csv b/scripts/pipelines/properties/testPrimitives.csv index c1e743396b3..a5cdc3ed8e2 100644 --- a/scripts/pipelines/properties/testPrimitives.csv +++ b/scripts/pipelines/properties/testPrimitives.csv @@ -1,3 +1,3 @@ ED,MVI,OTLR,EC,SCALE,CI,DUMMY,DIM ,imputeByMean,winsorize,imputeByMean,scale,abstain,dummycoding,m_pca -outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,wtomeklink,,ppca +outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,underSampling,frequencyEncoding,ppca diff --git a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml index 6edd23904ca..a4e1c8c7e9b 100644 --- a/src/test/scripts/functions/pipelines/applyEvaluateTest.dml +++ b/src/test/scripts/functions/pipelines/applyEvaluateTest.dml @@ -60,7 +60,7 @@ trainData = F[1:split,] testData = F[split+1:nrow(F),] -result = applyAndEvaluate(trainData, testData, metaInfo, lg, pip[1,], hp[1,], "evalML", matrix("1 1e-3 1e-9 100", rows=1, cols=4), TRUE, FALSE) +result = applyAndEvaluate(trainData, testData, metaInfo, lg, pip[1,], hp[1,], "evalML", evalHp, TRUE, FALSE) header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3) result = as.frame(result) @@ -75,13 +75,13 @@ write(result, $6) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp, Boolean trainML = FALSE) + Matrix[Double] evalFunHp) return(Matrix[Double] accuracy) { beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), - maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE); + maxi=1000, maxii=100, verbose=FALSE); [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) a = getAccuracy(Ytest, yhat, TRUE) print("accuracy: "+ accuracy+", accuracy weighted: "+a) diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv index 8350d69b5a7..746303da873 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv @@ -1,3 +1,3 @@ -77.42222222222223 -77.15555555555555 -76.97777777777777 +93.69369369369369 +93.69369369369369 +93.69369369369369 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd deleted file mode 100644 index d3f8f295c66..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd +++ /dev/null @@ -1,12 +0,0 @@ -{ - "data_type": "matrix", - "value_type": "double", - "rows": 3, - "cols": 1, - "nnz": 3, - "format": "csv", - "author": "olga_ovcharenko", - "header": false, - "sep": ",", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv index 3c0b94009e2..14992b730f3 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv @@ -1 +1 @@ -74.13333333333333 \ No newline at end of file +90.990990990991 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd deleted file mode 100644 index 4689778cab9..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd +++ /dev/null @@ -1,7 +0,0 @@ -{ - "data_type": "scalar", - "value_type": "double", - "format": "text", - "author": "olga_ovcharenko", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv index b0891774a7a..c3223bad408 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv @@ -1 +1 @@ -10.0,0.001,1.0E-9,1000.0 +2.0,0.001,1.0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd deleted file mode 100644 index 98f02f09ea8..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd +++ /dev/null @@ -1,12 +0,0 @@ -{ - "data_type": "matrix", - "value_type": "double", - "rows": 1, - "cols": 4, - "nnz": 4, - "format": "csv", - "author": "olga_ovcharenko", - "header": false, - "sep": ",", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv.mtd deleted file mode 100644 index f73d79482b1..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv.mtd +++ /dev/null @@ -1,11 +0,0 @@ -{ - "data_type": "frame", - "schema": "STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,STRING,", - "rows": 1, - "cols": 18, - "format": "csv", - "author": "olga_ovcharenko", - "header": false, - "sep": ",", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index c1201c5b388..51db8062157 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -48.0,1.0,0.6455927908212413,0,0,1.0,0,0,1.0,0,0,0,1.0,0,0,0,2.0,1.0,0.7028229812430514,0,0,1.0,0,0,1.0,2.0,0,0,0,0,0,0,0,1.0,0.7518372764174678,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -48.0,1.0,0.6687888403388711,0,0,1.0,0,0,1.0,0,0,0,1.0,0,0,0,2.0,1.0,0.8636413728699717,0,0,1.0,0,0,1.0,2.0,0,1.0,0,0,0,0,0,1.0,0.6999444414086964,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -54.0,1.0,0.8858480964079888,0,0,0,1.0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,3.0,7.0,1.0,1.0,0,0,0,1.0,0,2.0,0,0,0,0,0,0,0,0,1.0,0.8436419752757551,0,0,0,0,1.0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +56.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,2.0,0.01012948685771077,0.9700112361003191,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,2.0,1.0,1.0,0,0,0,0,0,1.0,0.7879135917206637,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +56.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,2.0,0.03120261172075603,0.9862240788883125,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,1.0,0.6444173997759863,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +56.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,2.0,0.03697717557557067,0.9732999162362644,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,2.0,0,1.0,0,0,0,0,0,1.0,0.6848186130743412,0,0,0,1.0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd deleted file mode 100644 index 80fe788bfb1..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd +++ /dev/null @@ -1,12 +0,0 @@ -{ - "data_type": "matrix", - "value_type": "double", - "rows": 3, - "cols": 60, - "nnz": 28, - "format": "csv", - "author": "olga_ovcharenko", - "header": false, - "sep": ",", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv index b6c716bf08d..e2f5bc4d986 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv @@ -1 +1 @@ -ED,MVI,ED,SCALE,CI,DUMMY +EC,MVI,OTLR,EC,SCALE,CI,DUMMY diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv.mtd deleted file mode 100644 index 241a6a065db..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv.mtd +++ /dev/null @@ -1,11 +0,0 @@ -{ - "data_type": "frame", - "schema": "STRING,STRING,STRING,STRING,", - "rows": 1, - "cols": 4, - "format": "csv", - "author": "olga_ovcharenko", - "header": false, - "sep": ",", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index 1367ddfbcd7..37f2ffbfff4 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -imputeByFd,imputeByMean,imputeByFd,scale,flipLabels,dummycoding -imputeByFd,imputeByMean,imputeByFd,scale,flipLabels,dummycoding -imputeByFd,imputeByMean,outlierBySd,scale,abstain,dummycoding +imputeByMean,imputeByMean,winsorize,imputeByMedian,scale,abstain,dummycoding +imputeByMean,imputeByMean,winsorize,imputeByMedian,scale,abstain,dummycoding +imputeByMean,imputeByMean,winsorize,imputeByMedian,scale,abstain,dummycoding diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd deleted file mode 100644 index 33bc1d4d7e5..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd +++ /dev/null @@ -1,11 +0,0 @@ -{ - "data_type": "frame", - "schema": "STRING,STRING,STRING,STRING,", - "rows": 3, - "cols": 4, - "format": "csv", - "author": "olga_ovcharenko", - "header": false, - "sep": ",", - "created": "2021-09-15 13:08:58 CEST" -} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml b/src/test/scripts/functions/pipelines/topkLogicalTest.dml index 481fb66135b..fdabe020ac5 100644 --- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml +++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml @@ -107,7 +107,7 @@ write(result , $O) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp, Boolean trainML = FALSE) + Matrix[Double] evalFunHp) return(Matrix[Double] accuracy) { diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml index 1ba5bdaca67..91d186c1334 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml @@ -24,7 +24,7 @@ source("scripts/pipelines/scripts/utils.dml") as utils; # read the inputs F = read($dirtyData, data_type="frame", format="csv", header=TRUE, - naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]); + naStrings= ["NA", "null"," ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999"]); metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE); primitives = read($primitives, data_type = "frame", format="csv", header= TRUE) @@ -57,7 +57,7 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)] # [topKPipelines, topKHyperParams, topKScores, bestLogical, features, dirtyScore, evalHp] = result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, - cmr=matrix("2 0.7 1", rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(0), + cmr=matrix("2 0.7 1", rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN), topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) write(result, $O) @@ -66,15 +66,15 @@ write(result, $O) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp, Integer trainML) + Matrix[Double] evalFunHp) return(Matrix[Double] output) { - if(trainML == 1) + if(is.na(as.scalar(evalFunHp[1,1]))) { - params = list("icpt", "reg", "tol", "maxii") - paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5), 10^seq(1,3)); - trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=-1, maxi=100, maxii=-1, verbose=FALSE); + params = list("icpt", "reg", "tol") + paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5)); + trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=-1, maxi=1000, maxii=100, verbose=FALSE); [B1, opt] = utils::topk_gridSearch(X=X, y=Y, Xtest=Xtest, ytest=Ytest, train="multiLogReg", predict="accuracy", numB=ncol(X)+1, cv=FALSE, cvk=0, params=params, paramValues=paramRanges, trainArgs=trainArgs, verbose=FALSE); evalFunHp = as.matrix(opt) @@ -86,7 +86,7 @@ return(Matrix[Double] output) } else { beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), - maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE); + maxi=1000, maxii=100, verbose=FALSE); [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) a = getAccuracy(Ytest, yhat, TRUE) print("accuracy: "+toString(accuracy)+" weighted accuracy: "+a) diff --git a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml index a797db206db..7682ae444b1 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml @@ -23,7 +23,7 @@ source("scripts/pipelines/scripts/utils.dml") as utils; # read the inputs F = read($dirtyData, data_type="frame", format="csv", header=TRUE, - naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]); + naStrings= ["NA", "null"," ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999"]); F = F[,2:ncol(F)] primitives = read($primitives, data_type = "frame", format="csv", header= TRUE) param = read($parameters, data_type = "frame", format="csv", header= TRUE) @@ -59,10 +59,10 @@ write(result, $O) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalRegression = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp, Boolean trainML = FALSE) + Matrix[Double] evalFunHp) return(Matrix[Double] output) { - if(trainML == 1) + if(is.na(as.scalar(evalFunHp[1,1]))) { # do the gridsearch for hyper-parameters params = list("icpt","reg", "tol", "maxi"); @@ -82,5 +82,5 @@ wmape = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B, Integer i # loss = as.matrix(sum((y - X%*%B)^2)); pred = lmPredict(X=X, B=B, ytest=y, icpt=icpt); WMAPE = sum(abs(y - pred))/sum(abs(y)) #this will give the lose into range of [0,1] - loss = as.matrix(WMAPE) + loss = ifelse(is.na(as.matrix(WMAPE)), as.matrix(0), as.matrix(WMAPE)) }