Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions scripts/builtin/applyAndEvaluate.dml
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ return (Matrix[Double] result)
if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")

score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE))
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
trainAccuracy = as.scalar(score[1, 1])

score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE))
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
testAccuracy = as.scalar(score[1, 1])


Expand Down Expand Up @@ -172,9 +172,9 @@ return(Double dirtyScore)
mask = as.matrix(metaList['mask'])
mask = ifelse(sum(mask == dmask) < ncol(mask), matrix(1, rows=1, cols=ncol(mask)), mask)
[eXtrain, eXtest] = recodeData(X, Xtest, mask, FALSE, "recode")
eXtrain = replace(target=eXtrain, pattern=NaN, replacement=1)
eXtest = replace(target=eXtest, pattern=NaN, replacement=1)
eXtrain = replace(target=eXtrain, pattern=NaN, replacement=0)
eXtest = replace(target=eXtest, pattern=NaN, replacement=0)
[eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, FALSE, "dummycode")
score = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE))
score = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
dirtyScore = as.scalar(score[1, 1])
}
18 changes: 9 additions & 9 deletions scripts/builtin/bandit.dml
Original file line number Diff line number Diff line change
Expand Up @@ -300,16 +300,17 @@ run_with_hyperparam = function(Frame[Unknown] lp, Frame[Unknown] ph_pip, Integer
{
pipList = list(lp = lp, ph = ph_pip[i], hp = hp_matrix, flags = no_of_flag_vars)
[evalFunOutput, hpForPruning, changesByOp] = crossV(X=X, y=Y, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, hpForPruning=hpForPruning,
changesByOp=changesByOp, evalFunc=evaluationFunc, trainML = 0)
changesByOp=changesByOp, evalFunc=evaluationFunc)
}
else
{
[eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i], X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList,
hyperParameters=hp_matrix, hpForPruning=hpForPruning, changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
[eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp] = executePipeline(logical=lp, pipeline=ph_pip[i],
X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList, hyperParameters=hp_matrix, hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
print("Y contains only one class")
else
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 0))
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
}

# evalFunOutput = eval(evaluationFunc, argList)
Expand Down Expand Up @@ -564,9 +565,8 @@ return (Matrix[Double] features)
# OHE features
OHE = sum(colMaxs(X) * mask)
features[1, 10] = OHE
tab = table(Y, 1)
distVal = nrow(tab)
if(nrow(Y) > 1 & distVal <= 10)

if(nrow(Y) > 1 & min(Y) >= 1)
{
ctab = table(Y, 1)
features[1, 11] = nrow(ctab) # number of classes
Expand Down Expand Up @@ -630,7 +630,7 @@ return (String s)
}

crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double] evalFunHp, List[Unknown] pipList, List[Unknown] metaList,
Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Integer trainML = 0)
Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc)
return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
{
accuracyMatrix = matrix(0, cvk, 1)
Expand Down Expand Up @@ -679,7 +679,7 @@ return (Matrix[Double] output, Matrix[Double] hpForPruning, Matrix[Double] chang
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
}
# print("test out: "+nrow(testy))
res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = trainML))
res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp))
accuracyMatrix[i] = res[1, 1]
evalFunHp = res[, 2:ncol(res)]
}
Expand Down
97 changes: 92 additions & 5 deletions scripts/builtin/executePipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[Str
op = as.scalar(pipeline[1,i])
lgOp = as.scalar(logical[1,i])

if(test == FALSE | lgOp != "CI") {
if(lgOp != "CI") {
Xclone = X
[hp, dataFlag, yFlag, executeFlag] = matrixToList(X, Y, mask, FD, hyperParameters[i], flagsCount, op)
if(executeFlag == 1) {
Expand Down Expand Up @@ -228,6 +228,9 @@ return (Matrix[Double] X)
cat = removeEmpty(target=X, margin="cols", select = mask)
# round categorical (if there is any floating point)
cat = round(cat)
less_than_1_mask = cat < 1
less_than_1 = less_than_1_mask * 9999
cat = (cat * (less_than_1_mask == 0)) + less_than_1
# reconstruct original X
X = X * (mask == 0)
q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
Expand Down Expand Up @@ -485,11 +488,9 @@ return(Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
########################################################
# The function will flip the noisy labels
########################################################
flipLabels = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Boolean verbose = FALSE)
flipLabels = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Integer maxIter =10, Boolean verbose = FALSE)
return (Matrix[Double] XY)
{

print("---- starting flip labels ---")
max_y = max(Y)
if(min(Y) != max(Y))
{
Expand All @@ -498,7 +499,7 @@ return (Matrix[Double] XY)
inc = ((yhat != Y) & (rowMaxs(prob) > threshold))
Xcor = removeEmpty(target = X, margin = "rows", select = (inc==0))
Ycor = removeEmpty(target = Y, margin = "rows", select = (inc==0))
while(sum(inc) > 0)
while(sum(inc) > 0 & maxIter > 0)
{
# print("inc vector "+toString(inc))
Xinc = removeEmpty(target = X, margin = "rows", select = inc)
Expand All @@ -512,9 +513,95 @@ return (Matrix[Double] XY)
Ycor = rbind(Ycor, YcorI)
X = Xinc
Y = Yinc
print("maxIter: "+maxIter)
maxIter = maxIter - 1
}
XY = cbind(Xcor, Ycor)
}
else
XY = cbind(X, Y)
}

#######################################################################
# function frequency conversion
# Inputs: The input dataset X, and mask of the columns
# Output: categorical columns are replaced with their frequencies
#######################################################################

frequencyEncoding = function(Matrix[Double] X, Matrix[Double] mask)
return (Matrix[Double] freqX) {

freqX = X
X = replace(target=X, pattern=NaN, replacement=1)
if(sum(mask) > 0)
{

parfor(i in 1:ncol(mask))
{
if(as.scalar(mask[1, i]) == 1)
{
Y = X[, i]
# print("max of Y: "+max(Y)+" max of Ytrain: "+max(Xtrain[, i]))
valueCount = table(Y, 1)
resp = matrix(0, nrow(Y), max(Y))
resp = (resp + t(seq(1, max(Y)))) == Y
# print("cols in resp: "+ncol(resp)+" cols in valueCount: "+nrow(valueCount))
# while(FALSE){}
resp = resp * t(valueCount)
freqX[, i] = rowSums(resp)
}

}
}
}

#######################################################################
# function Weight of evidence / information gain
# Inputs: The input dataset X, and mask of the columns
# Output: categorical columns are replaced with their frequencies
#######################################################################

WoE = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask)
return (Matrix[Double] output) {

freqX = X
X = replace(target=X, pattern=NaN, replacement=1)
if(sum(mask) > 0)
{
parfor(i in 1:ncol(mask))
{
if(as.scalar(mask[1, i]) == 1)
{
L = X[, i]
entropy = getEntropy(L, Y)
resp = matrix(0, nrow(L), max(L))
resp = (resp + t(seq(1, max(L)))) == L
resp = resp * entropy
freqX[, i] = rowSums(resp)
}

}
}
output = cbind(freqX, Y)
}


getEntropy = function(Matrix[Double] eX, Matrix[Double] eY)
return(Matrix[Double] entropyMatrix)
{

tab = table(eX, eY)
# print("tab \n"+toString(tab))
entropyMatrix = matrix(0, rows=1, cols=nrow(tab))
catTotal = rowSums(tab)
for(i in 1:nrow(tab))
{
# print("catProb: " +catProb)
entropy = (tab[i,]/catTotal[i])
# print("entropy: "+toString(entropy))
catEntropy = sum(-entropy * log(entropy, 2))
catEntropy = ifelse(is.na(catEntropy), 0, catEntropy)
# print("cat entropy: "+catEntropy)
entropyMatrix[1, i] = catEntropy
}
}
57 changes: 30 additions & 27 deletions scripts/builtin/topk_cleaning.dml
Original file line number Diff line number Diff line change
Expand Up @@ -113,44 +113,47 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
# # # create logical pipeline seeds
logicalSeedCI = frame([
"4", "ED", "MVI", "OTLR", "EC", "0", "0", "0", "0",
"5", "ED", "MVI", "CI", "SCALE","DUMMY","0", "0", "0",
"5", "OTLR", "EC", "CI", "SCALE", "DUMMY", "0","0", "0",
"7", "MVI", "OTLR", "ED", "EC", "SCALE", "CI", "DUMMY", "0",
"5", "ED", "EC", "SCALE", "CI","DUMMY","0", "0", "0",
"5", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0","0", "0",
"8", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "CI", "DUMMY",
"5", "ED", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0",
"4", "MVI", "SCALE", "CI", "DUMMY", "0", "0", "0", "0",
"4", "ED", "EC", "CI", "DUMMY", "0", "0", "0", "0",
"4", "MVI", "OTLR", "CI", "DUMMY", "0", "0", "0", "0",
"6", "MVI", "OTLR", "EC", "CI", "SCALE", "DUMMY", "0", "0",
"6", "ED", "MVI", "EC", "SCALE", "CI", "DUMMY", "0", "0",
"6", "MVI", "OTLR","EC", "SCALE", "CI", "DUMMY", "0", "0",
"7", "OTLR", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0",
"7", "ED", "MVI", "OTLR", "EC", "SCALE", "CI", "DUMMY", "0"
], rows=10, cols=9)

logicalSeedNoCI = frame([
"4", "ED", "MVI", "OTLR", "EC", "0", "0",
"3", "ED", "MVI", "DUMMY", "0","0","0",
"3", "OTLR", "EC", "DUMMY", "0","0","0",
"5", "MVI", "OTLR", "ED", "EC", "DUMMY", "0",
"3", "ED", "MVI", "DUMMY", "0", "0", "0",
"3", "MVI", "SCALE", "DUMMY", "0", "0", "0",
"3", "ED", "EC", "DUMMY", "0", "0", "0",
"3", "MVI", "OTLR", "DUMMY", "0", "0", "0",
"4", "MVI", "OTLR", "EC", "DUMMY", "0", "0",
"6", "ED", "MVI", "OTLR", "EC", "SCALE", "DUMMY"
], rows=10, cols=7)
logicalSeedNoCI = frame([
"3", "ED", "MVI", "OTLR", "EC", "0", "0", "0",
"4", "ED", "EC", "SCALE", "DUMMY","0", "0", "0",
"4", "OTLR", "EC", "SCALE", "DUMMY", "0","0", "0",
"7", "ED", "MVI", "OTLR", "ED", "EC", "SCALE", "DUMMY",
"4", "ED", "MVI", "SCALE", "DUMMY", "0", "0", "0",
"3", "MVI", "SCALE", "DUMMY", "0", "0", "0", "0",
"5", "ED", "MVI", "EC", "SCALE", "DUMMY", "0", "0",
"5", "MVI", "OTLR","EC", "SCALE", "DUMMY", "0", "0",
"6", "OTLR", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0",
"6", "ED", "MVI", "OTLR", "EC", "SCALE", "DUMMY", "0"
], rows=10, cols=8)

tab = table(eYtrain, 1)
dist = nrow(tab)
if(nrow(eYtrain) > 0 & dist < 15)
if(min(eYtrain) >= 1) {
tab = table(eYtrain, 1)
dist = nrow(tab)
}
if(nrow(eYtrain) > 0 & min(eYtrain) >= 1 & dist <= 15)
logical = logicalSeedCI
else
else {
logical = logicalSeedNoCI
}
idx = as.integer(as.scalar(logical[1, 1])) + 1
category = logical[1, 2:idx]

print("-- Cleaning - Enum Logical Pipelines: ");
[bestLogical, score] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, cmr=cmr,
cat=category, population=logical[2:nrow(logical)], max_iter=ceil(resource_val/topK), metaList = metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
num_inst=3 , num_exec=2, cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
num_inst=nrow(primitives), num_exec=ceil(resource_val/topK), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");

topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL")
Expand Down Expand Up @@ -231,8 +234,8 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
mask = as.matrix(metaList['mask'])
mask = ifelse(sum(mask == dmask) < ncol(mask), matrix(1, rows=1, cols=ncol(mask)), mask)
[eXtrain, eXtest] = recodeData(X, Xtest, mask, cv, "recode")
eXtrain = replace(target=eXtrain, pattern=NaN, replacement = 1)
eXtest = replace(target=eXtest, pattern=NaN, replacement = 1)
eXtrain = replace(target=eXtrain, pattern=NaN, replacement = 0)
eXtest = replace(target=eXtest, pattern=NaN, replacement = 0)
dirtyScore = 100
print(prefix+" sample from train data and dummy code");
[eXtrain, Ytrain] = utils::doSample(eXtrain, Y, sample, TRUE)
Expand All @@ -242,10 +245,10 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
print(prefix+" hyper-parameter tuning");
if(cv) {
score = crossV(X=eXtrain, y=Ytrain, cvk=cvk, evalFunHp=evalFunHp,
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc, trainML = 1)
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
}
else {
score = eval(evaluationFunc, list(X=eXtrain, Y=Ytrain, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = 1))
score = eval(evaluationFunc, list(X=eXtrain, Y=Ytrain, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
}

dirtyScore = as.scalar(score[1, 1])
Expand Down
8 changes: 5 additions & 3 deletions scripts/pipelines/properties/param.csv
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ normalize,0,0,0,0,0,0,,,,,,,,,,,,
imputeByMean,0,1,0,0,0,2,,,,,,,,,,,,
imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,,
mice,2,1,0,0,1,2,INT,FP,1,3,0.5,1,,,,,,
abstain,1,0,0,1,1,2,FP,0.6,0.9,,,,,,,,,
flipLabels,1,0,0,1,1,2,FP,0.6,0.9,,,,,,,,,
abstain,1,0,0,1,1,2,FP,0.6,0.8,,,,,,,,,
flipLabels,2,0,0,1,1,2,FP,INT,0.6,0.9,1,20,,,,,,
SMOTE,1,1,0,1,1,2,INT,100,500,,,,,,,,,
m_pca,3,0,0,0,0,2,INT,BOOL,BOOL,100,200,0,1,0,0,,,
ppca,4,0,0,0,1,2,INT,INT,FP,FP,100,200,1,10,1.00E-09,1.00E-06,1.00E-02,1.00E-01
fillDefault,0,0,0,0,0,2,,,,,,,,,,,,
dummycoding,0,1,0,0,0,2,,,,,,,,,,,,
frequencyEncoding,0,1,0,0,0,2,,,,,,,,,,,,
WoE,0,1,0,1,0,2,,,,,,,,,,,,
scale,2,0,0,0,0,0,BOOL,BOOL,0,1,0,1,,,,,,
forward_fill,1,0,0,0,1,2,BOOL,0,1,,,,,,,,,
imputeByFd,1,0,1,0,0,1,FP,0.6,0.9,,,,,,,,,
underSampling,1,0,0,1,0,2,FP,0.6,0.99,,,,,,,,,
wtomeklink,0,0,0,1,0,2,,,,,,,,,,,,
underSampling,1,0,0,1,0,2,FP,0.1,0.6,,,,,,,,,
6 changes: 3 additions & 3 deletions scripts/pipelines/properties/primitives.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ED,MVI,OTLR,EC,SCALE,CI,DUMMY,DIM
imputeByFd,imputeByMean,winsorize,imputeByMean,scale,abstain,dummycoding,m_pca
outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,wtomeklink,,ppca
outlierByIQR,mice,outlierByIQR,fillDefault,,SMOTE,,
outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,wtomeklink,frequencyEncoding,ppca
outlierByIQR,mice,outlierByIQR,fillDefault,,SMOTE,WoE,
,fillDefault,,,,flipLabels,,
,imputeByFd,,,,,,
,imputeByFd,,,,underSampling,,
,forward_fill,,,,,,
2 changes: 1 addition & 1 deletion scripts/pipelines/properties/testPrimitives.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ED,MVI,OTLR,EC,SCALE,CI,DUMMY,DIM
,imputeByMean,winsorize,imputeByMean,scale,abstain,dummycoding,m_pca
outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,wtomeklink,,ppca
outlierBySd,imputeByMedian,outlierBySd,imputeByMedian,,underSampling,frequencyEncoding,ppca
6 changes: 3 additions & 3 deletions src/test/scripts/functions/pipelines/applyEvaluateTest.dml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ trainData = F[1:split,]
testData = F[split+1:nrow(F),]


result = applyAndEvaluate(trainData, testData, metaInfo, lg, pip[1,], hp[1,], "evalML", matrix("1 1e-3 1e-9 100", rows=1, cols=4), TRUE, FALSE)
result = applyAndEvaluate(trainData, testData, metaInfo, lg, pip[1,], hp[1,], "evalML", evalHp, TRUE, FALSE)

header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
result = as.frame(result)
Expand All @@ -75,13 +75,13 @@ write(result, $6)
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
Matrix[Double] evalFunHp, Boolean trainML = FALSE)
Matrix[Double] evalFunHp)

return(Matrix[Double] accuracy)
{

beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE);
maxi=1000, maxii=100, verbose=FALSE);
[prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
a = getAccuracy(Ytest, yhat, TRUE)
print("accuracy: "+ accuracy+", accuracy weighted: "+a)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
77.42222222222223
77.15555555555555
76.97777777777777
93.69369369369369
93.69369369369369
93.69369369369369

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1 +1 @@
74.13333333333333
90.990990990991
Loading