Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions scripts/builtin/bandit.dml
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
{
[eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=op,
Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList2, hyperParameters=hp_matrix, hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(op))
if(max(eYtrain) == min(eYtrain))
print("Y contains only one class")
else if(changesByPip < ref)
Expand Down Expand Up @@ -540,7 +540,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
{
[trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=as.frame(pipList['ph']),
Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(as.frame(pipList['ph'])))
#TODO double check why this is necessary
mincol = min(ncol(cvChanges),ncol(changesByOp))
cvChanges[cvk,1:mincol] = changesByOp[,1:mincol];
Expand All @@ -557,7 +557,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
allChanges = min(allChanges)
changesByOp = colMaxs(cvChanges)
accuracy = mean(accuracyMatrix)
print("cv accuracy: "+toString(accuracy))
print("- cv accuracy: "+toString(accuracy))
}

pruningSignal = function(Matrix[Double] pipPre, Matrix[Double] pipNew, Matrix[Double] hp_matrix, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
Expand Down Expand Up @@ -670,7 +670,7 @@ run_with_hyperparamNested = function(Frame[Unknown] ph_pip, Integer r_i = 1, Mat
{
[eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=op,
Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList2, hyperParameters=hp_matrix, hpForPruning=hpForPruning,
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(op))
if(max(eYtrain) == min(eYtrain))
print("Y contains only one class")
else if(changesByPip < ref)
Expand Down Expand Up @@ -727,4 +727,4 @@ return(Boolean execute)
}
}
execute = !(changeCount > 0)
}
}
10 changes: 7 additions & 3 deletions scripts/builtin/executePipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@

f_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Matrix[Double] Ytrain,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose,
Integer startInd, Integer endInd)
return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest,
Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll, List[Unknown] internalStates)
{
Expand All @@ -68,8 +69,11 @@ f_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
print("pipeline in execution "+toString(pipeline))
print("pipeline hps "+toString(hyperParameters))
}
for(i in 1:ncol(pipeline)) {

# for(i in 1:ncol(pipeline)) {
for(i in startInd:endInd) {
op = as.scalar(pipeline[1,i])
print("-- Applying Primitive: "+op);
applyOp = toString(as.scalar(applyFunc[1,i]))
Xclone = Xtrain
XtestClone = Xtest
Expand Down Expand Up @@ -476,4 +480,4 @@ return (Matrix[Double] cmin, Matrix[Double] cmax)
cmin[1, i] = min(vec)
cmax[1, i] = max(vec)
}
}
}
80 changes: 59 additions & 21 deletions scripts/builtin/fit_pipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ source("scripts/builtin/bandit.dml") as bandit;

f_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean allCombinations=FALSE)
return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
{
externalState = list()
Expand Down Expand Up @@ -92,28 +92,66 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)

print("Getting training score using CV")
[trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
print("train score cv: "+toString(trainScore))
print("- train score cv: "+toString(trainScore))


# # # now test accuracy
[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)

if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")
print("----------------------------");
print("Getting test accuracy")
primitives = matrix(0, rows=0, cols=0);
if (allCombinations) {
# Count number of subsets of consecutive primitives
totCount = 0;
n = ncol(pip);
for (i in 1:n) {
for (j in i:n)
totCount = totCount + 1;
}
# List start and end indices of all those subsets
primitives = matrix(0, rows=totCount, cols=2);
r = 1;
for (start in 1:n) {
for (end in start:n) {
primitives[r,1] = start;
primitives[r,2] = end;
r = r + 1;
}
}
}
else {
# Include all primitives
primitives = matrix(0, rows=1, cols=2);
primitives[1,1] = 1;
primitives[1,2] = ncol(pip);
}

# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
# trainAccuracy = as.scalar(score[1, 1])

score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
testAccuracy = as.scalar(score[1, 1])

scores = matrix(0, rows=1, cols=3)
scores[1, 1] = dirtyScore
# scores[1, 2] = trainAccuracy
scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain, eYtrain)
cleanTest = cbind(eXtest, eYtest)
for (r in 1:nrow(primitives)) {
startInd = as.scalar(primitives[r,1]);
endInd = as.scalar(primitives[r,2]);
# # # now test accuracy
[eXtrain_cl, eYtrain_cl, eXtest_cl, eYtest_cl, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=startInd, endInd=endInd)

if(max(eYtrain_cl) == min(eYtrain_cl))
stop("Y contains only one class")

# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
# trainAccuracy = as.scalar(score[1, 1])

score = eval(evaluationFunc, list(X=eXtrain_cl, Y=eYtrain_cl, Xtest=eXtest_cl, Ytest=eYtest_cl, Xorig=as.matrix(0), evalFunHp=evalFunHp))
testAccuracy = as.scalar(score[1, 1])

scores = matrix(0, rows=1, cols=3)
scores[1, 1] = dirtyScore
# scores[1, 2] = trainAccuracy
scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain_cl, eYtrain_cl)
cleanTest = cbind(eXtest, eYtest)

header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
result = as.frame(scores)
writeRes = rbind(header, result)
print(toString(writeRes))
}
}
4 changes: 2 additions & 2 deletions scripts/builtin/sliceLineExtract.dml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

m_sliceLineExtract = function(Matrix[Double] X, Matrix[Double] e,
Matrix[Double] TK, Matrix[Double] TKC, Integer k2 = -1)
return(Matrix[Double] Xtk, Matrix[Double] etk)
return(Matrix[Double] Xtk, Matrix[Double] etk, Matrix[Double] I)
{
# check valid parameters
if( k2 > nrow(TK) )
Expand All @@ -50,7 +50,7 @@ m_sliceLineExtract = function(Matrix[Double] X, Matrix[Double] e,
# extract first k2 slices from X and e
I = matrix(0, k2, nrow(X));
parfor(i in 1:k2) {
I[i,] = t(rowSums(X == TK[i,]) == sum(TK[i,]))
I[i,] = t(rowSums(X == TK[i,]) == sum(TK[i,] > 0))
}
I = t(colSums(I)); #union

Expand Down
7 changes: 4 additions & 3 deletions scripts/builtin/topk_cleaning.dml
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ f_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
# apply sampling on training data for pipeline enumeration
# TODO why recoding/sampling twice (within getDirtyScore)
print("---- class-stratified sampling of feature matrix w/ f="+sample);
if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # &
[eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
else
# if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # &
# [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
# else
[eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, metaR, TRUE)
t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");

Expand Down Expand Up @@ -112,6 +112,7 @@ f_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
metaList['distY'] = dist

print("-- Cleaning - Enum Logical Pipelines: ");
print("---- Data Dimension before Cleaning: "+ nrow(eXtrain) + ", " + ncol(eXtrain));
[bestLogical, bestHp, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter, metaList = metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
Expand Down
3 changes: 2 additions & 1 deletion scripts/pipelines/scripts/utils.dml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D
sampledY = eY
sampled = floor(nrow(eX) * ratio)

if(sampled > MIN_SAMPLE & ratio != 1.0)
# if(sampled > MIN_SAMPLE & ratio != 1.0)
if(ratio != 1.0)
{
sampleVec = sample(nrow(eX), sampled, FALSE, 23)
P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec), nrow(eX))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;

public class BuiltinFitPipelineTest extends AutomatedTestBase {
Expand All @@ -42,7 +43,8 @@ public class BuiltinFitPipelineTest extends AutomatedTestBase {
public void setUp() {
addTestConfiguration(TEST_NAME1,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"}));
}


@Ignore
@Test
public void testEvalPipClass() {
evalPip(0.8, "FALSE", INPUT+"/classification/", Types.ExecMode.SINGLE_NODE);
Expand Down
6 changes: 3 additions & 3 deletions src/test/scripts/functions/builtin/sliceLineRealData.dml
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ acc = lmPredictStats(yhat, y, TRUE);
e = (y-yhat)^2;

# model debugging via sliceline
[TK,TKC,D] = sliceLine(X=X, e=e, k=4, alpha=0.95, minSup=32, tpBlksz=16, verbose=TRUE)
[TK,TKC,D] = sliceLine(X=X, e=e, k=4, alpha=0.95, minSup=32, tpBlksz=16, verbose=FALSE)
tfspec2 = "{ ids:true, recode:[1,2,5], bin:[{id:3, method:equi-width, numbins:10},{id:4, method:equi-width, numbins:10}]}"
XYZ = sliceLineDebug(TK=TK, TKC=TKC, tfmeta=meta, tfspec=tfspec2)
[Xtk,etk] = sliceLineExtract(X=X, e=e, TK=TK, TKC=TKC, k2=3);
[Xtk,etk,I] = sliceLineExtract(X=X, e=e, TK=TK, TKC=TKC, k2=3);

acc = acc[3,1];
val = as.matrix((sum(TKC[1,4]) >= nrow(Xtk)) & (nrow(Xtk) == nrow(etk)))
val = as.matrix((sum(TKC[1,4]) <= nrow(Xtk)) & (nrow(Xtk) == nrow(etk)))

write(acc, $3);
write(val, $4);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ hp = matrix("0.000 0.000 1.000 0.000 0.000 0.000 2.000
1.000 0.786 0.000 0.000 1.000 1.000 2.000", rows=2, cols=7)
print("X unchanged "+sum(eXtrain))
[eX, Y, Xtest, Ytest, tr] = executePipeline(pip, eXtrain, eYtrain, eXtest, eYtest, metaList, hp,
as.matrix(0), as.matrix(0), flagsCount, TRUE, FALSE)
as.matrix(0), as.matrix(0), flagsCount, TRUE, FALSE, 1, ncol(pip))


[eXtrain, imp] = imputeByMean(eXtrain, mask)
Expand Down
2 changes: 1 addition & 1 deletion src/test/scripts/functions/pipelines/fit_pipelineTest.dml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ testData = F[split+1:nrow(F),]


print("pipeline: "+toString(pip[1]))
[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE)
[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE, FALSE)
eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE)


Expand Down