Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions scripts/builtin/topk_cleaning.dml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ source("scripts/builtin/bandit.dml") as bandit;
s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives,
Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20,
Integer max_iter = 10, Double lq = 0.1, Double uq=0.7, Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2,
Boolean isLastLabel = TRUE,
Boolean isLastLabel = TRUE, Integer rowCount = 3700,
Boolean correctTypos=FALSE, Boolean enablePruning = FALSE)
return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores,
Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc)
Expand All @@ -43,7 +43,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
# prepare meta data
# # keeping the meta list format if we decide to add more stuff in metadata
[schema, mask, fdMask, maskY] = prepareMeta(dataTrain, metaData)
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("null"), distY=0)
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("null"), distY=0, minFold=0)
t2 = time(); print("-- Cleaning - Prepare Metadata: "+(t2-t1)/1e9+"s");

# separate the label
Expand Down Expand Up @@ -79,8 +79,8 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
# apply sampling on training data for pipeline enumeration
# TODO why recoding/sampling twice (within getDirtyScore)
print("---- class-stratified sampling of feature matrix w/ f="+sample);
if(sum(mask) > ncol(mask)/2 & nrow(eYtrain) >= 10000 & sample == 1.0)
[eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, 3500)
if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # &
[eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
else
[eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, metaR, TRUE)
t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
Expand Down
65 changes: 34 additions & 31 deletions scripts/pipelines/scripts/utils.dml
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,24 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D
doErrorSample = function(Matrix[Double] eX, Matrix[Double] eY, Double lq, Double uq, Integer rowCount = 3500)
return (Matrix[Double] sampledX, Matrix[Double] sampledY)
{
print("initial number of rows: " +nrow(eX))
print("quantiles: "+lq+" "+uq)
# # # prepare feature vector for NB
beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=20, maxii=20, verbose=FALSE);
[trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)
print("Error filtering")
if(nrow(eY) < rowCount)
filterMask = matrix(1, rows=nrow(eY), cols=1)
else {
# # # prepare feature vector for NB
beta = multiLogReg(X=eX, Y=eY, icpt=1, reg=1e-3, tol=1e-6, maxi=20, maxii=20, verbose=FALSE);
[trainProbs, yhat, accuracy] = multiLogRegPredict(eX, beta, eY, FALSE)


print("applying error filter")
filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) | rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq)
delta = 0.001
while(sum(filterMask) < rowCount & nrow(eY) > rowCount)
{
lq = lq + delta
uq = uq - delta
print("applying error filter")
filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) | rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq)
delta = 0.001
while(sum(filterMask) < rowCount & nrow(eY) > rowCount)
{
lq = lq + delta
uq = uq - delta
filterMask = rowMaxs(trainProbs) < quantile(rowMaxs(trainProbs), lq) | rowMaxs(trainProbs) > quantile(rowMaxs(trainProbs), uq)
}
}
sampledX = removeEmpty(target = eX, margin = "rows", select=filterMask)
sampledY = removeEmpty(target = eY, margin = "rows", select=filterMask)
Expand Down Expand Up @@ -205,18 +208,18 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona
}
# # step 7 convert date to decimal
dateColIdx = as.matrix(0)
isDate = map(data[1:10], "x -> UtilFunctions.isDateColumn(x)")
isDate = replace(target = as.matrix(isDate), pattern = NaN, replacement = 0)
isDate = (colMaxs(isDate)) & as.matrix(schema == frame("STRING", rows=1, cols=ncol(schema)))
if(sum(isDate) > 0) {
print(prefix+" changing date to timestamp")
dateColIdx = removeEmpty(target = isDate * t(seq(1, ncol(isDate))), margin="cols")
for(i in 1:ncol(dateColIdx))
{
idx = as.scalar(dateColIdx[i])
data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)
}
}
# isDate = map(data[1:10], "x -> UtilFunctions.isDateColumn(x)")
# isDate = replace(target = as.matrix(isDate), pattern = NaN, replacement = 0)
# isDate = (colMaxs(isDate)) & as.matrix(schema == frame("STRING", rows=1, cols=ncol(schema)))
# if(sum(isDate) > 0) {
# print(prefix+" changing date to timestamp")
# dateColIdx = removeEmpty(target = isDate * t(seq(1, ncol(isDate))), margin="cols")
# for(i in 1:ncol(dateColIdx))
# {
# idx = as.scalar(dateColIdx[i])
# data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)
# }
# }
# TODO add deduplication
print(prefix+" deduplication via entity resolution");

Expand Down Expand Up @@ -261,11 +264,11 @@ return(Frame[Unknown] data)
}
}
# # step 7 convert date to decimal
if(sum(dateColIdx) > 0) {
for(i in 1:ncol(dateColIdx))
{
idx = as.scalar(dateColIdx[i])
data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)
}
}
# if(sum(dateColIdx) > 0) {
# for(i in 1:ncol(dateColIdx))
# {
# idx = as.scalar(dateColIdx[i])
# data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)
# }
# }
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0
forward_fill,winsorizeApply,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0
forward_fill,winsorizeApply,imputeByMedianApply,NA,winsorizeApply,forward_fill,imputeByMeanApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0
outlierBySdApply,winsorizeApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
normalizeApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
imputeByMeanApply,outlierBySdApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
73.731884057971
73.731884057971
73.731884057971
74.87179487179488
74.87179487179488
74.87179487179488
Original file line number Diff line number Diff line change
@@ -1 +1 @@
61.050724637681164
74.87179487179488
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
40.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64.0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,1.0,0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,1.0,0,0,0,0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,2.0,0.05,0.95,0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21.0,0,0,0,0,0,0,0,1.0,0.75,0,0,1.0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27.0,0,0,0,0,1.0,0,0,0,2.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0
forward_fill,winsorize,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0
forward_fill,winsorize,imputeByMedian,tomeklink,winsorize,forward_fill,imputeByMean,dummycoding,0,0,0,0,0,0,0,0,0,0
outlierBySd,winsorize,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
normalize,abstain,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
imputeByMean,outlierBySd,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0