I'm doing logistic regression & XGBoost and converted all variable into WOE
's.
This was done for the training data set.
Now I would like to validate my model on my validation & out-of-sample test data.
WOE
were produced by using the Hmisc::CUT2
function and then apply InformationValue::WOE
data.work$MAILING_DAYS <- cut2(training$MAILING_DAYS, g=20, cuts=c(16,23,27))
data.work.woe$MAILING_DAYS <- WOE(data.work$MAILING_DAYS,
data.work$SUCCESS,
valueOfGood=1)
Possible Information would be:
WOETable(data.work$MAILING_DAYS,
data.work$SUCCESS,
valueOfGood=1)
CAT GOODS BADS TOTAL PCT_G PCT_B WOE IV
1 [ 0,16) 4827 89389 94216 0.58844325 0.4983581 0.16616157 0.014968688
2 [16,23) 1750 41383 43133 0.21333658 0.2307169 -0.07832034 0.001361233
3 [23,27) 987 27323 28310 0.12032183 0.1523301 -0.23588003 0.007550120
4 [27,30] 639 21272 21911 0.07789833 0.1185948 -0.42030843 0.017105085
levels(data.work$MAILING_DAYS)
[1] "[ 0,16)" "[16,23)" "[23,27)" "[27,30]"
I tried something like that:
WOE <- data.frame(NAME=character(),
COND=character(),
VALUE=integer(),
WOE =integer(),
stringsAsFactors=FALSE)
a = names(data.work)
WOE.CAT <- c()
WOE.WOE <- c()
k <- 1
for (i in c(4:4)){
temp.var <- a[i]
WOE.CAT <- WOETable(data.work[, temp.var], data.work$SUCCESS, valueOfGood = 1)$CAT
WOE.WOE <- WOETable(data.work[, temp.var], data.work$SUCCESS, valueOfGood = 1)$WOE
for (j in c(2:length(WOE.CAT))){
if (as.integer(gregexpr(pattern=",", WOE.CAT[j]) == -1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "<"
WOE[k,"VALUE"] <- as.numeric(WOE.CAT[j+1])
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
} else if (as.integer(gregexpr(pattern=",", WOE.CAT[j]) != -1)){
if (j < (length(WOE.CAT)-1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "<"
WOE[k,"VALUE"] <- as.numeric(substr(WOE.CAT[j], (as.integer(gregexpr(pattern=",", WOE.CAT[j]))+1), (nchar(WOE.CAT[j])-1)))
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
} else if(j == (length(WOE.CAT)-1)){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- ">="
WOE[k,"VALUE"] <- as.numeric(substr(WOE.CAT[j], 2, (as.integer(gregexpr(pattern=",", WOE.CAT[j]))-1)))
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
}
} else if (WOE.CAT[j] == "missing"){
WOE[k,"NAME"] <- temp.var
WOE[k,"COND"] <- "=="
WOE[k,"VALUE"] <- NA
WOE[k, "WOE"] <- WOE.WOE[j]
k <- k + 1
}
}
}
There should be a way to transform WOE from training data to validation data, is there?
The stupid way would be if else if
... but I have over 250 characteristics, so this would take a bunch of time!
Thanks a lot for any help
Finally i have a solution,
using package: scorecard is helpful. In my case i used the determined breaks breaks_list in scorecard::woebin und create the data with scorecard::woebin_ply