I have tried to make a reproducible example although I do not succeed.
I try to do target encoding with the function below from rbloggers:
function(x, y, sigma = NULL) {
d <- aggregate(y, list(factor(x, exclude = NULL)), mean, na.rm = TRUE)
m <- d[is.na(as.character(d[, 1])), 2]
l <- d[, 2]
names(l) <- d[, 1]
l <- l[x]
l[is.na(l)] <- m
if (!is.null(sigma)) {
l <- l * rnorm(length(l), mean = 1, sd = sigma)
}
l
}
It runs fine on the example data:
data1 <- data.frame(factor_var = c("a", "a", "a", "a", "b", "b"),
target = c(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE))
which gives:
> l
a a a a b b
0.5 0.5 0.5 0.5 0.0 0.0
Although when I apply it to my own dataset it gives the error below:
Error in l[is.na(l)] <- m : replacement has length zero
The error is thrown by l[is.na(l)]
m is " numeric (empty) " although this seems not the cause.
Thanks a lot!
my own data:
structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), levels = c("Albania_Albanian Cup",
"Albania_Superliga", "Argentina_Copa Argentina", "Argentina_Copa de la Superliga"), class = "factor")
The issue seems to be that the factor data may have some levels for which the data is not present, thus
l <- l[x]
> l
Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina
0.3333333 0.3333333 0.3333333 0.3333333 0.3333333 0.3333333
Argentina_Copa Argentina <NA> <NA> <NA>
0.3333333 NA NA NA
creates some NA columns for those unused levels. Instead, we may have to use droplevels
to remove the unused levels
f2 <- function(x, y, sigma = NULL) {
x <- droplevels(x)
d <- aggregate(y, list(factor(x, exclude = NULL)), mean, na.rm = TRUE)
m <- d[is.na(as.character(d[, 1])), 2]
l <- d[, 2]
names(l) <- d[, 1]
l <- l[x]
l[is.na(l)] <- m
if (!is.null(sigma)) {
l <- l * rnorm(length(l), mean = 1, sd = sigma)
}
l
}
-testing
> f2(x, y)
Albania_Superliga Albania_Superliga Albania_Superliga Albania_Superliga Albania_Superliga Albania_Superliga
0.7142857 0.7142857 0.7142857 0.7142857 0.7142857 0.7142857
Albania_Superliga Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina
0.7142857 0.3333333 0.3333333 0.3333333
# OP's original function
> f1(x, y)
Error in l[is.na(l)] <- m : replacement has length zero
OR another option is to match with character converted vector instead of factor
> l[as.character(x)]
Albania_Superliga Albania_Superliga Albania_Superliga Albania_Superliga Albania_Superliga Albania_Superliga
0.7142857 0.7142857 0.7142857 0.7142857 0.7142857 0.7142857
Albania_Superliga Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina
0.7142857 0.3333333 0.3333333 0.3333333
> l[x]
Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina Argentina_Copa Argentina
0.3333333 0.3333333 0.3333333 0.3333333 0.3333333 0.3333333
Argentina_Copa Argentina <NA> <NA> <NA>
0.3333333 NA NA NA
f3 <- function(x, y, sigma = NULL) {
d <- aggregate(y, list(factor(x, exclude = NULL)), mean, na.rm = TRUE)
m <- d[is.na(as.character(d[, 1])), 2]
l <- d[, 2]
names(l) <- d[, 1]
l <- l[as.character(x)]
l[is.na(l)] <- m
if (!is.null(sigma)) {
l <- l * rnorm(length(l), mean = 1, sd = sigma)
}
l
}