I have data like this
df<-structure(list(data = structure(c(8L, 2L, 3L, 2L, 2L, 2L, 2L,
1L, 7L, 5L, 6L, 5L, 4L), .Label = c("1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"M1yrtr", "Mitered"), class = "factor")), row.names = c(NA, -13L), class = "data.frame")
I am trying to calculate the following for each row
for example for the second row which is
2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
I want to calculate this
n =5
(-(2/n)*log2(2/n)) + (-(1/n)*log2(1/n)) +(-(1/n)*log2(1/n))+ (-(1/n)*log2(1/n))
for the third one which is
2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
I will calculate this
(-(2/n)*log2(2/n)) + (-(2/n)*log2(2/n)) + (-(1/n)*log2(1/n))
so the output looks like this
dfout<- structure(list(data = structure(c(8L, 2L, 3L, 2L, 2L, 2L, 2L,
1L, 7L, 5L, 6L, 5L, 4L), .Label = c("1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0",
"M1yrtr", "Mitered"), class = "factor"), X = structure(c(8L,
3L, 2L, 3L, 3L, 3L, 3L, 1L, 7L, 6L, 4L, 6L, 5L), .Label = c("0.2604594",
"1.03563", "1.168964", "2.020935", "2.077468", "2.204594", "M1yrtr",
"Mitered"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
In R all basic operations (addition subtraction, multiplication, logarithms,...) are vectorized. This means that for example if x
is a vector then log(x)
is just the componentwise log
function, ore 1 / x
is just component wise division.
Therefore, you can do the following:
x <- as.numeric(str_split(df[2, ], ", ", simplify = T))
n <- 5
sum((-(x[x > 0]/n)*log2(x[x > 0]/n)))
[1] 1.921928
If you want to apply this for all rows you can use the sapply
function like this:
myfun <- function(x){
if (! grepl(",", x)) return(as.character(x))
n <- 5
y <- as.numeric(str_split(x, ", ", simplify = T))
as.character(sum((-(y[y > 0]/n)*log2(y[y > 0]/n))))
}
df$newcol <- sapply(df[,1], myfun)