rmatrixsample

Sample from a matrix by rows


I have the following matrix:

 Mat1 <- structure(c("Procedure_B", "Procedure_C", "Procedure_B", NA, 
"Procedure_B", "Procedure_A", "Procedure_C", "Procedure_B", NA, 
"Procedure_B", NA, "Procedure_B", NA, NA, "Procedure_A", "Procedure_A", 
"Procedure_C", "Procedure_A", "Procedure_A", "Procedure_B", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_B", "Procedure_A", "Procedure_A", 
NA, NA, "Procedure_C", NA, "Procedure_C", NA, "Procedure_A", 
"Procedure_B", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_B", 
"Procedure_A", "Procedure_B", "Procedure_C", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_C", "Procedure_C", "Procedure_A", NA, 
NA, NA, NA, NA, NA, "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", "Procedure_A", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", "Procedure_B", 
"Procedure_B", "Procedure_B", "Procedure_B", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", "Procedure_C", 
"Procedure_C", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA), dim = c(53L, 5L))

I want to sample one value from each row, with the following constant probabilities:

P = c(0.99, 0.002992, 0.003186, 0.003018, 0.000804)

i.e. for each row sample each of its 5 values with these probabilities

The expected output is 53 values.

I tried:

sample(Mat1, size = nrow(Mat1), prob = rep(P, nrow(Mat1)), replace = T)

However outcome doesnt fit the expected distribution. I dont want to do this in a loop/apply as my matrix can have many rows.

What is wrong with this command?


Solution

    • You can sample column indices, and extract values from the matrix according to the sampled column numbers.

    • This method is vectorized.

    set.seed(1)
    
    cols <- sample(1:ncol(Mat1), size = nrow(Mat1), replace = TRUE, prob = P)
    Mat1[cbind(1:nrow(Mat1), cols)]
    
    #  [1] "Procedure_B" "Procedure_C" "Procedure_B" NA            "Procedure_B"
    #  [6] "Procedure_A" "Procedure_C" "Procedure_B" NA            "Procedure_B"
    # [11] NA            "Procedure_B" NA            NA            "Procedure_A"
    # [16] "Procedure_A" "Procedure_C" "Procedure_B" "Procedure_A" "Procedure_B"
    # [21] "Procedure_C" "Procedure_C" "Procedure_C" "Procedure_B" "Procedure_A"
    # [26] "Procedure_A" NA            NA            "Procedure_C" NA           
    # [31] "Procedure_C" NA            "Procedure_A" "Procedure_B" "Procedure_A"
    # [36] "Procedure_A" "Procedure_A" "Procedure_B" "Procedure_A" "Procedure_B"
    # [41] "Procedure_C" "Procedure_B" "Procedure_B" "Procedure_B" "Procedure_C"
    # [46] "Procedure_C" "Procedure_A" NA            NA            NA           
    # [51] NA            NA            NA