Search code examples
rfor-loopmatrixresamplingmarkov-chains

Adding and removing columns in a matrix within a for loop


I have this 4 state Markov Chain in R:

enter image description here

I wrote the following (100 iteration) simulation that simulates 100 hospital patients (all patients start in state 1) and tracks which state they are in during each iteration of the simulation. At the start of each iteration, we see which state each patient is in as of the last iteration ... and then move the patient according to the transition matrix probabilities (do this for each patient). E.g.

  • Iteration 1: patient1 moves from state1 to state2, patient2 moves from state1 to state3, ... patient100 movies from state1 to state1
  • Iteration 2: patient1 moves from state2 to state1, patient2 moves from state3 to state4 etc.

I then repeated this entire process 1000 times and plotted the results:

   library(reshape2)
library(ggplot2)
library(dplyr)
library(tidyr)

# transition matrix
P <- matrix(c(1/3, 1/3, 1/3, 0,
              1/2, 1/2, 0,   0,
              0,   0,   1/2, 1/2,
              0,   0,   0,   1),
            nrow = 4, ncol = 4, byrow = TRUE)

n_iterations <- 100
n_people <- 100
n_simulations <- 100

run_simulation <- function(sim_number) {
    state_matrix <- matrix(0, nrow = n_iterations, ncol = n_people)
    state_matrix[1, ] <- 1  # All start in state 1
    
    for (i in 2:n_iterations) {
        for (person in 1:n_people) {
            current_state <- state_matrix[i-1, person]
            state_matrix[i, person] <- sample(1:4, 1, prob = P[current_state, ])
        }
    }
    
    state_counts <- apply(state_matrix, 1, function(x) table(factor(x, levels = 1:4)))
    percentages <- as.data.frame(t(state_counts) / n_people * 100)  # Convert to percentages (0-100)
    
    # Create patient summary
    patient_summary <- data.frame(
        patient = 1:n_people,
        simulation = sim_number,
        state1 = rowSums(state_matrix == 1),
        state2 = rowSums(state_matrix == 2),
        state3 = rowSums(state_matrix == 3),
        state4 = rowSums(state_matrix == 4)
    )
    
    list(percentages = percentages, patient_summary = patient_summary)
}

# Run simulations
set.seed(123)  # For reproducibility
all_simulations <- vector("list", n_simulations)
all_patient_summaries <- vector("list", n_simulations)

for (sim in 1:n_simulations) {
    simulation_result <- run_simulation(sim)
    all_simulations[[sim]] <- simulation_result$percentages
    all_patient_summaries[[sim]] <- simulation_result$patient_summary
    if (sim %% 10 == 0) {  # Print progress every 10 simulations
        cat(sprintf("Completed simulation %d of %d\n", sim, n_simulations))
    }
}

# Combine all patient summaries into one data frame
final_patient_summary <- do.call(rbind, all_patient_summaries)

# Reorder columns to put patient and simulation first
final_patient_summary <- final_patient_summary[, c("patient", "simulation", "state1", "state2", "state3", "state4")]

# calculate the average percentages across all simulations
average_percentages <- Reduce(`+`, all_simulations) / n_simulations
average_percentages$iteration <- 1:n_iterations

# function to calculate confidence intervals
calculate_ci <- function(x) {
    quantile(x, probs = c(0.05, 0.95))
}

# confidence intervals
ci_data <- lapply(1:n_iterations, function(i) {
    data.frame(
        iteration = i,
        state = as.character(1:4),
        lower = sapply(1:4, function(j) calculate_ci(sapply(all_simulations, function(sim) sim[i, j]))[1]),
        upper = sapply(1:4, function(j) calculate_ci(sapply(all_simulations, function(sim) sim[i, j]))[2])
    )
})
ci_data <- do.call(rbind, ci_data)

# reshape 
average_percentages_long <- average_percentages %>%
    pivot_longer(cols = as.character(1:4),
                 names_to = "state",
                 values_to = "percentage")

plot_data <- merge(average_percentages_long, ci_data, by = c("iteration", "state"))

ggplot(plot_data, aes(x = iteration, y = percentage, color = state, fill = state)) +
    geom_line() +
    geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
    scale_color_manual(values = c("1" = "blue", "2" = "red", "3" = "green", "4" = "purple"),
                       labels = paste("State", 1:4)) +
    scale_fill_manual(values = c("1" = "blue", "2" = "red", "3" = "green", "4" = "purple"),
                      labels = paste("State", 1:4)) +
    labs(x = "Iteration", y = "Average Percentage of People", color = "State", fill = "State") +
    ggtitle("Average Percentage of People in Each State Over Time (1000 Simulations)") +
    theme_minimal() +
    scale_y_continuous(limits = c(0, 100))

print(head(final_patient_summary))

enter image description here

Objective: Now, what I am trying to do is the following. At the start of each iteration:

  • I want to introduce a random number of patients (e.g. rpois(1, lambda = 10)) into this simulation each iteration (i.e. into state 1)
  • remove all patients in state 4 from this simulation.

Conceptually, this represents new patients entering the hospital and existing patient exiting the hospital.

Can someone please show me how I can change my work to account for this? I am not sure how to change the for loop to account for this. Perhaps this approach can be useful ? Adding and Removing People within a For Loop in R

Thank you everyone!

  • PS:

R code for visualizing the Markov Chain

grViz("
  digraph {
    rankdir=LR;
    node [shape = circle]
    1 [label = '1', style = filled, fillcolor = green]
    2 [label = '2']
    3 [label = '3']
    4 [label = '4', style = filled, fillcolor = red]
    
    1 -> 1 [label = '1/3']
    1 -> 2 [label = '1/3']
    1 -> 3 [label = '1/3']
    2 -> 1 [label = '1/2']
    2 -> 2 [label = '1/2']
    3 -> 3 [label = '1/2']
    3 -> 4 [label = '1/2']
    4 -> 4 [label = '1']
  }
")

Solution

  • We could add patients at each step, and transition patients already in state 4 to a dummy state, let's call it 5 (see this post for more details: Recording the First Time an Event Happens). We also need to modify the probability matrix accordingly.

    library(reshape2)
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    
    # transition matrix
    P <- matrix(c(1/3, 1/3, 1/3, 0  , 0, 
                  1/2, 1/2, 0,   0  , 0, 
                  0,   0,   1/2, 1/2, 0, 
                  0,   0,   0,   0  , 1,
                  0,   0,   0,   0  , 1), 
                nrow = 5, ncol = 5, byrow = TRUE)
    
    n_iterations <- 100
    n_init_people <- 100
    n_simulations <- 100
    

    Then, within run_simulation functions, we need to add patients using a random function at the beginning of each iteration. However, we should set their state to 1 at that step and not at the iteration = 1 (hence, state_matrix_new[i-1, ] <- 1).

    Moreover, we need to make some changes to account for the new state and the new patients before we can calculate the percentages for each stage-iterations specifically. Basically, any patient with a state 0 or 5 should be "filtered out".

    run_simulation <- function() {
      state_matrix <- matrix(0, nrow = n_iterations, ncol = n_init_people)
      state_matrix[1, ] <- 1  # All start in state 1
      
      for (i in 2:n_iterations) {
        
        ## new patients
        n_people_rnd <- rpois(1, lambda = 10)
        state_matrix_new <- matrix(0, nrow = n_iterations, ncol = n_people_rnd)
        state_matrix_new[i-1, ] <- 1  # All start in state 1
        ## add them to the state matrix
        state_matrix <- cbind(state_matrix, state_matrix_new)
        
        for (person in 1:ncol(state_matrix)) {
          
          current_state <- state_matrix[i-1, person]
          state_matrix[i, person] <- sample(1:5, 1, prob = P[current_state, ])
        }
      }
      
      state_counts <- apply(state_matrix, 1, 
                            function(x) table(factor(x, levels = 1:4)))
      # Convert to percentages (0-100)
      ## filtering out 0s (new patients not in previous iterations)
      ## filtering out 5s (discharged patients)
      as.data.frame(t(state_counts) / rowSums(state_matrix != 5 & 
                                              state_matrix != 0) * 100)  
    }
    
    # Run simulations 
    set.seed(123)  # For reproducibility
    all_simulations <- vector("list", n_simulations)
    
    for (sim in 1:n_simulations) {
      all_simulations[[sim]] <- run_simulation()
      if (sim %% 10 == 0) {  # Print progress every 10 simulations
        cat(sprintf("Completed simulation %d of %d\n", sim, n_simulations))
      }
    }
    #> Completed simulation 10 of 100
    #> Completed simulation 20 of 100
    #> Completed simulation 30 of 100
    #> Completed simulation 40 of 100
    #> Completed simulation 50 of 100
    #> Completed simulation 60 of 100
    #> Completed simulation 70 of 100
    #> Completed simulation 80 of 100
    #> Completed simulation 90 of 100
    #> Completed simulation 100 of 100
    

    Next we can proceed with the same data wrangling/processing steps.

    # calculate the average percentages across all simulations
    average_percentages <- Reduce(`+`, all_simulations) / n_simulations
    average_percentages$iteration <- 1:n_iterations
    
    # function to calculate confidence intervals
    calculate_ci <- function(x) {
      quantile(x, probs = c(0.05, 0.95))
    }
    
    #  confidence intervals
    ci_data <- lapply(1:n_iterations, function(i) {
      data.frame(
        iteration = i,
        state = as.character(1:4),
        lower = sapply(1:4, function(j) calculate_ci(sapply(all_simulations,
                                                     function(sim) sim[i, j]))[1]),
        upper = sapply(1:4, function(j) calculate_ci(sapply(all_simulations,
                                                     function(sim) sim[i, j]))[2])
      )
    })
    ci_data <- do.call(rbind, ci_data)
    
    # reshape 
    average_percentages_long <- average_percentages %>%
      pivot_longer(cols = as.character(1:4), 
                   names_to = "state", 
                   values_to = "percentage")
    
    plot_data <- merge(average_percentages_long, ci_data, 
                        by = c("iteration", "state"))
    

    And finally plot:

    ggplot(plot_data, 
            aes(x = iteration, y = percentage, 
                color = state, fill = state)) +
      geom_line() +
      geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
      scale_color_manual(values = c("1" = "blue", "2" = "red", 
                                    "3" = "green", "4" = "purple"),
                         labels = paste("State", 1:4)) +
      scale_fill_manual(values = c("1" = "blue", "2" = "red", 
                                   "3" = "green", "4" = "purple"),
                        labels = paste("State", 1:4)) +
      labs(x = "Iteration", y = "Average Percentage of People", 
           color = "State", fill = "State") +
      ggtitle(
      "Average Percentage of People in Each State Over Time (1000 Simulations)") +
      theme_minimal() +
      scale_y_continuous(limits = c(0, 100))
    

    Here's the updated diagram to better visualize the concepts used above:

    DiagrammeR::grViz("
      digraph {
        rankdir=LR;
        node [shape = circle]
        0 [label = 'New', style = filled, fillcolor = grey]
        1 [label = '1', style = filled, fillcolor = green]
        2 [label = '2']
        3 [label = '3']
        4 [label = '4', style = filled, fillcolor = yellow]
        5 [label = '5', style = filled, fillcolor = red]
        
        0 -> 1 [label = 'Admitted']
        1 -> 1 [label = '1/3']
        1 -> 2 [label = '1/3']
        1 -> 3 [label = '1/3']
        2 -> 1 [label = '1/2']
        2 -> 2 [label = '1/2']
        3 -> 3 [label = '1/2']
        3 -> 4 [label = '1/2']
        4 -> 5 [label = '1 (discharged)']
        5 -> 5 [label = '1']
      }
    ")
    

    Created on 2024-09-09 with reprex v2.0.2