I have this data:
simulated_states = c("A", "E", "B", "B", "A", "C", "D", "A", "B", "D", "A", "D",
"D", "E", "D", "D", "D", "E", "A", "A", "A", "B", "A", "C", "C",
"D", "A", "A", "D", "A", "D", "A", "A", "A", "C", "C", "D", "A",
"C", "C", "D", "E", "C", "C", "C", "E", "B", "A", "E", "E", "C",
"C", "D", "E", "C", "E", "E", "A", "E", "B", "A", "A", "E", "E",
"C", "E", "C", "C", "C", "D", "E", "D", "C", "D", "A", "B", "B",
"E", "B", "A", "E", "C", "C", "D", "B", "B", "A", "C", "B", "A",
"D", "A", "D", "E", "C", "D", "D", "A", "A", "C")
I know how to calculate transition probabilities :
calculate_transition_probs <- function(states) {
transitions <- data.frame(
from = states,
to = c(states[-1], NA)
)
transition_counts <- table(transitions, useNA = "always")
transition_df <- as.data.frame(transition_counts)
colnames(transition_df) <- c("from", "to", "count")
transition_df <- transition_df[!is.na(transition_df$to), ]
transition_df <- transition_df %>%
group_by(from) %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
transition_df <- transition_df[, c("from", "to", "count", "percent")]
transition_df <- transition_df[order(transition_df$from, transition_df$to), ]
return(transition_df)
}
transition_probs <- calculate_transition_probs(simulated_states)
The result looks like this:
from to count percent
A A 7 26.923077
A B 3 11.538462
A C 6 23.076923
A D 5 19.230769
A E 5 19.230769
B A 7 58.333333
B B 3 25.000000
B C 0 0.000000
B D 1 8.333333
B E 1 8.333333
C A 0 0.000000
C B 1 4.545455
C C 9 40.909091
C D 9 40.909091
C E 3 13.636364
D A 9 42.857143
D B 1 4.761905
D C 1 4.761905
D D 4 19.047619
D E 6 28.571429
E A 2 11.111111
E B 4 22.222222
E C 7 38.888889
E D 2 11.111111
E E 3 16.666667
Now, I want to extend this to calculate transition probabilities for n-step probabilities.
E.g.
How can I write a function that does this for n-steps?
E.g. for 5 steps, the output should look like this:
from1 from2 from3 from4 from5 to count percent
A A A A A A 0 0
A A A A A E 0 0
A A A A A B 0 0
A A A A A C 0 0
A A A A A D 0 0
Try the function below. I add another parameter step
to control how many steps you want.
step = 1
, the output is the same as what you have done so far.as.data.frame(stringsAsFactors = TRUE)
+ group_by(.drop = FALSE)
is to prevent the missing combinations with 0 count from being excluding.summarise(count = n(), .groups = "drop_last")
indicates that the last level of grouping, i.e. the to
column, will be dropped after counting each combination. This ensures that the subsequent mutate()
computes conditional probabilities instead of overall probabilities.calculate_transition_probs <- function(states, step = 1) {
nc <- step+1
lagged_mat <- matrix(
states[sequence(rep(length(states), nc), 1:nc)],
ncol = nc
)
trans_prob <- lagged_mat %>%
as.data.frame(stringsAsFactors = TRUE) %>%
head(-step) %>%
group_by(pick(everything()), .drop = FALSE) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
names(trans_prob)[1:nc] <- c(paste0("from", 1:step), "to")
return(trans_prob)
}
calculate_transition_probs(simulated_states, step = 3)
# # A tibble: 625 × 6
# from1 from2 from3 to count percent
# <fct> <fct> <fct> <fct> <int> <dbl>
# 1 A A A A 0 0
# 2 A A A B 1 50
# 3 A A A C 1 50
# 4 A A A D 0 0
# 5 A A A E 0 0
# 6 A A B A 1 100
# 7 A A B B 0 0
# 8 A A B C 0 0
# 9 A A B D 0 0
# 10 A A B E 0 0
# # ℹ 615 more rows