I have this data:
simulated_states = c("A", "E", "B", "B", "A", "C", "D", "A", "B", "D", "A", "D",
"D", "E", "D", "D", "D", "E", "A", "A", "A", "B", "A", "C", "C",
"D", "A", "A", "D", "A", "D", "A", "A", "A", "C", "C", "D", "A",
"C", "C", "D", "E", "C", "C", "C", "E", "B", "A", "E", "E", "C",
"C", "D", "E", "C", "E", "E", "A", "E", "B", "A", "A", "E", "E",
"C", "E", "C", "C", "C", "D", "E", "D", "C", "D", "A", "B", "B",
"E", "B", "A", "E", "C", "C", "D", "B", "B", "A", "C", "B", "A",
"D", "A", "D", "E", "C", "D", "D", "A", "A", "C")
I know how to calculate transition probabilities :
calculate_transition_probs <- function(states) {
transitions <- data.frame(
from = states,
to = c(states[-1], NA)
)
transition_counts <- table(transitions, useNA = "always")
transition_df <- as.data.frame(transition_counts)
colnames(transition_df) <- c("from", "to", "count")
transition_df <- transition_df[!is.na(transition_df$to), ]
transition_df <- transition_df %>%
group_by(from) %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
transition_df <- transition_df[, c("from", "to", "count", "percent")]
transition_df <- transition_df[order(transition_df$from, transition_df$to), ]
return(transition_df)
}
transition_probs <- calculate_transition_probs(simulated_states)
The result looks like this:
from to count percent
A A 7 26.923077
A B 3 11.538462
A C 6 23.076923
A D 5 19.230769
A E 5 19.230769
B A 7 58.333333
B B 3 25.000000
B C 0 0.000000
B D 1 8.333333
B E 1 8.333333
C A 0 0.000000
C B 1 4.545455
C C 9 40.909091
C D 9 40.909091
C E 3 13.636364
D A 9 42.857143
D B 1 4.761905
D C 1 4.761905
D D 4 19.047619
D E 6 28.571429
E A 2 11.111111
E B 4 22.222222
E C 7 38.888889
E D 2 11.111111
E E 3 16.666667
Now, I want to extend this to calculate transition probabilities for n-step probabilities.
E.g.
How can I write a function that does this for n-steps?
E.g. for 5 steps, the output should look like this:
from1 from2 from3 from4 from5 to count percent
A A A A A A 0 0
A A A A A E 0 0
A A A A A B 0 0
A A A A A C 0 0
A A A A A D 0 0
Try the function below. I add another parameter step
to control how many steps you want.
If you set step = 1
, the output is the same as what you have done so far.
calculate_transition_probs <- function(states, step = 1) {
nc <- step+1
lagged_mat <- matrix(
states[sequence(rep(length(states), nc), 1:nc)],
ncol = nc
)
trans_prob <- lagged_mat %>%
as.data.frame(stringsAsFactors = TRUE) %>%
head(-step) %>%
group_by(pick(everything()), .drop = FALSE) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
names(trans_prob)[1:nc] <- c(paste0("from", 1:step), "to")
return(trans_prob)
}
calculate_transition_probs(simulated_states, step = 3)
# # A tibble: 625 × 6
# from1 from2 from3 to count percent
# <fct> <fct> <fct> <fct> <int> <dbl>
# 1 A A A A 0 0
# 2 A A A B 1 50
# 3 A A A C 1 50
# 4 A A A D 0 0
# 5 A A A E 0 0
# 6 A A B A 1 100
# 7 A A B B 0 0
# 8 A A B C 0 0
# 9 A A B D 0 0
# 10 A A B E 0 0
# # ℹ 615 more rows