I wrote a recursive function get_tree
that traces linked documents in a tibble named data
(they are linked by the columns id_from
and it_to
).
However, the function will not stop running. It will repeat certain linked documents over and over again.
#data
data <- structure(list(id_from = c("K202527", "K190593", "K181502", "K163512",
"K163512", "K133701", "K143513", "K143513", "K143513", "K113372",
"K142681", "K091075", "K091075", "K111917", "K111917", "K042463",
"K042463", "K042463", "K042463", "K062282", "K062282", "K062282",
"K072558", "K072558", "K091971", "K091971", "K091971", "K091971",
"K012241", "K012241", "K013717", "K043380", "K043380", "K062282",
"K062282", "K062282", "K063222", "K063222", "K071130", "K072558",
"K072558", "K083287", "K083287", "K083287", "K090696", "K090696",
"K090696", "K090696", "K012241", "K012241", "K013717", "K043380",
"K043380", "K052240", "K063222", "K063222", "K071130", "K072558",
"K072558", "K081365", "K083287", "K083287", "K083287", "K982803",
"K982803", "K982803", "K012241", "K012241", "K013717", "K052240",
"K063222", "K063222", "K072558", "K072558", "K982803", "K982803",
"K982803", "K012241", "K012241", "K052240", "K063222", "K063222",
"K982803", "K982803", "K982803", "K012241", "K012241", "K052240",
"K982803", "K982803", "K982803", "K012241", "K012241", "K982803",
"K982803", "K982803", "K982803", "K982803", "K982803"), id_to = c("K190593",
"K181502", "K163512", "K133701", "K143513", "K113372", "K121319",
"K121628", "K142681", "K111917", "K091075", "K042463", "K062282",
"K072558", "K091971", "K930564", "K012686", "K881905", "K002544",
"K003621", "K013717", "K043380", "K012241", "K063222", "K071130",
"K072558", "K083287", "K090696", "K003707", "K982803", "K003621",
"K003621", "K013717", "K003621", "K013717", "K043380", "K012241",
"K052240", "K033451", "K012241", "K063222", "K072558", "K043604",
"K042113", "K083287", "K042022", "K081365", "K063222", "K003707",
"K982803", "K003621", "K003621", "K013717", "K012241", "K012241",
"K052240", "K033451", "K012241", "K063222", "K994331", "K072558",
"K043604", "K042113", "K843503", "K880626", "K934913", "K003707",
"K982803", "K003621", "K012241", "K012241", "K052240", "K012241",
"K063222", "K843503", "K880626", "K934913", "K003707", "K982803",
"K012241", "K012241", "K052240", "K843503", "K880626", "K934913",
"K003707", "K982803", "K012241", "K843503", "K880626", "K934913",
"K003707", "K982803", "K843503", "K880626", "K934913", "K843503",
"K880626", "K934913"), rec = c(1, 2, 3, 4, 4, 5, 5, 5, 5, 6,
6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
12, 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15
)), row.names = c(NA, -99L), class = c("tbl_df", "tbl", "data.frame"
))
Example data:
head(data)
#> # A tibble: 6 × 3
#> id_from id_to rec
#> <chr> <chr> <dbl>
#> 1 K202527 K190593 1
#> 2 K190593 K181502 2
#> 3 K181502 K163512 3
#> 4 K163512 K133701 4
#> 5 K163512 K143513 4
#> 6 K133701 K113372 5
My funciton
# iterative function
get_tree <- function(id, data) {
cat("Processing node: ", id, "\n")
branch <- filter(data, id_from == id) %>% distinct()
if (nrow(branch) == 0) return()
bind_rows(
branch,
map(branch$id_to, \(x) get_tree(x, data))
) %>% distinct()
}
Function applied
# function applied
library(tidyverse)
library(igraph)
library(R.utils)
withTimeout( #beacuse it will not stop running
{data %>%
get_tree("K202527",.)},timeout=1,onTimeout = "warning")
#> Processing node: K202527
#> Processing node: K190593
#> Processing node: K181502
#> Processing node: K163512
#> Processing node: K133701
#> Processing node: K113372
#> Processing node: K111917
#> Processing node: K072558
#> Processing node: K012241
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K063222
#> Processing node: K012241
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K052240
#> Processing node: K012241
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K012241
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K012241
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K012241
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K003707
#> Processing node: K982803
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
#> Processing node: K880626
#> Processing node: K934913
#> Processing node: K843503
My approach:
I thought this could be caused by a cyclic document structure, however, I checked for that and could not find any cyclic connection.
# Create a graph object from the data frame
g <- graph_from_data_frame(data, directed = TRUE)
# Plot the graph
plot(g)
#check that it is not cyclic
is_dag(g)
#> [1] TRUE
I also manually checked the data and the nodes (K843503, K880626, K934913) that are processed over and over, I could not find a reason that would cause the function to run forever.
The code actually work. It just take longer to run.
When you add more lower level nodes, it will take exponentially more time to run. Try
data[1:50,] %>% get_tree("K202527",.)
vs data[1:80,] %>% get_tree("K202527",.)
.
Another issue is some path occurs multiple times. Therefore, same recursive actions are performed multiple times. Example: id_from="K982803" and id_to="K843503" apprears 6 times. The solution for this is to avoid running get_tree
if the node is already processed.
get_tree <- function(id, data) {
cat("Processing node: ", id, "\n")
if(id %in% processed_id){
cat("already processed. Skipping this node.")
return()
}
processed_id <<- c(processed_id,id)
branch <- filter(data, id_from == id) %>% distinct()
if (nrow(branch) == 0) return()
bind_rows(
branch,
map(branch$id_to, function(x) get_tree(x, data))
) %>% distinct()
}
and run the following
processed_id <- c() #you need to reset this everytime
data %>%
get_tree("K202527",.)
This should finish almost instantly.