Search code examples
rfunctionfor-loopdata-cleaningeulerr

How to make a function to make a complex euler diagram?


I am currently making an euler diagram. I managed to make an euler diagram containing 6 variables using these codes, although I believe these codes are not efficient:

dataset <- data.frame(
        A = rep(c(1, 2, NA), length.out = 100),
        B = rep(c(2, NA, 1), length.out = 100),
        C = rep(c(NA, 1, 2), length.out = 100),
        D = rep(c(NA, 2, 1), length.out = 100),
        E = rep(c(1, NA, 2), length.out = 100),
        F = rep(c(1, 2, NA), length.out = 100)) 

euler_primary <- c("A" = sum(dataset$A == 1, na.rm = TRUE),
                    "B" = sum(dataset$B == 1, na.rm = TRUE),
                    "C" = sum(dataset$C == 1, na.rm = TRUE),
                    "D" = sum(dataset$D == 1, na.rm = TRUE),
                    "E" = sum(dataset$E == 1, na.rm = TRUE),
                    "F" = sum(dataset$F == 1, na.rm = TRUE),
                    "A&B" = sum(dataset$B == 1 & dataset$A == 1, na.rm=TRUE),
                    "A&C" = sum(dataset$C == 1 & dataset$A == 1, na.rm=TRUE),
                    "A&D" = sum(dataset$C == 1 & dataset$D == 1, na.rm = TRUE),
                    "A&E" = sum(dataset$C == 1 & dataset$E == 1, na.rm = TRUE),
                    "A&F" = sum(dataset$C == 1 & dataset$F == 1, na.rm = TRUE),
                    "B&C" = sum(dataset$B == 1 & dataset$C == 1, na.rm=TRUE),
                    "B&D" = sum(dataset$B == 1 & dataset$D == 1, na.rm=TRUE),
                    "B&E" = sum(dataset$B == 1 & dataset$E == 1, na.rm=TRUE),
                    "B&F" = sum(dataset$B == 1 & dataset$F == 1, na.rm=TRUE),
                    "C&D" = sum(dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "C&E" = sum(dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "C&F" = sum(dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "D&E" = sum(dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "D&F" = sum(dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "E&F" = sum(dataset$F == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&C" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1, na.rm=TRUE),
                    "A&B&D" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1, na.rm=TRUE),
                    "A&B&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&D" = sum(dataset$A == 1 & dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "A&C&E" = sum(dataset$B == 1 & dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&C&F" = sum(dataset$A == 1 & dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&D&E" = sum(dataset$A == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&D&F" = sum(dataset$A == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&E&F" = sum(dataset$A == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&C&D" = sum(dataset$B == 1 & dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "B&C&E" = sum(dataset$B == 1 & dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "B&C&F" = sum(dataset$B == 1 & dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&D&E" = sum(dataset$B == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "B&D&F" = sum(dataset$B == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&E&F" = sum(dataset$B == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "C&D&E" = sum(dataset$C == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "C&D&F" = sum(dataset$C == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "C&E&F" = sum(dataset$C == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "D&E&F" = sum(dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&C&D" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "A&B&C&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&C&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&D&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&D&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&D&E" = sum(dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&C&D&F" = sum(dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&E&F" = sum(dataset$A == 1 & dataset$C == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&D&E&F" = sum(dataset$A == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&C&D&E" = sum(dataset$B == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E, na.rm=TRUE),
                    "B&C&D&F" = sum(dataset$B == 1 & dataset$C == 1 & dataset$D == 1 & dataset$F, na.rm=TRUE),
                    "B&C&E&F" = sum(dataset$B == 1 & dataset$C == 1 & dataset$E == 1 & dataset$F, na.rm=TRUE),
                    "B&D&E&F" = sum(dataset$B == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F, na.rm=TRUE),
                    "C&D&E&F" = sum(dataset$C == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F, na.rm=TRUE),
                    "A&B&C&D&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&C&D&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&C&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&D&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&D&E&F" = sum(dataset$C == 1 & dataset$A == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&C&D&E&F" = sum(dataset$C == 1 & dataset$B == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&C&D&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE)
                )

    venn_primary <- euler(euler_primary)
    plot(venn_primary6,
            quantities = list(cex = .75),
            fill = list(c("red", "blue", "green", "violet", "orange", "brown")),
            lty = 1,
            cex = 0.5,
            labels = NULL,
            legend = list(labels = letters[1:6]))

The above codes result in the following plot:

An euler diagram with 6 variables

However, now I need to make an euler diagram with 11 variables. It seems impossible to make such a diagram with 11 variables as the combinations of variables will reach hundreds, if not thousands. I think creating a function to assign the letters and create the list may be the solution. However, as I am a novice in data cleaning and conditionals in R, I am not able to come up with such a function. Can anyone help me to create a function where I can just enter the dataset name and the columns which will be included in the diagram, and the function will do the rest of the cleaning?

#p.s.: I noticed that the eulerr package requires us to add the & between variables to denote overlapping cases. For example, if we want to see the intersection between variable A and B, we will need to create an exact variable of A&B.

Thank you very much in advance


Solution

  • I tried to use tidyverse to create your plot with out any manual input, but the final plot is somehow flawed. Perhaps you are able to create your plot with 11 variables based on this code and you somehow fix the issue of missing labels inside the plot.

    dataset <- data.frame(
      A = rep(c(1, 2, NA), length.out = 100),
      B = rep(c(2, NA, 1), length.out = 100),
      C = rep(c(NA, 1, 2), length.out = 100),
      D = rep(c(NA, 2, 1), length.out = 100),
      E = rep(c(1, NA, 2), length.out = 100),
      F = rep(c(1, 2, NA), length.out = 100)) 
    
    library(tidyverse)
    library(eulerr)
    
    map(1:6, ~combn(names(dataset), .x)) %>% 
      map_df(~.x %>% 
               split(rep(1:ncol(.x), each = nrow(.x))) %>% 
               `names<-`(., paste0(split(.x, rep(1:ncol(.x), each = nrow(.x))))) %>% 
               map_df(~dataset %>% 
                        mutate(rn = row_number()) %>% 
                        pivot_longer(-rn) %>% 
                        filter(name %in% .x, value == 1) %>% 
                        group_by(rn) %>% 
                        filter(n() == length(.x)) %>% 
                        ungroup() %>% 
                        summarise(n = n_distinct(rn)),
                      .id = "cols")) %>% 
      mutate(cols = str_remove(str_replace_all(str_remove_all(cols, "[^[A-Z]]"), "(\\w)", "\\1&"),
                               "&$")) %>% 
      deframe() %>% 
      euler() %>% 
      plot(fill = list(c("red", "blue", "green", "violet", "orange", "brown")),
           lty = 1,
           cex = 0.5,
           labels = NULL,
           legend = list(labels = letters[1:6]))
    

    enter image description here