Search code examples
rperformancefor-loopgroup-concat

Concatenate a column by grouping inefficient and slow


Code does exactly what is needed but is horribly slow (>10k columns at a time) with these nested loops, is there a more effective/simple way to do this?

all_xpath<-unique(dplyr::pull(tibble(all.df['Xpath']), Xpath))
all_section<-unique(dplyr::pull(tibble(all.df['section']), section))
valueS<-unique(dplyr::pull(tibble(all.df['attr.']), attr.))

for (j in all_section){
  for (f in all_xpath){
    for (g in valueS){
      allx <-  data.frame(filter(all.df, section==j & attr. == g & Xpath == f), stringsAsFactors=FALSE)
      if (nrow(allx)>1){
          value<-paste(allx$value. , collapse = ' | ')
          if ( any(all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$elem.!='telecom')){
            all.df[all.df$section==j & all.df$attr.==g & all.df$Xpath==f,]$value. <- value
}}}}}

valueS<-unique(dplyr::pull(tibble(all.df['elem.']), elem.))
#concat by  elem. and xpath within component
for (j in all_section){
  for (f in all_xpath){
    for (g in valueS){
      allx <-  data.frame(filter(all.df, section==j & elem. == g & Xpath == f & attr.==""), stringsAsFactors=FALSE)
      if (nrow(allx)>1){
        value<-paste(allx$value. , collapse = ' | ')
        all.df[all.df$section==j & all.df$elem.==g & all.df$Xpath==f & all.df$attr.=="",]$value. <- value
      }}}}
all.df<-dplyr::distinct(all.df)

The Data is formatted like this:

dput(all.df)
structure(list(section = c("LastFiled", "LastFiled", "LastFiled", 
"LastFiled", "Results", "Results", "History", "History", "Cable", 
"Cable", "Fan", "Fan"), elem. = c("code", "code", "id", "id", 
"code", "code", "effectiveTime", "effectiveTime", "value", "value", 
"code", "code"), attr. = c("code", "code", "root", "root", "code", 
"code", "value", "value", "", "", "", ""), value. = c("8462-4", 
"8462-5", "39156-7", "39156-8", "59408-11", "39156-13", "59408-13", 
"39156-14", "59408-8", "39156-9", "59408-9", "39156-11"), Xpath = c("/Document/othersection/entry/body/sceen/code", 
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/body/sceen/id", 
"/Document/othersection/entry/body/sceen/id", "/Document/othersection/entry/body/sceen/code", 
"/Document/othersection/entry/body/sceen/code", "/Document/othersection/entry/sceen/effectiveTime", 
"/Document/othersection/entry/sceen/effectiveTime", "/Document/othersection/entry/entryRelationship", 
"/Document/othersection/entry/entryRelationship", "/Document/othersection/entry/procedure/entryRelationship/sceen", 
"/Document/othersection/entry/procedure/entryRelationship/sceen"
)), row.names = c(NA, -12L), class = "data.frame")

Results should look like this:

section     elem.       attr.   value.                  Xpath
LastFiled   code        code    8462-4 | 8462-5     /Document/othersection/entry/body/sceen/code
LastFiled   id          root    39156-7 | 39156-8   /Document/othersection/entry/body/sceen/id
Results     code        code    59408-11 | 39156-13 /Document/othersection/entry/body/sceen/code
History     effectiveTime   value   59408-13 | 39156-14     /Document/othersection/entry/sceen/effectiveTime
Cable       value               59408-8 | 39156-9 | 
                                59408-8 | 39156-9   /Document/othersection/entry/entryRelationship
Fan         code                59408-9 | 39156-11 | 
                                59408-9 | 39156-11  /Document/othersection/entry/procedure/entryRelationship

Solution

  • The problem can be solved with two dplyr pipes, one for each data transformation. The results are wrapped in bind_rows.

    library(dplyr)
    
    bind_rows(
      all.df %>%
        filter(elem. != 'telecom') %>%
        group_by(section, attr., elem., Xpath) %>%
        summarise(value. = paste(value., collapse = "|"), .groups = "keep"),
      
      all.df %>%
        filter(attr. == "") %>%
        group_by(section, elem., Xpath) %>%
        summarise(value. = paste(value., collapse = "|"), .groups = "keep")
    ) %>%
      mutate(attr. = ifelse(is.na(attr.), "", attr.)) %>%
      relocate(value., .before = Xpath)
    ## A tibble: 8 x 5
    ## Groups:   section, attr., elem., Xpath [6]
    #  section   attr.   elem.         value.            Xpath                                                         
    #  <chr>     <chr>   <chr>         <chr>             <chr>                                                         
    #1 Cable     ""      value         59408-8|39156-9   /Document/othersection/entry/entryRelationship                
    #2 Fan       ""      code          59408-9|39156-11  /Document/othersection/entry/procedure/entryRelationship/sceen
    #3 History   "value" effectiveTime 59408-13|39156-14 /Document/othersection/entry/sceen/effectiveTime              
    #4 LastFiled "code"  code          8462-4|8462-5     /Document/othersection/entry/body/sceen/code                  
    #5 LastFiled "root"  id            39156-7|39156-8   /Document/othersection/entry/body/sceen/id                    
    #6 Results   "code"  code          59408-11|39156-13 /Document/othersection/entry/body/sceen/code                  
    #7 Cable     ""      value         59408-8|39156-9   /Document/othersection/entry/entryRelationship                
    #8 Fan       ""      code          59408-9|39156-11  /Document/othersection/entry/procedure/entryRelationship/sceen