Search code examples
rsortingnamesmagrittr

Sorting column names by two numbers


I recently got this amazing answer from JBGruber, to order string columns with double numerical values, which works on both datasets at the bottom of the post:

library(magrittr)
order_cols <- function(dat) {
  
  # look for words to order by
  s_ordered <- stringi::stri_extract_all_regex(colnames(dat), "[[:alpha:]]+") %>% 
    unlist() %>% 
    unique() %>% 
    sort()
  
  if (length(s_ordered) > 1) {
    # replace words with their alphabetical index
    cnames <- stringi::stri_replace_all_fixed(colnames(dat), s_ordered, seq_along(s_ordered), vectorise_all = FALSE)
  } else {
    cnames <- colnames(dat)
  }
  
  cnames %>% 
    stringi::stri_extract_all_regex("\\d+") %>% # extract all numbers (including the alphabetical index numbers)
    lapply(as.numeric) %>% 
    lapply(sum) %>% 
    unlist() %>% 
    order()
  
}

However, I noticed that for the following data it does not completely work, because it is based on the assumption that the sum of the numbers in order give the write order of the columns:

dat_I <- structure(list(`[25,250)`=3L, `[0,25)` = 5L, `[100,250)` = 43L, `[100,500)` = 0L, 
    `[1000,1000000]` = 20L, `[1000,1500)` = 0L, `[1500,3000)` = 0L, 
    `[25,100)` = 38L, `[25,50)` = 0L, `[250,500)` = 27L, `[3000,1000000]` = 0L, 
    `[50,100)` = 0L, `[500,1000)` = 44L, `[500,1000000]` = 0L), row.names = "Type_A", class = "data.frame")

colnames(dat_I )[order_cols(dat_I)]

Is there a way to first order by the first element and then order by the second element?

Old Data

dat_I <- structure(list(`[0,25)` = 5L, `[100,250)` = 43L, `[100,500)` = 0L, 
    `[1000,1000000]` = 20L, `[1000,1500)` = 0L, `[1500,3000)` = 0L, 
    `[25,100)` = 38L, `[25,50)` = 0L, `[250,500)` = 27L, `[3000,1000000]` = 0L, 
    `[50,100)` = 0L, `[500,1000)` = 44L, `[500,1000000]` = 0L), row.names = "Type_A", class = "data.frame")

dat_II <- structure(list(`[0,25) east` = c(1269L, 85L), `[0,25) north` = c(364L, 
21L), `[0,25) south` = c(1172L, 97L), `[0,25) west` = c(549L, 
49L), `[100,250) east` = c(441L, 149L), `[100,250) north` = c(224L, 
45L), `[100,250) south` = c(521L, 247L), `[100,250) west` = c(770L, 
124L), `[100,500) east` = c(0L, 0L), `[100,500) north` = c(0L, 
0L), `[100,500) south` = c(0L, 0L), `[100,500) west` = c(0L, 
0L), `[1000,1000000] east` = c(53L, 0L), `[1000,1000000] north` = c(82L, 
0L), `[1000,1000000] south` = c(23L, 0L), `[1000,1000000] west` = c(63L, 
0L), `[1000,1500) east` = c(0L, 0L), `[1000,1500) north` = c(0L, 
0L), `[1000,1500) south` = c(0L, 0L), `[1000,1500) west` = c(0L, 
0L), `[1500,3000) east` = c(0L, 0L), `[1500,3000) north` = c(0L, 
0L), `[1500,3000) south` = c(0L, 0L), `[1500,3000) west` = c(0L, 
0L), `[25,100) east` = c(579L, 220L), `[25,100) north` = c(406L, 
58L), `[25,100) south` = c(1048L, 316L), `[25,100) west` = c(764L, 
131L), `[25,50) east` = c(0L, 0L), `[25,50) north` = c(0L, 0L
), `[25,50) south` = c(0L, 0L), `[25,50) west` = c(0L, 0L), `[250,500) east` = c(232L, 
172L), `[250,500) north` = c(207L, 40L), `[250,500) south` = c(202L, 
148L), `[250,500) west` = c(457L, 153L), `[3000,1000000] east` = c(0L, 
0L), `[3000,1000000] north` = c(0L, 0L), `[3000,1000000] south` = c(0L, 
0L), `[3000,1000000] west` = c(0L, 0L), `[50,100) east` = c(0L, 
0L), `[50,100) north` = c(0L, 0L), `[50,100) south` = c(0L, 0L
), `[50,100) west` = c(0L, 0L), `[500,1000) east` = c(103L, 0L
), `[500,1000) north` = c(185L, 0L), `[500,1000) south` = c(66L, 
0L), `[500,1000) west` = c(200L, 0L), `[500,1000000] east` = c(0L, 
288L), `[500,1000000] north` = c(0L, 120L), `[500,1000000] south` = c(0L, 
229L), `[500,1000000] west` = c(0L, 175L)), row.names = c("A", 
"B"), class = "data.frame")

Solution

  • I modified the last three lines of the function so that the order is now based on each element successively.

    order_cols <- function(dat) {
      
      # look for words to order by
      s_ordered <- stringi::stri_extract_all_regex(colnames(dat), "[[:alpha:]]+") %>% 
        unlist() %>% 
        unique() %>% 
        sort()
      
      if (length(s_ordered) > 1) {
        # replace words with their alphabetical index
        cnames <- stringi::stri_replace_all_fixed(colnames(dat), s_ordered, seq_along(s_ordered), vectorise_all = FALSE)
      } else {
        cnames <- colnames(dat)
      }
      
      cnames %>% 
        stringi::stri_extract_all_regex("\\d+") %>% # extract all numbers (including the alphabetical index numbers)
        lapply(as.numeric) %>% 
        do.call(rbind, .) %>%    # bind list items to a matrix
        as.data.frame %>%        # change the matrix to a data.frame (i.e. a list)
        do.call(order, .)        # use the list for ordering
    }
    
    colnames(dat_II)[order_cols(dat_II)]
    # [1] "[0,25) east"          "[0,25) north"         "[0,25) south"        
    # [4] "[0,25) west"          "[25,50) east"         "[25,50) north"       
    # [7] "[25,50) south"        "[25,50) west"         "[25,100) east"       
    # [10] "[25,100) north"       "[25,100) south"       "[25,100) west"       
    # [13] "[50,100) east"        "[50,100) north"       "[50,100) south"      
    # [16] "[50,100) west"        "[100,250) east"       "[100,250) north"     
    # [19] "[100,250) south"      "[100,250) west"       "[100,500) east"      
    # [22] "[100,500) north"      "[100,500) south"      "[100,500) west"      
    # [25] "[250,500) east"       "[250,500) north"      "[250,500) south"     
    # [28] "[250,500) west"       "[500,1000) east"      "[500,1000) north"    
    # [31] "[500,1000) south"     "[500,1000) west"      "[500,1000000] east"  
    # [34] "[500,1000000] north"  "[500,1000000] south"  "[500,1000000] west"  
    # [37] "[1000,1500) east"     "[1000,1500) north"    "[1000,1500) south"   
    # [40] "[1000,1500) west"     "[1000,1000000] east"  "[1000,1000000] north"
    # [43] "[1000,1000000] south" "[1000,1000000] west"  "[1500,3000) east"    
    # [46] "[1500,3000) north"    "[1500,3000) south"    "[1500,3000) west"    
    # [49] "[3000,1000000] east"  "[3000,1000000] north" "[3000,1000000] south"
    # [52] "[3000,1000000] west