Search code examples
rsorting

Sorting a Vector in R with Mixed Alphanumeric and Non-Alphanumeric Strings


I need to sort a vector of strings in R according to specific criteria. The vector contains both numeric and non-numeric entries. The criteria for sorting are:

  • Non-numeric entries should be sorted in descending alphabetical order.
  • Numeric entries should be sorted as follows:
    • Entries with the same alphabetic part should be ordered by their numeric part in ascending order.

    • Numeric entries should be listed after non-numeric entries but sorted correctly among themselves.

For example, given the vector:

vec <- c("Mezclado 1", "Sin_usar 1", "Mezclado 3", "Sin_usar 2", "Mezclado 2")

I've tried

sort_vector <- function(vec) {
    # Extract numeric and alphabetic parts
    extract_parts <- function(x) {
    # Extract numeric part using regex, handle cases where no numeric part is present
    num_part <- regmatches(x, regexpr("(\\d+)$", x))
    num_part <- ifelse(num_part == "", NA, num_part)
    
    # Extract alphabetic part
    alpha_part <- sub("([\\d]+)$", "", x)
    
    list(alpha_part = alpha_part, num_part = num_part)
    }
  
    # Apply extraction function to all entries
    parts <- lapply(vec, extract_parts)
  
    alpha_parts <- sapply(parts, `[[`, "alpha_part")
    num_parts <- sapply(parts, `[[`, "num_part")
  
    # Separate entries into numeric and non-numeric
    is_numeric <- !is.na(num_parts)
    non_numeric_entries <- vec[!is_numeric]
    numeric_entries <- vec[is_numeric]
  
    # Create data frames for sorting
    if (length(non_numeric_entries) > 0) {
    non_numeric_df <- data.frame(
      original = non_numeric_entries,
      alpha_part = alpha_parts[!is_numeric],
      stringsAsFactors = FALSE
    )
    } else {
    non_numeric_df <- data.frame(original = character(), alpha_part = character(), stringsAsFactors = FALSE)
  }
  
    if (length(numeric_entries) > 0) {
      numeric_df <- data.frame(
        original = numeric_entries,
        alpha_part = alpha_parts[is_numeric],
        num_part = as.numeric(num_parts[is_numeric]),
        stringsAsFactors = FALSE
       )
    } else {
      numeric_df <- data.frame(original = character(), alpha_part = character(), num_part = numeric(), stringsAsFactors = FALSE)
    }
  
    # Sort non-numeric entries in descending alphabetical order
    if (nrow(non_numeric_df) > 0) {
    sorted_non_numeric_df <- non_numeric_df[order(non_numeric_df$alpha_part, decreasing = TRUE), ]
    } else {
      sorted_non_numeric_df <- non_numeric_df
    }
  
  # Sort numeric entries: first by alpha part descending, then by numeric part ascending
  if (nrow(numeric_df) > 0) {
    sorted_numeric_df <- numeric_df[order(numeric_df$alpha_part, decreasing = TRUE, numeric_df$num_part), ]
  } else {
    sorted_numeric_df <- numeric_df
  }
  
  # Combine sorted data frames
  combined_df <- rbind(sorted_non_numeric_df, sorted_numeric_df)
  
  # Final sorted vector
  sorted_vector <- combined_df$original
  
  # Find indices for the sorted vector
  sorted_indices <- match(sorted_vector, vec)
  
  list(
    sorted_vector = sorted_vector,
    sorted_indices = sorted_indices
  )
}

with the resulting output:

$sorted_vector
[1] "Sin_usar 2" "Sin_usar"   "Mezclado 3" "Mezclado 2" "Mezclado 1" "Mezclado"  

$sorted_indices
[1] 4 2 3 5 1 6

The correct output should be:

Sorted Vector: "Sin_usar" "Sin_usar 2" "Mezclado", "Mezclado 1" "Mezclado 2" "Mezclado 3"

Sorted Indices: [2 4 6 1 5 3]


Solution

  • My approach was to split into alpha and num using gsub(), then converting the numeric parts with as.numeric(). For the non-numeric elements, this gives num equal to NA so I converted these to 0 so they would appear first.

    Then I convert alpha to an ordered factor (ordered in reverse alphabetical order) so that as.integer will give the correct order.

    Then I use the order() function on this new factor along with num to get the new sorted indices. Then simply subset the original vector with these indices to give the sorted vector.

    sort_vector <- function(vec) {
      # Get the numeric part
      num <- as.numeric(gsub("[^0-9]", "", vec))
      # Set NA to zero (i.e. non numeric strings will come before numeric ones)
      num[is.na(num)] <- 0
      
      # Get the alpha part
      alpha <- gsub("[0-9 ]", "", vec)
      # create the ordered factor
      alpha_factor <- factor(alpha, ordered = TRUE, levels = sort(unique(alpha), decreasing = TRUE))
      
      # get the new index order
      sorted_indices <- order(as.integer(alpha_factor), num)
      # get the sorted vector
      sorted_vector <- vec[sorted_indices]
      return(list(sorted_vector = sorted_vector, sorted_indices = sorted_indices))
    }
    
    vec <- c("Mezclado 1", "Sin_usar", "Mezclado 3", "Sin_usar 2", "Mezclado 2", "Mezclado")
    
    sort_vector(vec)
    #> $sorted_vector
    #> [1] "Sin_usar"   "Sin_usar 2" "Mezclado"   "Mezclado 1" "Mezclado 2"
    #> [6] "Mezclado 3"
    #> 
    #> $sorted_indices
    #> [1] 2 4 6 1 5 3
    

    Created on 2024-09-08 with reprex v2.1.0