Search code examples
rdplyr

How to restore values that were transformed using reciprocal and minmax scaling to their original values?


I have a data frame with 5 indicators per site. I calculated the reciprocal of 2 of the indicators, and then minmax scaled all 5 indicators. Now I would like to restore them to their original values. I managed to get it right, but only for the indicators for which I originally did not calculate reciprocals. Could you help me to find where the error is? Here is a reproducible example:

# Load necessary libraries
library(dplyr)

# Step 1: Create the example dataframe
set.seed(1)
df <- data.frame(
  Region = rep(c("Region1", "Region2"), each = 5),
  Site = rep(1:5, times = 2),
  Indicator_A = sample(1:100, 10, replace = TRUE),
  Indicator_B = sample(1:100, 10, replace = TRUE),
  Indicator_C = sample(0:1000, 10, replace = TRUE),
  Indicator_D = sample(5:500, 10, replace = TRUE),
  Indicator_E = sample(-10:10, 10, replace = TRUE)
)

# define minmax function
norm_minmax <- function(x, na.rm = FALSE) {
  (x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
}

df_scaled <- 
  df %>% 
  
  # calculate reciprocal of selected indicators  
  mutate(
    across(
      c(Indicator_A, Indicator_B),
      ~ 1 / .)) %>% 
  
  # min-max scale variables per region
  mutate(
    across(
      where(is.numeric),
      ~ norm_minmax(., na.rm = TRUE)),
    .by = Region) 

# revert transformations

# define undo minmax function

undo_minmax <- function(scaled, unscaled, na.rm = FALSE) {
  unscaled_min <- min(unscaled, na.rm = na.rm)
  unscaled_range <- max(unscaled, na.rm = na.rm) - unscaled_min
  original <- scaled * unscaled_range + unscaled_min
  return(original)
}

df_restored <-
  df_scaled %>% 
  filter(Region == "Region1") %>% 
  
  # undo minmax scaling
  mutate(
    across(
      c(Indicator_A:Indicator_E),
      ~ undo_minmax(scaled = .x, 
                    unscaled = df[df$Region == "Region1",][[cur_column()]], 
                    na.rm = TRUE))) %>% 
  
  # calculate reciprocal of reciprocals
  mutate(across(c(Indicator_A, Indicator_B), ~ 1/.)) %>%
  
  # reclassify inf values as 0 resulting from dividing 1/0
  mutate(across(c(Indicator_A, Indicator_B),
                ~ if_else(is.infinite(.), 0, .)))  

all.equal(df_restored, df %>% filter(Region == "Region1"))
print(df_restored)
print(df[df$Region == "Region1", ])

Solution

  • The problem was with your calling of the function to undo the transformation. For Indicator_A and Indicator_B, the input to the transformation isn't the raw values, but the inverse of the raw values. So you have to undo the transformation differently for those variables than the others.

    library(dplyr)
    
    # Step 1: Create the example dataframe
    set.seed(1)
    df <- data.frame(
      Region = rep(c("Region1", "Region2"), each = 5),
      Site = rep(1:5, times = 2),
      Indicator_A = sample(1:100, 10, replace = TRUE),
      Indicator_B = sample(1:100, 10, replace = TRUE),
      Indicator_C = sample(0:1000, 10, replace = TRUE),
      Indicator_D = sample(5:500, 10, replace = TRUE),
      Indicator_E = sample(-10:10, 10, replace = TRUE)
    )
    
    # define minmax function
    norm_minmax <- function(x, na.rm = FALSE) {
      (x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
    }
    
    
    df_scaled <- 
      df %>% 
      filter(Region == "Region1") %>% 
      # calculate reciprocal of selected indicators  
      mutate(
        across(
          c(Indicator_A, Indicator_B),
          ~ 1 / .x)) %>% 
      
      # min-max scale variables per region
      mutate(
        across(
          starts_with("Indicator"),
          ~ norm_minmax(.x, na.rm = TRUE)),
        .by = Region
        ) 
    
    # revert transformations
    
    # define undo minmax function
    
    undo_minmax <- function(scaled, unscaled, na.rm = FALSE) {
      unscaled_min <- min(unscaled, na.rm = na.rm)
      unscaled_range <- max(unscaled, na.rm = na.rm) - unscaled_min
      original <- scaled * unscaled_range + unscaled_min
      return(original)
    }
    
    inv <- function(x)1/x
    
    
    df_restored <-
      df_scaled %>% 
      filter(Region == "Region1") %>% 
      # undo minmax scaling
      mutate(
        across(
          c(Indicator_C:Indicator_E),
          ~ undo_minmax(scaled = .x, 
                        unscaled = df %>% filter(Region == "Region1") %>%  select(cur_column()) %>% pull(), 
                        na.rm = TRUE)), 
        across(
          c(Indicator_A:Indicator_B), 
          ~1/undo_minmax(scaled = .x, 
                       unscaled = df %>% filter(Region == "Region1") %>%  select(cur_column()) %>% pull() %>% inv(), 
                       na.rm = TRUE)), 
        ) 
    
    all.equal(df_restored, df %>% filter(Region == "Region1"))
    #> [1] TRUE
    
    print(df_restored)
    #>    Region Site Indicator_A Indicator_B Indicator_C Indicator_D Indicator_E
    #> 1 Region1    1          68          97         104         467           9
    #> 2 Region1    2          39          85         728         293           1
    #> 3 Region1    3           1          21         877         344          -5
    #> 4 Region1    4          34          54         484         423          -3
    #> 5 Region1    5          87          74         676         330           1
    
    print(df[df$Region == "Region1", ])
    #>    Region Site Indicator_A Indicator_B Indicator_C Indicator_D Indicator_E
    #> 1 Region1    1          68          97         104         467           9
    #> 2 Region1    2          39          85         728         293           1
    #> 3 Region1    3           1          21         877         344          -5
    #> 4 Region1    4          34          54         484         423          -3
    #> 5 Region1    5          87          74         676         330           1
    

    Created on 2024-06-13 with reprex v2.1.0