I have a data frame with 5 indicators per site. I calculated the reciprocal of 2 of the indicators, and then minmax scaled all 5 indicators. Now I would like to restore them to their original values. I managed to get it right, but only for the indicators for which I originally did not calculate reciprocals. Could you help me to find where the error is? Here is a reproducible example:
# Load necessary libraries
library(dplyr)
# Step 1: Create the example dataframe
set.seed(1)
df <- data.frame(
Region = rep(c("Region1", "Region2"), each = 5),
Site = rep(1:5, times = 2),
Indicator_A = sample(1:100, 10, replace = TRUE),
Indicator_B = sample(1:100, 10, replace = TRUE),
Indicator_C = sample(0:1000, 10, replace = TRUE),
Indicator_D = sample(5:500, 10, replace = TRUE),
Indicator_E = sample(-10:10, 10, replace = TRUE)
)
# define minmax function
norm_minmax <- function(x, na.rm = FALSE) {
(x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
}
df_scaled <-
df %>%
# calculate reciprocal of selected indicators
mutate(
across(
c(Indicator_A, Indicator_B),
~ 1 / .)) %>%
# min-max scale variables per region
mutate(
across(
where(is.numeric),
~ norm_minmax(., na.rm = TRUE)),
.by = Region)
# revert transformations
# define undo minmax function
undo_minmax <- function(scaled, unscaled, na.rm = FALSE) {
unscaled_min <- min(unscaled, na.rm = na.rm)
unscaled_range <- max(unscaled, na.rm = na.rm) - unscaled_min
original <- scaled * unscaled_range + unscaled_min
return(original)
}
df_restored <-
df_scaled %>%
filter(Region == "Region1") %>%
# undo minmax scaling
mutate(
across(
c(Indicator_A:Indicator_E),
~ undo_minmax(scaled = .x,
unscaled = df[df$Region == "Region1",][[cur_column()]],
na.rm = TRUE))) %>%
# calculate reciprocal of reciprocals
mutate(across(c(Indicator_A, Indicator_B), ~ 1/.)) %>%
# reclassify inf values as 0 resulting from dividing 1/0
mutate(across(c(Indicator_A, Indicator_B),
~ if_else(is.infinite(.), 0, .)))
all.equal(df_restored, df %>% filter(Region == "Region1"))
print(df_restored)
print(df[df$Region == "Region1", ])
The problem was with your calling of the function to undo the transformation. For Indicator_A
and Indicator_B
, the input to the transformation isn't the raw values, but the inverse of the raw values. So you have to undo the transformation differently for those variables than the others.
library(dplyr)
# Step 1: Create the example dataframe
set.seed(1)
df <- data.frame(
Region = rep(c("Region1", "Region2"), each = 5),
Site = rep(1:5, times = 2),
Indicator_A = sample(1:100, 10, replace = TRUE),
Indicator_B = sample(1:100, 10, replace = TRUE),
Indicator_C = sample(0:1000, 10, replace = TRUE),
Indicator_D = sample(5:500, 10, replace = TRUE),
Indicator_E = sample(-10:10, 10, replace = TRUE)
)
# define minmax function
norm_minmax <- function(x, na.rm = FALSE) {
(x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
}
df_scaled <-
df %>%
filter(Region == "Region1") %>%
# calculate reciprocal of selected indicators
mutate(
across(
c(Indicator_A, Indicator_B),
~ 1 / .x)) %>%
# min-max scale variables per region
mutate(
across(
starts_with("Indicator"),
~ norm_minmax(.x, na.rm = TRUE)),
.by = Region
)
# revert transformations
# define undo minmax function
undo_minmax <- function(scaled, unscaled, na.rm = FALSE) {
unscaled_min <- min(unscaled, na.rm = na.rm)
unscaled_range <- max(unscaled, na.rm = na.rm) - unscaled_min
original <- scaled * unscaled_range + unscaled_min
return(original)
}
inv <- function(x)1/x
df_restored <-
df_scaled %>%
filter(Region == "Region1") %>%
# undo minmax scaling
mutate(
across(
c(Indicator_C:Indicator_E),
~ undo_minmax(scaled = .x,
unscaled = df %>% filter(Region == "Region1") %>% select(cur_column()) %>% pull(),
na.rm = TRUE)),
across(
c(Indicator_A:Indicator_B),
~1/undo_minmax(scaled = .x,
unscaled = df %>% filter(Region == "Region1") %>% select(cur_column()) %>% pull() %>% inv(),
na.rm = TRUE)),
)
all.equal(df_restored, df %>% filter(Region == "Region1"))
#> [1] TRUE
print(df_restored)
#> Region Site Indicator_A Indicator_B Indicator_C Indicator_D Indicator_E
#> 1 Region1 1 68 97 104 467 9
#> 2 Region1 2 39 85 728 293 1
#> 3 Region1 3 1 21 877 344 -5
#> 4 Region1 4 34 54 484 423 -3
#> 5 Region1 5 87 74 676 330 1
print(df[df$Region == "Region1", ])
#> Region Site Indicator_A Indicator_B Indicator_C Indicator_D Indicator_E
#> 1 Region1 1 68 97 104 467 9
#> 2 Region1 2 39 85 728 293 1
#> 3 Region1 3 1 21 877 344 -5
#> 4 Region1 4 34 54 484 423 -3
#> 5 Region1 5 87 74 676 330 1
Created on 2024-06-13 with reprex v2.1.0