Search code examples
rscalenormalizationnormalize

How to obtain normalized data per group using scale() in R?


For example, I have a data like below.

 dataA=structure(list(variety = c("CV1", "CV1", "CV1", "CV1", "CV1", 
                                     "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", 
                                     "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", 
                                     "CV1", "CV1", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", 
                                     "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", 
                                     "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", 
                                     "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", 
                                     "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", 
                                     "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV4", "CV4", 
                                     "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", 
                                     "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", 
                                     "CV4", "CV4", "CV4", "CV4", "CV4"), KN = c(3150, 2646, 3024, 
                                                                                3402, 3260.25, 3276, 3150, 3685.5, 3260.25, 3402, 4672, 4234, 
                                                                                4599, 4526, 4599, 4526, 4234, 4088, 3504, 3942, 3550, 3550, 3124, 
                                                                                4437.5, 3550, 4313.25, 4118, 4153.5, 4473, 3851.75, 4260, 4952.25, 
                                                                                5680, 5112, 5431.5, 3976, 5112, 4828, 4544, 5112, 4632.75, 5271.75, 
                                                                                3976, 4792.5, 4544, 4313.25, 4952.25, 4792.5, 4260, 4952.25, 
                                                                                3192.75, 2580, 2999.25, 2580, 3096, 2902.5, 2805.75, 2332.75, 
                                                                                2999.25, 2666, 3240, 3648, 3360, 3564, 3360, 3360, 3552, 3648, 
                                                                                3456, 4320, 3256, 3564, 2992, 3663, 3168, 3861, 3663, 3861, 3564, 
                                                                                3465, 2886, 3042, 2886, 2886, 3412.5, 2886, 3510, 3246.75, 3159, 
                                                                                3159, 3384, 3760, 4018.5, 3572, 3912.75, 4018.5, 4230, 3572, 
                                                                                3478, 3760), GY = c(9729.7744491255, 8562.20151523044, 9885.45084031151, 
                                                                                                    10741.6709918346, 9729.7744491255, 10118.9654270905, 9963.28903590451, 
                                                                                                    10975.1855786136, 10118.9654270905, 10041.1272314975, 12266.3112039261, 
                                                                                                    11454.5700213133, 12356.5046686608, 12627.0850628651, 13889.7935691516, 
                                                                                                    11995.7308097218, 11364.3765565786, 12085.9242744566, 11725.1504155176, 
                                                                                                    10552.6353739658, 10000.3548433298, 12368.8599378026, 10438.9668978618, 
                                                                                                    11316.1910069258, 10614.4117196746, 12632.0271705218, 13421.5288686794, 
                                                                                                    12456.582348709, 12368.8599378026, 12105.6927050834, 17281.3149485611, 
                                                                                                    17281.3149485611, 16404.090839497, 17719.9270030931, 18158.5390576251, 
                                                                                                    15877.7563740586, 19825.2648648467, 18333.9838794379, 17105.8701267483, 
                                                                                                    19123.4855775955, 14035.5857450242, 15351.4219086202, 14474.1977995562, 
                                                                                                    15702.3115522458, 15351.4219086202, 11930.2478832706, 15702.3115522458, 
                                                                                                    13860.1409232114, 13947.8633341178, 8859.96350154653, 8712.93579875975, 
                                                                                                    8075.40391104562, 8553.55282683122, 8394.16985490268, 9137.95705723583, 
                                                                                                    9403.59534378339, 8606.68048414073, 7969.1485964266, 8819.19111337877, 
                                                                                                    8606.68048414073, 11268.005457273, 13106.4695055649, 10437.7313709476, 
                                                                                                    10674.9525384691, 9844.67845214374, 10971.478997871, 11623.8372085553, 
                                                                                                    11327.3107491533, 10852.8684141103, 11149.3948735122, 11959.9005292108, 
                                                                                                    14134.4278981582, 11851.1741607634, 13101.5273979082, 12068.6268976581, 
                                                                                                    13373.3433190266, 12286.0796345529, 12394.8060030003, 12449.1691872239, 
                                                                                                    13427.7065032503, 11516.3463670221, 10745.3775725771, 11275.418618758, 
                                                                                                    11660.9030159805, 11805.4596649389, 11660.9030159805, 12287.3151614671, 
                                                                                                    12865.5417573008, 11564.5319166749, 12046.387413203, 14691.650536451, 
                                                                                                    14807.7900663834, 15504.6272459779, 14517.4412415523, 14923.9295963158, 
                                                                                                    14807.7900663834, 14923.9295963158, 13936.7435918902, 14459.3714765861, 
                                                                                                    14517.4412415523)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
                                                                                                                                                                                -100L))

Now, I'd like to normalize data per variety about KN and GY values respectively.

Simply, what I did is

CV1=subset(dataA, variety=="CV1")
CV2=subset(dataA, variety=="CV2")
CV3=subset(dataA, variety=="CV3")

CV1$CV1_KN=scale(CV1$KN, center=TRUE, scale=TRUE)
CV2$CV2_KN=scale(CV2$KN, center=TRUE, scale=TRUE)
CV3$CV3_KN=scale(CV3$KN, center=TRUE, scale=TRUE)
CV1$CV1_GY=scale(CV1$GY, center=TRUE, scale=TRUE)
CV2$CV2_GY=scale(CV2$GY, center=TRUE, scale=TRUE)
CV3$CV3_GY=scale(CV3$GY, center=TRUE, scale=TRUE)

Then I download each data and match it to each variety in an Excel file. But in my actual data, I have more than 30 varieties, and it's impossible to manually match. I believe there are simple ways to automatically obtain normalized data per group in R.

Could you let me know how to do it?

Always many thanks!!


Solution

  • Here we will get a list of the different variety:

    library(dplyr)
    
    dataA_normalized <- dataA %>%
      group_by(variety) %>%
      mutate(KN_scaled = scale(KN, center = TRUE, scale = TRUE),
             GY_scaled = scale(GY, center = TRUE, scale = TRUE)) %>% 
      group_split()
    
    <list_of<
      tbl_df<
        variety  : character
        KN       : double
        GY       : double
        KN_scaled: double[,1]
        GY_scaled: double[,1]
      >
    >[4]>
    [[1]]
    # A tibble: 25 × 5
       variety    KN     GY KN_scaled[,1] GY_scaled[,1]
       <chr>   <dbl>  <dbl>         <dbl>         <dbl>
     1 CV1     3150   9730.       -0.980        -1.05  
     2 CV1     2646   8562.       -1.82         -2.01  
     3 CV1     3024   9885.       -1.19         -0.919 
     4 CV1     3402  10742.       -0.558        -0.211 
     5 CV1     3260.  9730.       -0.795        -1.05  
     6 CV1     3276  10119.       -0.769        -0.726 
     7 CV1     3150   9963.       -0.980        -0.854 
     8 CV1     3686. 10975.       -0.0839       -0.0180
     9 CV1     3260. 10119.       -0.795        -0.726 
    10 CV1     3402  10041.       -0.558        -0.790 
    # … with 15 more rows
    # ℹ Use `print(n = ...)` to see more rows
    
    [[2]]
    # A tibble: 25 × 5
       variety    KN     GY KN_scaled[,1] GY_scaled[,1]
       <chr>   <dbl>  <dbl>         <dbl>         <dbl>
     1 CV2     4313. 12632.        -0.703        -0.962
     2 CV2     4118  13422.        -1.10         -0.663
     3 CV2     4154. 12457.        -1.03         -1.03 
     4 CV2     4473  12369.        -0.376        -1.06 
     5 CV2     3852. 12106.        -1.65         -1.16 
     6 CV2     4260  17281.        -0.813         0.798
     7 CV2     4952. 17281.         0.607         0.798
     8 CV2     5680  16404.         2.10          0.466
     9 CV2     5112  17720.         0.935         0.964
    10 CV2     5432. 18159.         1.59          1.13 
    # … with 15 more rows
    # ℹ Use `print(n = ...)` to see more rows
    
    [[3]]
    # A tibble: 25 × 5
       variety    KN    GY KN_scaled[,1] GY_scaled[,1]
       <chr>   <dbl> <dbl>         <dbl>         <dbl>
     1 CV3     3193. 8713.       -0.0445        -0.978
     2 CV3     2580  8075.       -1.45          -1.34 
     3 CV3     2999. 8554.       -0.487         -1.07 
     4 CV3     2580  8394.       -1.45          -1.16 
     5 CV3     3096  9138.       -0.266         -0.736
     6 CV3     2902. 9404.       -0.708         -0.584
     7 CV3     2806. 8607.       -0.930         -1.04 
     8 CV3     2333. 7969.       -2.01          -1.40 
     9 CV3     2999. 8819.       -0.487         -0.918
    10 CV3     2666  8607.       -1.25          -1.04 
    # … with 15 more rows
    # ℹ Use `print(n = ...)` to see more rows
    
    [[4]]
    # A tibble: 25 × 5
       variety    KN     GY KN_scaled[,1] GY_scaled[,1]
       <chr>   <dbl>  <dbl>         <dbl>         <dbl>
     1 CV4     3861  13373.        0.949          0.162
     2 CV4     3663  12286.        0.446         -0.587
     3 CV4     3861  12395.        0.949         -0.512
     4 CV4     3564  12449.        0.194         -0.475
     5 CV4     3465  13428.       -0.0578         0.200
     6 CV4     2886  11516.       -1.53          -1.12 
     7 CV4     3042  10745.       -1.13          -1.65 
     8 CV4     2886  11275.       -1.53          -1.28 
     9 CV4     2886  11661.       -1.53          -1.02 
    10 CV4     3412. 11805.       -0.191         -0.919
    # … with 15 more rows
    # ℹ Use `print(n = ...)` to see more rows