For example, I have a data like below.
dataA=structure(list(variety = c("CV1", "CV1", "CV1", "CV1", "CV1",
"CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1",
"CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1", "CV1",
"CV1", "CV1", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2",
"CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2",
"CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2", "CV2",
"CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3",
"CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3",
"CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV3", "CV4", "CV4",
"CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4",
"CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4", "CV4",
"CV4", "CV4", "CV4", "CV4", "CV4"), KN = c(3150, 2646, 3024,
3402, 3260.25, 3276, 3150, 3685.5, 3260.25, 3402, 4672, 4234,
4599, 4526, 4599, 4526, 4234, 4088, 3504, 3942, 3550, 3550, 3124,
4437.5, 3550, 4313.25, 4118, 4153.5, 4473, 3851.75, 4260, 4952.25,
5680, 5112, 5431.5, 3976, 5112, 4828, 4544, 5112, 4632.75, 5271.75,
3976, 4792.5, 4544, 4313.25, 4952.25, 4792.5, 4260, 4952.25,
3192.75, 2580, 2999.25, 2580, 3096, 2902.5, 2805.75, 2332.75,
2999.25, 2666, 3240, 3648, 3360, 3564, 3360, 3360, 3552, 3648,
3456, 4320, 3256, 3564, 2992, 3663, 3168, 3861, 3663, 3861, 3564,
3465, 2886, 3042, 2886, 2886, 3412.5, 2886, 3510, 3246.75, 3159,
3159, 3384, 3760, 4018.5, 3572, 3912.75, 4018.5, 4230, 3572,
3478, 3760), GY = c(9729.7744491255, 8562.20151523044, 9885.45084031151,
10741.6709918346, 9729.7744491255, 10118.9654270905, 9963.28903590451,
10975.1855786136, 10118.9654270905, 10041.1272314975, 12266.3112039261,
11454.5700213133, 12356.5046686608, 12627.0850628651, 13889.7935691516,
11995.7308097218, 11364.3765565786, 12085.9242744566, 11725.1504155176,
10552.6353739658, 10000.3548433298, 12368.8599378026, 10438.9668978618,
11316.1910069258, 10614.4117196746, 12632.0271705218, 13421.5288686794,
12456.582348709, 12368.8599378026, 12105.6927050834, 17281.3149485611,
17281.3149485611, 16404.090839497, 17719.9270030931, 18158.5390576251,
15877.7563740586, 19825.2648648467, 18333.9838794379, 17105.8701267483,
19123.4855775955, 14035.5857450242, 15351.4219086202, 14474.1977995562,
15702.3115522458, 15351.4219086202, 11930.2478832706, 15702.3115522458,
13860.1409232114, 13947.8633341178, 8859.96350154653, 8712.93579875975,
8075.40391104562, 8553.55282683122, 8394.16985490268, 9137.95705723583,
9403.59534378339, 8606.68048414073, 7969.1485964266, 8819.19111337877,
8606.68048414073, 11268.005457273, 13106.4695055649, 10437.7313709476,
10674.9525384691, 9844.67845214374, 10971.478997871, 11623.8372085553,
11327.3107491533, 10852.8684141103, 11149.3948735122, 11959.9005292108,
14134.4278981582, 11851.1741607634, 13101.5273979082, 12068.6268976581,
13373.3433190266, 12286.0796345529, 12394.8060030003, 12449.1691872239,
13427.7065032503, 11516.3463670221, 10745.3775725771, 11275.418618758,
11660.9030159805, 11805.4596649389, 11660.9030159805, 12287.3151614671,
12865.5417573008, 11564.5319166749, 12046.387413203, 14691.650536451,
14807.7900663834, 15504.6272459779, 14517.4412415523, 14923.9295963158,
14807.7900663834, 14923.9295963158, 13936.7435918902, 14459.3714765861,
14517.4412415523)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-100L))
Now, I'd like to normalize data per variety about KN and GY values respectively.
Simply, what I did is
CV1=subset(dataA, variety=="CV1")
CV2=subset(dataA, variety=="CV2")
CV3=subset(dataA, variety=="CV3")
CV1$CV1_KN=scale(CV1$KN, center=TRUE, scale=TRUE)
CV2$CV2_KN=scale(CV2$KN, center=TRUE, scale=TRUE)
CV3$CV3_KN=scale(CV3$KN, center=TRUE, scale=TRUE)
CV1$CV1_GY=scale(CV1$GY, center=TRUE, scale=TRUE)
CV2$CV2_GY=scale(CV2$GY, center=TRUE, scale=TRUE)
CV3$CV3_GY=scale(CV3$GY, center=TRUE, scale=TRUE)
Then I download each data and match it to each variety in an Excel file. But in my actual data, I have more than 30 varieties, and it's impossible to manually match. I believe there are simple ways to automatically obtain normalized data per group in R.
Could you let me know how to do it?
Always many thanks!!
Here we will get a list of the different variety:
library(dplyr)
dataA_normalized <- dataA %>%
group_by(variety) %>%
mutate(KN_scaled = scale(KN, center = TRUE, scale = TRUE),
GY_scaled = scale(GY, center = TRUE, scale = TRUE)) %>%
group_split()
<list_of<
tbl_df<
variety : character
KN : double
GY : double
KN_scaled: double[,1]
GY_scaled: double[,1]
>
>[4]>
[[1]]
# A tibble: 25 × 5
variety KN GY KN_scaled[,1] GY_scaled[,1]
<chr> <dbl> <dbl> <dbl> <dbl>
1 CV1 3150 9730. -0.980 -1.05
2 CV1 2646 8562. -1.82 -2.01
3 CV1 3024 9885. -1.19 -0.919
4 CV1 3402 10742. -0.558 -0.211
5 CV1 3260. 9730. -0.795 -1.05
6 CV1 3276 10119. -0.769 -0.726
7 CV1 3150 9963. -0.980 -0.854
8 CV1 3686. 10975. -0.0839 -0.0180
9 CV1 3260. 10119. -0.795 -0.726
10 CV1 3402 10041. -0.558 -0.790
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows
[[2]]
# A tibble: 25 × 5
variety KN GY KN_scaled[,1] GY_scaled[,1]
<chr> <dbl> <dbl> <dbl> <dbl>
1 CV2 4313. 12632. -0.703 -0.962
2 CV2 4118 13422. -1.10 -0.663
3 CV2 4154. 12457. -1.03 -1.03
4 CV2 4473 12369. -0.376 -1.06
5 CV2 3852. 12106. -1.65 -1.16
6 CV2 4260 17281. -0.813 0.798
7 CV2 4952. 17281. 0.607 0.798
8 CV2 5680 16404. 2.10 0.466
9 CV2 5112 17720. 0.935 0.964
10 CV2 5432. 18159. 1.59 1.13
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows
[[3]]
# A tibble: 25 × 5
variety KN GY KN_scaled[,1] GY_scaled[,1]
<chr> <dbl> <dbl> <dbl> <dbl>
1 CV3 3193. 8713. -0.0445 -0.978
2 CV3 2580 8075. -1.45 -1.34
3 CV3 2999. 8554. -0.487 -1.07
4 CV3 2580 8394. -1.45 -1.16
5 CV3 3096 9138. -0.266 -0.736
6 CV3 2902. 9404. -0.708 -0.584
7 CV3 2806. 8607. -0.930 -1.04
8 CV3 2333. 7969. -2.01 -1.40
9 CV3 2999. 8819. -0.487 -0.918
10 CV3 2666 8607. -1.25 -1.04
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows
[[4]]
# A tibble: 25 × 5
variety KN GY KN_scaled[,1] GY_scaled[,1]
<chr> <dbl> <dbl> <dbl> <dbl>
1 CV4 3861 13373. 0.949 0.162
2 CV4 3663 12286. 0.446 -0.587
3 CV4 3861 12395. 0.949 -0.512
4 CV4 3564 12449. 0.194 -0.475
5 CV4 3465 13428. -0.0578 0.200
6 CV4 2886 11516. -1.53 -1.12
7 CV4 3042 10745. -1.13 -1.65
8 CV4 2886 11275. -1.53 -1.28
9 CV4 2886 11661. -1.53 -1.02
10 CV4 3412. 11805. -0.191 -0.919
# … with 15 more rows
# ℹ Use `print(n = ...)` to see more rows