I am trying two sampling methods with unbalanced data. I used the "upSample" function of the "Caret" package and everything went well. However when I use the "downSample" function I have the following error:
Error in sample.int(length(x), size, replace, prob) :
cannot take a sample larger than the population when 'replace = FALSE'
The command syntax I'am using is:
downtrain_eli=downSample(x=trainset_eli[,-16],
y=trainset_eli$Comportamento)
"trainset_eli" has 34 columns and 70.800 rows
As I am using the Random Forest model to predict multi-class (6) response variable, I am testing these two functions (up and dowsample) to keep my data balanced. However, I saw that the "Caret" package also contains the "train" function, with more options to balance the data. But this fuction is a model type function and I just wanted the function to create a dataset with balanced data, and then use it in my Random Forest model. Would it be better for me to continue using the "up and down" functions or to use the "train" function? If so, how do I implement this in my Random Forest model?
str(trainset_eli)
$ date : chr "01/10/2019" "24/09/2019" "01/10/2019" "01/10/2019" ...
$ air.temp : num 18.4 32.6 34.5 26.4 32.6 ...
$ relat.u : num 70 30.4 22.2 50.7 30.8 ...
$ wind.sp : num 1.14 2.81 1.51 3.33 2.17 ...
$ wind.dir : num 79.1 341.6 350.1 56.2 294.9 ...
$ solar.rad : num 39.6 741 433.9 621.1 274.6 ...
$ max.raj : num 1.65 5.25 2.85 6.05 4.45 ...
$ time : chr "06:40:00" "14:10:00" "14:40:00" "09:20:00" ...
$ timedate : POSIXct, format: "2019-10-01 06:43:48" "2019-09-24 14:10:45" "2019-10-01 14:48:50" ...
$ sensorid : int 67 65 66 70 70 70 69 68 69 65 ...
$ x : int -56 -49 15 35 -4 27 -40 33 -29 -47 ...
$ y : int -11 0 -4 24 10 34 -43 4 -4 5 ...
$ z : int -27 -37 -56 -20 -16 -44 -51 -49 -53 -41 ...
$ i.date : chr "01/10/2019" "24/09/2019" "01/10/2019" "01/10/2019" ...
$ i.time : chr "06:43:48" "14:10:45" "14:48:50" "09:21:41" ...
$ Comportamento: Factor w/ 6 levels "1","2","4","5",..: 6 3 3 5 2 2 1 1 2 1 ...
$ xg : num -0.875 -0.7656 0.2344 0.5469 -0.0625 ...
$ yg : num -0.1719 0 -0.0625 0.375 0.1562 ...
$ zg : num -0.422 -0.578 -0.875 -0.312 -0.25 ...
$ SMA : num 1.469 1.344 1.172 1.234 0.469 ...
$ SVM : num 0.986 0.959 0.908 0.733 0.301 ...
$ mov.var : num 0.0625 0.1094 0.0469 1.0156 1 ...
$ energy : num 0.94701 0.84715 0.67974 0.28875 0.00825 ...
$ entropy : num 0.2526 0.1219 0.0354 0.8179 0.0172 ...
$ pitch : num 62.5 52.9 -15 -48.2 12 ...
$ roll : num -158 180 -176 130 148 ...
$ inclination : num -64.7 -52.9 -15.5 -64.8 -33.9 ...
$ year : num 2019 2019 2019 2019 2019 ...
$ month : num 10 9 10 10 9 10 10 10 10 10 ...
$ day : int 1 24 1 1 24 1 1 1 1 1 ...
$ dayofweek : num 3 3 3 3 3 3 3 3 3 3 ...
$ hour : int 6 14 14 9 16 13 6 16 7 6 ...
$ minute : int 43 10 48 21 38 35 43 48 20 36 ...
$ second : num 48 45 50 41 45 16 36 13 43 57 ...
> dput(head(trainset_eli))
structure(list(date = c("01/10/2019", "24/09/2019", "01/10/2019",
"01/10/2019", "24/09/2019", "01/10/2019"), air.temp = c(18.42,
32.63, 34.54, 26.42, 32.63, 34.44), relat.u = c(70, 30.45, 22.19,
50.69, 30.83, 25.67), wind.sp = c(1.136, 2.809, 1.512, 3.326,
2.171, 2.04), wind.dir = c(79.1, 341.6, 350.1, 56.22, 294.9,
16.57), solar.rad = c(39.62, 741, 433.9, 621.1, 274.6, 847),
max.raj = c(1.647, 5.247, 2.847, 6.047, 4.447, 4.447), time = c("06:40:00",
"14:10:00", "14:40:00", "09:20:00", "16:30:00", "13:30:00"
), timedate = structure(c(1569912228, 1569334245, 1569941330,
1569921701, 1569343125, 1569936916), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), sensorid = c(67L, 65L, 66L, 70L,
70L, 70L), x = c(-56L, -49L, 15L, 35L, -4L, 27L), y = c(-11L,
0L, -4L, 24L, 10L, 34L), z = c(-27L, -37L, -56L, -20L, -16L,
-44L), i.date = c("01/10/2019", "24/09/2019", "01/10/2019",
"01/10/2019", "24/09/2019", "01/10/2019"), i.time = c("06:43:48",
"14:10:45", "14:48:50", "09:21:41", "16:38:45", "13:35:16"
), Comportamento = structure(c(6L, 3L, 3L, 5L, 2L, 2L), .Label = c("1",
"2", "4", "5", "6", "7"), class = "factor"), xg = c(-0.875,
-0.765625, 0.234375, 0.546875, -0.0625, 0.421875), yg = c(-0.171875,
0, -0.0625, 0.375, 0.15625, 0.53125), zg = c(-0.421875, -0.578125,
-0.875, -0.3125, -0.25, -0.6875), SMA = c(1.46875, 1.34375,
1.171875, 1.234375, 0.46875, 1.640625), SVM = c(0.986480882354037,
0.959380089563047, 0.907999389110477, 0.733044006608744,
0.30136408628103, 0.965847466282849), mov.var = c(0.0625,
0.109375, 0.046875, 1.015625, 1, 0.078125), energy = c(0.947010278701782,
0.847154855728149, 0.679739058017731, 0.288748800754547,
0.00824832916259766, 0.870230257511139), entropy = c(0.252618304422212,
0.121902803377891, 0.0354050216019417, 0.817915633557388,
0.0171719387098626, 0.109209155417093), pitch = c(62.4975813343597,
52.9434718105904, -14.9586823290351, -48.247900416119, 11.9694631246073,
-25.8994130495892), roll = c(-157.833654177918, 180, -175.914383220025,
129.805571092265, 147.994616791916, 142.305759533311), inclination = c(-64.6810700998259,
-52.9434718105904, -15.4942996397858, -64.7667344528855,
-33.9462950277539, -44.6176169165428), year = c(2019, 2019,
2019, 2019, 2019, 2019), month = c(10, 9, 10, 10, 9, 10),
day = c(1L, 24L, 1L, 1L, 24L, 1L), dayofweek = c(3, 3, 3,
3, 3, 3), hour = c(6L, 14L, 14L, 9L, 16L, 13L), minute = c(43L,
10L, 48L, 21L, 38L, 35L), second = c(48, 45, 50, 41, 45,
16)), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x56139e8dcfc0>, class = c("data.table",
"data.frame"))
Not very sure why it doesn't work, if I use an example set with imbalanced class, my label is named class
:
library(caret)
library(data.table)
dt = data.frame(v1 = runif(100), v2 = rnorm(100),class = sample(factor(1:6),100,seq(0.1,0.6,by=0.1),replace=TRUE))
dt = data.table(dt)
We check the output Class:
table(downSample(dt[,-3],dt$class)$Class)
1 2 3 4 5 6
4 4 4 4 4 4
table(upSample(dt[,-3],dt$class)$Class)
1 2 3 4 5 6
27 27 27 27 27 27
We can write a function to do it, but I am really not sure why caret doesn't work for you:
n = min(table(dt$class))
idx = unlist(tapply(1:nrow(dt),dt$class,sample,n))
dt[idx,]
v1 v2 class
1: 0.24056931 0.98202652 1
2: 0.29899859 0.69350666 1
3: 0.05496686 1.32054392 1
4: 0.62017288 1.49824766 1
5: 0.67481604 0.45320585 2
6: 0.79654281 0.49854685 2
7: 0.74180115 0.87424714 2
8: 0.02848226 -0.74332299 2
9: 0.05007267 1.18599816 3
10: 0.94377121 -0.45921234 3
11: 0.63222065 0.77273476 3
12: 0.89684199 -0.74368572 3
13: 0.19782915 -0.62413381 4
14: 0.89286833 0.08664853 4
15: 0.48428538 -0.90199352 4
16: 0.08179512 1.51315151 4
17: 0.89740177 -2.28249763 5
18: 0.35267634 -0.54414029 5
19: 0.68710533 -1.99195471 5
20: 0.76743271 1.17255792 5
21: 0.80106456 0.21315622 6
22: 0.53640778 0.56632657 6
23: 0.38322745 0.74336152 6
24: 0.36704649 -0.43914106 6