I would like to compare data in R
across columns of different size.
This is my data set.
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 5 29 53 AA2 11 56 34 AA2
129 516 34 BB5 44 217 42 BB1 36 190 39 BB1
107 436 44 AA3 29 147 53 CC7 30 155 31 CC1
57 244 53 BB6 21 108 53 BB2 14 77 61 BB4
57 227 29 AA1 21 104 39 AA6 9 48 44 BB6
80 318 47 AA2 18 89 47 CC3 37 200 44 DD3
128 529 56 BB4 43 222 54 CC1 36 202 50 CC3
31 127 53 CC1 7 38 53 DD4
18 73 47 CC2
I used the duplicated
function, appended the column with TRUE
and FALSE
results and then grepped results but it did not work. Here is my code:
v=duplicated(data$pp_filename, data$filename, data$filename)
b=cbind(data, dup=v)
dupl=(b[grep("FALSE", b$dup),])
And this is what I want to get (again in an appropriate format):
pp_value_1 pp_value_2 pp_value_3 pp_filename nn_value_1 nn_value_2 nn_value_3 nn_filename mm_value_1 mm_value_2 mm_value_3 mm_filename
17 73 53 CC3 18 89 47 CC3 11 56 34 AA2
80 318 47 AA2 43 222 54 CC1 30 155 31 CC1
31 127 53 CC1 5 29 53 AA2 36 202 50 CC3
This is more of a suggestion on the format of your data for the sake of easier manipulations. I guess, a better format of your data would be:
spl = split.default(DF, substring(names(DF), 1, 2))
lDF = do.call(rbind,
lapply(seq_along(spl),
function(i)
setNames(cbind(names(spl)[i],
spl[[i]][complete.cases(spl[[i]]), ]),
c("type", gsub("^(.*?)_", "", names(spl[[i]]))))))
lDF
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#2 mm 36 190 39 BB1
#3 mm 30 155 31 CC1
#4 mm 14 77 61 BB4
#5 mm 9 48 44 BB6
#6 mm 37 200 44 DD3
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#9 nn 44 217 42 BB1
#....
Then, you can proceed (at least from what I understand from the question) with:
commons = Reduce(intersect, split(lDF$filename, lDF$type))
lDF[lDF$filename %in% commons, ]
# type value_1 value_2 value_3 filename
#1 mm 11 56 34 AA2
#3 mm 30 155 31 CC1
#7 mm 36 202 50 CC3
#8 nn 5 29 53 AA2
#13 nn 18 89 47 CC3
#14 nn 43 222 54 CC1
#16 pp 17 73 53 CC3
#21 pp 80 318 47 AA2
#23 pp 31 127 53 CC1
If you want the format you show, there are some workarounds you can come with. E.g.:
res = lDF[lDF$filename %in% commons, ]
tmp = split(res[-1], res[[1]])
do.call(cbind,
lapply(seq_along(tmp),
function(i)
setNames(tmp[[i]],
paste(names(tmp)[i], names(tmp[[i]]), sep = "_"))))
"DF" is:
DF = structure(list(pp_value_1 = c(17L, 129L, 107L, 57L, 57L, 80L,
128L, 31L, 18L), pp_value_2 = c(73L, 516L, 436L, 244L, 227L,
318L, 529L, 127L, 73L), pp_value_3 = c(53L, 34L, 44L, 53L, 29L,
47L, 56L, 53L, 47L), pp_filename = structure(c(9L, 5L, 3L, 6L,
1L, 2L, 4L, 7L, 8L), .Label = c("AA1", "AA2", "AA3", "BB4", "BB5",
"BB6", "CC1", "CC2", "CC3"), class = "factor"), nn_value_1 = c(5L,
44L, 29L, 21L, 21L, 18L, 43L, 7L, NA), nn_value_2 = c(29L, 217L,
147L, 108L, 104L, 89L, 222L, 38L, NA), nn_value_3 = c(53L, 42L,
53L, 53L, 39L, 47L, 54L, 53L, NA), nn_filename = structure(c(1L,
3L, 7L, 4L, 2L, 6L, 5L, 8L, NA), .Label = c("AA2", "AA6", "BB1",
"BB2", "CC1", "CC3", "CC7", "DD4"), class = "factor"), mm_value_1 = c(11L,
36L, 30L, 14L, 9L, 37L, 36L, NA, NA), mm_value_2 = c(56L, 190L,
155L, 77L, 48L, 200L, 202L, NA, NA), mm_value_3 = c(34L, 39L,
31L, 61L, 44L, 44L, 50L, NA, NA), mm_filename = structure(c(1L,
2L, 5L, 3L, 4L, 7L, 6L, NA, NA), .Label = c("AA2", "BB1", "BB4",
"BB6", "CC1", "CC3", "DD3"), class = "factor")), .Names = c("pp_value_1",
"pp_value_2", "pp_value_3", "pp_filename", "nn_value_1", "nn_value_2",
"nn_value_3", "nn_filename", "mm_value_1", "mm_value_2", "mm_value_3",
"mm_filename"), class = "data.frame", row.names = c(NA, -9L))