I have the following df:
df_1=data.frame(col_1=c("a;b;c","c;d","e","f","g","h;j"),col_2=c("1;2;3","4","5;6","7","8;9","10;11;12"))
so I want to separate col_1 into separate rows with corresponding values of col_2 if it exists.
For example if number of elements in col_1= number of elements in col_2 then they should be separated with their corresponding values in col_1 and col_2 ( row 1)
if they have differing number of elements, if one column has only has one element, then that can be separated to different rows as well (row 2)
if they have disproportionate number of elements ( more than 1 each and not equal) then it should be left as-is
here is the final_dataset:
df_2=data.frame(col_1=c("a","b","c","c","d","e","e","f","g","g","h;j"),col_2=c("1","2","3","4","4","5","6","7","8","9","10;11;12"))
We can use cSplit
library(splitstackshape)
library(zoo)
cnt1 <- nchar(gsub(";", "", df_1$col_1))
cnt2 <- nchar(gsub(";", "", df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
rbind(cSplit(df_1[!i1,], c('col_1', 'col_2'), sep=";", "long")[
!is.na(col_1)|!is.na(col_2), lapply(.SD, na.locf0)], df_1[i1,])
# col_1 col_2
# 1: a 1
# 2: b 2
# 3: c 3
# 4: c 4
# 5: d 4
# 6: e 5
# 7: e 6
# 8: f 7
# 9: g 8
#10: g 9
#11: h;j 10;11;12
Or using base R
with all the constraints
cnt1 <- nchar(gsub(";", "", df_1$col_1))
cnt2 <- nchar(gsub(";", "", df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
lst1 <- lapply(df_1[!i1, ], function(x) strsplit(x, ";"))
out <- rbind(do.call(rbind, Map(function(x, y) {
l1 <- length(x)
l2 <- length(y)
mx <- max(l1, l2)
x <- if(l1 != l2 & l1 == 1) rep(x, mx) else x
y <- if(l1 != l2 & l2 == 1) rep(y, mx) else y
data.frame(col_1 = x, col_2 = y) } ,
lst1[[1]], lst1[[2]])), df_1[i1,])
row.names(out) <- NULL
out
# col_1 col_2
#1 a 1
#2 b 2
#3 c 3
#4 c 4
#5 d 4
#6 e 5
#7 e 6
#8 f 7
#9 g 8
#10 g 9
#11 h;j 10;11;12