I am very very new to programming and R and I already got a complex task and I need help.
Let's assume I have this data set
#1 | #2 | #3 | #4 |
---|---|---|---|
NA | a | NA | b |
a | b | c | d |
h | i | a | d |
NA | t | h | NA |
I need to output permutations without repetitions and without exchanges ignoring NA of each row. So first row would provide "ab" value (no "ba" value). Second row would output "ab", "ac", "ad", "bc", "bd", "cd". Output for better understanding:
#1 | #2 | #3 | #4 | perm.1 | perm.2 | perm.3 | perm.4 | perm.5 | perm.6 |
---|---|---|---|---|---|---|---|---|---|
NA | a | NA | b | ab | NA | NA | NA | NA | NA |
a | b | c | d | ab | ac | ad | bc | bd | cd |
h | i | a | d | hi | ha | hd | ai | id | ad |
NA | t | h | c | th | tc | hc | NA | NA | NA |
Also, I would need to do these steps after I have permutations listed but I will sort out myself.
For example in this case we have ab = 2, ad = 2 and all other permutations are = 1
I tried to play around with gtools permutations and combinations functions but I was not able to come close to solution.
Data frame consists of 30k rows and 42 columns.
head(df)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
1 ara urd <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 eng spa rus <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
3 <NA> eng ger <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
4 <NA> fre ger eng <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
5 <NA> eng fre <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 <NA> eng fre slo por dut pol <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
7 <NA> eng ger fre ita spa <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
8 <NA> ger eng fre <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
9 <NA> fre slo por dut pol cat <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
10 <NA> eng fre <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
34 35 36 37 38 39 40 41 42 id
1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 1
2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 2
3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 3
4 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 4
5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 5
6 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 6
7 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 7
8 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 8
9 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 9
10 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 10
dput(head(df, 10)
structure(list(`1` = c("ara", "eng", NA, NA, NA, NA, NA, NA,
NA, NA), `2` = c("urd", "spa", "eng", "fre", "eng", "eng", "eng",
"ger", "fre", "eng"), `3` = c(NA, "rus", "ger", "ger", "fre",
"fre", "ger", "eng", "slo", "fre"), `4` = c(NA, NA, NA, "eng",
NA, "slo", "fre", "fre", "por", NA), `5` = c(NA, NA, NA, NA,
NA, "por", "ita", NA, "dut", NA), `6` = c(NA, NA, NA, NA, NA,
"dut", "spa", NA, "pol", NA), `7` = c(NA, NA, NA, NA, NA, "pol",
NA, NA, "cat", NA), `8` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), `9` = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `10` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
`11` = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), `12` = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), `13` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `14` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `15` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `16` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `17` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `18` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `19` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `20` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `21` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `22` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `23` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `24` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `25` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `26` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `27` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `28` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `29` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `30` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `31` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `32` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `33` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `34` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `35` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `36` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `37` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `38` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `39` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `40` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), `41` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), `42` = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), id = 1:10), row.names = c(NA, 10L), class = "data.frame")
As to the first part of your question to get an ouptut with the combinations, you can do:
# Creating the test data. Note how I added an ID column that we need to the grouping and identification
df <- matrix(data = sample(c(NA, letters), size = 30000*42, replace = TRUE), ncol = 42, nrow = 30000) |>
as.data.frame() |>
as_tibble() |>
mutate(id = row_number())
# Now run everything
df_new <- df |>
pivot_longer(-id) |>
filter(!is.na(value)) |>
group_by(id) |>
summarize(x1 = combn(value, 2, simplify = TRUE)[1,],
x2 = combn(value, 2, simplify = TRUE)[2,]) |>
mutate(perm_number = 1:n()) |>
ungroup() |>
mutate(perms = str_c(x1, x2)) |>
select(-x1, -x2) |>
pivot_wider(values_from = perms,
names_from = perm_number,
names_prefix = "perms.") |>
left_join(x = df,
y = _,
by = "id")
With the above test data, the runtime is ~34s for me which I think is good enough for the size of the task:
user system elapsed
34.34 1.00 35.49
# Using the initial test data with 4 rows, the result would look like:
x1 x2 x3 x4 id perms.1 perms.2 perms.3 perms.4 perms.5 perms.6
1 <NA> a <NA> b 1 ab <NA> <NA> <NA> <NA> <NA>
2 a b c d 2 ab ac ad bc bd cd
3 h i a d 3 hi ha hd ia id ad
4 <NA> t h <NA> 4 th <NA> <NA> <NA> <NA> <NA>
# Create the data
df <- data.frame(x1 = c(NA, "a", "h", NA),
x2 = c("a", "b", "i", "t"),
x3 = c(NA, "c", "a", "h"),
x4 = c("b", "d", "d", NA))
df |>
mutate(perms = apply(across(everything()), 1, function(x) combn(x[!is.na(x)], 2, simplify = FALSE))) |>
mutate(perm_length = max(lengths(perms))) |>
unnest_wider(perms) |>
rename_with(.cols = starts_with("..."),
.fn = ~paste0("perm.", str_remove(., "..."))) |>
mutate(across(starts_with("perm."), ~unlist(map(.x = .,
.f = ~ str_c(unlist(.), collapse = "")))),
across(starts_with("perm."), ~ifelse(. == "", NA_character_, .)))
which gives:
# A tibble: 4 × 11
x1 x2 x3 x4 perm.1 perm.2 perm.3 perm.4 perm.5 perm.6 perm_length
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 NA a NA b ab NA NA NA NA NA 6
2 a b c d ab ac ad bc bd cd 6
3 h i a d hi ha hd ia id ad 6
4 NA t h NA th NA NA NA NA NA 6