I have a large df with Columns: "Gene, variable, value" to which I would like to add a new variable "light_exposure". To assign the value to this new column I would like to see if there is a partial string match of "i13" within the variable column, and assign values of "yes" or "no" accordingly in the "light_exposure" column.
The column "variable" contains subject IDs and a typical one looks like "sem_outdir_NAi13t3n27_S4"
This is the code I thought would work. However it assigns yes in the new column even when there is not a match present.
df_long <- df_long %>%
group_by(Gene) %>%
mutate(Light_exposure = case_when(sum(grep("i13", paste(variable))) > 0 ~ "yes",
TRUE ~ "no")) %>% ungroup()
df <- structure(list(rsem_outdir_NAi13t3n2_S1 = c(756.624, 0, 357.557,
1.1, 0, 0), rsem_outdir_NAi13t3n14_S2 = c(906.464, 0, 352.441,
1.022, 0, 0), rsem_outdir_NAi13t3n26_S3 = c(307.957, 0, 211.422,
1.04, 0, 0), rsem_outdir_NAi13t3n27_S4 = c(739.13, 0, 268.329,
1.243, 0, 0), rsem_outdir_NAi13t3n31_S5 = c(206.39, 0, 205.709,
1.052, 0, 0), rsem_outdir_NAi6t3n1_S6 = c(154.113, 0, 247.21,
1.458, 0, 0), rsem_outdir_NAi6t3n10_S7 = c(220.125, 0, 346.917,
0.378, 0, 0), rsem_outdir_NAi6t3n16_S8 = c(554.147, 0, 252.998,
0.503, 0, 0), rsem_outdir_NAi6t3n24_S9 = c(317.122, 0, 325.901,
1.32, 0, 0), rsem_outdir_NAi6t3n29_S10 = c(183.428, 0, 216.055,
1.47, 0, 0), rsem_outdir_NAi13t12n7_S11 = c(125.687, 0, 157.471,
0.41, 0, 0), rsem_outdir_NAi13t12n10_S12 = c(403.675, 0, 224.673,
1.034, 0, 0), rsem_outdir_NAi13t12n11_S13 = c(1387.289, 0, 429.794,
1.657, 0, 0), rsem_outdir_NAi13t12n16_S14 = c(470.699, 0, 145.129,
0.949, 0, 0), rsem_outdir_NAi13t12n19_S15 = c(401.953, 0, 164.364,
1.143, 0, 0), rsem_outdir_NAi6t12n3_S16 = c(534.054, 0, 240.231,
0.814, 0, 0), rsem_outdir_NAi6t12n4_S17 = c(420.981, 0, 204.186,
0.866, 0, 0), rsem_outdir_NAi6t12n18_S18 = c(509.926, 0, 263.276,
0.716, 0, 0), rsem_outdir_NAi6t12n23_S19 = c(603.937, 0, 249.055,
1.038, 0, 0), rsem_outdir_NAi6t12n27_S20 = c(374.14, 0, 293.112,
1.163, 0, 0)), row.names = c("TRINITY_DN0_c0_g1", "TRINITY_DN0_c13_g1",
"TRINITY_DN0_c1_g1", "TRINITY_DN0_c1_g2", "TRINITY_DN0_c2_g1",
"TRINITY_DN0_c40_g1"), class = "data.frame")))
> df
Not sure if this is helpful but here is the sessionInfo
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] reshape_0.8.9 forcats_0.5.1 stringr_1.5.0 dplyr_1.1.4 purrr_1.0.2
[6] readr_2.1.2 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.5.0 tidyverse_1.3.1
loaded via a namespace (and not attached):
[1] tidyselect_1.2.0 xfun_0.39 haven_2.4.3 colorspace_2.1-0
[5] vctrs_0.6.5 generics_0.1.3 htmltools_0.5.5 yaml_2.3.7
[9] utf8_1.2.3 rlang_1.1.2 pillar_1.9.0 glue_1.6.2
[13] withr_2.5.0 DBI_1.1.1 dbplyr_2.1.1 modelr_0.1.8
[17] readxl_1.3.1 plyr_1.8.6 lifecycle_1.0.3 munsell_0.5.0
[21] gtable_0.3.3 cellranger_1.1.0 rvest_1.0.2 evaluate_0.23
[25] knitr_1.43 tzdb_0.2.0 fastmap_1.1.1 fansi_1.0.4
[29] broom_1.0.1 Rcpp_1.0.10 scales_1.3.0 backports_1.3.0
[33] jsonlite_1.8.8 fs_1.6.1 hms_1.1.1 digest_0.6.33
[37] stringi_1.7.12 grid_4.1.2 cli_3.6.1 tools_4.1.2
[41] magrittr_2.0.3 crayon_1.5.2 pkgconfig_2.0.3 ellipsis_0.3.2
[45] xml2_1.3.3 reprex_2.0.1 lubridate_1.8.0 assertthat_0.2.1
[49] rmarkdown_2.21 httr_1.4.5 rstudioapi_0.13 R6_2.5.1
[53] compiler_4.1.2
I suspect you are looking for
df |>
tibble::rownames_to_column("Gene") |>
tidyr::pivot_longer(cols = -Gene,
names_to = "Variable", values_to = "Value") |>
dplyr::mutate(Light_exposure = grepl("i13", Variable))
giving
# A tibble: 120 × 4
Gene Variable Value Light_exposure
<chr> <chr> <dbl> <lgl>
1 TRINITY_DN0_c0_g1 rsem_out… 757. TRUE
2 TRINITY_DN0_c0_g1 rsem_out… 906. TRUE
3 TRINITY_DN0_c0_g1 rsem_out… 308. TRUE
4 TRINITY_DN0_c0_g1 rsem_out… 739. TRUE
5 TRINITY_DN0_c0_g1 rsem_out… 206. TRUE
6 TRINITY_DN0_c0_g1 rsem_out… 154. FALSE
7 TRINITY_DN0_c0_g1 rsem_out… 220. FALSE
8 TRINITY_DN0_c0_g1 rsem_out… 554. FALSE
9 TRINITY_DN0_c0_g1 rsem_out… 317. FALSE
10 TRINITY_DN0_c0_g1 rsem_out… 183. FALSE
# ℹ 110 more rows
# ℹ Use `print(n = ...)` to see more rows
We first move the row names to a new variable Gene
to keep track of them. We then pivot the data to long format, and finally identify which strings (Variable
) contain i13
.
Notice that, it is easier and more common to work with logical TRUE
/FALSE
than with character "no"
/"yes"
.
Input
df <- structure(list(rsem_outdir_NAi13t3n2_S1 = c(756.624, 0, 357.557,
1.1, 0, 0), rsem_outdir_NAi13t3n14_S2 = c(906.464, 0, 352.441,
1.022, 0, 0), rsem_outdir_NAi13t3n26_S3 = c(307.957, 0, 211.422,
1.04, 0, 0), rsem_outdir_NAi13t3n27_S4 = c(739.13, 0, 268.329,
1.243, 0, 0), rsem_outdir_NAi13t3n31_S5 = c(206.39, 0, 205.709,
1.052, 0, 0), rsem_outdir_NAi6t3n1_S6 = c(154.113, 0, 247.21,
1.458, 0, 0), rsem_outdir_NAi6t3n10_S7 = c(220.125, 0, 346.917,
0.378, 0, 0), rsem_outdir_NAi6t3n16_S8 = c(554.147, 0, 252.998,
0.503, 0, 0), rsem_outdir_NAi6t3n24_S9 = c(317.122, 0, 325.901,
1.32, 0, 0), rsem_outdir_NAi6t3n29_S10 = c(183.428, 0, 216.055,
1.47, 0, 0), rsem_outdir_NAi13t12n7_S11 = c(125.687, 0, 157.471,
0.41, 0, 0), rsem_outdir_NAi13t12n10_S12 = c(403.675, 0, 224.673,
1.034, 0, 0), rsem_outdir_NAi13t12n11_S13 = c(1387.289, 0, 429.794,
1.657, 0, 0), rsem_outdir_NAi13t12n16_S14 = c(470.699, 0, 145.129,
0.949, 0, 0), rsem_outdir_NAi13t12n19_S15 = c(401.953, 0, 164.364,
1.143, 0, 0), rsem_outdir_NAi6t12n3_S16 = c(534.054, 0, 240.231,
0.814, 0, 0), rsem_outdir_NAi6t12n4_S17 = c(420.981, 0, 204.186,
0.866, 0, 0), rsem_outdir_NAi6t12n18_S18 = c(509.926, 0, 263.276,
0.716, 0, 0), rsem_outdir_NAi6t12n23_S19 = c(603.937, 0, 249.055,
1.038, 0, 0), rsem_outdir_NAi6t12n27_S20 = c(374.14, 0, 293.112,
1.163, 0, 0)), row.names = c("TRINITY_DN0_c0_g1", "TRINITY_DN0_c13_g1",
"TRINITY_DN0_c1_g1", "TRINITY_DN0_c1_g2", "TRINITY_DN0_c2_g1",
"TRINITY_DN0_c40_g1"), class = "data.frame")))