Search code examples
rdataframefunctiondateapply

Return the name of a column which matches a date value


I have a data.frame with about 66k rows and 150 columns. For the purposes of this question, this is a sample. All columns are date objects.

structure(list(fixed_date = structure(c(19267, 19239, 19120, 
18035, 19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224, 
19624, 19211, 19211, 19211, 18409, 18409, 18558, 19876, 19267
), class = "Date"), BEGIN1 = structure(c(19240, 19212, 19086, 
18008, 19240, 19177, 19086, 19751, 19086, 19177, 18197, 18197, 
19597, 19177, 19177, 18197, 18197, 18197, 18477, 18477, 19240
), class = "Date"), END1 = structure(c(19267, 19239, 19120, 18035, 
19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224, 19624, 
19211, 19211, 18224, 18224, 18224, 18488, 18488, 19267), class = "Date"), 
    BEGIN2 = structure(c(NA, NA, 18043, NA, NA, NA, NA, NA, 18232, 
    NA, NA, NA, NA, NA, 18232, 18232, 18232, 18232, 18512, 19849, 
    18484), class = "Date"), END2 = structure(c(NA, NA, 18070, 
    NA, NA, NA, NA, NA, 18280, NA, NA, NA, NA, NA, 18280, 18280, 
    18280, 18280, 18523, 19876, 18495), class = "Date"), BEGIN3 = structure(c(NA, 
    NA, 18897, NA, NA, NA, NA, NA, 18323, NA, NA, NA, NA, NA, 
    18354, 18354, 18396, 18396, 18547, 19793, 18519), class = "Date"), 
    END3 = structure(c(NA, NA, 18924, NA, NA, NA, NA, NA, 18350, 
    NA, NA, NA, NA, NA, 18381, 18381, 18409, 18409, 18558, 19827, 
    18530), class = "Date"), BEGIN4 = structure(c(NA, NA, 18113, 
    NA, NA, NA, NA, NA, 18687, NA, NA, NA, NA, NA, 18722, 18372, 
    NA, NA, NA, NA, 18554), class = "Date"), END4 = structure(c(NA, 
    NA, 18140, NA, NA, NA, NA, NA, 18700, NA, NA, NA, NA, NA, 
    18749, 18399, NA, NA, NA, NA, 18565), class = "Date"), BEGIN5 = structure(c(NA, 
    NA, 18148, NA, NA, NA, NA, NA, 18407, NA, NA, NA, NA, NA, 
    18407, 18407, NA, NA, NA, NA, NA), class = "Date"), END5 = structure(c(NA, 
    NA, 18175, NA, NA, NA, NA, NA, 18434, NA, NA, NA, NA, NA, 
    18434, 18434, NA, NA, NA, NA, NA), class = "Date")), row.names = c(NA, 
-21L), class = c("tbl_df", "tbl", "data.frame"))

What I want is a column that gives me the name of the column across columns 2 and 11 which is equal to the value of column 1. This runs with no errors but no column is created:

sample$match_date_<- apply(sample[,c(2:11)], 1, 
        function(x) names(which(x == sample[,1])))  

I don't know why this isn't working as expected and also I want to achieve a faster solution since I guess using apply will be slow. Any advice will be much appreciated.


Solution

  • As a tidyverse / dplyr package user, my suggestion is to use the column names as "variable" for easier manipulations, with pivot_longer(). If you anticipate that there will be a single match on every row, you can use filter(). I don't think that it will slow, some seconds on bigger dataset maybe.

    library(tidyverse) 
    
    df = structure(list(fixed_date = structure(c(19267, 19239, 19120, 
                                                 18035, 19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224, 
                                                 19624, 19211, 19211, 19211, 18409, 18409, 18558, 19876, 19267
    ), class = "Date"), BEGIN1 = structure(c(19240, 19212, 19086, 
                                             18008, 19240, 19177, 19086, 19751, 19086, 19177, 18197, 18197, 
                                             19597, 19177, 19177, 18197, 18197, 18197, 18477, 18477, 19240
    ), class = "Date"), END1 = structure(c(19267, 19239, 19120, 18035, 
                                           19253, 19211, 19120, 19778, 19120, 19211, 18224, 18224, 19624, 
                                           19211, 19211, 18224, 18224, 18224, 18488, 18488, 19267), class = "Date"), 
    BEGIN2 = structure(c(NA, NA, 18043, NA, NA, NA, NA, NA, 18232, 
                         NA, NA, NA, NA, NA, 18232, 18232, 18232, 18232, 18512, 19849, 
                         18484), class = "Date"), END2 = structure(c(NA, NA, 18070, 
                                                                     NA, NA, NA, NA, NA, 18280, NA, NA, NA, NA, NA, 18280, 18280, 
                                                                     18280, 18280, 18523, 19876, 18495), class = "Date"), BEGIN3 = structure(c(NA, 
                                                                                                                                               NA, 18897, NA, NA, NA, NA, NA, 18323, NA, NA, NA, NA, NA, 
                                                                                                                                               18354, 18354, 18396, 18396, 18547, 19793, 18519), class = "Date"), 
    END3 = structure(c(NA, NA, 18924, NA, NA, NA, NA, NA, 18350, 
                       NA, NA, NA, NA, NA, 18381, 18381, 18409, 18409, 18558, 19827, 
                       18530), class = "Date"), BEGIN4 = structure(c(NA, NA, 18113, 
                                                                     NA, NA, NA, NA, NA, 18687, NA, NA, NA, NA, NA, 18722, 18372, 
                                                                     NA, NA, NA, NA, 18554), class = "Date"), END4 = structure(c(NA, 
                                                                                                                                 NA, 18140, NA, NA, NA, NA, NA, 18700, NA, NA, NA, NA, NA, 
                                                                                                                                 18749, 18399, NA, NA, NA, NA, 18565), class = "Date"), BEGIN5 = structure(c(NA, 
                                                                                                                                                                                                             NA, 18148, NA, NA, NA, NA, NA, 18407, NA, NA, NA, NA, NA, 
                                                                                                                                                                                                             18407, 18407, NA, NA, NA, NA, NA), class = "Date"), END5 = structure(c(NA, 
                                                                                                                                                                                                                                                                                    NA, 18175, NA, NA, NA, NA, NA, 18434, NA, NA, NA, NA, NA, 
                                                                                                                                                                                                                                                                                    18434, 18434, NA, NA, NA, NA, NA), class = "Date")), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                      -21L), class = c("tbl_df", "tbl", "data.frame"))
    
    
    df %>% mutate(id = 1:n() ) %>% 
      pivot_longer(-c(fixed_date, id) )  %>% 
      group_by(id) %>% 
      filter(fixed_date ==value)
    #> # A tibble: 20 × 4
    #> # Groups:   id [20]
    #>    fixed_date    id name  value     
    #>    <date>     <int> <chr> <date>    
    #>  1 2022-10-02     1 END1  2022-10-02
    #>  2 2022-09-04     2 END1  2022-09-04
    #>  3 2022-05-08     3 END1  2022-05-08
    #>  4 2019-05-19     4 END1  2019-05-19
    #>  5 2022-09-18     5 END1  2022-09-18
    #>  6 2022-08-07     6 END1  2022-08-07
    #>  7 2022-05-08     7 END1  2022-05-08
    #>  8 2024-02-25     8 END1  2024-02-25
    #>  9 2022-05-08     9 END1  2022-05-08
    #> 10 2022-08-07    10 END1  2022-08-07
    #> 11 2019-11-24    11 END1  2019-11-24
    #> 12 2019-11-24    12 END1  2019-11-24
    #> 13 2023-09-24    13 END1  2023-09-24
    #> 14 2022-08-07    14 END1  2022-08-07
    #> 15 2022-08-07    15 END1  2022-08-07
    #> 16 2020-05-27    17 END3  2020-05-27
    #> 17 2020-05-27    18 END3  2020-05-27
    #> 18 2020-10-23    19 END3  2020-10-23
    #> 19 2024-06-02    20 END2  2024-06-02
    #> 20 2022-10-02    21 END1  2022-10-02
    

    Created on 2024-06-13 with reprex v2.1.0