Search code examples
rdataframemergeconditional-statements

Binding columns of two dataframes of different lengths in R tidyverse with grouped vectors


I'm trying to column bind two dataframes: the first is a time-series with numerous columns and ~10,000 rows. The second is a derivative of the first with correlation coefficients among other relevant vectors.

The 2nd df is obviously much shorter than the first. Both dfs have grouping columns of species and region.

Goal: I would like to add the results of the shorter 2nd df to the first, matching by species and region columns.

I've tried merge, full_join, and other join functions but these either result in NAs or add rows to the first df. How would I go about matching and merging these dfs in a tidy universe?

Below is a minimum reproducible example of the dfs and desired output:

set.seed(123)

ebird <- data.frame(
  
  region = c("one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three"),
  Species = c("A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C"),
  value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
  date = seq(from = 1, to = 5, by = 1)
  
) 

cor <- data.frame(
  
  region = c("one", "one", "one",
             "two", "two", "two",
             "three", "three", "three"),
  Species = c("A", "B", "C",
              "A", "B", "C",
              "A", "B", "C"),
  rho = sample(seq(from = -1, to = 1, by = 0.1), size = 9, replace = TRUE),
  date_max_value = sample(seq(from = 1, to = 5, by = 1), size = 9, replace = TRUE) # date of max abundance (value)

)

desired_output <- data.frame(
  
  region = c("one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "one", "one", "one", "one", "one",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "two", "two", "two", "two", "two",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three",
             "three", "three", "three", "three", "three"),
  Species = c("A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C",
              "A", "A", "A", "A", "A",
              "B", "B", "B", "B", "B",
              "C", "C", "C", "C", "C"),
  value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
  date = seq(from = 1, to = 5, by = 1),
  rho = c(rep(-0.6, 5), rep(-0.3, 5), rep(0.1, 5), rep(-0.2, 5), rep(0.7, 5),
          rep(-1.0, 5), rep(-0.5, 5), rep(1.0, 5), rep(0.4, 5)),
  date_max_value = c(rep(1, 5), rep(2, 5), rep(4, 5), rep(4, 5), rep(3, 5),
                     rep(1, 5), rep(2, 5), rep(1, 5), rep(2, 5))
  
) 

EDIT Adding dput(head(df1)) and dput(head(df2)) so readers understand structure of the data. Note: I am attempting to left_join on the Species column, not the species column - I understand that could create some confusion given that my species was not capitalized in my toy example:

dput(head(ebird))
structure(list(species = c("Mallard", "Mallard", "Mallard", "Mallard", 
"Mallard", "Mallard"), week = structure(c(19255, 19262, 19276, 
19283, 19290, 19304), class = "Date"), median = c(3.28408646583557, 
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313, 
118.688171863556), region = c("Adams", "Adams", "Adams", "Adams", 
"Adams", "Adams"), month = c(9, 9, 10, 10, 10, 11), lower = c(2.58288896083832, 
2.33652949333191, 5.62750267982483, 15.5113918781281, 24.3405115604401, 
102.083569049835), upper = c(3.94177603721619, 2.92097306251526, 
7.13815760612488, 17.6045758724213, 29.9772963523865, 141.712877750397
), Date = structure(c(19255, 19262, 19276, 19283, 19290, 19304
), class = "Date"), week1 = c(38, 39, 41, 42, 43, 45), Species = c("MALL", 
"MALL", "MALL", "MALL", "MALL", "MALL"), survey = c(0, 0, 0, 
0, 0, 100), year = c(2022, 2022, 2022, 2022, 2022, 2022), day = c(20L, 
27L, 11L, 18L, 25L, 8L), Date1 = structure(c(19255, 19262, 19276, 
19283, 19290, 19304), class = "Date"), difference = c(3.28408646583557, 
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313, 
18.6881718635559), max_survey = c(500, 500, 500, 500, 500, 500
), max_ebird = c(231.35857963562, 231.35857963562, 231.35857963562, 
231.35857963562, 231.35857963562, 231.35857963562), max_ebird_upper = c(262.414058685303, 
262.414058685303, 262.414058685303, 262.414058685303, 262.414058685303, 
262.414058685303), max_ebird_lower = c(211.088241577148, 211.088241577148, 
211.088241577148, 211.088241577148, 211.088241577148, 211.088241577148
), scaled_ebird = c(0.0141947900570961, 0.0115574002394934, 0.0268204184375627, 
0.0715650209420927, 0.115733997124301, 0.513005275406189), scaled_ebird_upper = c(0.0150212075411071, 
0.0111311607203873, 0.0272018871316847, 0.0670870149283174, 0.114236624754684, 
0.540035387053498), scaled_ebird_lower = c(0.0122360627078999, 
0.011068970378807, 0.0266594796459475, 0.0734829745239929, 0.115309651445195, 
0.483606136879613), scaled_survey = c(0, 0, 0, 0, 0, 0.2), zscaled_ebird = c(-1.15368075571164, 
-1.16038986114502, -1.12156314025646, -1.00773989965177, -0.895380961995726, 
0.115214769380754), zscaled_ebird_upper = c(-1.14916660942026, 
-1.15919230741443, -1.11777371864876, -1.01497900584015, -0.893461766987318, 
0.203936283432415), zscaled_ebird_lower = c(-1.15213092209424, 
-1.15506773303076, -1.11583657969332, -0.998012352409944, -0.892761860089624, 
0.0340004185595489), zscaled_survey = c(-0.639688641399654, -0.639688641399654, 
-0.639688641399654, -0.639688641399654, -0.639688641399654, -0.0156021619853574
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
> dput(head(cor))
structure(list(Date1 = structure(c(19255, 19241, 19234, 19234, 
19234, 19234), class = "Date"), survey = c(0, 0, 0, 0, 0, 0), 
    Species = c("ABDU ", "ABDU ", "ABDU ", "ABDU ", "ABDU ", 
    "ABDU "), region = c(" Adams", " Appanoose/Lucas", " Bremer", 
    " Buena Vista", " Butler", " Calhoun"), Parameter1 = c("scaled_ebird", 
    "scaled_ebird", "scaled_ebird", "scaled_ebird", "scaled_ebird", 
    "scaled_ebird"), Parameter2 = c("scaled_survey", "scaled_survey", 
    "scaled_survey", "scaled_survey", "scaled_survey", "scaled_survey"
    ), rho = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), CI = c(0.95, 0.95, 0.95, 0.95, 0.95, 0.95), CI_low = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CI_high = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), S = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), p = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Method = c("NA correlation", 
    "NA correlation", "NA correlation", "NA correlation", "NA correlation", 
    "NA correlation"), n_Obs = c(0L, 0L, 0L, 0L, 0L, 0L), state = c("IA", 
    "IA", "IA", "IA", "IA", "IA")), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))

Solution

  • There are no matching records in the real data sample you provided, so I changed the values of cor$Species to `"MALL ", retaining the trailing space as in the original data.

    I also noted that cor$region has a leading space.

    Try this:

    ebird %>%
      inner_join(
        cor %>%
          mutate(
            region = str_trim(region),
            Species = str_trim(Species)
          ),
        by = c("region", "Species")
      )
    

    It trims leading and trailing whitespace from the key columns in cor.

    This gives reasonable results with the toy data (although the values in the value column do not appear to match between ebird and desired_output).

    With the amended real data it yields only 6 rows (matching rows in ebird) and picks out the corresponding values from cor).

    If this doesn't do the job, perhaps you could provide a subset of the real data where there is a match between records?