I'm trying to column bind two dataframes: the first is a time-series with numerous columns and ~10,000 rows. The second is a derivative of the first with correlation coefficients among other relevant vectors.
The 2nd df is obviously much shorter than the first. Both dfs have grouping columns of species
and region
.
Goal: I would like to add the results of the shorter 2nd df to the first, matching by species
and region
columns.
I've tried merge
, full_join
, and other join
functions but these either result in NAs
or add rows to the first df. How would I go about matching and merging these dfs in a tidy
universe?
Below is a minimum reproducible example of the dfs and desired output:
set.seed(123)
ebird <- data.frame(
region = c("one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three"),
Species = c("A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
date = seq(from = 1, to = 5, by = 1)
)
cor <- data.frame(
region = c("one", "one", "one",
"two", "two", "two",
"three", "three", "three"),
Species = c("A", "B", "C",
"A", "B", "C",
"A", "B", "C"),
rho = sample(seq(from = -1, to = 1, by = 0.1), size = 9, replace = TRUE),
date_max_value = sample(seq(from = 1, to = 5, by = 1), size = 9, replace = TRUE) # date of max abundance (value)
)
desired_output <- data.frame(
region = c("one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"one", "one", "one", "one", "one",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"two", "two", "two", "two", "two",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three",
"three", "three", "three", "three", "three"),
Species = c("A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C",
"A", "A", "A", "A", "A",
"B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
value = sample(seq(from = 1, to = 45, by = 1), replace = TRUE), # abundance
date = seq(from = 1, to = 5, by = 1),
rho = c(rep(-0.6, 5), rep(-0.3, 5), rep(0.1, 5), rep(-0.2, 5), rep(0.7, 5),
rep(-1.0, 5), rep(-0.5, 5), rep(1.0, 5), rep(0.4, 5)),
date_max_value = c(rep(1, 5), rep(2, 5), rep(4, 5), rep(4, 5), rep(3, 5),
rep(1, 5), rep(2, 5), rep(1, 5), rep(2, 5))
)
EDIT Adding dput(head(df1))
and dput(head(df2))
so readers understand structure of the data. Note: I am attempting to left_join
on the Species
column, not the species
column - I understand that could create some confusion given that my species was not capitalized in my toy example:
dput(head(ebird))
structure(list(species = c("Mallard", "Mallard", "Mallard", "Mallard",
"Mallard", "Mallard"), week = structure(c(19255, 19262, 19276,
19283, 19290, 19304), class = "Date"), median = c(3.28408646583557,
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313,
118.688171863556), region = c("Adams", "Adams", "Adams", "Adams",
"Adams", "Adams"), month = c(9, 9, 10, 10, 10, 11), lower = c(2.58288896083832,
2.33652949333191, 5.62750267982483, 15.5113918781281, 24.3405115604401,
102.083569049835), upper = c(3.94177603721619, 2.92097306251526,
7.13815760612488, 17.6045758724213, 29.9772963523865, 141.712877750397
), Date = structure(c(19255, 19262, 19276, 19283, 19290, 19304
), class = "Date"), week1 = c(38, 39, 41, 42, 43, 45), Species = c("MALL",
"MALL", "MALL", "MALL", "MALL", "MALL"), survey = c(0, 0, 0,
0, 0, 100), year = c(2022, 2022, 2022, 2022, 2022, 2022), day = c(20L,
27L, 11L, 18L, 25L, 8L), Date1 = structure(c(19255, 19262, 19276,
19283, 19290, 19304), class = "Date"), difference = c(3.28408646583557,
2.67390370368958, 6.20513391494751, 16.557181596756, 26.7760531902313,
18.6881718635559), max_survey = c(500, 500, 500, 500, 500, 500
), max_ebird = c(231.35857963562, 231.35857963562, 231.35857963562,
231.35857963562, 231.35857963562, 231.35857963562), max_ebird_upper = c(262.414058685303,
262.414058685303, 262.414058685303, 262.414058685303, 262.414058685303,
262.414058685303), max_ebird_lower = c(211.088241577148, 211.088241577148,
211.088241577148, 211.088241577148, 211.088241577148, 211.088241577148
), scaled_ebird = c(0.0141947900570961, 0.0115574002394934, 0.0268204184375627,
0.0715650209420927, 0.115733997124301, 0.513005275406189), scaled_ebird_upper = c(0.0150212075411071,
0.0111311607203873, 0.0272018871316847, 0.0670870149283174, 0.114236624754684,
0.540035387053498), scaled_ebird_lower = c(0.0122360627078999,
0.011068970378807, 0.0266594796459475, 0.0734829745239929, 0.115309651445195,
0.483606136879613), scaled_survey = c(0, 0, 0, 0, 0, 0.2), zscaled_ebird = c(-1.15368075571164,
-1.16038986114502, -1.12156314025646, -1.00773989965177, -0.895380961995726,
0.115214769380754), zscaled_ebird_upper = c(-1.14916660942026,
-1.15919230741443, -1.11777371864876, -1.01497900584015, -0.893461766987318,
0.203936283432415), zscaled_ebird_lower = c(-1.15213092209424,
-1.15506773303076, -1.11583657969332, -0.998012352409944, -0.892761860089624,
0.0340004185595489), zscaled_survey = c(-0.639688641399654, -0.639688641399654,
-0.639688641399654, -0.639688641399654, -0.639688641399654, -0.0156021619853574
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
> dput(head(cor))
structure(list(Date1 = structure(c(19255, 19241, 19234, 19234,
19234, 19234), class = "Date"), survey = c(0, 0, 0, 0, 0, 0),
Species = c("ABDU ", "ABDU ", "ABDU ", "ABDU ", "ABDU ",
"ABDU "), region = c(" Adams", " Appanoose/Lucas", " Bremer",
" Buena Vista", " Butler", " Calhoun"), Parameter1 = c("scaled_ebird",
"scaled_ebird", "scaled_ebird", "scaled_ebird", "scaled_ebird",
"scaled_ebird"), Parameter2 = c("scaled_survey", "scaled_survey",
"scaled_survey", "scaled_survey", "scaled_survey", "scaled_survey"
), rho = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), CI = c(0.95, 0.95, 0.95, 0.95, 0.95, 0.95), CI_low = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CI_high = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), S = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), p = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Method = c("NA correlation",
"NA correlation", "NA correlation", "NA correlation", "NA correlation",
"NA correlation"), n_Obs = c(0L, 0L, 0L, 0L, 0L, 0L), state = c("IA",
"IA", "IA", "IA", "IA", "IA")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
There are no matching records in the real data sample you provided, so I changed the values of cor$Species
to `"MALL ", retaining the trailing space as in the original data.
I also noted that cor$region
has a leading space.
Try this:
ebird %>%
inner_join(
cor %>%
mutate(
region = str_trim(region),
Species = str_trim(Species)
),
by = c("region", "Species")
)
It trims leading and trailing whitespace from the key columns in cor
.
This gives reasonable results with the toy data (although the values in the value
column do not appear to match between ebird
and desired_output
).
With the amended real data it yields only 6 rows (matching rows in ebird
) and picks out the corresponding values from cor
).
If this doesn't do the job, perhaps you could provide a subset of the real data where there is a match between records?