I have used rbind
to combine two datasets into a 828 x 5 dataframe, called vegetation.
Here is an example of my current dataframe:
site year sos eos vegetation
EPIC_alligatorriver.csv 2016 176 301
EPIC_alligatorriver.csv 2018 164 291
Landsat_alligatorriver.csv 2016 170 303
Pheno_alligatorriver.csv 2017 152 288
Landsat_NEON.BART42.csv 2017 115 290
Pheno_NEON.BART42.csv 2017 120 290
alligatorriver.csv NA NA NA deciduous broadleaf
NEON.BART42.csv NA NA NA mixed forest
Here is what I need
site year sos eos vegetation
EPIC_alligatorriver.csv 2016 176 301 deciduous broadleaf
EPIC_alligatorriver.csv 2018 164 291 deciduous broadleaf
Landsat_alligatorriver.csv 2016 170 303 deciduous broadleaf
Pheno_alligatorriver.csv 2017 152 288 deciduous broadleaf
Landsat_NEON.BART42.csv 2017 115 290 mixed forest
Pheno_NEON.BART42.csv 2017 120 290 mixed forest
alligatorriver.csv NA NA NA deciduous broadleaf
NEON.BART42.csv NA NA NA mixed forest
Basically, I need to fill in the blank vegetation column by matching the correct vegetation type based on the site name.
I can also remove the rbind and have all of the vegetation data in a completely different two column dataframe
site vegetation
alligatorriver.csv deciduous broadleaf
konza.csv grassland
merbleue.csv wetland
NEON.BART42.csv mixed forest
if that makes it easier. There are 7 vegetation types total, and 99 sites over a three year period (2016, 2017, 2018). Any help is appreciated!
df1 (partial): structure(list(site = c("EPIC_alligatorriver.csv", "EPIC_alligatorriver.csv",
"EPIC_alligatorriver.csv", "EPIC_arbutuslakeinlet.csv", "EPIC_arbutuslakeinlet.csv",
"EPIC_arbutuslakeinlet.csv", "EPIC_archboldavir.csv", "EPIC_archboldavir.csv",
"EPIC_archboldavir.csv", "EPIC_archboldavirx.csv"), year = c(2016L, 2017L, 2018L, 2016L, 2017L, 2018L, 2016L,
2017L, 2018L, 2016L), sos = c(117, 111, 122, 147, 145, 144, 98, 156,
114, 98), eos = c(294, 294, 274, 276, 271, 274, 315, 295, 307, 315), vegetation = c("", "", "", "", "", "", "", "", "", "")), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
df2: structure(list(vegetation = structure(c(2L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 6L, 3L, 2L, 4L, 2L, 2L, 3L, 4L, 2L, 3L, 2L,
1L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 6L, 6L, 4L, 1L, 4L, 1L, 4L, 2L,
3L, 2L, 4L, 7L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 2L, 4L, 7L, 2L,
4L, 2L, 2L, 5L, 2L, 4L, 2L, 5L, 2L, 5L, 2L, 6L, 2L, 5L, 3L, 2L,
2L, 5L, 2L, 4L, 4L, 2L, 4L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 6L, 2L,
1L, 4L, 2L, 1L, 3L, 2L, 2L, 4L, 3L, 2L, 3L, 2L, 2L), .Label = c("agriculture",
"deciduous broadleaf", "evergreen needlefeaf", "grassland", "mixed forest",
"shrub", "wetland"), class = "factor"), site = structure(1:99, .Label = c("alligatorriver",
"arbutuslakeinlet", "archboldavir", "archboldavirx", "archboldpnot",
"archboldpnotx", "arsmnswanlake1", "ashburnham", "bartlettir",
"bostoncommon", "bullshoals", "burnssagebrush", "canadaOBS",
"caryinstitute", "cperuvb", "downerwoods", "dukehw", "goodnow",
"grandteton", "harvard", "harvardbarn", "harvardbarn2", "harvardfarmsouth",
"harvardhemlock", "harvardhemlock2", "harvardlph", "howland1",
"howland2", "hubbardbrook", "huyckpreserveny", "jerbajada", "jernort",
"kansas", "kelloggcorn", "kendall", "kingmanfarm", "konza", "lacclair",
"laclaflamme", "laurentides", "lethbridge", "lostcreek", "luckyhills",
"mandanh5", "mandani2", "mead1", "mead2", "mead3", "meadpasture",
"merbleue", "missouriozarks", "montebondonegrass", "montebondonepeat",
"morganmonroe", "nationalelkrefuge", "ncssm", "NEON.BART33",
"NEON.BART42", "NEON.DELA33", "NEON.DSNY33", "NEON.HARV33", "NEON.HARV42",
"NEON.JERC33", "NEON.JERC42", "NEON.LENO33", "NEON.ONAQ33", "NEON.ORNL33",
"NEON.ORNL42", "NEON.RMNP33", "NEON.SERC33", "NEON.TREE33", "NEON.TREE42",
"NEON.UNDE33", "NEON.WOOD33", "ninemileprairie", "northattkeboroma",
"oakville", "proctor", "queens", "readingma", "rosemountnprs",
"russellsage", "sanford", "sevilletagrass", "sevilletashrub",
"shalehillsczo", "southerngreatplains", "stjones", "sweetbriar",
"sweetbriargrass", "sylvania", "tonzi", "umichbiological2", "usgseros",
"usmpj", "uwmfieldsta", "warrenwilson", "willowcreek", "worcester"
), class = "factor")), class = "data.frame", row.names = c(NA,
-99L))
This can be done using match()
using the site columns in the two dataframes using sub()
to remove the prefix from the first data.frame:
df$vegetation <- df2$vegetation[match(sub("^.*_", "", df$site), df2$site)]
df
site year sos eos vegetation
1 EPIC_alligatorriver.csv 2016 176 301 deciduous broadleaf
2 EPIC_alligatorriver.csv 2018 164 291 deciduous broadleaf
3 Landsat_alligatorriver.csv 2016 170 303 deciduous broadleaf
4 Pheno_alligatorriver.csv 2017 152 288 deciduous broadleaf
5 Landsat_NEON.BART42.csv 2017 115 290 mixed forest
6 Pheno_NEON.BART42.csv 2017 120 290 mixed forest
Data:
df <- structure(list(site = c("EPIC_alligatorriver.csv", "EPIC_alligatorriver.csv",
"Landsat_alligatorriver.csv", "Pheno_alligatorriver.csv", "Landsat_NEON.BART42.csv",
"Pheno_NEON.BART42.csv"), year = c(2016L, 2018L, 2016L, 2017L,
2017L, 2017L), sos = c(176L, 164L, 170L, 152L, 115L, 120L), eos = c(301L,
291L, 303L, 288L, 290L, 290L), vegetation = c("deciduous broadleaf",
"deciduous broadleaf", "deciduous broadleaf", "deciduous broadleaf",
"mixed forest", "mixed forest")), row.names = c(NA, -6L), class = "data.frame")
df2 <- structure(list(site = c("alligatorriver.csv", "konza.csv", "merbleue.csv",
"NEON.BART42.csv"), vegetation = c("deciduous broadleaf", "grassland",
"wetland", "mixed forest")), class = "data.frame", row.names = c(NA,
-4L))