Search code examples
rrbind

Is there a way to fill blank columns by matching terms from one column to another, when the terms are not exact?


I have used rbind to combine two datasets into a 828 x 5 dataframe, called vegetation. Here is an example of my current dataframe:

site                           year    sos    eos    vegetation
EPIC_alligatorriver.csv        2016    176    301
EPIC_alligatorriver.csv        2018    164    291
Landsat_alligatorriver.csv     2016    170    303
Pheno_alligatorriver.csv       2017    152    288
Landsat_NEON.BART42.csv        2017    115    290
Pheno_NEON.BART42.csv          2017    120    290
alligatorriver.csv             NA      NA     NA     deciduous broadleaf
NEON.BART42.csv                NA      NA     NA     mixed forest

Here is what I need

site                           year    sos    eos    vegetation
EPIC_alligatorriver.csv        2016    176    301    deciduous broadleaf
EPIC_alligatorriver.csv        2018    164    291    deciduous broadleaf
Landsat_alligatorriver.csv     2016    170    303    deciduous broadleaf
Pheno_alligatorriver.csv       2017    152    288    deciduous broadleaf
Landsat_NEON.BART42.csv        2017    115    290    mixed forest
Pheno_NEON.BART42.csv          2017    120    290    mixed forest
alligatorriver.csv             NA      NA     NA     deciduous broadleaf
NEON.BART42.csv                NA      NA     NA     mixed forest

Basically, I need to fill in the blank vegetation column by matching the correct vegetation type based on the site name.

I can also remove the rbind and have all of the vegetation data in a completely different two column dataframe

site                     vegetation
alligatorriver.csv       deciduous broadleaf
konza.csv                grassland
merbleue.csv             wetland
NEON.BART42.csv          mixed forest

if that makes it easier. There are 7 vegetation types total, and 99 sites over a three year period (2016, 2017, 2018). Any help is appreciated!

df1 (partial): structure(list(site = c("EPIC_alligatorriver.csv", "EPIC_alligatorriver.csv", 
"EPIC_alligatorriver.csv", "EPIC_arbutuslakeinlet.csv", "EPIC_arbutuslakeinlet.csv", 
"EPIC_arbutuslakeinlet.csv", "EPIC_archboldavir.csv", "EPIC_archboldavir.csv", 
"EPIC_archboldavir.csv", "EPIC_archboldavirx.csv"), year = c(2016L, 2017L, 2018L, 2016L, 2017L, 2018L, 2016L, 
2017L, 2018L, 2016L), sos = c(117, 111, 122, 147, 145, 144, 98, 156, 
114, 98), eos = c(294, 294, 274, 276, 271, 274, 315, 295, 307, 315), vegetation = c("", "", "", "", "", "", "", "", "", "")), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
df2: structure(list(vegetation = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 6L, 3L, 2L, 4L, 2L, 2L, 3L, 4L, 2L, 3L, 2L, 
1L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 6L, 6L, 4L, 1L, 4L, 1L, 4L, 2L, 
3L, 2L, 4L, 7L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 2L, 4L, 7L, 2L, 
4L, 2L, 2L, 5L, 2L, 4L, 2L, 5L, 2L, 5L, 2L, 6L, 2L, 5L, 3L, 2L, 
2L, 5L, 2L, 4L, 4L, 2L, 4L, 2L, 2L, 2L, 1L, 2L, 2L, 4L, 6L, 2L, 
1L, 4L, 2L, 1L, 3L, 2L, 2L, 4L, 3L, 2L, 3L, 2L, 2L), .Label = c("agriculture", 
"deciduous broadleaf", "evergreen needlefeaf", "grassland", "mixed forest", 
"shrub", "wetland"), class = "factor"), site = structure(1:99, .Label = c("alligatorriver", 
"arbutuslakeinlet", "archboldavir", "archboldavirx", "archboldpnot", 
"archboldpnotx", "arsmnswanlake1", "ashburnham", "bartlettir", 
"bostoncommon", "bullshoals", "burnssagebrush", "canadaOBS", 
"caryinstitute", "cperuvb", "downerwoods", "dukehw", "goodnow", 
"grandteton", "harvard", "harvardbarn", "harvardbarn2", "harvardfarmsouth", 
"harvardhemlock", "harvardhemlock2", "harvardlph", "howland1", 
"howland2", "hubbardbrook", "huyckpreserveny", "jerbajada", "jernort", 
"kansas", "kelloggcorn", "kendall", "kingmanfarm", "konza", "lacclair", 
"laclaflamme", "laurentides", "lethbridge", "lostcreek", "luckyhills", 
"mandanh5", "mandani2", "mead1", "mead2", "mead3", "meadpasture", 
"merbleue", "missouriozarks", "montebondonegrass", "montebondonepeat", 
"morganmonroe", "nationalelkrefuge", "ncssm", "NEON.BART33", 
"NEON.BART42", "NEON.DELA33", "NEON.DSNY33", "NEON.HARV33", "NEON.HARV42", 
"NEON.JERC33", "NEON.JERC42", "NEON.LENO33", "NEON.ONAQ33", "NEON.ORNL33", 
"NEON.ORNL42", "NEON.RMNP33", "NEON.SERC33", "NEON.TREE33", "NEON.TREE42", 
"NEON.UNDE33", "NEON.WOOD33", "ninemileprairie", "northattkeboroma", 
"oakville", "proctor", "queens", "readingma", "rosemountnprs", 
"russellsage", "sanford", "sevilletagrass", "sevilletashrub", 
"shalehillsczo", "southerngreatplains", "stjones", "sweetbriar", 
"sweetbriargrass", "sylvania", "tonzi", "umichbiological2", "usgseros", 
"usmpj", "uwmfieldsta", "warrenwilson", "willowcreek", "worcester"
), class = "factor")), class = "data.frame", row.names = c(NA, 
-99L))

Solution

  • This can be done using match() using the site columns in the two dataframes using sub() to remove the prefix from the first data.frame:

    df$vegetation <- df2$vegetation[match(sub("^.*_", "", df$site), df2$site)]
    
    df
    
                            site year sos eos          vegetation
    1    EPIC_alligatorriver.csv 2016 176 301 deciduous broadleaf
    2    EPIC_alligatorriver.csv 2018 164 291 deciduous broadleaf
    3 Landsat_alligatorriver.csv 2016 170 303 deciduous broadleaf
    4   Pheno_alligatorriver.csv 2017 152 288 deciduous broadleaf
    5    Landsat_NEON.BART42.csv 2017 115 290        mixed forest
    6      Pheno_NEON.BART42.csv 2017 120 290        mixed forest
    

    Data:

    df <- structure(list(site = c("EPIC_alligatorriver.csv", "EPIC_alligatorriver.csv", 
                                  "Landsat_alligatorriver.csv", "Pheno_alligatorriver.csv", "Landsat_NEON.BART42.csv", 
                                  "Pheno_NEON.BART42.csv"), year = c(2016L, 2018L, 2016L, 2017L, 
                                                                     2017L, 2017L), sos = c(176L, 164L, 170L, 152L, 115L, 120L), eos = c(301L, 
                                                                                                                                         291L, 303L, 288L, 290L, 290L), vegetation = c("deciduous broadleaf", 
                                                                                                                                                                                       "deciduous broadleaf", "deciduous broadleaf", "deciduous broadleaf", 
                                                                                                                                                                                       "mixed forest", "mixed forest")), row.names = c(NA, -6L), class = "data.frame")
    
    df2 <- structure(list(site = c("alligatorriver.csv", "konza.csv", "merbleue.csv", 
                                   "NEON.BART42.csv"), vegetation = c("deciduous broadleaf", "grassland", 
                                                                      "wetland", "mixed forest")), class = "data.frame", row.names = c(NA, 
                                                                                                                                       -4L))