Search code examples
rfor-loopnames

Conditionally replace column names in a dataframe based on values in another dataframe


I have downloaded a table of stream diversion data ("df_download"). The column names of this table are primarily taken from the ID numbers of the gauging stations.

I want to conditionally replace the ID numbers that have been used for column names with text for the station names, which will help make the data more readable when I'm sharing the results. I created a table ("stationIDs") with the ID numbers and station names to use as a reference for changing the column names of "df_download".

I can replace the column names individually, but I want to write a loop of some kind that will address all of the columns of "df_download" and change the names of the columns referenced in the dataframe "stationIDs".

An example of what I'm trying to do is below.

Downloaded Data ("df_download")

A portion of the downloaded data is similar to this:

df_downloaded <- data.frame(Var1 = seq(as.Date("2012-01-01"),as.Date("2012-12-01"), by="month"),
                            Var2 = sample(50:150,12, replace =TRUE),
                            Var3 = sample(10:100,12, replace =TRUE),
                            Var4 = sample(15:45,12, replace =TRUE),
                            Var5 = sample(50:200,12, replace =TRUE),
                            Var6 = sample(15:100,12, replace =TRUE),
                            Var7 = c(rep(0,3),rep(13,6),rep(0,3)),
                            Var8 = rep(5,12))
colnames(df_downloaded) <- c("Diversion.Date","360410059","360410060",
                             "360410209","361000655","361000656","Irrigation","Seep") 

df_download # not run
# 
#    Diversion.Date 360410059 360410060 360410209 361000655 361000656 Irrigation Seep
# 1      2012-01-01        93        57        28       101        16          0    5
# 2      2012-02-01       102        68        19       124        98          0    5
# 3      2012-03-01       124        93        36       109        56          0    5
# 4      2012-04-01        94        96        23        54        87         13    5
# 5      2012-05-01        83        70        43       119        15         13    5
# 6      2012-06-01        78        63        45       195        15         13    5
# 7      2012-07-01        86        77        20       130        63         13    5
# 8      2012-08-01       118        29        27       118        57         13    5
# 9      2012-09-01       142        18        45       116        27         13    5
# 10     2012-10-01        74        68        34       182        79          0    5
# 11     2012-11-01       106        48        27        95        74          0    5
# 12     2012-12-01        91        41        20       179        55          0    5

Reference Table ("stationIDs")

stationIDs <- data.frame(ID = c("360410059", "360410060", "360410209", "361000655", "361000656"),
                         Names = c("RimView", "IPCO", "WMA.Ditch", "RV.Bypass", "LowerFalls"))
stationIDs # not run
#
#          ID      Names
# 1 360410059    RimView
# 2 360410060       IPCO
# 3 360410209  WMA.Ditch
# 4 361000655  RV.Bypass
# 5 361000656 LowerFalls

I can replace the column names in "df_downloaded" using individual statements. I show the first three iterations below.
After three iterations "RimValley", "IPCO", and "WMA.Ditch" have replaced their respective gauge ID numbers.

names(df_downloaded) <- gsub(stationIDs$ID[1],stationIDs$Name[1],names(df_downloaded))

# head(df_downloaded)
#   Diversion.Date RimView 360410060 360410209 361000655 361000656 Irrigation Seep
# 1     2012-01-01      93        57        28       101        16          0    5
# 2     2012-02-01     102        68        19       124        98          0    5
# 3     2012-03-01     124        93        36       109        56          0    5
# 4     2012-04-01      94        96        23        54        87         13    5
# 5     2012-05-01      83        70        43       119        15         13    5
# 6     2012-06-01      78        63        45       195        15         13    5

names(df_downloaded) <- gsub(stationIDs$ID[2],stationIDs$Name[2],names(df_downloaded))

# head(df_downloaded)
#   Diversion.Date RimView IPCO 360410209 361000655 361000656 Irrigation Seep
# 1     2012-01-01      93   57        28       101        16          0    5
# 2     2012-02-01     102   68        19       124        98          0    5
# 3     2012-03-01     124   93        36       109        56          0    5
# 4     2012-04-01      94   96        23        54        87         13    5
# 5     2012-05-01      83   70        43       119        15         13    5
# 6     2012-06-01      78   63        45       195        15         13    5

names(df_downloaded) <- gsub(stationIDs$ID[3],stationIDs$Name[3],names(df_downloaded))

# head(df_downloaded)
#   Diversion.Date RimView IPCO WMA.Ditch 361000655 361000656 Irrigation Seep
# 1     2012-01-01      93   57        28       101        16          0    5
# 2     2012-02-01     102   68        19       124        98          0    5
# 3     2012-03-01     124   93        36       109        56          0    5
# 4     2012-04-01      94   96        23        54        87         13    5
# 5     2012-05-01      83   70        43       119        15         13    5
# 6     2012-06-01      78   63        45       195        15         13    5

If I try to do the renaming using a for loop, I end up with NAs for column names.

for(i in seq_along(names(df_downloaded))){
    names(df_downloaded) <- gsub(stationIDs$ID[i],stationIDs$Name[i],names(df_downloaded))
}

# head(df_downloaded)
#           NA  NA NA NA  NA NA NA NA
# 1 2012-01-01  93 57 28 101 16  0  5
# 2 2012-02-01 102 68 19 124 98  0  5
# 3 2012-03-01 124 93 36 109 56  0  5
# 4 2012-04-01  94 96 23  54 87 13  5
# 5 2012-05-01  83 70 43 119 15 13  5
# 6 2012-06-01  78 63 45 195 15 13  5

I really want to be able to change the names with a for loop or something similar, because because the number of stations that I download data from changes depending on the years that I am analyzing.

Thanks for taking time to look at my question.


Solution

  • We can use match

    #Convert factor columns to character
    stationIDs[] <- lapply(stationIDs, as.character)
    #Match names of df_downloaded with stationIDs$ID
    inds <- match(names(df_downloaded), stationIDs$ID)
    #Replace the matched name with corresponding Names from stationIDs
    names(df_downloaded)[which(!is.na(inds))] <- stationIDs$Names[inds[!is.na(inds)]]
    
    df_downloaded
    #   Diversion.Date RimView IPCO WMA.Ditch RV.Bypass LowerFalls Irrigation Seep
    #1      2012-01-01     142   14        41       200         79          0    5
    #2      2012-02-01      97  100        35       176         22          0    5
    #3      2012-03-01      85   59        26        88         71          0    5
    #4      2012-04-01      68   49        34        63         15         13    5
    #5      2012-05-01      62   58        44        87         16         13    5
    #6      2012-06-01      70   59        33       145         87         13    5
    #7      2012-07-01     112   65        25        52         64         13    5
    #8      2012-08-01      75   12        27       103         19         13    5
    #9      2012-09-01      73   65        36       172         68         13    5
    #10     2012-10-01      87   35        27       146         42          0    5
    #11     2012-11-01     122   17        33       183         32          0    5
    #12     2012-12-01     108   65        15       120         99          0    5