Search code examples
rr-factor

Best way to make factor levels uniform over a number of data frames


I have a number of data.frames that each have a factor. I want to make sure that they all use the same levels. What is the proper way to do this?

In the code below you'll see that I reassign the factor for each case using levels from the overall set of levels with a small convenience function changeLevels. I would expect that there is a better way to do this though.

set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)

#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)

t1<-as.data.frame(table(sample(b,5)))
t2<-as.data.frame(table(sample(b,1)))
t3<-as.data.frame(table(sample(b,1)))
t4<-as.data.frame(table(sample(b,8)))
t5<-as.data.frame(table(sample(b2,20)))
t6<-as.data.frame(table(sample(b3,18)))

t1<-cbind(t1,p="A")
t2<-cbind(t2,p="B")
t3<-cbind(t3,p="C")
t4<-cbind(t4,p="D")
t5<-cbind(t5,p="E")
t6<-cbind(t6,p="F")

d<-data.frame()
d<-rbind(d,t2,t3,t6,t4,t5,t1)

#.. out of order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("out of order bins")

changeFactor<-function(t,lvls){
  temp<-as.numeric(as.character(t))
  factor(temp,levels=lvls)
}

t1$Var1<-changeFactor(t1$Var1,lvls)
t2$Var1<-changeFactor(t2$Var1,lvls)
t3$Var1<-changeFactor(t3$Var1,lvls)
t4$Var1<-changeFactor(t4$Var1,lvls)
t5$Var1<-changeFactor(t5$Var1,lvls)
t6$Var1<-changeFactor(t6$Var1,lvls)

d<-data.frame()
d<-rbind(d,t2,t3,t6,t4,t5,t1)

#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
  geom_bar(aes(weight=Freq)) +
  facet_grid( p ~ ., margins=T)+
  ggtitle("in order bins")

Solution

  • short answer: keep your data in lists and learn the *pply family

    set.seed(1234)
    b<-round(runif(100,1,10),digits=2)
    set.seed(2345)
    b2<-round(runif(100,11,20),digits=2)
    set.seed(3456)
    b3<-round(runif(50,15,18),digits=2)
    
    #.. all potential levels
    bt<-factor(sort(c(b,b2,b3)))
    lvls<-levels(bt)
    
    options(stringsAsFactors = FALSE)
    f <- function(x, y, z)
      cbind(data.frame(table(sample(x, y))), p = z)
    
    datl <- Map(f, list(b,b,b,b,b2,b3), c(5,1,1,8,20,18), LETTERS[1:6])
    
    changeFactor<-function(t,lvls){
      temp<-as.numeric(as.character(t))
      factor(temp,levels=lvls)
    }
    
    datl <- lapply(rapply(datl, f = function(x) changeFactor(x, lvls), 
                         classes = 'factor', how = 'replace'),
                  data.frame)
    
    d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])
    
    #.. in order bins
    ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
      geom_bar(aes(weight=Freq)) +
      facet_grid( p ~ ., margins=T)+
      ggtitle("in order bins")
    

    enter image description here

    long answer:

    set.seed(1234)
    b<-round(runif(100,1,10),digits=2)
    set.seed(2345)
    b2<-round(runif(100,11,20),digits=2)
    set.seed(3456)
    b3<-round(runif(50,15,18),digits=2)
    
    #.. all potential levels
    bt<-factor(sort(c(b,b2,b3)))
    lvls<-levels(bt)
    

    first, I don't want any unexpected factors popping up, so stringsAsFactors = FALSE then write a function, f, to do what you want, and check to make sure it works

    options(stringsAsFactors = FALSE)
    f <- function(x, y, z)
      cbind(data.frame(table(sample(x, y))), p = z)
    
    f(b, 5, 'A')
    
    #   Var1 Freq p
    # 1 1.13    1 A
    # 2 1.46    1 A
    # 3 2.09    1 A
    # 4  2.5    1 A
    # 5 7.02    1 A
    

    seems to work, so just Map it to lists of arguments and check the output

    datl <- Map(f, list(b,b,b,b,b2,b3), c(5,1,1,8,20,18), LETTERS[1:6])
    
    # List of 6
    # $ :'data.frame':  5 obs. of  3 variables:
    #   ..$ Var1: Factor w/ 5 levels "2.02","3.09",..: 1 2 3 4 5
    #   ..$ Freq: int [1:5] 1 1 1 1 1
    #   ..$ p   : chr [1:5] "A" "A" "A" "A" ...
    # $ :'data.frame':  1 obs. of  3 variables:
    #   ..$ Var1: Factor w/ 1 level "1.63": 1
    #   ..$ Freq: int 1
    #   ..$ p   : chr "B"
    

    so combine everything to use with ggplot

    d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])
    
    library(ggplot2)
    #.. out of order bins
    ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
      geom_bar(aes(weight=Freq)) +
      facet_grid( p ~ ., margins=T)+
      ggtitle("out of order bins")
    

    enter image description here

    changeFactor<-function(t,lvls){
      temp<-as.numeric(as.character(t))
      factor(temp,levels=lvls)
    }
    

    again making sure the function does what it is supposed to do on one data frame

    changeFactor(datl[[1]]$Var1, lvls)
    
    # [1] 2.02 3.09 3.79 3.89 8.3 
    # 234 Levels: 1.09 1.12 1.13 1.24 1.36 1.38 1.41 1.46 1.63 1.66 1.81 1.95 ... 19.86
    

    so apply it again to them all at once and check the output

    datl <- lapply(rapply(datl, f = function(x) changeFactor(x, lvls), 
                         classes = 'factor', how = 'replace'),
                  data.frame)
    str(datl)
    # List of 6
    # $ :'data.frame':  5 obs. of  3 variables:
    #   ..$ Var1: Factor w/ 234 levels "1.09","1.12",..: 13 28 41 45 81
    #   ..$ Freq: int [1:5] 1 1 1 1 1
    #   ..$ p   : chr [1:5] "A" "A" "A" "A" ...
    # $ :'data.frame':  1 obs. of  3 variables:
    #   ..$ Var1: Factor w/ 234 levels "1.09","1.12",..: 9
    #   ..$ Freq: int 1
    #   ..$ p   : chr "B"
    # ...
    

    combine again and plot

    d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])
    
    #.. in order bins
    ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
      geom_bar(aes(weight=Freq)) +
      facet_grid( p ~ ., margins=T)+
      ggtitle("in order bins")
    

    enter image description here