Search code examples
rsurveyweightedsample-data

R survey data add weights


I have survey data with weights already assigned. Here is a toy sample. If I don't weight the data, it will not be an accurate representation of the population. Is there a simple way to factor in the weights? Given that the weight is how many people in population X are represented by a given person in my sample, do I multiply obs * weight? I checked on youtube but I don't have strata data referred to in the video, my data are all individuals and did not have any luck searching elsewhere.

Thank you in advance for any help

ID <-  c(1,2,3,4,5,6,7,8,9,10,11,12,
     13,14,15,16,17,18,19,20,21,22,23,24,
     25,26,27,28,29,30,31,32,33,34,35,36)
year <- c(1980,1980,1980,1981,1982,1982,1980,1980,1981,1981,1982,1982,
      1980,1980,1980,1981,1982,1982,1980,1980,1981,1981,1982,1982,
      1980,1980,1980,1981,1982,1982,1980,1980,1981,1981,1982,1982)
city <- c("NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY",
      "NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY",
      "CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA","CA")
district <- c(1,2,1,2,1,2,1,2,1,2,1,2,
          1,2,1,2,1,2,1,2,1,2,1,2,
          1,2,1,2,1,2,1,2,1,2,1,2)
weight <- c(100,17,25,1,100,52,10,5,90,10,10,15,
        13,1,25,1,6,52,10,5,90,7,10,15,
        1,2,3,4,5,6,10,20,3,40,50,6)

df <- data.frame(ID,year,city,district,weight)

df$year <- as.factor(df$year)
df$district <- as.factor(df$district)

print(df)


# aggregate count by year, city and district
a <- aggregate(ID ~ year + city + district, 
            data = df,  
            FUN = length) 

colnames(a)[colnames(a) == 'ID'] <- 'numObs'

# add new Total column, populate total by year
a[a$year == 1980, "TotalforYear"] <-  with(a, sum(numObs[year == 1980]))
a[a$year == 1981, "TotalforYear"] <-  with(a, sum(numObs[year == 1981]))  
a[a$year == 1982, "TotalforYear"] <-  with(a, sum(numObs[year == 1982]))

# add new column called share and initialize it
a[ , 'share'] <- 0
a$share = (a$numObs/a$TotalforYear)

# add group column
a[a$district == 1 & a$city == 'NY', "group"] <- 'NY district1'
a[a$district == 2 & a$city == 'NY', "group"] <- 'NY district2'
a[a$district == 1 & a$city == 'CA', "group"] <- 'CA district1'
a[a$district == 2 & a$city == 'CA', "group"] <- 'CA district2'

a

g1 <- ggplot(a, aes(x=year, y=share, group=group)) +
  geom_line(aes(linetype=group), color = 'red', size = 0.8) +
  geom_point(aes(shape=group), size = 2) +
  theme_classic() +  
  theme(legend.position="bottom",
    panel.grid.major.x = element_line( linewidth=.1, color="grey93" ), 
    panel.grid.major.y = element_line( linewidth=.1, color="grey93" ) )       +
  ylab("Share") +
  scale_linetype_manual(values=c("solid", "dotted", 'dashed', 'longdash')) +
  theme(legend.title=element_blank()) 
g1

Solution

  • Edited to reflect the changes to the question.

    In Stata, you would use "pweight" or "fweight" depending on your analysis needs, and the principle in R is similar: to adjust the data by the weights to reflect the sample's representation in the population.

    In your case you have frequency weights, and you can multiply each observation by its weight to reflect the population count in your aggregation, and then aggregate the weighted counts by the desired variables.

    From the possible partitions (or groups), you can take year, city, and favFood, summing up the weights. The result gives you a representation of the population’s preferences rather than just the sample and one way to do it is:

    library(ggplot2)
    
    ID <- c(
      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
      25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36
    )
    year <- c(
      1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
      1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
      1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982
    )
    city <- c(
      "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
      "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
      "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA"
    )
    district <- c(
      1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
      1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
      1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
    )
    weight <- c(
      100, 17, 25, 1, 100, 52, 10, 5, 90, 10, 10, 15,
      13, 1, 25, 1, 6, 52, 10, 5, 90, 7, 10, 15,
      1, 2, 3, 4, 5, 6, 10, 20, 3, 40, 50, 6
    )
    
    df <- data.frame(ID, year, city, district, weight)
    
    df$year <- as.factor(df$year)
    df$district <- as.factor(df$district)
    
    a <- aggregate(weight ~ year + city + district, 
                   data = df, 
                   FUN = sum)
    
    colnames(a)[colnames(a) == 'weight'] <- 'weightedCount'
    
    a[a$year == 1980, "TotalWeighted"] <- with(a, sum(weightedCount[year == 1980]))
    a[a$year == 1981, "TotalWeighted"] <- with(a, sum(weightedCount[year == 1981]))
    a[a$year == 1982, "TotalWeighted"] <- with(a, sum(weightedCount[year == 1982]))
    
    a$share <- a$weightedCount / a$TotalWeighted
    
    a[a$district == 1 & a$city == 'NY', "group"] <- 'NY district1'
    a[a$district == 2 & a$city == 'NY', "group"] <- 'NY district2'
    a[a$district == 1 & a$city == 'CA', "group"] <- 'CA district1'
    a[a$district == 2 & a$city == 'CA', "group"] <- 'CA district2'
    
    # g2: g1 with modifications to include weights
    
    g2 <- ggplot(a, aes(x=year, y=share, group=group)) +
      geom_line(aes(linetype=group), color = 'blue', linewidth = 0.8) +
      geom_point(aes(shape=group), size = 2) +
      theme_classic() +  
      theme(legend.position="bottom",
            panel.grid.major.x = element_line(linewidth=.1, color="grey93"), 
            panel.grid.major.y = element_line(linewidth=.1, color="grey93")) +
      ylab("Weighted Share") +
      scale_linetype_manual(values=c("solid", "dotted", 'dashed', 'longdash')) +
      theme(legend.title=element_blank()) 
    
    g2
    

    I prefer to use the Tidyverse, and that would be

    library(ggplot2)
    library(dplyr)
    
    df <- tibble(
      ID = c(
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
        25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36
      ),
      year = c(
        1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
        1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982,
        1980, 1980, 1980, 1981, 1982, 1982, 1980, 1980, 1981, 1981, 1982, 1982
      ),
      city = c(
        "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
        "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY", "NY",
        "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA"
      ),
      district = c(
        1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
        1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
        1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
      ),
      weight = c(
        100, 17, 25, 1, 100, 52, 10, 5, 90, 10, 10, 15,
        13, 1, 25, 1, 6, 52, 10, 5, 90, 7, 10, 15,
        1, 2, 3, 4, 5, 6, 10, 20, 3, 40, 50, 6
      )
    )
    
    df <- df %>%
      mutate(
        year = as.factor(year),
        district = as.factor(district)
      )
    
    a <- df %>%
      group_by(year, city, district) %>%
      summarise(weightedCount = sum(weight), .groups = "drop") %>%
      mutate(
        TotalWeighted = sum(weightedCount, na.rm = TRUE),
        share = weightedCount / TotalWeighted,
        group = case_when(
          district == 1 & city == "NY" ~ "NY district1",
          district == 2 & city == "NY" ~ "NY district2",
          district == 1 & city == "CA" ~ "CA district1",
          district == 2 & city == "CA" ~ "CA district2"
        )
      )
    
    a
    
    # g2: g1 with modifications to include weights
    
    g2 <- ggplot(a, aes(x=year, y=share, group=group)) +
      geom_line(aes(linetype=group), color = 'blue', linewidth = 0.8) +
      geom_point(aes(shape=group), size = 2) +
      theme_classic() +  
      theme(legend.position="bottom",
            panel.grid.major.x = element_line(linewidth=.1, color="grey93"), 
            panel.grid.major.y = element_line(linewidth=.1, color="grey93")) +
      ylab("Weighted Share") +
      scale_linetype_manual(values=c("solid", "dotted", 'dashed', 'longdash')) +
      theme(legend.title=element_blank()) 
    
    g2
    

    enter image description here