Search code examples
rsubsetlong-format-data

How do I improve this syntax for creating a subset with factors?


EDIT: A moderator has flagged this this question as very similar to mine. As a neophyte, I do not see these as similar. This reflects the discrepancy between my understanding of R and the moderator's. They see the obvious connection because they are experts. I do not, as I am still learning R. I suspect that other neophytes might benefit from my question as it is given: even if trivial, it is explicitly framed.

I am attempting to subset my data.table in long format, named tmp:

tmp <- structure(list(Year = c(1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
2006, 2011, 2016, 2021), variable = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L), .Label = c("FORT ERIE", "GRIMSBY", 
"LINCOLN", "NIAGARA FALLS", "NIAGARA-ON-THE-LAKE", "PELHAM", 
"PORT COLBORNE", "ST. CATHARINES", "THOROLD", "WAINFLEET", "WELLAND", 
"WEST LINCOLN"), class = "factor"), value = c(23113L, 24030L, 
24096L, 23253L, 26006L, 27183L, 28143L, 29925L, 29960L, 30710L, 
32901L, 15770L, 15565L, 15797L, 16956L, 18520L, 19585L, 21297L, 
23937L, 25325L, 27314L, 28883L, 14247L, 14460L, 14196L, 14391L, 
17149L, 18801L, 20612L, 21722L, 22487L, 23787L, 25719L, 67163L, 
69420L, 70960L, 72107L, 75399L, 76917L, 78815L, 82184L, 82997L, 
88071L, 94415L, 12552L, 12485L, 12186L, 12494L, 12945L, 13238L, 
13839L, 14587L, 15400L, 17511L, 19090L, 9997L, 10070L, 11104L, 
12137L, 13328L, 14343L, 15272L, 16155L, 16598L, 17110L, 18192L, 
21420L, 20535L, 19225L, 18281L, 18766L, 18451L, 18450L, 18599L, 
18424L, 18306L, 20033L, 109722L, 123350L, 124018L, 123455L, 129300L, 
130926L, 129170L, 131989L, 131400L, 133113L, 136803L, 15065L, 
14945L, 15412L, 16131L, 17542L, 17883L, 18048L, 18224L, 17931L, 
18801L, 23816L, 5486L, 6065L, 6000L, 5955L, 6203L, 6253L, 6258L, 
6601L, 6356L, 6372L, 6887L, 44397L, 45050L, 45448L, 45054L, 47914L, 
48411L, 48402L, 50331L, 50631L, 52293L, 55750L, 8396L, 9460L, 
9846L, 9918L, 10864L, 11513L, 12268L, 13167L, 13837L, 14500L, 
15454L)), row.names = c(NA, -132L), class = c("data.table", "data.frame"
)

This data.table contains 132 rows and three columns: Year, variable and value. Year is the year data were collected. Variable is the name of the location. Value is the total population. There are 11 rows per location.

I wish to create a subset that excludes the value for NIAGARA FALLS, ST. CATHARINES and WELLAND. This works:

tmpsmall9 <- subset(tmp, variable != 'NIAGARA FALLS') # subset excludes NF
tmpsmall9 <- subset(tmpsmall9, variable != 'ST. CATHARINES') # subset excludes ST. KITTY
tmpsmall9 <- subset(tmpsmall9, variable != "WELLAND") # subset excludes Welland
View(tmpsmall9)

This iterative solution eliminates 33 observations (i.e., rows) associated with those three factors in variable (i.e., Niagara Falls, St. Catharines, Welland), leaving 99 rows. However, I thought there must be a more efficient solution. So, I tried this:

tmpsmall9 <- subset(tmp, variable != c('NIAGARA FALLS','ST. CATHARINES','WELLAND'))
View(tmpsmall9)

This does not perform as expected. It leaves 121 rows. Niagara Falls still has 7 rows, St. Catharines 8, and Welland 7. The other factors (i.e., variable) have all their rows.

What is a more efficient and accurate way to subset factors than my iterative solution? Can my second solution be revised to work? (If not, can someone explain why this second subset() syntax only removes some of the observations for those three factors, and not all of them? Is it that logical operators can only handle one factor at a time?)


Solution

  • One option is to negate the %in% operator, e.g.

    library(data.table)
    
    tmp <- structure(list(Year = c(1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 
                                   2006, 2011, 2016, 2021), variable = structure(c(1L, 1L, 1L, 1L, 
                                                                                   1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                   2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
                                                                                   4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
                                                                                   5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 
                                                                                   7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
                                                                                   8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
                                                                                   10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 
                                                                                   11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 
                                                                                   12L, 12L, 12L, 12L, 12L, 12L), .Label = c("FORT ERIE", "GRIMSBY", 
                                                                                                                             "LINCOLN", "NIAGARA FALLS", "NIAGARA-ON-THE-LAKE", "PELHAM", 
                                                                                                                             "PORT COLBORNE", "ST. CATHARINES", "THOROLD", "WAINFLEET", "WELLAND", 
                                                                                                                             "WEST LINCOLN"), class = "factor"), value = c(23113L, 24030L, 
                                                                                                                                                                           24096L, 23253L, 26006L, 27183L, 28143L, 29925L, 29960L, 30710L, 
                                                                                                                                                                           32901L, 15770L, 15565L, 15797L, 16956L, 18520L, 19585L, 21297L, 
                                                                                                                                                                           23937L, 25325L, 27314L, 28883L, 14247L, 14460L, 14196L, 14391L, 
                                                                                                                                                                           17149L, 18801L, 20612L, 21722L, 22487L, 23787L, 25719L, 67163L, 
                                                                                                                                                                           69420L, 70960L, 72107L, 75399L, 76917L, 78815L, 82184L, 82997L, 
                                                                                                                                                                           88071L, 94415L, 12552L, 12485L, 12186L, 12494L, 12945L, 13238L, 
                                                                                                                                                                           13839L, 14587L, 15400L, 17511L, 19090L, 9997L, 10070L, 11104L, 
                                                                                                                                                                           12137L, 13328L, 14343L, 15272L, 16155L, 16598L, 17110L, 18192L, 
                                                                                                                                                                           21420L, 20535L, 19225L, 18281L, 18766L, 18451L, 18450L, 18599L, 
                                                                                                                                                                           18424L, 18306L, 20033L, 109722L, 123350L, 124018L, 123455L, 129300L, 
                                                                                                                                                                           130926L, 129170L, 131989L, 131400L, 133113L, 136803L, 15065L, 
                                                                                                                                                                           14945L, 15412L, 16131L, 17542L, 17883L, 18048L, 18224L, 17931L, 
                                                                                                                                                                           18801L, 23816L, 5486L, 6065L, 6000L, 5955L, 6203L, 6253L, 6258L, 
                                                                                                                                                                           6601L, 6356L, 6372L, 6887L, 44397L, 45050L, 45448L, 45054L, 47914L, 
                                                                                                                                                                           48411L, 48402L, 50331L, 50631L, 52293L, 55750L, 8396L, 9460L, 
                                                                                                                                                                           9846L, 9918L, 10864L, 11513L, 12268L, 13167L, 13837L, 14500L, 
                                                                                                                                                                           15454L)), row.names = c(NA, -132L), class = c("data.table", "data.frame"))
    
    tmpsmall9 <- subset(tmp, variable != 'NIAGARA FALLS') # subset excludes NF
    tmpsmall9 <- subset(tmpsmall9, variable != 'ST. CATHARINES') # subset excludes ST. KITTY
    tmpsmall9 <- subset(tmpsmall9, variable != "WELLAND") # subset excludes Welland
    
    tmpsmall9_ver2 <- tmp[!(variable %in% c('NIAGARA FALLS','ST. CATHARINES','WELLAND'))]
    all.equal(tmpsmall9, tmpsmall9_ver2)
    #> [1] TRUE
    

    Created on 2023-05-05 with reprex v2.0.2