Search code examples
rdataframerow-number

add a column to a dataframe in r based on unique values row count


I have a consolidated table as given below:

> dput(data.frame(df))
structure(list(make = c("audi", "audi", "audi", "audi", "bmw", 
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda", 
"honda"), model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series", 
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic", 
"civic"), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", "1.8L TFSI Quattro", 
"320d", "320d", "4.2L VX AT", "2.4L LE MT", "2.4L LE MT", "2.3L VTI AT", 
"2.3L VTI S", "1.8L SPORT", "1.8L V"), from_year = c(2014, 2008, 
2004, 2011, 2012, 2015, 1998, 2001, 2006, 2001, 2001, 2009, 2006
), to_year = c(2020, 2012, 2008, 2016, 2015, 2020, 2003, 2006, 
2011, 2003, 2003, 2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 
9, 10, 11, 12, 13)), class = "data.frame", row.names = c(NA, 
-13L))

dataframe structure:

> df
     make        model           variant from_year to_year id
1    audi           A3         1.4L TFSI      2014    2020  1
2    audi           A3         1.6L TFSI      2008    2012  2
3    audi           A4              1.6L      2004    2008  3
4    audi           A4 1.8L TFSI Quattro      2011    2016  4
5     bmw     3 Series              320d      2012    2015  5
6     bmw     3 Series              320d      2015    2020  6
7  toyota Land Cruiser        4.2L VX AT      1998    2003  7
8  toyota        Camry        2.4L LE MT      2001    2006  8
9  toyota        Camry        2.4L LE MT      2006    2011  9
10  honda       Accord       2.3L VTI AT      2001    2003 10
11  honda       Accord        2.3L VTI S      2001    2003 11
12  honda        civic        1.8L SPORT      2009    2012 12
13  honda        civic            1.8L V      2006    2009 13

I need to normalize the table based on make, model and variant, with primary key for each table

Expected Output:

> dput(df1)
structure(list(make = c("audi", "audi", "audi", "audi", "bmw", 
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda", 
"honda"), make_id = c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4), 
    model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series", 
    "Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic", 
    "civic"), vehicle_model_id = c(1, 1, 2, 2, 3, 3, 4, 5, 5, 
    6, 6, 7, 7), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", 
    "1.8L TFSI Quattro", "320d", "320d", "4.2L VX AT", "2.4L LE MT", 
    "2.4L LE MT", "2.3L VTI AT", "2.3L VTI S", "1.8L SPORT", 
    "1.8L V"), from_year = c(2014, 2008, 2004, 2011, 2012, 2015, 
    1998, 2001, 2006, 2001, 2001, 2009, 2006), to_year = c(2020, 
    2012, 2008, 2016, 2015, 2020, 2003, 2006, 2011, 2003, 2003, 
    2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
    13)), row.names = c(NA, -13L), class = c("tbl_df", "tbl", 
"data.frame"))
> df1
# A tibble: 13 x 8
   make   make_id model        vehicle_model_id variant           from_year to_year    id
   <chr>    <dbl> <chr>                   <dbl> <chr>                 <dbl>   <dbl> <dbl>
 1 audi         1 A3                          1 1.4L TFSI              2014    2020     1
 2 audi         1 A3                          1 1.6L TFSI              2008    2012     2
 3 audi         1 A4                          2 1.6L                   2004    2008     3
 4 audi         1 A4                          2 1.8L TFSI Quattro      2011    2016     4
 5 bmw          2 3 Series                    3 320d                   2012    2015     5
 6 bmw          2 3 Series                    3 320d                   2015    2020     6
 7 toyota       3 Land Cruiser                4 4.2L VX AT             1998    2003     7
 8 toyota       3 Camry                       5 2.4L LE MT             2001    2006     8
 9 toyota       3 Camry                       5 2.4L LE MT             2006    2011     9
10 honda        4 Accord                      6 2.3L VTI AT            2001    2003    10
11 honda        4 Accord                      6 2.3L VTI S             2001    2003    11
12 honda        4 civic                       7 1.8L SPORT             2009    2012    12
13 honda        4 civic                       7 1.8L V                 2006    2009    13
> 

Could someone let me know how to get the new 'id' columns as shown above based on the unique value of strings in "make" and "model" columns.

Once I get the above output, I can then get the normalized tables as below.

> manufacturers
id    make
1     audi
2     bmw
3     toyota
4     honda

> models
id    make_id    model
1     1          A3
2     1          A4
3     2          3 Series
4     3          Land Cruiser
5     3          Camry
6     4          Accord
7     4          civic

> variants
id    name                 vehicle_model_id     from_year      to_year
1     1.4 TFSI             1                    2014           2020
2     1.6 TFSI             1                    2008           2012
3     1.6L                 2                    2004           2008
4     1.8L TFSI Quattro    2                    2011           2016
..
..

Solution

  • Here's how you might do this using dplyr:

    Manufacturers

    library(dplyr)
    
     df %>%
        mutate(id = as.numeric(as.factor(make))) %>% 
        group_by(make) %>% 
        summarise(id = mean(id)) %>% 
        select(2:1)
    
    #> # A tibble: 4 x 2
    #>      id make  
    #>   <dbl> <chr> 
    #> 1     1 audi  
    #> 2     2 bmw   
    #> 3     3 honda 
    #> 4     4 toyota
    

    Models

    df %>% mutate(make_id = as.numeric(as.factor(make))) %>%
        group_by(model) %>% 
        summarize(make_id = mean(make_id)) %>%
        arrange(make_id) %>%
        mutate(id = row_number()) %>%
        select(c(3, 1, 2))
    
    #> # A tibble: 7 x 3
    #>      id model        make_id
    #>   <int> <chr>          <dbl>
    #> 1     1 A3                 1
    #> 2     2 A4                 1
    #> 3     3 3 Series           2
    #> 4     4 Accord             3
    #> 5     5 civic              3
    #> 6     6 Camry              4
    #> 7     7 Land Cruiser       4
    

    Variants

    df %>% mutate(model_id = as.numeric(as.factor(model))) %>%
        group_by(variant) %>% 
        summarize(vehicle_model_id = mean(model_id),
                  from_year = from_year,
                  to_year = to_year) %>%
      arrange(vehicle_model_id) %>%
      ungroup() %>%
      mutate(id = row_number()) %>%
      select(5, 1:4)
    
    #> # A tibble: 13 x 5
    #>       id variant           vehicle_model_id from_year to_year
    #>    <int> <chr>                        <dbl>     <dbl>   <dbl>
    #>  1     1 320d                             1      2012    2015
    #>  2     2 320d                             1      2015    2020
    #>  3     3 1.4L TFSI                        2      2014    2020
    #>  4     4 1.6L TFSI                        2      2008    2012
    #>  5     5 1.6L                             3      2004    2008
    #>  6     6 1.8L TFSI Quattro                3      2011    2016
    #>  7     7 2.3L VTI AT                      4      2001    2003
    #>  8     8 2.3L VTI S                       4      2001    2003
    #>  9     9 2.4L LE MT                       5      2001    2006
    #> 10    10 2.4L LE MT                       5      2006    2011
    #> 11    11 1.8L SPORT                       6      2009    2012
    #> 12    12 1.8L V                           6      2006    2009
    #> 13    13 4.2L VX AT                       7      1998    2003
    

    Created on 2020-07-09 by the reprex package (v0.3.0)