I have a consolidated table as given below:
> dput(data.frame(df))
structure(list(make = c("audi", "audi", "audi", "audi", "bmw",
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda",
"honda"), model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series",
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic",
"civic"), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L", "1.8L TFSI Quattro",
"320d", "320d", "4.2L VX AT", "2.4L LE MT", "2.4L LE MT", "2.3L VTI AT",
"2.3L VTI S", "1.8L SPORT", "1.8L V"), from_year = c(2014, 2008,
2004, 2011, 2012, 2015, 1998, 2001, 2006, 2001, 2001, 2009, 2006
), to_year = c(2020, 2012, 2008, 2016, 2015, 2020, 2003, 2006,
2011, 2003, 2003, 2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13)), class = "data.frame", row.names = c(NA,
-13L))
dataframe structure:
> df
make model variant from_year to_year id
1 audi A3 1.4L TFSI 2014 2020 1
2 audi A3 1.6L TFSI 2008 2012 2
3 audi A4 1.6L 2004 2008 3
4 audi A4 1.8L TFSI Quattro 2011 2016 4
5 bmw 3 Series 320d 2012 2015 5
6 bmw 3 Series 320d 2015 2020 6
7 toyota Land Cruiser 4.2L VX AT 1998 2003 7
8 toyota Camry 2.4L LE MT 2001 2006 8
9 toyota Camry 2.4L LE MT 2006 2011 9
10 honda Accord 2.3L VTI AT 2001 2003 10
11 honda Accord 2.3L VTI S 2001 2003 11
12 honda civic 1.8L SPORT 2009 2012 12
13 honda civic 1.8L V 2006 2009 13
I need to normalize the table based on make, model and variant, with primary key for each table
Expected Output:
> dput(df1)
structure(list(make = c("audi", "audi", "audi", "audi", "bmw",
"bmw", "toyota", "toyota", "toyota", "honda", "honda", "honda",
"honda"), make_id = c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4),
model = c("A3", "A3", "A4", "A4", "3 Series", "3 Series",
"Land Cruiser", "Camry", "Camry", "Accord", "Accord", "civic",
"civic"), vehicle_model_id = c(1, 1, 2, 2, 3, 3, 4, 5, 5,
6, 6, 7, 7), variant = c("1.4L TFSI", "1.6L TFSI", "1.6L",
"1.8L TFSI Quattro", "320d", "320d", "4.2L VX AT", "2.4L LE MT",
"2.4L LE MT", "2.3L VTI AT", "2.3L VTI S", "1.8L SPORT",
"1.8L V"), from_year = c(2014, 2008, 2004, 2011, 2012, 2015,
1998, 2001, 2006, 2001, 2001, 2009, 2006), to_year = c(2020,
2012, 2008, 2016, 2015, 2020, 2003, 2006, 2011, 2003, 2003,
2012, 2009), id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13)), row.names = c(NA, -13L), class = c("tbl_df", "tbl",
"data.frame"))
> df1
# A tibble: 13 x 8
make make_id model vehicle_model_id variant from_year to_year id
<chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 audi 1 A3 1 1.4L TFSI 2014 2020 1
2 audi 1 A3 1 1.6L TFSI 2008 2012 2
3 audi 1 A4 2 1.6L 2004 2008 3
4 audi 1 A4 2 1.8L TFSI Quattro 2011 2016 4
5 bmw 2 3 Series 3 320d 2012 2015 5
6 bmw 2 3 Series 3 320d 2015 2020 6
7 toyota 3 Land Cruiser 4 4.2L VX AT 1998 2003 7
8 toyota 3 Camry 5 2.4L LE MT 2001 2006 8
9 toyota 3 Camry 5 2.4L LE MT 2006 2011 9
10 honda 4 Accord 6 2.3L VTI AT 2001 2003 10
11 honda 4 Accord 6 2.3L VTI S 2001 2003 11
12 honda 4 civic 7 1.8L SPORT 2009 2012 12
13 honda 4 civic 7 1.8L V 2006 2009 13
>
Could someone let me know how to get the new 'id' columns as shown above based on the unique value of strings in "make" and "model" columns.
Once I get the above output, I can then get the normalized tables as below.
> manufacturers
id make
1 audi
2 bmw
3 toyota
4 honda
> models
id make_id model
1 1 A3
2 1 A4
3 2 3 Series
4 3 Land Cruiser
5 3 Camry
6 4 Accord
7 4 civic
> variants
id name vehicle_model_id from_year to_year
1 1.4 TFSI 1 2014 2020
2 1.6 TFSI 1 2008 2012
3 1.6L 2 2004 2008
4 1.8L TFSI Quattro 2 2011 2016
..
..
Here's how you might do this using dplyr:
Manufacturers
library(dplyr)
df %>%
mutate(id = as.numeric(as.factor(make))) %>%
group_by(make) %>%
summarise(id = mean(id)) %>%
select(2:1)
#> # A tibble: 4 x 2
#> id make
#> <dbl> <chr>
#> 1 1 audi
#> 2 2 bmw
#> 3 3 honda
#> 4 4 toyota
Models
df %>% mutate(make_id = as.numeric(as.factor(make))) %>%
group_by(model) %>%
summarize(make_id = mean(make_id)) %>%
arrange(make_id) %>%
mutate(id = row_number()) %>%
select(c(3, 1, 2))
#> # A tibble: 7 x 3
#> id model make_id
#> <int> <chr> <dbl>
#> 1 1 A3 1
#> 2 2 A4 1
#> 3 3 3 Series 2
#> 4 4 Accord 3
#> 5 5 civic 3
#> 6 6 Camry 4
#> 7 7 Land Cruiser 4
Variants
df %>% mutate(model_id = as.numeric(as.factor(model))) %>%
group_by(variant) %>%
summarize(vehicle_model_id = mean(model_id),
from_year = from_year,
to_year = to_year) %>%
arrange(vehicle_model_id) %>%
ungroup() %>%
mutate(id = row_number()) %>%
select(5, 1:4)
#> # A tibble: 13 x 5
#> id variant vehicle_model_id from_year to_year
#> <int> <chr> <dbl> <dbl> <dbl>
#> 1 1 320d 1 2012 2015
#> 2 2 320d 1 2015 2020
#> 3 3 1.4L TFSI 2 2014 2020
#> 4 4 1.6L TFSI 2 2008 2012
#> 5 5 1.6L 3 2004 2008
#> 6 6 1.8L TFSI Quattro 3 2011 2016
#> 7 7 2.3L VTI AT 4 2001 2003
#> 8 8 2.3L VTI S 4 2001 2003
#> 9 9 2.4L LE MT 5 2001 2006
#> 10 10 2.4L LE MT 5 2006 2011
#> 11 11 1.8L SPORT 6 2009 2012
#> 12 12 1.8L V 6 2006 2009
#> 13 13 4.2L VX AT 7 1998 2003
Created on 2020-07-09 by the reprex package (v0.3.0)