I use str_extract_all()
from the stringr
package to extract all numbers from strings. The dataset I am using is
dat <- data.frame(
X1 = rep(
c(
"DIESEL", "", "Oberösterreich", "", "Salzburg", "", "Steiermark",
""
),
c(1L, 10L, 1L, 15L, 1L, 5L, 1L, 6L)
),
X2 = c(
"Mistelbach Bezirk", "", "Mödling", "Neunkirchen", "Sankt Pölten(Land)",
"Scheibbs", "Tulln", "Waidhofen an der Thaya", "Wiener Neustadt(Land)",
"Wien-Umgebung", "Zwettl", "Linz(Stadt)", "Steyr(Land / Stadt)",
"Wels(Stadt / Land)", "Braunau am Inn", "Eferding", "Freistadt",
"Gmunden", "Grieskirchen", "Kirchdorf an der Krems", "Linz-Land",
"Perg", "Ried im Innkreis", "Rohrbach", "Schärding", "Urfahr-Umgebung",
"Vöcklabruck", "Salzburg(Stadt)", "Hallein", "Salzburg-Umgebung",
"Sankt Johann im Pongau", "Tamsweg", "Zell am See", "Graz(Stadt)",
"Bruck an der Mur", "Deutschlandsberg", "Feldbach", "Fürstenfeld",
"Graz-Umgebung", ""
),
X3 = rep("", 40L),
X4 = c(
"Tank- 33", "", "stellen 30", "22", "36", "11", "31", "13",
"24", "46", "35", "38", "33", "51", "41", "18", "33", "38", "34",
"21", "39", "24", "34", "25", "36", "40", "50", "32", "17", "52",
"36", "9", "38", "55", "22", "24", "37", "18", "80", ""
),
X5 = c(
"Änderun6g.e2n98", "", "(Häufig1k0e.i6t2) 1", "2.834", "8.342",
"869", "3.636", "399", "5.684", "13.599", "4.661", "15.855",
"11.094", "19.068", "5.344", "5.143", "4.133", "11.363", "3.064",
"8.020", "15.607", "4.988", "9.854", "741", "5.144", "9.515",
"11.973", "12.056", "4.913", "16.722", "9.015", "446", "8.888",
"16.007", "5.175", "4.507", "6.796", "1.460", "20.271", ""
),
X6 = c(
"190,85", "Häufigkeit", "354,03", "128,82", "231,72", "79,00",
"117,29", "30,69", "236,83", "295,63", "133,17", "417,24", "336,18",
"373,88", "130,34", "285,72", "125,24", "299,03", "90,12", "381,90",
"400,18", "207,83", "289,82", "29,64", "142,89", "237,88", "239,46",
"376,75", "289,00", "321,58", "250,42", "49,56", "233,89", "291,04",
"235,23", "187,79", "183,68", "81,11", "253,39", ""
),
X7 = c(
"1,205", "", "1,205", "1,249", "1,199", "1,230", "1,240", "1,181",
"1,198", "1,219", "1,190", "1,193", "1,197", "1,192", "1,237",
"1,163", "1,208", "1,203", "1,176", "1,196", "1,184", "1,172",
"1,186", "1,201", "1,215", "1,195", "1,204", "1,228", "1,231",
"1,226", "1,243", "1,318", "1,246", "1,204", "1,215", "1,208",
"1,174", "1,232", "1,209", ""
),
X8 = c(
"1,173", "", "1,173", "1,209", "1,197", "1,229", "1,189", "1,174",
"1,181", "1,189", "1,171", "1,170", "1,181", "1,175", "1,199",
"1,153", "1,188", "1,184", "1,169", "1,181", "1,169", "1,168",
"1,177", "1,194", "1,191", "1,176", "1,179", "1,200", "1,209",
"1,194", "1,199", "1,309", "1,218", "1,172", "1,183", "1,169",
"1,161", "1,232", "1,179", ""
),
X9 = c(
"1,157", "", "1,158", "1,192", "1,180", "1,209", "1,169", "1,159",
"1,164", "1,173", "1,162", "1,155", "1,168", "1,163", "1,179",
"1,141", "1,174", "1,164", "1,159", "1,167", "1,154", "1,154",
"1,164", "1,184", "1,177", "1,163", "1,159", "1,189", "1,194",
"1,179", "1,184", "1,289", "1,198", "1,153", "1,161", "1,149",
"1,146", "1,199", "1,159", ""
),
X10 = c(
"1,207", "", "1,195", "1,239", "1,214", "1,249", "1,219", "1,209",
"1,199", "1,209", "1,182", "1,189", "1,199", "1,189", "1,219",
"1,164", "1,199", "1,204", "1,189", "1,198", "1,189", "1,185",
"1,194", "1,209", "1,209", "1,193", "1,199", "1,224", "1,229",
"1,218", "1,239", "1,339", "1,239", "1,199", "1,199", "1,186",
"1,184", "1,259", "1,199", ""
),
X11 = c(
"0,050", "", "0,037", "0,047", "0,034", "0,040", "0,050", "0,050",
"0,035", "0,036", "0,020", "0,034", "0,031", "0,026", "0,040",
"0,023", "0,025", "0,040", "0,030", "0,031", "0,035", "0,031",
"0,030", "0,025", "0,032", "0,030", "0,040", "0,035", "0,035",
"0,039", "0,055", "0,050", "0,041", "0,046", "0,038", "0,037",
"0,038", "0,060", "0,040", ""
),
X12 = c(
"4,32%", "", "3,20%", "3,94%", "2,88%", "3,31%", "4,28%", "4,31%",
"3,01%", "3,07%", "1,72%", "2,94%", "2,65%", "2,24%", "3,39%",
"2,02%", "2,13%", "3,44%", "2,59%", "2,66%", "3,03%", "2,69%",
"2,58%", "2,11%", "2,72%", "2,58%", "3,45%", "2,94%", "2,93%",
"3,31%", "4,65%", "3,88%", "3,42%", "3,99%", "3,27%", "3,22%",
"3,32%", "5,00%", "3,45%", "27 / 32"
)
)
Then, I apply the following code to clean the dataset dat
library(dplyr)
library(string)
dat = dat %>% select(-X3) %>%
slice(-2) %>%
mutate(X1 = '',
X2 = str_remove(X2, ' Bezirk'),
X4 = str_remove(X4, 'Tank- '),
X4 = str_remove(X4, 'stellen '),
X5 = str_extract_all(X5, '[:digit:]'))
and now the problem occurs. The function str_extract_all()
returns a list of the extracted numbers separated by commas, and the list is stored in column X5
. So, I have a list of lists now. My objective is a column X5
consisting of numeric entries which I can further work with such as
X5
6298
10621
...
I tried many different ways and am out of ideas for now. Could anyone help me out how to achieve this?
As a side note: I apply the code to many lists of the same structure using lapply()
.
I am thankful for any hints!!
dat |> mutate(X5 = purrr::map_chr(X5, paste0, collapse = ""))