Search code examples
rdataframelistdplyrstringr

Convert output of str_extract_all to vector of numeric entries instead of list


I use str_extract_all() from the stringr package to extract all numbers from strings. The dataset I am using is

dat <- data.frame(
  X1 = rep(
    c(
      "DIESEL", "", "Oberösterreich", "", "Salzburg", "", "Steiermark",
      ""
    ),
    c(1L, 10L, 1L, 15L, 1L, 5L, 1L, 6L)
  ),
  X2 = c(
    "Mistelbach Bezirk", "", "Mödling", "Neunkirchen", "Sankt Pölten(Land)",
    "Scheibbs", "Tulln", "Waidhofen an der Thaya", "Wiener Neustadt(Land)",
    "Wien-Umgebung", "Zwettl", "Linz(Stadt)", "Steyr(Land / Stadt)",
    "Wels(Stadt / Land)", "Braunau am Inn", "Eferding", "Freistadt",
    "Gmunden", "Grieskirchen", "Kirchdorf an der Krems", "Linz-Land",
    "Perg", "Ried im Innkreis", "Rohrbach", "Schärding", "Urfahr-Umgebung",
    "Vöcklabruck", "Salzburg(Stadt)", "Hallein", "Salzburg-Umgebung",
    "Sankt Johann im Pongau", "Tamsweg", "Zell am See", "Graz(Stadt)",
    "Bruck an der Mur", "Deutschlandsberg", "Feldbach", "Fürstenfeld",
    "Graz-Umgebung", ""
  ),
  X3 = rep("", 40L),
  X4 = c(
    "Tank- 33", "", "stellen 30", "22", "36", "11", "31", "13",
    "24", "46", "35", "38", "33", "51", "41", "18", "33", "38", "34",
    "21", "39", "24", "34", "25", "36", "40", "50", "32", "17", "52",
    "36", "9", "38", "55", "22", "24", "37", "18", "80", ""
  ),
  X5 = c(
    "Änderun6g.e2n98", "", "(Häufig1k0e.i6t2) 1", "2.834", "8.342",
    "869", "3.636", "399", "5.684", "13.599", "4.661", "15.855",
    "11.094", "19.068", "5.344", "5.143", "4.133", "11.363", "3.064",
    "8.020", "15.607", "4.988", "9.854", "741", "5.144", "9.515",
    "11.973", "12.056", "4.913", "16.722", "9.015", "446", "8.888",
    "16.007", "5.175", "4.507", "6.796", "1.460", "20.271", ""
  ),
  X6 = c(
    "190,85", "Häufigkeit", "354,03", "128,82", "231,72", "79,00",
    "117,29", "30,69", "236,83", "295,63", "133,17", "417,24", "336,18",
    "373,88", "130,34", "285,72", "125,24", "299,03", "90,12", "381,90",
    "400,18", "207,83", "289,82", "29,64", "142,89", "237,88", "239,46",
    "376,75", "289,00", "321,58", "250,42", "49,56", "233,89", "291,04",
    "235,23", "187,79", "183,68", "81,11", "253,39", ""
  ),
  X7 = c(
    "1,205", "", "1,205", "1,249", "1,199", "1,230", "1,240", "1,181",
    "1,198", "1,219", "1,190", "1,193", "1,197", "1,192", "1,237",
    "1,163", "1,208", "1,203", "1,176", "1,196", "1,184", "1,172",
    "1,186", "1,201", "1,215", "1,195", "1,204", "1,228", "1,231",
    "1,226", "1,243", "1,318", "1,246", "1,204", "1,215", "1,208",
    "1,174", "1,232", "1,209", ""
  ),
  X8 = c(
    "1,173", "", "1,173", "1,209", "1,197", "1,229", "1,189", "1,174",
    "1,181", "1,189", "1,171", "1,170", "1,181", "1,175", "1,199",
    "1,153", "1,188", "1,184", "1,169", "1,181", "1,169", "1,168",
    "1,177", "1,194", "1,191", "1,176", "1,179", "1,200", "1,209",
    "1,194", "1,199", "1,309", "1,218", "1,172", "1,183", "1,169",
    "1,161", "1,232", "1,179", ""
  ),
  X9 = c(
    "1,157", "", "1,158", "1,192", "1,180", "1,209", "1,169", "1,159",
    "1,164", "1,173", "1,162", "1,155", "1,168", "1,163", "1,179",
    "1,141", "1,174", "1,164", "1,159", "1,167", "1,154", "1,154",
    "1,164", "1,184", "1,177", "1,163", "1,159", "1,189", "1,194",
    "1,179", "1,184", "1,289", "1,198", "1,153", "1,161", "1,149",
    "1,146", "1,199", "1,159", ""
  ),
  X10 = c(
    "1,207", "", "1,195", "1,239", "1,214", "1,249", "1,219", "1,209",
    "1,199", "1,209", "1,182", "1,189", "1,199", "1,189", "1,219",
    "1,164", "1,199", "1,204", "1,189", "1,198", "1,189", "1,185",
    "1,194", "1,209", "1,209", "1,193", "1,199", "1,224", "1,229",
    "1,218", "1,239", "1,339", "1,239", "1,199", "1,199", "1,186",
    "1,184", "1,259", "1,199", ""
  ),
  X11 = c(
    "0,050", "", "0,037", "0,047", "0,034", "0,040", "0,050", "0,050",
    "0,035", "0,036", "0,020", "0,034", "0,031", "0,026", "0,040",
    "0,023", "0,025", "0,040", "0,030", "0,031", "0,035", "0,031",
    "0,030", "0,025", "0,032", "0,030", "0,040", "0,035", "0,035",
    "0,039", "0,055", "0,050", "0,041", "0,046", "0,038", "0,037",
    "0,038", "0,060", "0,040", ""
  ),
  X12 = c(
    "4,32%", "", "3,20%", "3,94%", "2,88%", "3,31%", "4,28%", "4,31%",
    "3,01%", "3,07%", "1,72%", "2,94%", "2,65%", "2,24%", "3,39%",
    "2,02%", "2,13%", "3,44%", "2,59%", "2,66%", "3,03%", "2,69%",
    "2,58%", "2,11%", "2,72%", "2,58%", "3,45%", "2,94%", "2,93%",
    "3,31%", "4,65%", "3,88%", "3,42%", "3,99%", "3,27%", "3,22%",
    "3,32%", "5,00%", "3,45%", "27 / 32"
  )
)

Then, I apply the following code to clean the dataset dat

library(dplyr)
library(string)

dat = dat %>% select(-X3) %>%
             slice(-2) %>%
             mutate(X1 = '',
                    X2 = str_remove(X2, ' Bezirk'),
                    X4 = str_remove(X4, 'Tank- '),
                    X4 = str_remove(X4, 'stellen '),
                    X5 = str_extract_all(X5, '[:digit:]'))

and now the problem occurs. The function str_extract_all() returns a list of the extracted numbers separated by commas, and the list is stored in column X5. So, I have a list of lists now. My objective is a column X5 consisting of numeric entries which I can further work with such as

X5
6298
10621
...

I tried many different ways and am out of ideas for now. Could anyone help me out how to achieve this?

As a side note: I apply the code to many lists of the same structure using lapply().

I am thankful for any hints!!


Solution

  • dat |> mutate(X5 = purrr::map_chr(X5, paste0, collapse = ""))