Search code examples
rdplyrtidyrdata-cleaningstringr

R - extract of one character string column for each row: variable-names an values - data cleaning


How can I extract the variables and their values for each row when they are stuck in a character row like {"variable1":value1,"variable2":value2, ...}?

In my df about activities, there is one "info" column (a string with variables following their values) which should be made to columns themselves. The strings are buit like this: {"variable1":value1,"variable2":value2, ...} and NA rows are: {}).

   activity_type  #There is also the column "activity_type" 
94       Running  #(what activity the measurement was)
95   Multi Sport
96       Running
98       Walking
                                                                                                                                                                                                                                                                                                                
info
94 "{\"calories\":2994,\"intensity\":30,\"manual_distance\":0,\"manual_calories\":0,\"hr_average\":60,\"hr_min\":54,\"hr_max\":66,\"hr_zone_0\":86,\"hr_zone_1\":0,\"hr_zone_2\":0,\"hr_zone_3\":0,\"device_startdate\":1612293557,\"device_enddate\":1612315157,\"pause_duration\":0,\"steps\":767,\"distance\":566,\"elevation\":50,\"metcumul\":50}"                                                                                                                                                                                                                                                                                                            
95 {}
96 "{\"calories\":2994,\"intensity\":30,\"manual_distance\":0,\"manual_calories\":0,\"hr_average\":52,\"hr_min\":49,\"hr_max\":58,\"hr_zone_0\":59,\"hr_zone_1\":0,\"hr_zone_2\":0,\"hr_zone_3\":0,\"device_startdate\":1612336821,\"device_enddate\":1612358421,\"pause_duration\":0,\"steps\":1162,\"distance\":827,\"elevation\":101,\"metcumul\":101}"                                                                                                                                                                                                                                                                                                             
98 "{\"calories\":1208,\"intensity\":30,\"manual_distance\":0,\"manual_calories\":0,\"hr_average\":0,\"hr_min\":0,\"hr_max\":0,\"hr_zone_0\":0,\"hr_zone_1\":0,\"hr_zone_2\":0,\"hr_zone_3\":0,\"device_startdate\":1612457880,\"device_enddate\":1612479480,\"pause_duration\":0,\"steps\":0,\"distance\":0,\"elevation\":0,\"metcumul\":0}"

#what I want in the end
row_number    calories  intensity  manual_distance  manual_calories  ...
94    2994      30             0                0           ... 
95    NA        NA             0                0           ...
96    2994      30             0                0           ...
98    1208      30             0                0           ...
                                                                                                                                                                                                                                                                                                 

I tried:

info2 <-as.data.frame(do.call(rbind, strsplit(infos, ":|,")))

FYI: The sequence of variables are different between rows ("calories", "intensity"...all start similar but the sequences differ). So the resulting df was not "consistent". I thought the resulting df would be constructed like: one col is one type of variable and the next col the corresponding value :

#I thought:
#          V25=steps              V27=distance        V29=evaluation V31=metcul
                V25        V26              V27  V28         V29 V30        V31
1           "steps"          0       "distance"    0 "elevation"  17 "metcumul"
2           "steps"          0       "distance"    0 "elevation"  17 "metcumul"
3          "steps"       2420       "distance" 1971 "elevation" 110 "metcumul"

But as the sequence of variables in each row differ, the result is shifted:

info2 [c(1, 52, 86, 93:95), c(25:36)]
                V25        V26              V27  V28           V29 V30
1           "steps"          0       "distance"    0   "elevation"  17
52          "steps"        828       "distance"  536   "elevation"  33
86           "laps"          0           "mvts"    0 "pool_length"  25
93          "steps"       2420       "distance" 1971   "elevation" 110
94 "device_enddate" 1612315157 "pause_duration"    0       "steps" 767
95               {}         {}               {}   {}            {}  {}
          V31  V32         V33  V34          V35 V36
1  "metcumul"  17} {"calories"  123  "intensity"  30
52 "metcumul"  33} {"calories"   30  "intensity"  30
86     "type"   9} {"calories" 1881 "hr_average"  55
93 "metcumul" 110} {"calories"  218  "intensity"  30
94 "distance"  566 "elevation"   50   "metcumul" 50}
95         {}   {}          {}   {}           {}  {}

FYI: There are even differences within one type of activity (some have more/less variables, so the shorter rows have to repeat the first variables at the end) :

swim2[2:4,c(1,29:34)]
           V1           V29 V30       V31 V32         V33  V34
2 {"calories" "pool_length"  25 "version"   0      "type"   3}
3 {"calories" "pool_length"  25 "version"   0      "type"   4}
4 {"calories" "pool_length"  25    "type"  9} {"calories" 1881

walking2[17:19,c(1:2,21:31)]
            V1   V2         V21 V22                V23        V24
17 {"calories"   30 "hr_zone_3"   0   "pause_duration"          0
18 {"calories"   13 "hr_zone_3"   0   "pause_duration"          0
19 {"calories" 1208 "hr_zone_3"   0 "device_startdate" 1612457880
                V25        V26              V27 V28         V29 V30        V31
17          "steps"        511       "distance" 339 "elevation"  23 "metcumul"
18          "steps"        405       "distance" 282 "elevation"  16 "metcumul"
19 "device_enddate" 1612479480 "pause_duration"   0     "steps"   0 "distance"

Conclusion: How can I extract the variables and their values for each row (when they are stuck in a row like {"variable1":value1,"variable2":value2, ...})?

Maybe with stringr? For example looking for "calories" in the row and then use the next number? Or maybe I should have used the strsplitin a loop/ apply function?

I am absolutly open to other approachs!

Thanks a lot to all R-cracks!


Solution

  • Here is one idea.

    library(tidyverse)
    
    infogroup2 <- infogroup %>%
      mutate(ID = 1:n()) %>%
      mutate(info = str_remove_all(info, pattern = regex("\\{|\\}"))) %>%
      mutate(info = ifelse(info %in% "", NA, info)) %>%
      separate_rows(info, sep = ",") %>%
      separate(info, into = c("parameter", "value"), sep = ":", convert = TRUE) %>%
      mutate(parameter = str_remove_all(parameter, pattern = regex('\\"|\\"'))) %>%
      pivot_wider(names_from = "parameter", values_from = "value", values_fill = 0) %>%
      select(-all_of(c("NA", "ID")))
    

    DATA

    infogroup <- tibble(
      activity_type = c("Running", "Multi Sport", "Running", "Walking"),
      info = c('{"calories":2994,"intensity":30,"manual_distance":0,"manual_calories":0,"hr_average":60,"hr_min":54,"hr_max":66,"hr_zone_0":86,"hr_zone_1":0,"hr_zone_2":0,"hr_zone_3":0,"device_startdate":1612293557,"device_enddate":1612315157,"pause_duration":0,"steps":767,"distance":566,"elevation":50,"metcumul":50}',                                                                                                                                                                        
               '{}',
               '{"calories":2994,"intensity":30,"manual_distance":0,"manual_calories":0,"hr_average":52,"hr_min":49,"hr_max":58,"hr_zone_0":59,"hr_zone_1":0,"hr_zone_2":0,"hr_zone_3":0,"device_startdate":1612336821,"device_enddate":1612358421,"pause_duration":0,"steps":1162,"distance":827,"elevation":101,"metcumul":101}',                                                                                                                                                                                                                                                                                                             
               '{"calories":1208,"intensity":30,"manual_distance":0,"manual_calories":0,"hr_average":0,"hr_min":0,"hr_max":0,"hr_zone_0":0,"hr_zone_1":0,"hr_zone_2":0,"hr_zone_3":0,"device_startdate":1612457880,"device_enddate":1612479480,"pause_duration":0,"steps":0,"distance":0,"elevation":0,"metcumul":0}'
     ))