I originally developed a model to work with a df as df1_ref. Now, I found more data I need in a different format as df2.
df1_ref:
id country point_id lon lat value_A
10195 United States 0 -98.52838821 35.33514183 22
10195 United States 1 -98.52838821 35.33514183 22
10195 United States 2 -98.52838821 35.33514183 22
10195 United States 3 -98.6100762 35.34134382 22
557 United States 0 -98.6100762 35.34134382 33
557 United States 1 -98.68102873 35.32256443 33
557 United States 2 -98.68102873 35.32256443 33
11354 United States 0 -95.4872305 31.85227725 22
11354 United States 1 -95.4872305 31.85227725 22
11354 United States 2 -95.474907 31.90126317 22
10131 United States 0 -98.7424264 35.777792 25
10131 United States 1 -98.58307728 35.74085314 25
10131 United States 2 -98.58307728 35.74085314 25
df2:
id country point_id_0 mid_points last_point_n route_id value_B
[10195, 557, 10123, 10105] United States [35.31514548, -98.46788543] [[35.33514183, -98.52838821], [35.34134382, -98.6100762]] [35.50457366645043, -98.98023683317503] 883 3500.69
[10029, 10027, 11354, 11355] United States [31.77540736, -95.11453227] [[31.83779617, -95.1591275], [31.89157317, -95.20337925]] [31.3187429993161, -95.450949000766] 1156 105.9092
[10131, 10204, 553, 2855] United States [35.77292283, -98.69682372] [[35.777792, -98.7424264], [35.74085314, -98.58307728]] [35.7434880000976, -99.2341759999093] 1186 4478.04
dput(df2[1:3,])
structure(list(id = c("[10195, 557, 10123, 10105, 10088, 10083]",
"[5349, 5369, 5414, 5455]", "[13184, 13217, 13235, 13251]"),
country = c("United States", "United States", "United States"
), point_id_0 = c("[35.31514548, -98.46788543]", "[40.7595028, -80.78622874]",
"[26.8019707, -99.18425714]"), mid_points = c("[[35.33514183, -98.52838821], [35.34134382, -98.6100762], [35.32256443, -98.68102873], [35.314511, -98.74060343], [35.298251, -98.796368], [35.30914456, -98.87914044], [35.40467971, -98.97597341], [35.50457367, -98.98023683]]",
"[[40.803904, -80.757632], [40.835328, -80.761536], [40.832602, -80.728872], [40.824192, -80.628096], [40.844928, -80.588928], [40.870912, -80.51968], [40.887313, -80.441874], [40.911872, -80.325376], [40.935712, -80.146112]]",
"[[26.77083569, -99.16190154], [26.8606385, -99.0916255], [26.99262, -99.04733833], [27.04302367, -99.0116915], [27.24608845, -98.94356648]]"
), last_point_n = c("[35.50457366645043, -98.98023683317503]",
"[40.9265919994913, -79.9925760003392]", "[27.24608845465542, -98.94356647741596]"
), route_id = c(883, 884, 885), value_B = c(3500.69, 20911.7359999998,
1794.77)), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))
I need to convert df2 into df1_ref for me to run my previous model.
Here I explain what I need:
Various id in df2 come in the same observation, e.g., [10195, 557, 10123, 10105...]; I simplify to 4 id's for the example. So, I need to split it into rows. I share some examples as how it would look like in df1.
point_id_0 from df2 is lat and lon of point_id = 0 of df1. So, I need to split point_id_0 in lat and lon columns, respectively.
mid_points from df2 are the next coordinates lat/lot of my network, each pair of coordinates belongs to one id in the same order (1,2,3...). So, first mid_points of route_id is id = 557, then to become 1 in point_id.
last_point_n is the last coordinate, belonging to the last id, e.g., 10105; point_id = 3.
Therefore, df3 is built from df2, considering df1 as a reference. So, the final df3 is as df1 + value_B + route_id.
I have tried the following and it works:
df3 <- cbind(df2,
do.call(rbind,
lapply(df2$point_id_0, jsonlite::fromJSON)))
However, it did not work for mid_points as it would require first to convert all [lat, lon] into rows and then split it in columns.
This is a partial answer to show how one might work with columns containing JSON strings of (nested) lists and with resulting list columns. I'm afraid there's just not enough information available in the question to assign id
and point_id
values for mid-point coordinates.
Resulting id
is a list column, for first and last points of each group id
list includes a single value; for all midpoints it holds remaining id values.
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(purrr)
library(jsonlite, warn.conflicts = FALSE)
df2 |>
# parse JSON columns with parse_json
mutate(across(c(id, point_id_0:last_point_n), \(x) map(x, parse_json))) |>
# combine point_id_0, mid_points & last_point_n into a single column, rowwise
rowwise() |>
mutate(pts = c(list(point_id_0), mid_points, list(last_point_n)) |> list(),
.keep = "unused", .after = country) |>
ungroup() |>
# unnest newly create pts column, hoist lon & lat to top-level columns
unnest_longer(pts) |>
hoist(pts, lon = 2, lat = 1) |>
select(-pts) |>
# group by id, assign 1st and last id-s, for mid-points leave a list of id candidates
mutate(
id = case_when(row_number() == 1 ~ list(first(id[[1]])),
row_number() == n() ~ list(last (id[[1]])),
.default = id[[1]] |> head(-1) |> tail(-1) |> list()) ,
.by = id) |>
# display list column content and increase number of displayed digits
rowwise() |>
mutate(id_chr = paste0(id, collapse = ","), .after = id, across(lon:lat, \(x) tibble::num(x, digits = 9) )) |>
ungroup() |>
print(n = 28)
Result:
#> # A tibble: 28 × 7
#> id id_chr country lon lat route_id value_B
#> <list> <chr> <chr> <num:.9!> <num:.9!> <dbl> <dbl>
#> 1 <int [1]> 10195 United… -98.467885430 35.315145480 883 3501.
#> 2 <list [4]> 557,10123,101… United… -98.528388210 35.335141830 883 3501.
#> 3 <list [4]> 557,10123,101… United… -98.610076200 35.341343820 883 3501.
#> 4 <list [4]> 557,10123,101… United… -98.681028730 35.322564430 883 3501.
#> 5 <list [4]> 557,10123,101… United… -98.740603430 35.314511000 883 3501.
#> 6 <list [4]> 557,10123,101… United… -98.796368000 35.298251000 883 3501.
#> 7 <list [4]> 557,10123,101… United… -98.879140440 35.309144560 883 3501.
#> 8 <list [4]> 557,10123,101… United… -98.975973410 35.404679710 883 3501.
#> 9 <list [4]> 557,10123,101… United… -98.980236830 35.504573670 883 3501.
#> 10 <int [1]> 10083 United… -98.980236833 35.504573666 883 3501.
#> 11 <int [1]> 5349 United… -80.786228740 40.759502800 884 20912.
#> 12 <list [2]> 5369,5414 United… -80.757632000 40.803904000 884 20912.
#> 13 <list [2]> 5369,5414 United… -80.761536000 40.835328000 884 20912.
#> 14 <list [2]> 5369,5414 United… -80.728872000 40.832602000 884 20912.
#> 15 <list [2]> 5369,5414 United… -80.628096000 40.824192000 884 20912.
#> 16 <list [2]> 5369,5414 United… -80.588928000 40.844928000 884 20912.
#> 17 <list [2]> 5369,5414 United… -80.519680000 40.870912000 884 20912.
#> 18 <list [2]> 5369,5414 United… -80.441874000 40.887313000 884 20912.
#> 19 <list [2]> 5369,5414 United… -80.325376000 40.911872000 884 20912.
#> 20 <list [2]> 5369,5414 United… -80.146112000 40.935712000 884 20912.
#> 21 <int [1]> 5455 United… -79.992576000 40.926591999 884 20912.
#> 22 <int [1]> 13184 United… -99.184257140 26.801970700 885 1795.
#> 23 <list [2]> 13217,13235 United… -99.161901540 26.770835690 885 1795.
#> 24 <list [2]> 13217,13235 United… -99.091625500 26.860638500 885 1795.
#> 25 <list [2]> 13217,13235 United… -99.047338330 26.992620000 885 1795.
#> 26 <list [2]> 13217,13235 United… -99.011691500 27.043023670 885 1795.
#> 27 <list [2]> 13217,13235 United… -98.943566480 27.246088450 885 1795.
#> 28 <int [1]> 13251 United… -98.943566477 27.246088455 885 1795.
Input frame:
df2 <- structure(list(id = c("[10195, 557, 10123, 10105, 10088, 10083]",
"[5349, 5369, 5414, 5455]", "[13184, 13217, 13235, 13251]"),
country = c("United States", "United States", "United States"
), point_id_0 = c("[35.31514548, -98.46788543]", "[40.7595028, -80.78622874]",
"[26.8019707, -99.18425714]"), mid_points = c("[[35.33514183, -98.52838821], [35.34134382, -98.6100762], [35.32256443, -98.68102873], [35.314511, -98.74060343], [35.298251, -98.796368], [35.30914456, -98.87914044], [35.40467971, -98.97597341], [35.50457367, -98.98023683]]",
"[[40.803904, -80.757632], [40.835328, -80.761536], [40.832602, -80.728872], [40.824192, -80.628096], [40.844928, -80.588928], [40.870912, -80.51968], [40.887313, -80.441874], [40.911872, -80.325376], [40.935712, -80.146112]]",
"[[26.77083569, -99.16190154], [26.8606385, -99.0916255], [26.99262, -99.04733833], [27.04302367, -99.0116915], [27.24608845, -98.94356648]]"
), last_point_n = c("[35.50457366645043, -98.98023683317503]",
"[40.9265919994913, -79.9925760003392]", "[27.24608845465542, -98.94356647741596]"
), route_id = c(883, 884, 885), value_B = c(3500.69, 20911.7359999998,
1794.77)), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))
df2
#> # A tibble: 3 × 7
#> id country point_id_0 mid_points last_point_n route_id value_B
#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 [10195, 557, 1012… United… [35.31514… [[35.3351… [35.5045736… 883 3501.
#> 2 [5349, 5369, 5414… United… [40.75950… [[40.8039… [40.9265919… 884 20912.
#> 3 [13184, 13217, 13… United… [26.80197… [[26.7708… [27.2460884… 885 1795.
glimpse(df2)
#> Rows: 3
#> Columns: 7
#> $ id <chr> "[10195, 557, 10123, 10105, 10088, 10083]", "[5349, 5369,…
#> $ country <chr> "United States", "United States", "United States"
#> $ point_id_0 <chr> "[35.31514548, -98.46788543]", "[40.7595028, -80.78622874…
#> $ mid_points <chr> "[[35.33514183, -98.52838821], [35.34134382, -98.6100762]…
#> $ last_point_n <chr> "[35.50457366645043, -98.98023683317503]", "[40.926591999…
#> $ route_id <dbl> 883, 884, 885
#> $ value_B <dbl> 3500.69, 20911.74, 1794.77
Also note that all 28 coordinate pairs in provided (dput) sample are unique, and there are signs of precision issues. For example consider rows 9 & 10 of resulting frame, corresponding values in input data are:
35.50457367, -98.98023683
35.50457366645043, -98.98023683317503