I’m a newbie to alle the geo functionalities of R, so forgive me if this is trivial.
Im trying to draw a randomized sample from the locations
object (consisting of lat/long coordinates). All of the given coordinates are located in the German province of Bavaria.
What do I want to do exactly? Overlay a grid on the extension of Bavaria and then randomly select a given number of locations within each cell of the grid.
All discussions I found on Stackoverflow don’t seem to include any reference to this exact question.
Below is some code but it is nowhere near to a solution. The data file can be retrieved via my Dropbox: https://www.dropbox.com/s/sw18m8a3168lhbc/coords.csv?dl=0
Thanks a lot for your help!
Oliver
library(sf)
germany <- raster::getData("GADM", country = "Germany", level = 1)
bavaria <- germany[germany$NAME_1 == "Bayern",]
bavaria_mod <- sf::st_as_sf(bavaria)
grid <- bavaria_mod %>%
st_make_grid(cellsize = 0.1, what = "polygons", square = FALSE) %>%
st_intersection(bavaria_mod)
# Specify file path
coord <- read.csv(".../coords.csv", sep=";", row.names=,1, na.strings="")
library(dplyr)
test <- tibble::as_tibble(coord) |>
filter(!is.na(Latitude), !is.na(Longitude))
# Plots
plot(grid)
points(test$Longitude, test$Latitude, col='orange', pch=20, cex=0.75)
You probably have to figure out optimal grid parameters and best sampling approach yourself depending on the context and actual domain. E.g. consider grid cell size and shape (square vs hexagon), should sampling be weighted, should weights be based on a grid cell area, population density or some other factor. Or maybe the regular grid is not suitable at all and it makes sense to use Eurostat NUTS regions, some service area, local municipality borders, distances from main roads or something else to drive that sampling.
Here's one example of how one could first create 100x100km regular square grid, divide points by cells and sample 10% of points from each cell.
library(sf)
#> Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(dplyr)
library(ggplot2)
# Transforming to projected coordinate reference system to create a metric grid
# EPSG:25832, UTM zone 32N -- https://epsg.io/25832 ;
bavaria_sf <- geodata::gadm("Germany", level = 1, path = "gadm_dl") %>%
st_as_sf() %>%
filter(NAME_1 == "Bayern") %>%
st_transform(crs = 25832)
# Create 100x100km grid, intersect with bravia_sf,
# transform back to WGS84 so it could be joined to points
grid_sf <- bavaria_sf %>%
st_make_grid(cellsize = units::set_units(100, km)) %>%
st_intersection(bavaria_sf) %>%
# st_intersection returns geometry column,
# convert to sf so it could store cell_id attribute
st_as_sf() %>%
st_set_geometry("geometry") %>%
# factor : for more convenient ggplot viz
mutate(cell_id = as.factor(row_number())) %>%
st_transform(crs = "WGS84")
grid_sf %>%
ggplot(aes(label = cell_id)) +
geom_sf() +
geom_sf_label(alpha = .5) +
ggtitle("100x100 grid, cell_id values") +
theme_minimal()
## Create sf of points from csv
points_csv <- "https://www.dropbox.com/s/sw18m8a3168lhbc/coords.csv?dl=1"
points_latlon <- readr::read_delim(points_csv, delim = ";", show_col_types = FALSE)
# check data quality,
# quite a few NA values & punctuation issues resulting in out-of-range coordinates
summary(points_latlon)
#> Latitude Longitude
#> Min. : 12 Min. : 11
#> 1st Qu.: 48 1st Qu.: 11
#> Median : 49 Median : 12
#> Mean : 12003 Mean : 2939
#> 3rd Qu.: 49 3rd Qu.: 13
#> Max. :47753699 Max. :11717572
#> NA's :137 NA's :137
# assign point id, remove NA values, convert to sf,
# add grid attributes (cell_id) to to points through spatial join
# left = FALSE : use inner join to remove out-of-range coordinates
points_sf <- points_latlon %>%
mutate(point_id = row_number()) %>%
na.omit() %>%
st_as_sf(coords = c("Longitude", "Latitude"), remove = FALSE, crs = "WGS84") %>%
st_join(grid_sf, left = FALSE)
points_sf
#> Simple feature collection with 3999 features and 4 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 10.74219 ymin: 47.44295 xmax: 13.81453 ymax: 50.22331
#> Geodetic CRS: WGS 84
#> # A tibble: 3,999 × 5
#> Latitude Longitude point_id geometry cell_id
#> * <dbl> <dbl> <int> <POINT [°]> <fct>
#> 1 47.9 12.0 1 (12.00886 47.86416) 3
#> 2 47.9 11.9 2 (11.86329 47.90276) 3
#> 3 47.8 12.0 3 (12.03605 47.84173) 3
#> 4 47.8 12.0 4 (11.97297 47.84431) 3
#> 5 47.9 12.0 5 (11.9883 47.92656) 3
#> 6 48.0 12.0 6 (11.9559 47.95897) 3
#> 7 47.9 11.9 7 (11.91461 47.88283) 3
#> 8 47.8 12.0 8 (11.96721 47.82401) 3
#> 9 47.9 12.0 9 (12.02283 47.88042) 3
#> 10 47.8 12.0 10 (12.00777 47.77415) 3
#> # ℹ 3,989 more rows
# ~10% sample from each grid cell
points_sf_sample <- points_sf %>%
slice_sample(prop = .1, by = cell_id)
# resulting 395-row sf object
points_sf_sample
#> Simple feature collection with 395 features and 4 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: 10.78363 ymin: 47.49384 xmax: 13.81453 ymax: 50.12196
#> Geodetic CRS: WGS 84
#> # A tibble: 395 × 5
#> Latitude Longitude point_id geometry cell_id
#> <dbl> <dbl> <int> <POINT [°]> <fct>
#> 1 48.0 12.3 1345 (12.30854 47.99197) 3
#> 2 48.0 12.0 282 (11.99309 48.02393) 3
#> 3 47.7 12.1 1017 (12.09874 47.7224) 3
#> 4 47.9 11.8 763 (11.82766 47.87053) 3
#> 5 48.0 12.1 1384 (12.14405 48.01815) 3
#> 6 47.8 11.7 756 (11.67682 47.84605) 3
#> 7 47.9 12.3 1003 (12.33885 47.88012) 3
#> 8 48.1 12.5 847 (12.47459 48.09297) 3
#> 9 47.8 11.7 738 (11.74033 47.79344) 3
#> 10 48.0 12.3 973 (12.32703 47.95384) 3
#> # ℹ 385 more rows
# To use it as a regular tibble / dataframe (i.e. without geometry column)
# pass sf object through st_drop_geometry() first:
# points_sf_sample %>% st_drop_geometry()
Visualise:
# map points
points_plot <- function (points_sf, grid = grid_sf, title = ""){
points_sf %>%
ggplot() +
geom_sf(data = grid_sf ) +
geom_sf(aes(color = cell_id), alpha = .5) +
scale_color_viridis_d() +
ggtitle(title) +
theme_minimal() +
theme(legend.position = "none")
}
# point distribution across grid cells
prop_plot <- function (points_sf, title = ""){
points_sf %>%
st_drop_geometry() %>%
ggplot(aes(x = cell_id, y = after_stat(count/sum(count)))) +
geom_bar(aes(fill = cell_id)) +
geom_text(aes(label = after_stat(count)), nudge_y = .01, stat = "count") +
scale_y_continuous(labels = scales::percent) +
scale_fill_viridis_d() +
ggtitle(title) +
theme_minimal() +
theme(legend.position = "none")
}
patchwork::wrap_plots(
points_plot(points_sf, title = "points_sf"),
points_plot(points_sf_sample, title = "points_sf_sample" ),
prop_plot(points_sf),
prop_plot(points_sf_sample),
guides = "collect"
)
5% sample (207 records incl. NAs) from externally hosted coords.csv
for reproducibility:
points_latlon <- structure(list(Latitude = c(48.460047, 47.835715, 48.35, 50.01895,
48.449453, 49.631708, NA, 48.4027566, 49.044197, 47.8164083,
47.9267455, 47.7376437, 49.196618, 47.855453, 49.8621620897013,
48.2324058, 48.0070554, 48.0239263, 48.736963, 47.729564, 49.109133,
49.234312, 48.4346725, 48.859192, 49.029029, 49.242516, 49.2921066768646,
48.103715, 49.7794212, 48.58469, 47.8261219, 49.235885, 48.000387,
47.745551, 49.489557, 47.90369, 48.1818820197736, 49.1770286,
49.291361, 48.1140289, 49.294193, 48.694438, 49.140107, 49.053483,
48.531191, 49.078276, 48.3203866, 47.775056, 50.199638, 48.156559,
49.05533, 48.673917, 49.228085, 48.712153, 49.057355, 48.078369,
47.927474, 48.4829321, 49.022892, 48.7956624, 47.8246167, 47.7209304,
48.764784, 48.6605773, NA, 49.55748, 48.526303, 49.106989, 49.122224,
NA, 48.446922, 49.57665, 48.981843, 47.94971, 47.879827, 49.357095,
49.793653, 49.177008, 49.468683, 48.544613, 49.403415, 48.361844,
49.522403, 48.683755, 48.026041, 48.27893, 48.004447, 48.793434,
48.017088, 48.0621714, NA, 47.860081, 47.665818, 49.436433, 48.436846,
48.976777, 48.768093, 47.623256, 48.047275, 48.361608, 47.893257,
49.143758, 49.4852016, 47.862639, 49.313177, 49.109211, 50.019534,
47.8735435, 49.580368, 48.2284554571875, 48.823014, 48.308227,
49.416011, 47.8945169, 48.7672231, 49.7339507, 48.8460875, 48.982177,
49.7, 49.168743, 47.734136, 49.1317245, 50.029097, 48.457828,
48.8517832, 48.788989, 49.889998, 49.0827731, 49.450131, 48.01048,
49.112675, 48.482385, 48.769906, 49.724108, 49.6249319, 48.82746,
49.382445, 47.884151, 48.8936502, 49.217479, 48.248994, 49.498321,
48.488178, 48.514002, 48.476623, 48.5806712, 48.4578556, 47.987988,
48.384038, 48.667656, 49.205956, 47725, 47.797177, 49.3261854,
48.5925104, 48.3830804, 48.8806701, 48.504028, 49.100125, 48.004671,
49.150699, 48.861375, 48.285957, 48.908867, 49.401638, 48.0036041,
48.950738, 47.822407, 49.715301, 48.8999367, 49.4258775, 48.474503,
48.319697, 48.0092859, 48.464717, 48.2537608, 49.219499, 48.990985,
48.4789804, 48.943624, 48.41688, 48.580808, 48.086605, 48.559659,
49.7448168, 49.130051, 49.070161, 48.542459, 47.835298, 48.9554435,
47.999043, 50.0327, 48.3310912, 48.8130193, 48.6587758, 47.7012483,
47.6155933, 47.492704, 49.471329, 49.874586, 49.390387, 48.708821,
49.6075579, 48.433305, 47.832461, 49.0699652, 48.216022), Longitude = c(11.706489,
12.233693, 12.333333, 12.028469, 12.137471, 11.817511, NA, 11.1651023,
12.933855, 11.4804346, 12.3799269, 10.7769144, 13.049079, 11.406477,
12.0689065913736, 12.8146204, 12.7486078, 11.993085, 12.713763,
12.881491, 11.161646, 12.657568, 13.1155835, 10.992565, 13.068533,
10.964307, 12.7692206343073, 11.428834, 11.9631656, 10.963977,
11.4121137, 12.324855, 12.6389258, 11.459803, 11.97028, 12.950481,
11.1372185582852, 12.6770617, 12.680287, 11.2921549, 11.836345,
13.419817, 12.367274, 11.782333, 13.176881, 13.2062163, 11.315348,
11.703036, 12.087804, 11.9964917, 13.034749, 11.678925, 12.4166,
11.454315, 11.516181, 11.517031, 11.896899, 12.9001794, 13.275592,
12.6065681, 12.0944248, 12.395497, 11.532544, 12.0317065, NA,
11.490849, 12.82308, 12.3644454, 11.468217, NA, 12.212925, 11.202493,
10.881399, 11.968291, 11.379607, 11.2422, 11.921616, 12.851353,
12.284234, 12.526446, 11.436138, 12.3865, 11.320158, 11.609298,
12.176982, 12.870022, 10.905597, 12.511304, 12.365843, 12.7676484,
NA, 12.625661, 11.048838, 11.646473, 11.8524948, 13.127186, 11.652464,
12.973992, 12.738985, 12.838555, 11.566588, 11.457948, 11.0494148,
12.397666, 12.751713, 11.686648, 12.106724, 12.5886433, 11.615009,
11.4579408700182, 13.6010532, 13.259913, 11.872389, 11.7784633,
11.6239384, 11.61242, 12.3140203, 11.979299, 11.73, 13.089918,
11.540953, 11.2120903, 11.933553, 11.65121, 10.9125143, 12.070335,
12.065993, 11.6208026, 12.375032, 10.881234, 11.816616, 12.03229,
11.965402, 12.385068, 12.1921353, 11.589908, 11.451597, 11.697075,
13.6500874, 12.83647, 11.561471, 11.569279, 11.0710008, 13.073288,
11.379252, 11.3356983, 10.9822507, 11.714249, 10.9270712, 12.79545,
12.926871, 12.0855, 12.168363, 12.1092708, 12.4249371, 12.3062186,
13.1854521, 13.198449, 12.248487, 10.927244, 11.392112, 12.162731,
13.016266, 13.255873, 11.756443, 11.9551674, 11.395847, 11.726202,
12.155797, 13.1832566, 12.3455134, 11.563734, 13.274499, 11.2166513,
11.5825651, 12.8417327, 12.0688, 12.451017, 12.9890533, 12.218339,
12.90243, 13.03024, 11.201369, 13.297408, 11.5600975, 11.334681,
13.099453, 12.239071, 10.829332, 10.9089888, 11.362652, 12.157162,
11.1295818, 12.6118933, 13.790628, 10.9363218, 11.0286344, 11.094968,
11.199696, 11.77106, 12.246808, 11.633561, 11.6323925, 11.5999989,
12.482294, 12.3969462, 11.000662)), row.names = c(NA, -207L), spec = structure(list(
cols = list(Latitude = structure(list(), class = c("collector_number",
"collector")), Longitude = structure(list(), class = c("collector_number",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ";"), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
Created on 2023-06-11 with reprex v2.0.2