Search code examples
rr-sfgeo

Use a grid to create an areally-balanced sample


I’m a newbie to alle the geo functionalities of R, so forgive me if this is trivial.

Im trying to draw a randomized sample from the locations object (consisting of lat/long coordinates). All of the given coordinates are located in the German province of Bavaria.

What do I want to do exactly? Overlay a grid on the extension of Bavaria and then randomly select a given number of locations within each cell of the grid.

All discussions I found on Stackoverflow don’t seem to include any reference to this exact question.

Below is some code but it is nowhere near to a solution. The data file can be retrieved via my Dropbox: https://www.dropbox.com/s/sw18m8a3168lhbc/coords.csv?dl=0

Thanks a lot for your help!

Oliver

library(sf)

germany <- raster::getData("GADM", country = "Germany", level = 1)
bavaria <- germany[germany$NAME_1 == "Bayern",]

bavaria_mod <- sf::st_as_sf(bavaria)

grid <- bavaria_mod %>% 
  st_make_grid(cellsize = 0.1, what = "polygons", square = FALSE) %>%
  st_intersection(bavaria_mod)

# Specify file path

coord <- read.csv(".../coords.csv", sep=";", row.names=,1, na.strings="")


library(dplyr)

test <- tibble::as_tibble(coord) |> 
  filter(!is.na(Latitude), !is.na(Longitude)) 


# Plots

plot(grid)

points(test$Longitude, test$Latitude, col='orange', pch=20, cex=0.75)

Solution

  • You probably have to figure out optimal grid parameters and best sampling approach yourself depending on the context and actual domain. E.g. consider grid cell size and shape (square vs hexagon), should sampling be weighted, should weights be based on a grid cell area, population density or some other factor. Or maybe the regular grid is not suitable at all and it makes sense to use Eurostat NUTS regions, some service area, local municipality borders, distances from main roads or something else to drive that sampling.

    Here's one example of how one could first create 100x100km regular square grid, divide points by cells and sample 10% of points from each cell.

    library(sf)
    #> Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
    library(dplyr)
    library(ggplot2)
    
    # Transforming to projected coordinate reference system to create a metric grid
    # EPSG:25832, UTM zone 32N -- https://epsg.io/25832 ;
    bavaria_sf <- geodata::gadm("Germany", level = 1, path = "gadm_dl") %>% 
      st_as_sf() %>% 
      filter(NAME_1 == "Bayern") %>% 
      st_transform(crs = 25832)  
      
    # Create 100x100km grid, intersect with bravia_sf, 
    # transform back to WGS84 so it could be joined to points
    grid_sf <- bavaria_sf %>% 
      st_make_grid(cellsize = units::set_units(100, km)) %>% 
      st_intersection(bavaria_sf) %>% 
      # st_intersection returns geometry column, 
      # convert to sf so it could store cell_id attribute
      st_as_sf() %>% 
      st_set_geometry("geometry") %>% 
      # factor : for more convenient ggplot viz 
      mutate(cell_id = as.factor(row_number())) %>% 
      st_transform(crs = "WGS84")
    
    grid_sf %>% 
      ggplot(aes(label = cell_id)) +
      geom_sf() +
      geom_sf_label(alpha = .5) +
      ggtitle("100x100 grid, cell_id values") +
      theme_minimal()
    

    
    ##  Create sf of points from csv
    points_csv <- "https://www.dropbox.com/s/sw18m8a3168lhbc/coords.csv?dl=1"
    points_latlon <- readr::read_delim(points_csv, delim = ";", show_col_types = FALSE)
    # check data quality,
    # quite a few NA values & punctuation issues resulting in out-of-range coordinates
    summary(points_latlon)
    #>     Latitude          Longitude       
    #>  Min.   :      12   Min.   :      11  
    #>  1st Qu.:      48   1st Qu.:      11  
    #>  Median :      49   Median :      12  
    #>  Mean   :   12003   Mean   :    2939  
    #>  3rd Qu.:      49   3rd Qu.:      13  
    #>  Max.   :47753699   Max.   :11717572  
    #>  NA's   :137        NA's   :137
    
    # assign point id, remove NA values, convert to sf, 
    # add grid attributes (cell_id) to to points through spatial join
    # left = FALSE : use inner join to remove out-of-range coordinates
    points_sf <- points_latlon %>%
      mutate(point_id = row_number()) %>% 
      na.omit() %>% 
      st_as_sf(coords = c("Longitude", "Latitude"), remove = FALSE, crs = "WGS84") %>% 
      st_join(grid_sf, left = FALSE) 
    
    points_sf
    #> Simple feature collection with 3999 features and 4 fields
    #> Geometry type: POINT
    #> Dimension:     XY
    #> Bounding box:  xmin: 10.74219 ymin: 47.44295 xmax: 13.81453 ymax: 50.22331
    #> Geodetic CRS:  WGS 84
    #> # A tibble: 3,999 × 5
    #>    Latitude Longitude point_id            geometry cell_id
    #>  *    <dbl>     <dbl>    <int>         <POINT [°]> <fct>  
    #>  1     47.9      12.0        1 (12.00886 47.86416) 3      
    #>  2     47.9      11.9        2 (11.86329 47.90276) 3      
    #>  3     47.8      12.0        3 (12.03605 47.84173) 3      
    #>  4     47.8      12.0        4 (11.97297 47.84431) 3      
    #>  5     47.9      12.0        5  (11.9883 47.92656) 3      
    #>  6     48.0      12.0        6  (11.9559 47.95897) 3      
    #>  7     47.9      11.9        7 (11.91461 47.88283) 3      
    #>  8     47.8      12.0        8 (11.96721 47.82401) 3      
    #>  9     47.9      12.0        9 (12.02283 47.88042) 3      
    #> 10     47.8      12.0       10 (12.00777 47.77415) 3      
    #> # ℹ 3,989 more rows
    
    # ~10% sample from each grid cell
    points_sf_sample <- points_sf %>% 
      slice_sample(prop = .1, by = cell_id)
    
    # resulting 395-row sf object
    points_sf_sample
    #> Simple feature collection with 395 features and 4 fields
    #> Geometry type: POINT
    #> Dimension:     XY
    #> Bounding box:  xmin: 10.78363 ymin: 47.49384 xmax: 13.81453 ymax: 50.12196
    #> Geodetic CRS:  WGS 84
    #> # A tibble: 395 × 5
    #>    Latitude Longitude point_id            geometry cell_id
    #>       <dbl>     <dbl>    <int>         <POINT [°]> <fct>  
    #>  1     48.0      12.3     1345 (12.30854 47.99197) 3      
    #>  2     48.0      12.0      282 (11.99309 48.02393) 3      
    #>  3     47.7      12.1     1017  (12.09874 47.7224) 3      
    #>  4     47.9      11.8      763 (11.82766 47.87053) 3      
    #>  5     48.0      12.1     1384 (12.14405 48.01815) 3      
    #>  6     47.8      11.7      756 (11.67682 47.84605) 3      
    #>  7     47.9      12.3     1003 (12.33885 47.88012) 3      
    #>  8     48.1      12.5      847 (12.47459 48.09297) 3      
    #>  9     47.8      11.7      738 (11.74033 47.79344) 3      
    #> 10     48.0      12.3      973 (12.32703 47.95384) 3      
    #> # ℹ 385 more rows
    
    # To use it as a regular tibble / dataframe (i.e. without geometry column)
    # pass sf object through st_drop_geometry() first:
    # points_sf_sample %>% st_drop_geometry()
    

    Visualise:

    # map points
    points_plot <- function (points_sf, grid = grid_sf, title = ""){
      points_sf %>%
        ggplot() +
        geom_sf(data = grid_sf ) +
        geom_sf(aes(color = cell_id), alpha = .5) +
        scale_color_viridis_d() +
        ggtitle(title) +
        theme_minimal() +
        theme(legend.position = "none")
    }
    
    # point distribution across grid cells
    prop_plot <- function (points_sf, title = ""){
      points_sf %>%  
        st_drop_geometry() %>% 
        ggplot(aes(x = cell_id, y = after_stat(count/sum(count)))) +
        geom_bar(aes(fill = cell_id)) +
        geom_text(aes(label = after_stat(count)), nudge_y = .01, stat = "count") +
        scale_y_continuous(labels = scales::percent) +
        scale_fill_viridis_d() +
        ggtitle(title) +
        theme_minimal() +
        theme(legend.position = "none")
    }
    
    patchwork::wrap_plots(
      points_plot(points_sf, title = "points_sf"),
      points_plot(points_sf_sample, title = "points_sf_sample" ),
      prop_plot(points_sf),
      prop_plot(points_sf_sample), 
      guides = "collect"
    )
    

    5% sample (207 records incl. NAs) from externally hosted coords.csv for reproducibility:

    points_latlon <- structure(list(Latitude = c(48.460047, 47.835715, 48.35, 50.01895, 
    48.449453, 49.631708, NA, 48.4027566, 49.044197, 47.8164083, 
    47.9267455, 47.7376437, 49.196618, 47.855453, 49.8621620897013, 
    48.2324058, 48.0070554, 48.0239263, 48.736963, 47.729564, 49.109133, 
    49.234312, 48.4346725, 48.859192, 49.029029, 49.242516, 49.2921066768646, 
    48.103715, 49.7794212, 48.58469, 47.8261219, 49.235885, 48.000387, 
    47.745551, 49.489557, 47.90369, 48.1818820197736, 49.1770286, 
    49.291361, 48.1140289, 49.294193, 48.694438, 49.140107, 49.053483, 
    48.531191, 49.078276, 48.3203866, 47.775056, 50.199638, 48.156559, 
    49.05533, 48.673917, 49.228085, 48.712153, 49.057355, 48.078369, 
    47.927474, 48.4829321, 49.022892, 48.7956624, 47.8246167, 47.7209304, 
    48.764784, 48.6605773, NA, 49.55748, 48.526303, 49.106989, 49.122224, 
    NA, 48.446922, 49.57665, 48.981843, 47.94971, 47.879827, 49.357095, 
    49.793653, 49.177008, 49.468683, 48.544613, 49.403415, 48.361844, 
    49.522403, 48.683755, 48.026041, 48.27893, 48.004447, 48.793434, 
    48.017088, 48.0621714, NA, 47.860081, 47.665818, 49.436433, 48.436846, 
    48.976777, 48.768093, 47.623256, 48.047275, 48.361608, 47.893257, 
    49.143758, 49.4852016, 47.862639, 49.313177, 49.109211, 50.019534, 
    47.8735435, 49.580368, 48.2284554571875, 48.823014, 48.308227, 
    49.416011, 47.8945169, 48.7672231, 49.7339507, 48.8460875, 48.982177, 
    49.7, 49.168743, 47.734136, 49.1317245, 50.029097, 48.457828, 
    48.8517832, 48.788989, 49.889998, 49.0827731, 49.450131, 48.01048, 
    49.112675, 48.482385, 48.769906, 49.724108, 49.6249319, 48.82746, 
    49.382445, 47.884151, 48.8936502, 49.217479, 48.248994, 49.498321, 
    48.488178, 48.514002, 48.476623, 48.5806712, 48.4578556, 47.987988, 
    48.384038, 48.667656, 49.205956, 47725, 47.797177, 49.3261854, 
    48.5925104, 48.3830804, 48.8806701, 48.504028, 49.100125, 48.004671, 
    49.150699, 48.861375, 48.285957, 48.908867, 49.401638, 48.0036041, 
    48.950738, 47.822407, 49.715301, 48.8999367, 49.4258775, 48.474503, 
    48.319697, 48.0092859, 48.464717, 48.2537608, 49.219499, 48.990985, 
    48.4789804, 48.943624, 48.41688, 48.580808, 48.086605, 48.559659, 
    49.7448168, 49.130051, 49.070161, 48.542459, 47.835298, 48.9554435, 
    47.999043, 50.0327, 48.3310912, 48.8130193, 48.6587758, 47.7012483, 
    47.6155933, 47.492704, 49.471329, 49.874586, 49.390387, 48.708821, 
    49.6075579, 48.433305, 47.832461, 49.0699652, 48.216022), Longitude = c(11.706489, 
    12.233693, 12.333333, 12.028469, 12.137471, 11.817511, NA, 11.1651023, 
    12.933855, 11.4804346, 12.3799269, 10.7769144, 13.049079, 11.406477, 
    12.0689065913736, 12.8146204, 12.7486078, 11.993085, 12.713763, 
    12.881491, 11.161646, 12.657568, 13.1155835, 10.992565, 13.068533, 
    10.964307, 12.7692206343073, 11.428834, 11.9631656, 10.963977, 
    11.4121137, 12.324855, 12.6389258, 11.459803, 11.97028, 12.950481, 
    11.1372185582852, 12.6770617, 12.680287, 11.2921549, 11.836345, 
    13.419817, 12.367274, 11.782333, 13.176881, 13.2062163, 11.315348, 
    11.703036, 12.087804, 11.9964917, 13.034749, 11.678925, 12.4166, 
    11.454315, 11.516181, 11.517031, 11.896899, 12.9001794, 13.275592, 
    12.6065681, 12.0944248, 12.395497, 11.532544, 12.0317065, NA, 
    11.490849, 12.82308, 12.3644454, 11.468217, NA, 12.212925, 11.202493, 
    10.881399, 11.968291, 11.379607, 11.2422, 11.921616, 12.851353, 
    12.284234, 12.526446, 11.436138, 12.3865, 11.320158, 11.609298, 
    12.176982, 12.870022, 10.905597, 12.511304, 12.365843, 12.7676484, 
    NA, 12.625661, 11.048838, 11.646473, 11.8524948, 13.127186, 11.652464, 
    12.973992, 12.738985, 12.838555, 11.566588, 11.457948, 11.0494148, 
    12.397666, 12.751713, 11.686648, 12.106724, 12.5886433, 11.615009, 
    11.4579408700182, 13.6010532, 13.259913, 11.872389, 11.7784633, 
    11.6239384, 11.61242, 12.3140203, 11.979299, 11.73, 13.089918, 
    11.540953, 11.2120903, 11.933553, 11.65121, 10.9125143, 12.070335, 
    12.065993, 11.6208026, 12.375032, 10.881234, 11.816616, 12.03229, 
    11.965402, 12.385068, 12.1921353, 11.589908, 11.451597, 11.697075, 
    13.6500874, 12.83647, 11.561471, 11.569279, 11.0710008, 13.073288, 
    11.379252, 11.3356983, 10.9822507, 11.714249, 10.9270712, 12.79545, 
    12.926871, 12.0855, 12.168363, 12.1092708, 12.4249371, 12.3062186, 
    13.1854521, 13.198449, 12.248487, 10.927244, 11.392112, 12.162731, 
    13.016266, 13.255873, 11.756443, 11.9551674, 11.395847, 11.726202, 
    12.155797, 13.1832566, 12.3455134, 11.563734, 13.274499, 11.2166513, 
    11.5825651, 12.8417327, 12.0688, 12.451017, 12.9890533, 12.218339, 
    12.90243, 13.03024, 11.201369, 13.297408, 11.5600975, 11.334681, 
    13.099453, 12.239071, 10.829332, 10.9089888, 11.362652, 12.157162, 
    11.1295818, 12.6118933, 13.790628, 10.9363218, 11.0286344, 11.094968, 
    11.199696, 11.77106, 12.246808, 11.633561, 11.6323925, 11.5999989, 
    12.482294, 12.3969462, 11.000662)), row.names = c(NA, -207L), spec = structure(list(
        cols = list(Latitude = structure(list(), class = c("collector_number", 
        "collector")), Longitude = structure(list(), class = c("collector_number", 
        "collector"))), default = structure(list(), class = c("collector_guess", 
        "collector")), delim = ";"), class = "col_spec"), class = c("spec_tbl_df", 
    "tbl_df", "tbl", "data.frame"))
    

    Created on 2023-06-11 with reprex v2.0.2