I'm trying to scrape all tif urls from a public facing website (https://prd-tnm.s3.amazonaws.com/index.html?prefix=StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/). I receive a list of tif urls back, but it only returns 1000, when there are 15,626.
library(tidyverse)
library(RCurl)
library(xml2)
library(rvest)
start_url <- 'https://prd-tnm.s3.amazonaws.com/index.html?prefix=StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/'
files <- read_html(gsub('index\\.html\\?','?delimiter=/&', start_url)) %>%
html_elements('key') %>%
html_text() %>%
url_absolute(start_url)
length(files)
[1] 1000
There should be 15,626 files (or urls).
You could always use paws
. Paws is an aws sdk for R.
library(paws)
client <- s3()
bucket <- "prd-tnm"
prefix <- "StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF"
# list all elements in bucket and prefix:
# paginate over all pages to get all keys
keys <- client$list_objects_v2(
Bucket = bucket,
Prefix = prefix
) |> paginate_lapply(\(resp) {
vapply(resp$Contents, \(pg) pg$Key, FUN.VALUE = "")
}) |> unlist()
length(keys)
#> [1] 15627
# create s3 uris
sprintf("s3://%s/%s", bucket, keys)[1:5]
#> [1] "s3://prd-tnm/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514270.tif"
#> [2] "s3://prd-tnm/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514271.tif"
#> [3] "s3://prd-tnm/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514272.tif"
#> [4] "s3://prd-tnm/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514273.tif"
#> [5] "s3://prd-tnm/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514274.tif"
# create https urls links
sprintf("https://%s.s3.amazonaws.com/%s", bucket, keys)[1:5]
#> [1] "https://prd-tnm.s3.amazonaws.com/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514270.tif"
#> [2] "https://prd-tnm.s3.amazonaws.com/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514271.tif"
#> [3] "https://prd-tnm.s3.amazonaws.com/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514272.tif"
#> [4] "https://prd-tnm.s3.amazonaws.com/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514273.tif"
#> [5] "https://prd-tnm.s3.amazonaws.com/StagedProducts/Elevation/OPR/Projects/MN_RainyLake_2020_B20/MN_RainyLake_1_2020/TIFF/USGS_OPR_MN_RainyLake_2020_B20_15TWN514274.tif"
Created on 2024-09-25 with reprex v2.1.1
I hope this helps