I'm trying to download some data and have the following
library(RJSONIO)
first.day <- as.Date(paste0("202201","01"), "%Y%m%d")
today <- Sys.Date()
months <- format(seq(first.day, today,by="month"), "%Y%m")
length(months)
months.df.list <- vector("list", length = length(months))
for (m in months){
print(m)
filename <- paste0("./monthly_bike_data/", m, ".csv")
query <- paste0("https://data.urbansharing.com/bergenbysykkel.no/trips/v1/", substr(m,0,4),"/", substr(m,5,6), ".json")
month.df.raw <- fromJSON(query) #retrieve data
month.df <- do.call(rbind,month.df.raw) # convert to a proper dataframe
write.csv(month.df, filename)
}
However, when I run class(month.df) I get
[1] "list"
. When I simply run the code
twentytwenty.raw <- fromJSON("https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2020/01.json")
twentytwenty <- do.call(rbind, twentytwenty.raw)
class(twentytwenty)
outside of the loop, I get
[1] "matrix" "array"
As far as I can see, the only difference here is that do.call() is run inside vs. outside of a loop, and it seems it changes the output behavior.
Does anyone know why this is the case and a suggestion for a fix?
Assuming it's more about getting those files than loops and do.call()
, I'd suggest bit different approach. First few remarks:
So I'd build a dataframes from S3 ListBucketResult XML and use those URLs to directly fetch CSV files. Sorry for tidverse overload ..
library(xml2)
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
library(lubridate)
# S3 ListBucketResult xml
bucket_url <- "https://data.urbansharing.com/"
urbansharing_xml <- read_xml(bucket_url) %>% xml_ns_strip()
# xml -> dataframe with file dates and urls, for both json and csv
trip_data <- urbansharing_xml %>%
xml_find_all(".//Contents") %>%
as_list() %>%
bind_rows() %>%
unnest(cols = c(Key, LastModified, Size)) %>%
select(!where(is.list)) %>%
mutate(month = str_extract(Key, "\\d{4}/\\d+(?=\\.)") %>% ym(),
url = paste0(bucket_url, Key) %>% tibble::char(shorten = "front"),
.keep = "unused",
.before = 1)
# select only bergenbysykkel csv files, since 2022-01-01
bergen_csv <- trip_data %>%
filter(str_detect(url, "bergenbysykkel.*csv"),
month >= date("2022-01-01"))
print(bergen_csv, n = 5)
#> # A tibble: 19 × 4
#> month url LastModified Size
#> <date> <char> <chr> <chr>
#> 1 2022-01-01 …ring.com/bergenbysykkel.no/trips/v1/2022/01.csv 2022-02-28T… 6050…
#> 2 2022-02-01 …ring.com/bergenbysykkel.no/trips/v1/2022/02.csv 2022-03-31T… 6531…
#> 3 2022-03-01 …ring.com/bergenbysykkel.no/trips/v1/2022/03.csv 2022-04-30T… 1155…
#> 4 2022-04-01 …ring.com/bergenbysykkel.no/trips/v1/2022/04.csv 2022-05-31T… 1148…
#> 5 2022-05-01 …ring.com/bergenbysykkel.no/trips/v1/2022/05.csv 2022-06-30T… 1296…
#> # ℹ 14 more rows
# fetch first 3 files, set local names to year_month.csv;
# local file names are stored as vector names for iwalk
fetch_list <- bergen_csv$url[1:3] %>%
set_names(str_replace(., ".*(\\d{4})/(.*)", "\\1_\\2"))
fetch_list
#> <pillar_char<[3]>
#> 2022_01.csv
#> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/01.csv
#> 2022_02.csv
#> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/02.csv
#> 2022_03.csv
#> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/03.csv
# walk through fetch_list, download all files
# iwalk() passes each value(url) and name(local file name) to its function
iwalk(fetch_list, \(url, filename) download.file(url, filename, mode = "wb"))
fs::dir_info(glob = "2022*.csv")[,1:3]
#> # A tibble: 3 × 3
#> path type size
#> <fs::path> <fct> <fs::bytes>
#> 1 2022_01.csv file 5.77M
#> 2 2022_02.csv file 6.23M
#> 3 2022_03.csv file 11.02M
# import all downloaded csv file to a single dataframe,
# filename column stores source file name
readr::read_csv(names(fetch_list), id = "filename", show_col_types = FALSE)
#> # A tibble: 104,998 × 14
#> filename started_at ended_at duration start_station_id
#> <chr> <dttm> <dttm> <dbl> <dbl>
#> 1 2022_01.csv 2022-01-01 04:01:22 2022-01-01 04:19:41 1099 33
#> 2 2022_01.csv 2022-01-01 04:01:37 2022-01-01 04:19:38 1081 33
#> 3 2022_01.csv 2022-01-01 04:02:24 2022-01-01 04:10:08 463 1899
#> 4 2022_01.csv 2022-01-01 04:04:04 2022-01-01 04:06:10 125 644
#> 5 2022_01.csv 2022-01-01 04:11:38 2022-01-01 04:23:24 705 368
#> 6 2022_01.csv 2022-01-01 04:45:47 2022-01-01 04:54:35 528 1887
#> 7 2022_01.csv 2022-01-01 05:20:44 2022-01-01 05:27:50 425 814
#> 8 2022_01.csv 2022-01-01 06:09:17 2022-01-01 06:16:29 431 217
#> 9 2022_01.csv 2022-01-01 06:37:29 2022-01-01 06:54:24 1014 1816
#> 10 2022_01.csv 2022-01-01 06:40:14 2022-01-01 06:48:20 486 806
#> # ℹ 104,988 more rows
#> # ℹ 9 more variables: start_station_name <chr>,
#> # start_station_description <chr>, start_station_latitude <dbl>,
#> # start_station_longitude <dbl>, end_station_id <dbl>,
#> # end_station_name <chr>, end_station_description <chr>,
#> # end_station_latitude <dbl>, end_station_longitude <dbl>
Created on 2023-07-01 with reprex v2.0.2