Search code examples
rrbindrjsoniofromjson

do.call(rbind, ... ) exhibits unexpected behaviour inside a loop


I'm trying to download some data and have the following

library(RJSONIO)

first.day <- as.Date(paste0("202201","01"), "%Y%m%d")
today <- Sys.Date()
months <- format(seq(first.day, today,by="month"), "%Y%m")
length(months)

months.df.list <- vector("list", length = length(months))

for (m in months){
  print(m)
  filename <- paste0("./monthly_bike_data/", m, ".csv")
  query <- paste0("https://data.urbansharing.com/bergenbysykkel.no/trips/v1/", substr(m,0,4),"/", substr(m,5,6), ".json")
  month.df.raw <- fromJSON(query) #retrieve data
  month.df <- do.call(rbind,month.df.raw) # convert to a proper dataframe
  write.csv(month.df, filename)
  
}

However, when I run class(month.df) I get

[1] "list"

. When I simply run the code

twentytwenty.raw <- fromJSON("https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2020/01.json")
twentytwenty <- do.call(rbind, twentytwenty.raw)
class(twentytwenty)

outside of the loop, I get

[1] "matrix" "array"

As far as I can see, the only difference here is that do.call() is run inside vs. outside of a loop, and it seems it changes the output behavior.

Does anyone know why this is the case and a suggestion for a fix?


Solution

  • Assuming it's more about getting those files than loops and do.call(), I'd suggest bit different approach. First few remarks:

    So I'd build a dataframes from S3 ListBucketResult XML and use those URLs to directly fetch CSV files. Sorry for tidverse overload ..

    library(xml2)
    library(dplyr)
    library(tidyr)
    library(purrr)
    library(stringr)
    library(lubridate)
    
    # S3 ListBucketResult xml
    bucket_url <- "https://data.urbansharing.com/"
    urbansharing_xml <- read_xml(bucket_url) %>% xml_ns_strip()
    
    # xml -> dataframe with file dates and urls, for both json and csv
    trip_data <- urbansharing_xml %>% 
      xml_find_all(".//Contents") %>% 
      as_list() %>% 
      bind_rows() %>% 
      unnest(cols = c(Key, LastModified, Size))  %>% 
      select(!where(is.list)) %>% 
      mutate(month = str_extract(Key, "\\d{4}/\\d+(?=\\.)") %>% ym(), 
             url = paste0(bucket_url, Key) %>% tibble::char(shorten = "front"),
             .keep = "unused",
             .before = 1)
    
    # select only bergenbysykkel csv files, since 2022-01-01 
    bergen_csv <- trip_data %>% 
      filter(str_detect(url, "bergenbysykkel.*csv"),
             month >= date("2022-01-01"))
    print(bergen_csv, n = 5)
    #> # A tibble: 19 × 4
    #>   month      url                                              LastModified Size 
    #>   <date>     <char>                                           <chr>        <chr>
    #> 1 2022-01-01 …ring.com/bergenbysykkel.no/trips/v1/2022/01.csv 2022-02-28T… 6050…
    #> 2 2022-02-01 …ring.com/bergenbysykkel.no/trips/v1/2022/02.csv 2022-03-31T… 6531…
    #> 3 2022-03-01 …ring.com/bergenbysykkel.no/trips/v1/2022/03.csv 2022-04-30T… 1155…
    #> 4 2022-04-01 …ring.com/bergenbysykkel.no/trips/v1/2022/04.csv 2022-05-31T… 1148…
    #> 5 2022-05-01 …ring.com/bergenbysykkel.no/trips/v1/2022/05.csv 2022-06-30T… 1296…
    #> # ℹ 14 more rows
    
    # fetch first 3 files, set local names to year_month.csv; 
    # local file names are stored as vector names for iwalk
    fetch_list <- bergen_csv$url[1:3] %>% 
      set_names(str_replace(., ".*(\\d{4})/(.*)", "\\1_\\2"))
    fetch_list
    #> <pillar_char<[3]>
    #>                                                          2022_01.csv 
    #> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/01.csv 
    #>                                                          2022_02.csv 
    #> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/02.csv 
    #>                                                          2022_03.csv 
    #> https://data.urbansharing.com/bergenbysykkel.no/trips/v1/2022/03.csv
    
    # walk through fetch_list, download all files
    # iwalk() passes each value(url) and name(local file name) to its function 
    iwalk(fetch_list, \(url, filename) download.file(url, filename, mode = "wb"))
    fs::dir_info(glob = "2022*.csv")[,1:3]
    #> # A tibble: 3 × 3
    #>   path        type         size
    #>   <fs::path>  <fct> <fs::bytes>
    #> 1 2022_01.csv file        5.77M
    #> 2 2022_02.csv file        6.23M
    #> 3 2022_03.csv file       11.02M
    
    # import all downloaded csv file to a single  dataframe, 
    # filename column stores source file name
    readr::read_csv(names(fetch_list), id = "filename", show_col_types = FALSE)
    #> # A tibble: 104,998 × 14
    #>    filename    started_at          ended_at            duration start_station_id
    #>    <chr>       <dttm>              <dttm>                 <dbl>            <dbl>
    #>  1 2022_01.csv 2022-01-01 04:01:22 2022-01-01 04:19:41     1099               33
    #>  2 2022_01.csv 2022-01-01 04:01:37 2022-01-01 04:19:38     1081               33
    #>  3 2022_01.csv 2022-01-01 04:02:24 2022-01-01 04:10:08      463             1899
    #>  4 2022_01.csv 2022-01-01 04:04:04 2022-01-01 04:06:10      125              644
    #>  5 2022_01.csv 2022-01-01 04:11:38 2022-01-01 04:23:24      705              368
    #>  6 2022_01.csv 2022-01-01 04:45:47 2022-01-01 04:54:35      528             1887
    #>  7 2022_01.csv 2022-01-01 05:20:44 2022-01-01 05:27:50      425              814
    #>  8 2022_01.csv 2022-01-01 06:09:17 2022-01-01 06:16:29      431              217
    #>  9 2022_01.csv 2022-01-01 06:37:29 2022-01-01 06:54:24     1014             1816
    #> 10 2022_01.csv 2022-01-01 06:40:14 2022-01-01 06:48:20      486              806
    #> # ℹ 104,988 more rows
    #> # ℹ 9 more variables: start_station_name <chr>,
    #> #   start_station_description <chr>, start_station_latitude <dbl>,
    #> #   start_station_longitude <dbl>, end_station_id <dbl>,
    #> #   end_station_name <chr>, end_station_description <chr>,
    #> #   end_station_latitude <dbl>, end_station_longitude <dbl>
    

    Created on 2023-07-01 with reprex v2.0.2