Search code examples
rxmlloopssdmx

Loop URL in SDMX format and combine the result into R dataframe


I'm trying to download World Bank's WITS data and convert them into R dataframe. The site's API (ref. pp. 12-13) does not seem to allow users to call all "reporters," "partners," and "products" at once, so one may need to loop through a list of country (either as "reporters" or "partners") using their XML request format:

http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG

(One will need to loop through the "usa" part by a list of "reporters" (country abbreviation)) My goal is to loop through a list a country abbreviation, generate a dataframe for each run, and then bind them together into a larger dataframe. So I referenced this post and use the following code, but it didn't get me any further. I posted my code at below and it will be really appreciated if someone could take a look at and share some tips on this.

# load required packages
library(RCurl)
library(XML)
devtools::install_github("opensdmx/rsdmx")
library(rsdmx)

# if just for one reporter (usa)
myUrl <- "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"

dataset <- readSDMX(myUrl)
stats <- as.data.frame(dataset)

dim(stats)
[1] 5142    8

# looks like this
head(stats)
  FREQ REPORTER PARTNER     PRODUCTCODE
1    A      USA     ABW    01-05_Animal
2    A      USA     ABW 06-15_Vegetable
3    A      USA     ABW  16-24_FoodProd

## loop through a list of reporters (countries)

library(rvest)

# teams
reporters <- c("aus", "usa", "ukr")

# init
df <- data.frame()

# loop
for(i in reporters){
    # find url
    myUrl <- paste0("http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/", i,"/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG")
    dataset <- readSDMX(myUrl)
    stats <- as.data.frame(dataset)
    # bind to dataframe
    df <- rbind(df, stats)
}

# view captured data
View(df)

# NOTHING!

Solution

  • Consider R's built-in utils::download.file() and then parse with XML. Because your data is attribute-centric with no element text in <Series> and <Obs> nodes, consider the undocumented xmlAttrsToDataFrame, requiring triple colon qualifier, :::.

    Finally, use an apply function like sapply and avoid the bookkeeping of for loops and inefficiently growing an object in a loop by calling rbind iteratively on same dataframe. Below even wraps download and XML parsing in tryCatch for potential errors like ukr.

    library(XML)
    
    build_df <- function(i) {
      url <- paste0(
        "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/",
        "tradestats-tariff/reporter/", i,
        "/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"
      )
      
      df <- NULL
      tryCatch({
        tmp <- tempfile()          # DOWNLOAD XML TO TEMP FILE
        download.file(url, tmp)
        
        # PARSE XML AND EXTRACT ATTRIBUTES TO DATA FRAME
        doc <- XML::xmlParse(tmp)
        df <- cbind(
          XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Series")),
          XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Obs"))
        )
        unlink(tmp)               # DELETE TEMP FILE
      }, warning = function(w) print(w)
      , error = function(e) print(e)
      )
      
      return(df)
    }
    
    reporters <- c("aus", "usa", "ukr")
    
    # NAMED LIST OF DATA FRAMES
    df_list <- sapply(reporters, build_df)
    
    # COMPILE ALL INTO SINGLE DATA FRAME (CALL rbind ONCE)
    final_df <- do.call(rbind, unname(df_list))
    

    Output

    df_list

    head(df_list$aus)
      FREQ REPORTER PARTNER     PRODUCTCODE      INDICATOR TIME_PERIOD        OBS_VALUE DATASOURCE
    1    A      AUS     AFG 50-63_TextCloth AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    2    A      AUS     AFG           manuf AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    3    A      AUS     AFG        Textiles AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    4    A      AUS     AFG           Total AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    5    A      AUS     AFG     UNCTAD-SoP3 AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    6    A      AUS     AGO 41-43_HidesSkin AHS-WGHTD-AVRG        2000               20   WITS-TRN
    
    head(df_list$usa)
      FREQ REPORTER PARTNER     PRODUCTCODE      INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
    1    A      USA     ABW    01-05_Animal AHS-WGHTD-AVRG        2000         0   WITS-TRN
    2    A      USA     ABW 06-15_Vegetable AHS-WGHTD-AVRG        2000         0   WITS-TRN
    3    A      USA     ABW  16-24_FoodProd AHS-WGHTD-AVRG        2000         0   WITS-TRN
    4    A      USA     ABW     27-27_Fuels AHS-WGHTD-AVRG        2000         0   WITS-TRN
    5    A      USA     ABW 28-38_Chemicals AHS-WGHTD-AVRG        2000         0   WITS-TRN
    6    A      USA     ABW 39-40_PlastiRub AHS-WGHTD-AVRG        2000         0   WITS-TRN
    
    head(df_list$ukr)
    NULL
    

    final_df

    head(final_df)
      FREQ REPORTER PARTNER     PRODUCTCODE      INDICATOR TIME_PERIOD        OBS_VALUE DATASOURCE
    1    A      AUS     AFG 50-63_TextCloth AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    2    A      AUS     AFG           manuf AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    3    A      AUS     AFG        Textiles AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    4    A      AUS     AFG           Total AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    5    A      AUS     AFG     UNCTAD-SoP3 AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
    6    A      AUS     AGO 41-43_HidesSkin AHS-WGHTD-AVRG        2000               20   WITS-TRN
    
    tail(final_df)
         FREQ REPORTER PARTNER PRODUCTCODE      INDICATOR TIME_PERIOD          OBS_VALUE DATASOURCE
    8966    A      USA     ZWE       Total AHS-WGHTD-AVRG        2000   5.65257483778078   WITS-TRN
    8967    A      USA     ZWE      Transp AHS-WGHTD-AVRG        2000  0.249632882835876   WITS-TRN
    8968    A      USA     ZWE UNCTAD-SoP1 AHS-WGHTD-AVRG        2000   29.5531507778593   WITS-TRN
    8969    A      USA     ZWE UNCTAD-SoP2 AHS-WGHTD-AVRG        2000    2.7243509937362   WITS-TRN
    8970    A      USA     ZWE UNCTAD-SoP3 AHS-WGHTD-AVRG        2000   5.12801568054237   WITS-TRN
    8971    A      USA     ZWE UNCTAD-SoP4 AHS-WGHTD-AVRG        2000 0.0082396181212962   WITS-TRN