Search code examples
rweb-scrapingshiny

R Shiny error for data scrape web app "number of items to replace is not a multiple of replacement length"


I am attempting to make a shiny web app using a function that I found to scrape data off of a NOAA website. So I did not create the function, but I have used it and it works well. I want to create the app for others at work who are not willing/able to use R so they can download the data from the app.

I have created some simple apps and thought this would be simple as well but I am running into a problem. I run the app and get the message "number of items to replace is not a multiple of replacement length". I think this has to do with the fact that the function creates lists of csv files (line 163 of helpers)and is then adding them together, but I am not certain. I am hoping that I am missing something about creating an empty data frame in shiny that can handle the new table and not have to edit the function within helpers at all.

The info already entered into the ui should work if the app does so you don't need to go to the noaa website to find a station name/number. I really appreciate the help. Thanks.

server.R

library(XML)
library(RCurl)
library(plyr)
source("helpers.R")

shinyServer(
  function(input, output) {
    
    
    
    output$stageData <- renderTable({
      
      input$goButton
      stage <- isolate(data.frame(noaa(begindate = input$begindate, enddate = input$enddate, station = input$station,
                         units = input$units, datum = input$datum, interval = input$interval, time = input$time, continuous = "TRUE")))
      return(stage)
    })
    
    
  }
) 

ui.R

shinyUI(fluidPage(
  titlePanel("NOAA"),
  
  sidebarLayout(
    sidebarPanel(
     
      
      textInput("station", label = h5("Station"), value = "Astoria, OR"),  
      
      textInput("begindate", label = h5("Begin date, must be format: YYYYMMDD"), value = "20140101"),
      
      textInput("enddate",  label = h5("End date, must be format: YYYYMMDD"), value = "20140201"),
      
      selectInput("units", label = h5("Units"), 
                  choices = list("Feet" = "feet", "Meters" = "meters"), 
                  selected = "feet"),
      
      selectInput("datum", label = h5("Datum"), 
                  choices = list("NAVD" = "NAVD", "Station" = "station", "MLLM" = "MLLW", "MLW" = "MLW", "MSL" = "MSL", "MTL" = "MTL", "MHW" = "MHW", "MHHW" = "MHHW", "IGLD" = "IGLD"), 
                  selected = "NAVD"),
      
      selectInput("interval", label = h5("Interval"), 
                  choices = list("HL" = "HL", "6 minute" = "6 minute", "Hourly" = "hourly", "Monthly" = "monthly"), 
                  selected = "HL"),
      
      selectInput("time", label = h5("Time"), 
                  choices = list("LST" = "LST", "GMT" = "GMT", "LST/LDT" = "LST/LDT"), 
                  selected = "LST"),
     
     
      actionButton("goButton", "Go!")
      ),
    
    mainPanel(
      
      tableOutput("stageData")
    )
  )
))

helpers.R

noaa <- function(begindate = "begindate", enddate = "enddate", station = "9439040",
                 units = "feet", datum = "NAVD", interval = "hourly", time = "GMT", continuous = "TRUE") {
  
  if(!continuous %in% c("FALSE", "TRUE", "T", "F")) stop("'continuous' must be set to 'TRUE' or 'FALSE'")
  # set units                                                       
  if(units ==  "meters")       {
    u.csv <- u <- "metric"
  } else if(units ==  "feet") {
    u <- "standard"
    u.csv <- "english"
  }  else stop("invalid units: must be 'feet' or 'meters' ")
  
  
  
  # set datum
  if(datum %in% c("STND", "MHHW", "MHW", "MTL", "MSL", "MLW", "MLLW", "NAVD", "IGLD")){
    datum <- datum  
  } else stop("invalid datum: must be 'STND', 'MHHW', 'MHW', 'MTL',
    'MSL', 'MLW', 'MLLW', 'IGLD', or 'NAVD'")
  
  # set measurement time interval        
  if(interval ==  "6 minute")                   {
    ti.csv <- "water_level"
    ti.name <- "Verified 6-Minute Water Level"
  } else if(interval ==  "hourly")             {
    ti.csv <- "hourly_height"
    ti.name <- "Verified Hourly Height Water Level"
  } else if(interval ==  "HL")                 {
    ti.csv <- "high_low"
    ti.name <- "Verified High/Low Water Level"
  } else if(interval ==  "monthly")                 {
    ti.csv <- "monthly_mean"
    ti.name <- "Verified Monthly Mean Water Level"
  }  else stop("invalid time interval: must be '6 minute', 'hourly', or 'HL'")
  
  # set time zone
  if(time %in%  c("LST/LDT", "GMT", "LST"))     {
    tz <- time
  } else stop("invalid time zone: must be 'LST/LDT', 'GMT', or 'LST' ")
  
  
  # set site name/number indicator
  if(regexpr("[0-9]{7}", station)[1] == 1)      {
    site.ind <- c(1)
  } else if(regexpr("[a-zA-Z]+", station)[1] == 1) {
    site.name <- station
    site.ind <- c(0)
  } else stop("Invalid station entry: must use station name or number. Check active stations 
   at: http://co-ops.nos.noaa.gov/stations.html?type=Water+Levels")
  
  
  
  suppressWarnings(stns <- readLines("http://co-ops.nos.noaa.gov/stations.html?type=Water+Levels")) # list of active stations
  
  if(site.ind == 1) {                                                             # Use station number to identify station
    stn1 <- grep(paste(station, " " , sep = ""), stns)                              # station number is followed by a space, then the station name
    if(length(stn1) == 0) {
      stop("Station number appears to be invalid. No match found at
           http://co-ops.nos.noaa.gov/stations.html?type=Water+Levels")
    } else if(length(stn1) > 1) {
      stop("Station number appears to be duplicated. Try using site name:
           http://co-ops.nos.noaa.gov/stations.html?type=Water+Levels")
    } else if(length(stn1) == 1) {
      stn2 <- regexpr("[0-9] .*</a>$", stns[stn1])
      stn3 <- regmatches(stns[stn1], stn2)                                        # extract matches
      site.name <- gsub("[0-9] |</a>", "", stn3)                                  # clean up site name
    }
  } else if(site.ind == 0) {                                                    # Use station name to identify site number
    no1 <- grep(site.name, stns)                                                    
    if(length(no1) == 1){ 
      no2 <- regexpr("[0-9]{7} .*</a>$", stns[no1])
      no3 <- regmatches(stns[no1], no2)                                       
      station <- site.no <- gsub("[A-Za-z]| |,|</a>", "", no3)                           
    } else if(length(no1) > 1){
      stop("Site name found for multiple active NOAA stations. Look up site number at 
           http://co-ops.nos.noaa.gov/stations.html?type=Water+Levels")
    } else if(length(no1) < 1){
      stop("Site name not found on list of active NOAA stations. Look up sites at 
           http://co-ops.nos.noaa.gov/stations.html?type=Water+Levels. 
           Be attentive to spelling or consider using the station number.")
    }
    }
  
  
  
  
  
  doc <- htmlParse(getURL(paste("http://co-ops.nos.noaa.gov/inventory.html?id=", station, sep="")),
                   useInternalNodes = TRUE)
  nodes <- getNodeSet(doc, "//tr")
  date.list <- sapply(nodes, function(x)  xmlValue(getSibling(x)))
  data.line <- grep(ti.name, date.list)          
  
  first.record <- regexpr("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}[:punct:][0-9]{2}", 
                          date.list[data.line])
  first.rec <- regmatches(date.list[data.line], first.record )
  first.rec <- as.Date(substr(first.rec, 1, 10), format = "%Y-%m-%d")
  last.record <- regexpr("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}[:punct:][0-9]{2}$", 
                         date.list[data.line])
  last.rec <- regmatches(date.list[data.line], last.record )
  last.rec <- as.Date(substr(last.rec, 1, 10), format = "%Y-%m-%d")
  
  if(length(date.list[data.line]) > 1) {
    first.rec <- first.rec[1]
    last.rec <- last.rec[length(last.rec)]
  }
  
  
  
  # set start/end dates to full period of record, if left as default
  if(begindate ==  "begindate")        {
    sdate <- strptime(as.character(first.rec), "%Y-%m-%d")
    sdate <- gsub("-", "", sdate)
    sdate <- as.Date(as.character(sdate), "%Y%m%d")              
    sdate <- sdate + 1
  } else if(begindate !=  "begindate")                         {
    sdate <- as.Date(as.character(begindate), "%Y%m%d")
  }
  
  if(enddate ==  "enddate")            {
    edate <- strptime(as.character(last.rec), "%Y-%m-%d")
    edate <- gsub("-", "", edate)             
    edate <- as.Date(as.character(edate), "%Y%m%d")              
    edate <- edate - 1
  } else if(enddate !=  "enddate")                             {
    edate <- as.Date(as.character(enddate), "%Y%m%d")
  }
  
  
  
  # check if date range is within period of record, and check if time period
  # requires splitting into smaller units. Interval limit is 1 year for hourly
  # and HL data,  31 days for 6-min data, 10 years for monthly data.
  dates <- sdate
  if(interval == "HL") { if(sdate < first.rec | edate > last.rec) {
    stop("invalid time interval")
  } else if( as.numeric(edate - sdate) > 364) {
    dates <- seq(sdate,edate, 365)
  } else(dates <- c(sdate,edate))}
  if(interval == "hourly") { if(sdate < first.rec | edate > last.rec) {
    stop("invalid time interval")
  } else if( as.numeric(edate - sdate) > 364) {
    dates <- seq(sdate,edate, 365)
  } else(dates <- c(sdate,edate))}
  if(interval == "6 minute") { if(sdate < first.rec | edate > last.rec) {
    stop("invalid time interval")
  } else if( as.numeric(edate - sdate) > 30)  {
    dates <- seq(sdate,edate, 31)
  } else(dates <- c(sdate,edate))}
  
  if(!edate %in% dates[length(dates)]) dates <- c(dates, edate)
  
  dates2 <- format(as.Date(dates), "%Y%m%d")     # re-format dates for the url
  
  #   old version: 
  #   url.temp <- c(paste0("http://co-ops.nos.noaa.gov/api/datagetter?", "begin_date=", dates2[i], 
  #                        "&end_date=", dates2[i+1], "&station=", station, "&product=", ti.csv, 
  #                        "&units=", u.csv, "&time_zone=", tz, "&datum=", datum, 
  #                        "&application=Tides_and_Currents","&format=csv"))
  #   
  
  # create list of csv files
  for(i in 1:(length(dates2) - 1)) {
    url.temp <- c(paste0("http://tidesandcurrents.noaa.gov/api/datagetter?", 
                         "product=", ti.csv, 
                         "&application=NOS.COOPS.TAC.WL",
                         "&begin_date=", dates2[i], 
                         "&end_date=", dates2[i+1], 
                         "&datum=", datum, 
                         "&station=", station,
                         "&time_zone=", tz, 
                         "&units=", u.csv, 
                         "&format=csv"))
    if (!exists("url.list")){
      url.list <- url.temp
    }
    # if the dataset exists, add to it
    if (exists("url.list")){
      url.list[i] <- url.temp
      rm(url.temp)
    }
  }
  
  
  lapply.csv <- lapply(url.list, function(x) read.csv(x))
  data.csv <- do.call(rbind, lapply.csv)
  data.csv$station <- rep(site.name, times = nrow(data.csv))
  
  label <- paste("ver_wtr_lev_", units,"_", datum, sep="")
  t.label <- paste("time (", time, ")", sep = "")
  
  
  # clean up the data
  if(interval == "HL" ) {
    data.csv$datetime <- as.POSIXlt(data.csv[, 1], format = "%Y-%m-%d %H:%M")
    data.csv <- data.csv[, c(7, 2, 3, 6)]
    names(data.csv) <- c(t.label, label, "tide", "station")
    levels(data.csv$tide) <- c("H", "HH", "L", "LL")
  }
  
  if(interval == "6 minute" ) {
    data.csv$datetime <- as.POSIXlt(data.csv[, 1], format = "%Y-%m-%d %H:%M")
    data.csv <- data.csv[, c(10, 2, 9)]
    names(data.csv) <- c(t.label, label, "station")
  }
  
  if(interval == "hourly" ) {
    data.csv$datetime <- as.POSIXlt(data.csv[, 1], format = "%Y-%m-%d %H:%M")
    data.csv <- data.csv[, c(7, 2, 6)]
    names(data.csv) <- c(t.label, label, "station") 
  }
  
  if(interval == "monthly" ) {
    data.csv$datetime <- data.csv$Year + data.csv$Month / 12
  }
  
  
  if(interval == "hourly" & (continuous == "TRUE" | continuous == "T")) {
    data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
    time.df <- data.frame(seq(from = data.csv[1, 1], to = data.csv[nrow(data.csv), 1], by = 60*60))
    names(time.df)[1] <- t.label
    data.csv <- join_all(list(time.df, data.csv[!duplicated(data.csv[, 1]), ]))
    
  } else if(interval == "6 minute" & (continuous == "TRUE" | continuous == "T")) {
    data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
    time.df <- data.frame(seq(from = data.csv[1, 1], to = data.csv[nrow(data.csv), 1], by = 60*6))
    names(time.df)[1] <- t.label
    data.csv <- join_all(list(time.df, data.csv[!duplicated(data.csv[, 1]), ]))
    
  } else if(interval == "monthly" & (continuous == "TRUE" | continuous == "T" )) {
    time.df <- data.frame(seq(from = data.csv$datetime[1], to = data.csv$datetime[nrow(data.csv)], by = 1 / 12))
    names(time.df) <- "datetime"
    time.df$datetime <- round(time.df$datetime, 2)
    data.csv$datetime <- round(data.csv$datetime, 2)
    data.csv <- join_all(list(time.df, data.csv))
    data.csv$Year <- as.numeric(data.csv$Year)
    data.csv$Year[is.na(data.csv$station)] <- as.numeric(substr(data.csv$datetime[is.na(data.csv$station)], 1, 4))
    data.csv$Month[is.na(data.csv$station)] <- round((data.csv$datetime[is.na(data.csv$station)] - data.csv$Year[is.na(data.csv$station)]) * 12)
    data.csv$station[is.na(data.csv$station)] <- site.name
  } else data.csv <- data.csv[!duplicated(data.csv[, 1]), ]
  
  invisible(data.csv)
  
    }

Solution

  • It turns out that renderTable has problems with the POSIXlt format.

    Add stage[,'time..LST.'] <- as.character(stage[,'time..LST.']) before return(stage) to fix your problem.