Search code examples
rodbcrodbc

RODBC: Why are the values "NA" for empty and space-only values in sqlQuery()?


I am a newbie when it comes to R. I was looking at RODBC results returned from the following:

> library(RODBC)
> dbcon <- odbcDriverConnect("DRIVER={SQL SERVER};SERVER=MYSERV;DATABASE=SOME", tabQuote='', colQuote='')
> sqlQuery(dbcon, "SELECT 3, 'a', ' ', '', NULL")
    .1 .2 .3 .4
1 3  a NA NA NA

Why is ' ' and '' returning NA? I can't find anything in the documentation for this behavior. What am I missing?


Solution

  • So decided to look at the source code for RODBC's sqlQuery method since I remembered it was open-source:

    sqlQuery <-
        function(channel, query, errors = TRUE, ..., rows_at_time)
    {
        if(!odbcValidChannel(channel))
           stop("first argument is not an open RODBC channel")
        if(missing(query))
            stop("missing argument 'query'")
        ## could argue that 'max' should restrict rows_at_time
        rows_at_time <- if(missing(rows_at_time)) attr(channel, "rows_at_time")
        else max(1, min(1024, rows_at_time))
        stat <- odbcQuery(channel, query, rows_at_time)
        if(stat == -1L) {
            if(errors) return(odbcGetErrMsg(channel))
            else return(invisible(stat))
        } else return(sqlGetResults(channel, errors = errors, ...))
    }
    

    so it calls sqlGetResults:

    sqlGetResults <-
        function (channel, as.is = FALSE,
                  errors = FALSE, max = 0, buffsize = 1000,
                  nullstring = NA_character_, na.strings = "NA",
                  believeNRows = TRUE, dec = getOption("dec"),
                  stringsAsFactors = default.stringsAsFactors())
    {
        if(!odbcValidChannel(channel))
           stop("first argument is not an open RODBC channel")
        as.df <- function(value, colnames) {
            for(i in seq_along(value))
                if(is.list(value[[i]])) class(value[[i]]) <- "ODBC_binary"
            ## convert list to data frame
            class(value) <- "data.frame"
            names(value) <- make.unique(colnames)
            row.names(value) <- seq(along=value[[1L]])
            value
        }
        cols <- .Call(C_RODBCNumCols, attr(channel, "handle_ptr"))
        ## FIXME: should this be <= 0L?
        if(cols < 0L) {
            if(errors) return("No data")
            else return(invisible(-1L))
        }
        cData <- .Call(C_RODBCColData, attr(channel, "handle_ptr"))
        dbdata <- odbcFetchRows(channel,
                                max = max,
                                buffsize = buffsize,
                                nullstring = nullstring,
                                believeNRows = believeNRows)
        if(dbdata$stat < 0L) {
        if(errors) return(odbcGetErrMsg(channel))
        else return(invisible(dbdata$stat))
        }
        data <- as.df(dbdata$data, cData$names)
        if(nrow(data) > 0L) {
            cols <- ncol(data)
            enc <- attr(channel, "encoding")
            if(length(na.strings))
                for (i in 1L:cols)
                    if(is.character(data[,i]))
                        data[data[,i] %in% na.strings, i] <- NA
            if(is.logical(as.is)) {
                as.is <- rep(as.is, length = cols)
            } else if(is.numeric(as.is)) {
                if(any(as.is < 1 | as.is > cols))
                    stop("invalid numeric 'as.is' expression")
                i <- rep(FALSE, cols)
                i[as.is] <- TRUE
                as.is <- i
            } else if(length(as.is) != cols)
                stop("'as.is' has the wrong length ", length(as.is),
                     " != cols = ", cols)
            for (i in seq_len(cols)) {
                if(is.character(data[[i]]) && nchar(enc))
                    data[[i]] <- iconv(data[[i]], from = enc)
                if(as.is[i] || is.list(data[[i]])) next
                if(is.numeric(data[[i]])) next
                if(cData$type[i] == "date")
                    data[[i]] <- as.Date(data[[i]])
                else if(cData$type[i] == "timestamp")
                    data[[i]] <- as.POSIXct(data[[i]])
                else
                    data[[i]] <- type.convert(as.character(data[[i]]),
                                              na.strings = na.strings,
                                              as.is = !stringsAsFactors,
                                              dec = dec)
            }
        }
        data
    }
    

    so for string/varchar values it all comes down to:

    data[[i]] <- type.convert(as.character(data[[i]]),
                                             na.strings = na.strings,
                                              as.is = !stringsAsFactors,
                                              dec = dec)
    

    so let's try type.convert:

    > type.convert("a")
    [1] a
    

    ok, that is expected. Let's try the odd cases now:

    > type.convert("")
    [1] NA
    

    hmmm... OK, so that is how we get NA

    > type.convert("       ")
    [1] NA
    

    ok, that's NA too. Now, why does as.is not return NA?

    if(as.is[i] || is.list(data[[i]])) next
    

    aha, it just exits when as.is is set and doesn't call type.convert() that explains why it doesn't return NA when the as.is flag is set