Search code examples
rr-haven

Hack to to include special characters in file path in haven::read_sav()


There seems to be an issue with the haven (1.1.1) package when including any type of special character in the file path, including just the file name.

Assuming this is a real issue I am looking for some kind of neat hack/solution to get around it.

An (not ideal) example would be to have R take a copy of the file into a more friendly path and give it a "better" filename and then load with haven. Such as:

setwd("c:/temp")
fn <- "randóóm.sav"
file.copy(paste0("./äglæpath/", fn), fn)
file.rename(fn, gsub("[^-\\./a-zA-Z0-9[:space:]]", "", fn))
# now apply read_sav() to the copy

I'm using:

R version 3.5.0 (2018-04-23)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

Solution

  • Unfortunately, I have been able to reproduce the problem on Windows 10 with both the standard version of haven and the devtools version of haven. This appears to be known bug with haven. #371

    Recommend Workaround:

    Move the file to a directory without German Umlauts in the file path or filename. Thus, your workaround works as stated.

    > file.path(dataFilepath, dtaFilename)
    [1] "äglæpath/randóóm.dta"
    
    > dtaFilename <- gsub("[^-\\./a-zA-Z0-9[:space:]]", "", dtaFilename)
    > bdatFilename <- gsub("[^-\\./a-zA-Z0-9[:space:]]", "", bdatFilename)
    > savFilename <- gsub("[^-\\./a-zA-Z0-9[:space:]]", "", savFilename)
    > dataFilepath <- gsub("[^-\\./a-zA-Z0-9[:space:]]", "", dataFilepath)
    
    > file.path(dataFilepath, dtaFilename)
    [1] "glpath/randm.dta"
    
    > # Stata
    > read_dta(dtaDest)
    # A tibble: 150 x 5
       sepallength sepalwidth petallength petalwidth species
             <dbl>      <dbl>       <dbl>      <dbl> <chr>  
     1        5.10       3.5         1.40      0.200 setosa 
     2        4.90       3           1.40      0.200 setosa 
     3        4.70       3.20        1.30      0.200 setosa 
     4        4.60       3.10        1.5       0.200 setosa 
     5        5          3.60        1.40      0.200 setosa 
     6        5.40       3.90        1.70      0.400 setosa 
     7        4.60       3.40        1.40      0.300 setosa 
     8        5          3.40        1.5       0.200 setosa 
     9        4.40       2.90        1.40      0.200 setosa 
    10        4.90       3.10        1.5       0.100 setosa 
    # ... with 140 more rows
    > 
    

    Github Bug #371

    Read_*() does not work for special characters in file path #371 https://github.com/tidyverse/haven/issues/371

    the problem code is in DfReader.cpp df.parse_dta() 594-612 in haven/src/DFReader.cpp.

    Code to Reproduce

    require(haven)
    require(stringi)
    
    dtaURL  <- "https://github.com/tidyverse/haven/blob/master/inst/examples/iris.dta?raw=true"
    bdatURL <- "https://github.com/tidyverse/haven/blob/master/inst/examples/iris.sas7bdat?raw=true"
    savURL  <- "https://github.com/tidyverse/haven/blob/master/inst/examples/iris.sav?raw=true"
    
    dtaFilename   <- "randóóm.dta"
    bdatFilename <- "randóóm.bdata"
    savFilename   <- "randóóm.sav"
    
    dataFilepath      <- "äglæpath"
    
    if (!dir.exists(dataFilepath)) {
      dir.create(file.path(dataFilepath), showWarnings = TRUE)
    }
    
    dtaDest = file.path(dataFilepath, dtaFilename)
    bdatDest = file.path(dataFilepath, bdatFilename )
    savDest = file.path(dataFilepath, savFilename )
    
    download.file(dtaURL, destfile = dtaDest, method = "wget", mode = "wb")
    download.file(bdatURL, destfile = bdatDest, method = "wget", mode = "wb")
    download.file(savURL, destfile = savDest, method = "wget", mode = "wb")
    
    
    # Stata
    read_dta(dtaDest)
    
    # SAS
    read_sas(bdatDest)
    
    # SPSS
    read_sav(savDest)
    

    Console Output

    > require(haven)
    > require(stringi)
    > dtaURL  <- "https://github.com/tidyverse/haven/blob/master/inst/examples/iris.dta?raw=true"
    > bdatURL <- "https://github.com/tidyverse/haven/blob/master/inst/examples/iris.sas7bdat?raw=true"
    > savURL  <- "https://github.com/tidyverse/haven/blob/master/inst/examples/iris.sav?raw=true"
    > dtaFilename   <- "randóóm.dta"
    > bdatFilename <- "randóóm.bdata"
    > savFilename   <- "randóóm.sav"
    > dataFilepath      <- "äglæpath"
    > if (!dir.exists(dataFilepath)) {
    +   dir.create(file.path(dataFilepath), showWarnings = TRUE)
    + }
    > dtaDest = file.path(dataFilepath, dtaFilename)
    > bdatDest = file.path(dataFilepath, bdatFilename )
    > savDest = file.path(dataFilepath, savFilename )
    > download.file(dtaURL, destfile = dtaDest, method = "wget", mode = "wb")
    --2018-05-29 15:56:59--  https://github.com/tidyverse/haven/blob/master/inst/examples/iris.dta?raw=true
    Resolving github.com (github.com)... 192.30.255.113, 192.30.255.112
    Connecting to github.com (github.com)|192.30.255.113|:443... connected.
    HTTP request sent, awaiting response... 302 Found
    Location: https://github.com/tidyverse/haven/raw/master/inst/examples/iris.dta [following]
    --2018-05-29 15:56:59--  https://github.com/tidyverse/haven/raw/master/inst/examples/iris.dta
    Reusing existing connection to github.com:443.
    HTTP request sent, awaiting response... 302 Found
    Location: https://raw.githubusercontent.com/tidyverse/haven/master/inst/examples/iris.dta [following]
    --2018-05-29 15:56:59--  https://raw.githubusercontent.com/tidyverse/haven/master/inst/examples/iris.dta
    Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.52.133
    Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.52.133|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 8213 (8.0K) [application/octet-stream]
    Saving to: '\344gl\346path/rand\363\363m.dta'
    
         0K ........                                              100% 1.56M=0.005s
    
    2018-05-29 15:56:59 (1.56 MB/s) - '\344gl\346path/rand\363\363m.dta' saved [8213/8213]
    
    > download.file(bdatURL, destfile = bdatDest, method = "wget", mode = "wb")
    --2018-05-29 15:56:59--  https://github.com/tidyverse/haven/blob/master/inst/examples/iris.sas7bdat?raw=true
    Resolving github.com (github.com)... 192.30.255.113, 192.30.255.112
    Connecting to github.com (github.com)|192.30.255.113|:443... connected.
    HTTP request sent, awaiting response... 302 Found
    Location: https://github.com/tidyverse/haven/raw/master/inst/examples/iris.sas7bdat [following]
    --2018-05-29 15:56:59--  https://github.com/tidyverse/haven/raw/master/inst/examples/iris.sas7bdat
    Reusing existing connection to github.com:443.
    HTTP request sent, awaiting response... 302 Found
    Location: https://raw.githubusercontent.com/tidyverse/haven/master/inst/examples/iris.sas7bdat [following]
    --2018-05-29 15:56:59--  https://raw.githubusercontent.com/tidyverse/haven/master/inst/examples/iris.sas7bdat
    Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.52.133
    Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.52.133|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 131072 (128K) [application/octet-stream]
    Saving to: '\344gl\346path/rand\363\363m.bdata'
    
         0K .......... .......... .......... .......... .......... 39% 4.05M 0s
        50K .......... .......... .......... .......... .......... 78% 19.7M 0s
       100K .......... .......... ........                        100% 19.3M=0.02s
    
    2018-05-29 15:57:00 (7.83 MB/s) - '\344gl\346path/rand\363\363m.bdata' saved [131072/131072]
    
    > download.file(savURL, destfile = savDest, method = "wget", mode = "wb")
    --2018-05-29 15:57:01--  https://github.com/tidyverse/haven/blob/master/inst/examples/iris.sav?raw=true
    Resolving github.com (github.com)... 192.30.255.113, 192.30.255.112
    Connecting to github.com (github.com)|192.30.255.113|:443... connected.
    HTTP request sent, awaiting response... 302 Found
    Location: https://github.com/tidyverse/haven/raw/master/inst/examples/iris.sav [following]
    --2018-05-29 15:57:01--  https://github.com/tidyverse/haven/raw/master/inst/examples/iris.sav
    Reusing existing connection to github.com:443.
    HTTP request sent, awaiting response... 302 Found
    Location: https://raw.githubusercontent.com/tidyverse/haven/master/inst/examples/iris.sav [following]
    --2018-05-29 15:57:01--  https://raw.githubusercontent.com/tidyverse/haven/master/inst/examples/iris.sav
    Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.52.133
    Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.52.133|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 6690 (6.5K) [application/octet-stream]
    Saving to: '\344gl\346path/rand\363\363m.sav'
    
         0K ......                                                100% 3.09M=0.002s
    
    2018-05-29 15:57:01 (3.09 MB/s) - '\344gl\346path/rand\363\363m.sav' saved [6690/6690]
    
    > # Stata
    > read_dta(dtaDest)
    Error in df_parse_dta_file(spec, encoding) : 
      Failed to parse <...>/äglæpath/randóóm.dta: Unable to open file.