Search code examples
rlabelexportspss

how to use R to export SPSS variable names and labels to *.txt file


I need to convert SPSS (*.sav) file variable names, labels, and attributes (NOT data values) to *.txt / *.csv in R. The preferred structure in R (for export) is a data.frame in this form, see the example below:

> new.label.dataframe
VARIABLE.NAME   ATTRIBUTE   LABEL
STATUS          1           Complete
STATUS          2           Screen out Q1
STATUS          3           Screen out Q5
SAMPLE          1           Kunín + Hollandia (včetně černobílých)
SAMPLE          2           Mlýn + Krajina
...etc.
Q2_1            NA          dobré složení
Q2_1            NA          Žádné konzervanty
Q2_1            NA          lahodný
...etc.

After importing data to R with haven() I used the R library sjlabelled() to extract labels:

library(haven)    
mydata <- read_sav("DATA_FINAL.sav", encoding = NULL, user_na = FALSE)
library(sjlabelled)
lab <- get_labels(mydata, values=T):
str(lab)

> str(lab)
List of 762

 $ ID          : NULL
 $ STATUS      : Named chr [1:3] "Complete" "Screen out Q1" "Screen out Q5"
  ..- attr(*, "names")= chr [1:3] "1" "2" "3"
 $ SAMPLE      : Named chr [1:2] "Kunín + Hollandia (včetně černobílých)" "Mlýn + Krajina"
  ..- attr(*, "names")= chr [1:2] "1" "2"
 $ ORDER       : Named chr [1:2] "Kunín ==> Hollandia || Mlýn ==> Krajina" "Hollandia ==> Kunín || Krajina ==> Mlýn"
  ..- attr(*, "names")= chr [1:2] "1" "2"
 $ Q1          : Named chr [1:6] "5x týdně nebo častěji" "1x – 4x týdně" "1x za dva týdny" "1x za měsíc" ...
  ..- attr(*, "names")= chr [1:6] "1" "2" "3" "4" ...
 $ Q2_1        : chr [1:473] "dobré složení" "Žádné konzervanty" "lahodný" "Husty" ...
 $ Q2_2        : chr [1:431] "prává chuť" "Žádné Přidatné látky" "vláčný" "Chutny" ...
 $ Q2_3        : chr [1:311] "" "Žádná Zahušťovadla" "konzistentní" "Ovocny" ...


head(lab, 6)

> head(lab, 6)
$ID
NULL

$STATUS
              1               2               3 
     "Complete" "Screen out Q1" "Screen out Q5" 

$SAMPLE
                                       1                                        2 
"Kunín + Hollandia (včetně černobílých)"                         "Mlýn + Krajina" 

$ORDER
                                        1                                         2 
"Kunín ==> Hollandia || Mlýn ==> Krajina" "Hollandia ==> Kunín || Krajina ==> Mlýn" 

$Q1
                      1                       2                       3                       4                       5                       6 
"5x týdně nebo častěji"         "1x – 4x týdně"       "1x za dva týdny"           "1x za měsíc"      "1x za čtvrt roku"            "Méně často" 

$Q2_1
  [1] "dobré složení"                                                         "Žádné konzervanty"                                                    
  [3] "lahodný"                                                               "Husty"                                                                
  [5] "Krémový"                                                               ""                                                                     
  [7] "Musi mi chutnat"                                                       "hustý"                                                                
  [9] "Chutny"                                                                "Hustý"                                                                
 [11] "vysoké procento tuku"                                                  "krémový" 

The class of the "lab" is a list:

> class(lab)
[1] "list"

Note that the variable Q2_1 has just labels ("dobre slozeni" etc.) but no attributes (1,2,3...). It is quite easy to extract vectors of variable names, attributes, and labels:

my.vars <- names(lab)
my.atts <- unlist(lapply(lab, attributes))
my.labs <- unlist(unname(lab))

...but their lengths indeed differ:

> length(my.vars)
[1] 762
> length(my.atts)
[1] 1734
> length(my.labs)
[1] 4775

The best what I am able to get is below - not very handy, however:

> lab.u <- data.frame(unlist(lab))
> head(lab.u, 20)
                                     unlist.lab.
STATUS.1                                Complete
STATUS.2                           Screen out Q1
STATUS.3                           Screen out Q5
SAMPLE.1  Kunín + Hollandia (včetně černobílých)
SAMPLE.2                          Mlýn + Krajina
ORDER.1  Kunín ==> Hollandia || Mlýn ==> Krajina
ORDER.2  Hollandia ==> Kunín || Krajina ==> Mlýn
Q1.1                       5x týdně nebo častěji
Q1.2                               1x – 4x týdně
Q1.3                             1x za dva týdny
Q2_11                              dobré složení
Q2_12                          Žádné konzervanty
Q2_13                                    lahodný

Please, any hints to convert the "lab" to the structure described above, at the beginning of this message? Many thanks! Zdenek Skala


Solution

  • You could write a small Vectorized function that scrapes the "label.table" attributes of your SPSS data and does case handling for "factor" (i.e. with labels) or other classes (without labels). I'm not sure about the benefits of using sjlabelled package, so I don't use it.

    attFun <- Vectorize(function(x) {
      if (is.factor(spss[[x]])) {
        a <- sort(attributes(spss)$label.table[[x]])
        cbind(VARIABLE.NAME=names(spss)[x], ATTRIBUTE=unname(a), LABEL=names(a))
      } else {
        u <- unique(spss[[x]])
        cbind(VARIABLE.NAME=names(spss)[x], ATTRIBUTE=NA, LABEL=u)
      }
    }, SIMPLIFY=F)
    

    Result just needs to be rbinded.

    res <- do.call(rbind.data.frame, attFun(1:5))
    res
    #         VARIABLE.NAME ATTRIBUTE       LABEL
    # 1             vehicle         1         car
    # 2             vehicle         2         LGV
    # 3             vehicle         3         SUV
    # 4             vehicle         4         bus
    # 5             vehicle         5         HGV
    # 6             vehicle         6        taxi
    # 7             vehicle         7         PTW
    # 8              Colour         1        blue
    # 9              Colour         2         red
    # 10             Colour         3 silver/grey
    # 11             Colour         4       white
    # 12             Colour         5       black
    # 13             Colour         6       green
    # 14             Colour         9       other
    # 15             Colour        99     unknown
    # 16               hour      <NA>       57600
    # 17               hour      <NA>       25200
    # 18               hour      <NA>       28800
    # 19               hour      <NA>       32400
    # 20               hour      <NA>       50400
    # 21               hour      <NA>       54000
    # 22               hour      <NA>       39600
    # 23               hour      <NA>       43200
    # 24               hour      <NA>       61200
    # 25               hour      <NA>       28800
    # 26               hour      <NA>       36000
    # 27               hour      <NA>       36000
    # 28               hour      <NA>       39600
    # 29               hour      <NA>       46800
    # 30               hour      <NA>       46800
    # 31            Colour2      <NA>        blue
    # 32            Colour2      <NA>         red
    # 33            Colour2      <NA>     unknown
    # 34            Colour2      <NA>       other
    # 35            Colour2      <NA> silver/grey
    # 36            Colour2      <NA>       white
    # 37            Colour2      <NA>       black
    # 38            Colour2      <NA>       green
    # 39 Distance_from_kerb      <NA>         0.5
    # 40 Distance_from_kerb      <NA>           1
    # 41 Distance_from_kerb      <NA>        0.25
    # 42 Distance_from_kerb      <NA>        1.25
    # 43 Distance_from_kerb      <NA>        0.75 
    

    Example data:

    spss <- foreign::read.spss("http://staff.bath.ac.uk/pssiw/stats2/PsychBike.sav")
    spss[[4]] <- as.character(spss$Colour)
    names(spss)[4] <- "Colour2"