Search code examples
rdictionarytextimportstata

Automate importing data from a text file


I would like to use a Spanish household survey from Instituto Nacional de Estadística (INE) for a research project. The data comes in a text file, where the answers of the survey are stuck in a continuum of numbers, with an auxiliary file that tells you from digit 1 to 5 it corresponds to variable, from 6 to 8 to variable 2, etc. This file is also a text file.

Is there a way making use of this auxiliary file in the import of the data instead of manually telling R, Stata, etc how to delimit the columns?

The auxiliary file looks like this:

enter image description here

Where the first columns is the variable name, the second column the number of digits the variable has and in the third column which position is the variable situated in the data text file, which looks like this:

enter image description here


Solution

  • This was indeed a really silly data file but that made the challenge the more fun. Here is perhaps a solution. I was not able to test it on your files as you posted screenshots, but I tried to replicate the meta data file and the raw data file as best as I could.

    To replicate my code before you test it on your files, start by creating a folder somewhere on your computer. Copy the path to that folder and paste it in the local folder "path/to/your/folder" on the first line in my code.

    Then copy this block into a txt file that you save as meta_data.txt in that folder:

    
    ********************* text *********************
    ********************* text *********************
    
    @1    VAR1      $4.     /*   1     4  */
          VAR2      $5.     /*   5     9  */
          VAR3      $2.     /*  10    11  */
          VAR4      $2.     /*  12    13  */
          VAR5      $1.     /*  14    16  */
    
    

    And then copy this block into a txt file that you save as raw_data.txt in the same folder. I have added an empty row, and empty cell, mixed str/numric etc to make sure my code can handle the most obvious irregularities.

    A11122222334455
    B11122222334455
    C11122222  4455
    
    D11122222334455
    201200001-9AA12
    
    

    Then you should be able to run this code. The comments should explain what each line does. Hopefully it works on your actual files without too much modification. If you get errors that you cannot solve, please explain the errors you get in as much details as possible, and provide sample of your files in code blocks like I did instead of images.

        local folder "path/to/your/folder"
        local meta_data_file "`folder'/meta_data.txt"
        local raw_data_file  "`folder'/raw_data.txt"
        local csv_data_file  "`folder'/csv_data.csv"
     
        tempname meta_data raw_data csv_data
        
        *****************
        * Get column numbers from meta data
        
        file open `meta_data' using "`meta_data_file'", read 
        file read `meta_data' line
        local varnames ""
        local colnums ""
        
        *Loop over meta data file, line by line
        while r(eof)==0 {
                
            display `"`line'"'
            
            *Split the line where it has the first "/"
            gettoken varname_str colnum_str : line, parse("/")
            
            *Lines without "/" do not have col nums so skip them
            if "`colnum_str'" != "" {
                
                * Get the varname (first word that does not have "@" in it)
                tokenize "`varname_str'"
                if strpos("`1'","@") == 0 local varname = "`1'"
                else                      local varname = "`2'"
                local varnames "`varnames',`varname'"
                
                *Split string into each string seprated by space, f.ex. "/*","1","4","*/"
                tokenize "`colnum_str'"
                local colnums "`colnums' `2':`3'"
            }
            local linenum = `linenum' + 1
            file read `meta_data' line
        }   
        file close `meta_data'
        
        display "`varnames'"
        display "`colnums'"
        
        
        *****************
        * Convert silly file to csv file
        
        *Open silly data
        file open `raw_data' using "`raw_data_file'", read 
        file read `raw_data' old_row
        *Prepare csv file to read from
        file open `csv_data' using "`csv_data_file'", write text replace
        
        *Write varnames on first row
        file write `csv_data' "`varnames'" _n
        
        *Loop over silly data file, line by line
        while r(eof)==0 {
                
            local new_row ""
            
            *Loop over the colnums preared above
            foreach varcols of local colnums {
                
                *split each colnum pair on ":" and 
                tokenize "`varcols'", parse(":")
                local startcol = `1'
                local value_len = `3' - `1' + 1
                
                *Get the value from the row in the old dataset
                local value = substr("`old_row'",`startcol',`value_len')
                
                *Prepare the new row in csv format by inserting ","
                local new_row "`new_row',`value'"
            }
            
            *Write the new row in csv format tot he file
            display "old row: `old_row'"
            display "new row: `new_row'"
            display ""
            file write `csv_data' "`new_row'" _n
            
            *Read next line in the old data
            local linenum = `linenum' + 1
            file read `raw_data' old_row
        }   
        
        *Close both files
        file close `raw_data'
        file close `csv_data'
        
        *Import the csv file
        import delimited "`csv_data_file'", clear
        drop v1