Automate importing data from a text file

I would like to use a Spanish household survey from Instituto Nacional de Estadística (INE) for a research project. The data comes in a text file, where the answers of the survey are stuck in a continuum of numbers, with an auxiliary file that tells you from digit 1 to 5 it corresponds to variable, from 6 to 8 to variable 2, etc. This file is also a text file.

Is there a way making use of this auxiliary file in the import of the data instead of manually telling R, Stata, etc how to delimit the columns?

The auxiliary file looks like this:

Where the first columns is the variable name, the second column the number of digits the variable has and in the third column which position is the variable situated in the data text file, which looks like this:

Solution

This was indeed a really silly data file but that made the challenge the more fun. Here is perhaps a solution. I was not able to test it on your files as you posted screenshots, but I tried to replicate the meta data file and the raw data file as best as I could.

To replicate my code before you test it on your files, start by creating a folder somewhere on your computer. Copy the path to that folder and paste it in the local folder "path/to/your/folder" on the first line in my code.

Then copy this block into a txt file that you save as meta_data.txt in that folder:


********************* text *********************
********************* text *********************

@1    VAR1      $4.     /*   1     4  */
      VAR2      $5.     /*   5     9  */
      VAR3      $2.     /*  10    11  */
      VAR4      $2.     /*  12    13  */
      VAR5      $1.     /*  14    16  */

And then copy this block into a txt file that you save as raw_data.txt in the same folder. I have added an empty row, and empty cell, mixed str/numric etc to make sure my code can handle the most obvious irregularities.

A11122222334455
B11122222334455
C11122222  4455

D11122222334455
201200001-9AA12

Then you should be able to run this code. The comments should explain what each line does. Hopefully it works on your actual files without too much modification. If you get errors that you cannot solve, please explain the errors you get in as much details as possible, and provide sample of your files in code blocks like I did instead of images.

    local folder "path/to/your/folder"
    local meta_data_file "`folder'/meta_data.txt"
    local raw_data_file  "`folder'/raw_data.txt"
    local csv_data_file  "`folder'/csv_data.csv"
 
    tempname meta_data raw_data csv_data
    
    *****************
    * Get column numbers from meta data
    
    file open `meta_data' using "`meta_data_file'", read 
    file read `meta_data' line
    local varnames ""
    local colnums ""
    
    *Loop over meta data file, line by line
    while r(eof)==0 {
            
        display `"`line'"'
        
        *Split the line where it has the first "/"
        gettoken varname_str colnum_str : line, parse("/")
        
        *Lines without "/" do not have col nums so skip them
        if "`colnum_str'" != "" {
            
            * Get the varname (first word that does not have "@" in it)
            tokenize "`varname_str'"
            if strpos("`1'","@") == 0 local varname = "`1'"
            else                      local varname = "`2'"
            local varnames "`varnames',`varname'"
            
            *Split string into each string seprated by space, f.ex. "/*","1","4","*/"
            tokenize "`colnum_str'"
            local colnums "`colnums' `2':`3'"
        }
        local linenum = `linenum' + 1
        file read `meta_data' line
    }   
    file close `meta_data'
    
    display "`varnames'"
    display "`colnums'"
    
    
    *****************
    * Convert silly file to csv file
    
    *Open silly data
    file open `raw_data' using "`raw_data_file'", read 
    file read `raw_data' old_row
    *Prepare csv file to read from
    file open `csv_data' using "`csv_data_file'", write text replace
    
    *Write varnames on first row
    file write `csv_data' "`varnames'" _n
    
    *Loop over silly data file, line by line
    while r(eof)==0 {
            
        local new_row ""
        
        *Loop over the colnums preared above
        foreach varcols of local colnums {
            
            *split each colnum pair on ":" and 
            tokenize "`varcols'", parse(":")
            local startcol = `1'
            local value_len = `3' - `1' + 1
            
            *Get the value from the row in the old dataset
            local value = substr("`old_row'",`startcol',`value_len')
            
            *Prepare the new row in csv format by inserting ","
            local new_row "`new_row',`value'"
        }
        
        *Write the new row in csv format tot he file
        display "old row: `old_row'"
        display "new row: `new_row'"
        display ""
        file write `csv_data' "`new_row'" _n
        
        *Read next line in the old data
        local linenum = `linenum' + 1
        file read `raw_data' old_row
    }   
    
    *Close both files
    file close `raw_data'
    file close `csv_data'
    
    *Import the csv file
    import delimited "`csv_data_file'", clear
    drop v1