Search code examples
javascriptangularjscsvencodingsheetjs

Filereader read file using correct encoding when read as readAsArrayBuffer


I am working on reading .csv /xlsx file uploaded using javaScript and get the result as array containing each row . I was able to read the file and get data using FileReader and SheetJs with following code.

// code for the new excel reader
$scope.do_file =  function(files)
{
    $scope.fileContent  = [];
    var X = XLSX;
    var global_wb;
    var f = files[0];
    var reader = new FileReader();
    reader.onload = function(e)
    {
        var data = e.target.result;console.log(data);
        global_wb = X.read(data, {type: 'array'});
        var output = "";
        var result = {};
        global_wb.SheetNames.forEach(function(sheetName) {
            var roa = X.utils.sheet_to_json(global_wb.Sheets[sheetName], {header:1});
            if(roa.length) result[sheetName] = roa;
        });
        $scope.fileContent =  result["Sheet1"];
        if(!result["Sheet1"])
        {
            $scope.fileContent =  result["contacts"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });
        }
    };
    reader.readAsArrayBuffer(f);
};

For reading most of the files the code works , but when file containing Hebrew text with Windows-1255 encoding i get gibberish data.

enter image description here

Looking in for more options i tried to read the file as text using reader.readAsText and change the encoding as necessary , check the following code:

function is_Hebrew(data)
{
    var position = data.search(/[\u0590-\u05FF]/);
    return position >= 0;
}

 $scope.do_file =  function(files)
 {
    var fullResult = [];
    var file =files[0];
      var reader = new FileReader();
        reader.onload = function(e){
            var data = e.target.result;
                if(!is_Hebrew(data.toString()))
                {
                  reader.readAsText(file,'ISO-8859-8');   
                }
            };
        reader.readAsText(file);
        reader.onloadend = function(){
            var lines = reader.result.split('\r\n');
            console.log(lines);
            lines.forEach(element => {
                var cell = element.split(',');
                fullResult.push(cell);
            });

             console.log(reader);
        };
    };

but the above code is not suitable as it does not read the file as each row identifying each cell. if any one of cell contains string with coma separated value (for example if a cell contains a string value such as "25,28,29" ) the array output gives wrong data as it considers each values as each cell.

So i decided to stick with first method but i am not able to change the encoding .Is there a possible way to change encoding in the first code where i have used the readAsArrayBuffer to read the file data ?


Solution

  • After going through lot of possible solutions i found that answer to the above question was to combine the above two methods. The first method for reading the xlsx files and second method for reading csv files. Also i have used an additional javaScript library called papaparse in the second method to solve the problem of reading data in each cell

    $scope.is_Hebrew = function($data){
    var position = $data.search(/[\u0590-\u05FF]/);
    return position >= 0;
    }
    
    // code for the new excel reader
    $scope.do_file =  function(files)
    {
        var config = {
        delimiter: "",  // auto-detect
        newline: "",    // auto-detect
        quoteChar: '"',
        escapeChar: '"',
        header: false,
        trimHeader: false,
        dynamicTyping: false,
        preview: 0,
        encoding: "",
        worker: false,
        comments: false,
        step: undefined,
        complete: undefined,
        error: undefined,
        download: false,
        skipEmptyLines: false,
        chunk: undefined,
        fastMode: undefined,
        beforeFirstChunk: undefined,
        withCredentials: undefined
        };
    
        $scope.fileContent  = [];
        var f = files[0];
        var fileExtension = f.name.replace(/^.*\./, '');
        if(fileExtension == 'xlsx')
        {
            var X = XLSX;
            var global_wb;
            var reader = new FileReader();
            reader.onload = function(e)
            {
                var data = e.target.result;
                global_wb = X.read(data, {type: 'array'});
                var result = {};
                global_wb.SheetNames.forEach(function(sheetName) {
                   var roa = X.utils.sheet_to_json(global_wb.Sheets[sheetName], {header:1});
                   if(roa.length) result[sheetName] = roa;
                });
                $scope.fileContent =  result["Sheet1"];
                if(!result["Sheet1"])
                {
                   $scope.fileContent =  result["contacts"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });
                }
    
            };
            reader.readAsArrayBuffer(f);
    
        }
        else if(fileExtension == 'csv')
        {
        var reader = new FileReader();
        reader.onload = function(e)
        {
            var data = e.target.result;
            console.log(f);
            console.log($scope.is_Hebrew(data.toString()));
            if(!$scope.is_Hebrew(data.toString()))
            {
               reader.readAsText(f,'ISO-8859-8');   
            }
        };
    
        reader.readAsText(f);
        reader.onloadend = function(e){
            var c =  Papa.parse(reader.result,[ config])
            console.log(c);
            $scope.fileContent =  c["data"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });
    
        };
    
        }
        else
        {
           alert("File Not supported!");
        }
    
    $scope.fileContent.push([]);
    };