Search code examples
javascriptparsingfor-looptimeout

Why does my for loop mess up all the parameters?


I am trying to parse some data from several web pages using javascript. I wrote a small parser for this purpose. The algorithm looks like this:

  1. Open first URL from my .csv file
  2. Find the data I need on the page
  3. Save URL and data to a json file

My code executes 1. and 2. perfectly but sometimes messes up with number 3. Output looks like this:

URL 1 + data from URL 1  (correct line)
URL 2 + data from URL 2  (correct line)
URL 3 + data from URL 3  (correct line)
URL 4 + data from URL 4  (correct line)
URL 6(wrong URL) + data from another URL 
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7  (correct line)
URL 8 + data from URL 8  (correct line)
URL 9 + data from URL 9  (correct line)

I assume the problem is that some pages load way too long which messes up the whole process. But I still don't understand why it sometimes saves the wrong data. Heres my code:

var request = require('request');
var cheerio = require('cheerio');
var cloudscraper = require('cloudscraper');
var fs = require('fs');
var path = require('path');
var csvjson = require('csvjson');

//First, we read .csv file with our URL list
function getTheList() {
    urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
    var options = {
      delimiter : ';', // optional
      quote     : '"' // optional
     };
    urlList = csvjson.toObject(urlList, options);
    end = urlList.length;
    logs = [];

//here we start the loop reading and saving data from each url
  for (let p = 0; p < end; p += 1){
    grabTheData(urlList, p)
  }
} 

//this code extracts the data from the page and saves it to a json file
function grabTheData(urlList, p){
    setTimeout(function() { 

    url = url[p].ItemLink;
    cloudscraper.get(url, function(err, res, body){
        if (err) { 
            console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
            callback();

        } else {

            var $ = cheerio.load(body);
            /*
            here are the lines which extract the data I need
            dataIneed = ...;
            */

            logs.push({
                url, dataINeed
            });
            fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
        }
    });
//here I set a 2 seconds delay between each URL
    }, 2000 * p);
  }

getTheList()

Solution

  • The reason this is happening is that there is a potential mismatch between the callback result and the url variable in grabTheData.

    Now there is a very quick fix for this, simple change the scope of the url variable like so:

    function grabTheData(urlList, p){
        setTimeout(function() { 
            // Set scope of url variable to block
            let url = url[p].ItemLink;
            cloudscraper.get(url, function(err, res, body){
                if (err) { 
                    console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
                    callback();
    
                } else {
    
                    var $ = cheerio.load(body);
                    /*
                    here are the lines which extract the data I need
                    dataIneed = ...;
                    */
    
                    logs.push({
                        url, dataINeed
                    });
                    fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
                }
            });
    //here I set a 2 seconds delay between each URL
        }, 2000 * p);
    }
    

    This should keep your results in order.

    Here's another (IMHO much better) option, using promises and avoiding the use of setTimeout to separate calls. This should avoid any potential race condition, since the Promise.all call will preserve order:

    async function getTheList() {
        urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
        var options = {
            delimiter : ';', // optional
            quote     : '"' // optional
        };
        urlList = csvjson.toObject(urlList, options);
        let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
        let logs = await Promise.all(promiseList);
        fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
    }
    
    // Promisified version of cloudscraper.get
    function getCloudScraperData(url) {
        return new Promise((resolve, reject) => {
            cloudscraper.get(url, (err, res, body) => {
                if (err) {
                    reject(err);
                } else {
                    resolve ( { url, res, body });
                }
            })
        })
    }
    
    function getDataINeed(url, body) {
        // Use cheerio to process data..
        // Return mock data for now.. replace with actual data processed by cheerio..
        return `data from ${url}`;
    }
    
    async function grabTheDataUpdated(url) {
        try { 
            let result = await getCloudScraperData(url);
            let dataINeed = getDataINeed(result.url, result.body);
            return { url, dataINeed };
        } catch (error) { 
            return { url, dataINeed: "Error occurred: " + error.message };
        }
    }