Search code examples
javascriptnode.jsmongodbresponsecheerio

Upload Node Request-Response Response To MongoDB


I'm playing around with Cheerio in Node. I have a scraper that goes to a list of articles, grabs all the article URLS, and then goes to each article and scrapes the title and url. Everything works fine, except when I try to upsert the results to my Mongodb, I get undefined.

I'm assuming it's trying to upsert before the values are defined... But even using Request-Response I can't get it working. Any help would be greatly appreciated! Since the code isn't too long, I'll just paste the whole thing so it's easier to see what I'm trying to do. Again, the main issue is getting upsertArticle to actually upsert the variables.

const request = require('request');
const cheerio = require('cheerio');
const rp = require('request-promise');
const mongoose = require('mongoose');
const Article = require('./models/article');

var urls = [];
//get the list of articles to scrape
rp('https://www.somesite.com/', function(error, response, html) {
    if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html);
        $('.c-entry-box--compact__title').each(function(i, element) {
            var a = $(this);
            urls.push(a.children().attr('href'));
        });     }  })
    //scrape over each article individually
    .then(function(getStuff) {
        var arrayLength = urls.length;
        //get the list of articles to scrape and upsert each one
        for (var i = 0; i < arrayLength; i++) {
            const result = rp(urls[i], function(error, response, html) {
                if (!error && response.statusCode == 200) {
                    var $ = cheerio.load(html);
                    var parsedResults = [];
                    $('.l-main-content').each(function(n, element) {
                        var a = $(this);
                        var title = a.find('.c-page-title').text();
                        var url = response.request.uri.href;
                        //I also tried upserting the variables right here, that didn't work
                        return { title, url };
                    });
                } else {console.log(error);}
            }).then(function(upsertStuff) {
                    //also tried returning and upserting stuff here... but nothing gets upserted
                    upsertArticle({
                        title: result.title,
                        source: result.url,
                        dateCrawled: new Date()
                    });
                    console.log('Upserted ' + result.title);
                }).catch(function(err) {console.log(err);   });     }
    })  .catch(function(err) {console.log(err); });

function upsertArticle(userObj) {
    const DB_URL = 'mongodb://localhost/articles';
    if (mongoose.connection.readyState == 0) {
        mongoose.connect(DB_URL, {
            useMongoClient: true
        });
    }
    let conditions = {
        title: userObj.title
    };
    let options = {
        upsert: true,
        new: true,
        setDefaultsOnInsert: true
    };
    Article.findOneAndUpdate(conditions, userObj, options, (err, result) => {
        if (err) throw err;
    });
}

Solution

  • I made a few changes to the code provided. Namely, I am using promises instead of callbacks for your logic to stay consistent and to ensure that everything is running when it should be.

    For the for loop, I moved the upsertArticle({...}) back to being inside of the each function, so that the title and url are defined when it runs.

    Lastly, I'm using Bluebird's Promise.all (request-promise has a dependency on Bluebird already) to signal when all of the links have been upserted. This change is optional, but I think it will be useful to get feedback when everything is finished:

    Give this a try:

    const request = require('request');
    const cheerio = require('cheerio');
    const rp = require('request-promise');
    const mongoose = require('mongoose');
    const Article = require('./models/article');
    const Promise = require("bluebird");
    
    var urls = [];
    
    rp({uri: 'https://www.somesite.com',  resolveWithFullResponse: true}).then(function(response) {
    
        if(response.statusCode != 200) throw "Response: " + response.statusCode;
    
        var html = response.body;
    
        var $ = cheerio.load(html);
    
        $('.c-entry-box--compact__title').each(function(i, element) {
            var a = $(this);
            urls.push(a.children().attr('href'));
        });
    
    }).then(function(getStuff) {
    
        var arrayLength = urls.length;
        var promiseArray = [];
    
        for(var i = 0; i < arrayLength; i++) {
    
            const p = rp({uri: urls[i],  resolveWithFullResponse: true}).then(function(response) {
    
                if(response.statusCode != 200) throw "Response: " + response.statusCode;
    
                var html = response.body;
    
                var $ = cheerio.load(html);
                var parsedResults = [];
    
                $('.l-main-content').each(function(n, element) {
    
                    var a = $(this);
                    var title = a.find('.c-page-title').text();
                    var url = response.request.uri.href;
    
                    upsertArticle({
                        title: title,
                        source: url,
                        dateCrawled: new Date()
                    });
    
                    console.log('Upserted ' + title);
                });
    
            });
    
            promiseArray.push(p);
        }
    
        return Promise.all(promiseArray);
    
    }).then(function() {
        console.log("Done upserting!");
    })
    .catch(function(err) {
        console.log(err); 
    });
    
    function upsertArticle(userObj) {
        const DB_URL = 'mongodb://localhost/articles';
        if (mongoose.connection.readyState == 0) {
            mongoose.connect(DB_URL, {
                useMongoClient: true
            });
        }
        let conditions = {
            title: userObj.title
        };
        let options = {
            upsert: true,
            new: true,
            setDefaultsOnInsert: true
        };
        Article.findOneAndUpdate(conditions, userObj, options, (err, result) => {
            if (err) throw err;
        });
    }
    

    I'm unable to test the code without knowing the true value of https://www.somesite.com, so let me know if the code gives you any new errors.