I am currently trying to write a scraper that will get all the 'p' tags from within a div inside a facebook post using node.js
Each of the posts on the page lie within div's that all have this class: .text_exposed_root
There is sometimes multiple 'p' tags within each post so ideally i need to grab all of the html text within that div if possible. I am using cheerio and request modules and my code so far is below:
request(BTTS, function(error, response, body){
if (!error){
var $ = cheerio.load(body),
post = $(".text_exposed_root p").text();
console.log(post);
} else {
console.log("We’ve encountered an error: " + error);
}
})
I have tried using .text .value and .html but they all just return a blank response. I'm guessing I would need to grab all the 'p' tags within that div and convert to a string maybe?
Thanks in advance.
EDITED:
var url = ('https://www.facebook.com/BothTeamsToScore');
request({url:url, headers: headers}, function(error, response, body){
if (!error){
var strippedBody = body.replace(/<!--[\s\S]*?-->/g, "")
console.log(strippedBody);
var $ = cheerio.load(strippedBody),
post = $(".text_exposed_root p").text();
console.log(post);
} else {
console.log("We’ve encountered an error: " + error);
}
})
First of all, you're going to need to set some headers with your request. Without them, Facebook will respond with and "unsupported browser" page. That's your first problem.
var headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36',
'Content-Type' : 'application/x-www-form-urlencoded'
}
var url = BTTS
request({url:url, headers: headers}, function(error, response, body){
if (!error){
var $ = cheerio.load(body.replace(/<!--|-->/g, ''))
console.log($('.text_exposed_root p').text())
} else {
console.log("We’ve encountered an error: " + error);
}
})
The other thing that should be noted, is that the content comes in inside of an html comment. ie
<code class="hidden_elem"><!--
...
<div class="text_exposed_root">
<p>text</p>
Cheerio will not parse comment nodes, so you'll most likely need to remove the <!--
and -->
and load the result back into cheerio to parse the part of html that you want. Good luck!