I want an array of urls from an html string, although only from the following tags:
I would like these urls so I can put them into an appcache manifest file. I use an appcache manifest builder, but it only analyzes static files that I am serving locally. It is working great, but it doesn't automatically include the external static js/css files that I am including in my html.
I would like to be able to parse the html string using node.js.
You can use cheerio. It is an implementation of core jQuery for node.
For example:
var cheerio = require('cheerio'),
request = require('request');
request('http://www.stackoverflow.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
var linkHrefs = $('link').map(function(i) {
return $(this).attr('href');
}).get();
var scriptSrcs = $('script').map(function(i) {
return $(this).attr('src');
}).get();
console.log("links:");
console.log(linkHrefs);
console.log("scripts:");
console.log(scriptSrcs);
}
});
Outputs:
Victors-MacBook-Pro:a kohl$ node test.js
links:
[ '//cdn.sstatic.net/stackoverflow/img/favicon.ico?v=6cd6089ee7f6',
'//cdn.sstatic.net/stackoverflow/img/apple-touch-icon.png?v=41f6e13ade69',
'/opensearch.xml',
'//cdn.sstatic.net/stackoverflow/all.css?v=317033db9646',
'/feeds' ]
scripts:
[ '//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js',
'//cdn.sstatic.net/Js/stub.en.js?v=e3a448574e16' ]