I'm currently working on a project that involves querying yahoo-finance for many different ticker symbols. The bottleneck is acquiring the data from yahoo, so I was wondering if there is a way I might go about speeding this up.
If I used multiple machines to query and then aggregated the data, would that help? I only have one physical machine; how might I go about doing that?
Thanks!
EDIT: Currently, I'm using Node.js, yahoo-finance, and Q.deferred to ask yahoo for historical data. Then, once all the promises are fulfilled (for each ticker), I'm doing a Q.all() to persist the data.
var data = [];
tickers = ["goog", "aapl", ...];
...
Q.all(_.map(tickers, function(symbol) {
return getYahooPromise(symbol);
}))
.done( function() { persistData(data) });
getYahooPromise retrieves data for the ticker symbol and pushes it into the data array. Once all promises are resolved, the data is persisted in a MySQL database.
SECOND EDIT: More code:
var sequentialCalls = [];
for ( var i = 0; i < tickers.length / chunkSize; i++ ) {
sequentialCalls.push( persistYahooChunk );
}
sequentialCalls.push( function(callback) {
connection.end();
callback();
});
async.series( sequentialCalls )
exports.persistYahooChunk = function(callback) {
console.log("Starting yahoo query");
var currentTickers = tickers.slice(currentTickerIndex,currentTickerIndex + chunkSize);
return yahooFinance.historical( {
symbols: currentTickers,
from: "2015-01-28",
to: "2015-02-05"
}).then( function(result) {
console.log("Query " + currentTickerIndex + "/" + tickers.length + "completed");
currentTickerIndex += chunkSize;
//add valid data
var toPersist = _.map(result, function(quotes, symbol) {
return [symbol, quotes.length != 0 ];
});
var query = "INSERT INTO `ticker` (`symbol`, `valid`) VALUES ?";
connection.query(query, [toPersist], function(err, result) {
if (err) {
console.log (err);
}
//console.log(result);
callback();
});
});
}
The bottleneck is because you are doing one query per ticker.
Depending on the data you need to pull, if you could do a single query that includes all your tickers it would be much faster.
Here is an example if you need to get all current prices for a list of tickers, with a single query :
http://finance.yahoo.com/webservice/v1/symbols/A,B,C,D,E/quote?format=json