I finally figured out how callbacks work in node.js, but I'm trying now to get my code to execute in order.
The goal is to (in order):
- Load the URL into cheerio
- Parse through each
<td>in the<tbody>on the page. - Once text elements are loaded into the data array, callback.
- Call loopThroughData on the full Data array.
- Loop through the data array and call the lookForPlayer array on each one, which:
- Runs a SELECT in my db that matches the player name passed from the text element, and if there is no match in my db, INSERT them (I have it just printing to the console for now for testing purposes).
The end goal is to go through every page (there is a separate URL for each date, so I am looping through the dates) and INSERT players that aren't in my database ONCE. The problem is that it goes through each SELECT before the INSERT queries are executed, so it's inserting them multiple times.
Here is the page I'm parsing, if it helps: http://www.basketball-reference.com/friv/dailyleaders.cgi?month=12&day=29&year=2014
Here is my code:
function loadPage (url, callback){
request(url, function(err, response, body){
if(!err && response.statusCode ==200){
var $ = cheerio.load(body);
rowsRemaining = $.length;
$('td', 'tbody').each(function(){
var text = $(this).text();
data.push(text);
rowsRemaining -= 1;
console.log('rows left: ',rowsRemaining);
});
}
if (rowsRemaining == 0){
console.log('$ length: ',$.length);
callback(data);
}
});
}
function loopThroughData (data, callback){
for(i=1;i<data.length;i+=26){
lookForPlayer(data[i].replace("'",""),function(name){
/* var insertPlayer = connection.query(
'INSERT INTO player (provider_id, team_id, position_id, name) VALUES (1, (SELECT id FROM team WHERE slug = "'+data[i+1]+'"),1,"'+name+'");',function(err,result,fields){
}); */
console.log('i is currently = ',i);
});
}
callback();
}
function lookForPlayer(name, callback){
console.log('Looking for Player...');
var selectPlayer = connection.query(
"SELECT * FROM player WHERE name = '"+name+"'", function(err, rows, fields){
if(err) throw err;
if(rows.length==0){
callback(name);
}
});
}
//loop through every day since the season started
for (d = seasonStart; d <= Date.now(); d.setDate(d.getDate() + 1)){
console.log('d = ',d);
loadPage(baseURL+(d.getMonth()+1)+'&day='+d.getDate()+'&year='+d.getFullYear(),function(data){
console.log('Page loaded...');
loopThroughData(data,function(){
});
});
}
As you can see, I tried adding a rowsRemaining variable that is meant to make sure I've parsed the whole file before calling the callback in the loadPage function, but it never gets to that point. Note that I initialize a lot of these variables before these functions (rowsRemaining, data, etc).
It also seems to loop through every date before fully loading, parsing, and INSERTing the first page, which it should not be doing.
Here is the updated code based off of @Brant's answer
function loadPage (url, callback){
request(url, function(err, response, body){
if(!err && response.statusCode ==200){
var $ = cheerio.load(body);
console.log(url);
$('td', 'tbody').each(function(){
var text = $(this).text();
data.push(text);
});
}
callback(data);
});
}
function loopThroughData (data, callback){
for(i=1;i<data.length;i+=26){
lookForPlayer(data[i].replace("'",""),function(name){
var insertPlayer = connection.query(
'INSERT INTO player (provider_id, team_id, position_id, name) VALUES (1, (SELECT id FROM team WHERE slug = "'+data[i+1]+'"),1,"'+name+'");',function(err,result,fields){
});
});
}
callback(data);
}
function lookForPlayer(name, callback){
var selectPlayer = connection.query(
"SELECT * FROM player WHERE name = '"+name+"'", function(err, rows, fields){
if(err) throw err;
if(rows.length==0){
console.log(name,' was not found in DB!');
callback(name);
}
});
}
//loop through every day since the season started
for (d = seasonStart; d <= Date.now(); d.setDate(d.getDate() + 1)){
validDatesArr.push(d);
}
async.eachSeries(validDatesArr,
function(validDatesArr, callback){
loadPage(baseURL+'/month='+validDatesArr.getMonth()+1+'&day='+validDatesArr.getDate()+'&year='+validDatesArr.getFullYear(),function(data){
loopThroughData(data, function(){
callback();
});
});
}, function(err){
if(!err){
console.log('We processed each date requests one by one');
}
}
);
So now it's loading the pages one by one, but it isn't executing the INSERT function in the loopThroughData function on that data. I would think I would just add another function to the async list, but this particular one is calling a function as opposed to using an anonymous one.