I am attempting to create a web scraper (in node.js) that will pull down information from a site, and write it to a file. I have it built to correctly work for one page, but when I try to use the function in a for loop, to iterate through multiple games, I get bad data in all of the games.
I understand that this is related to Javascript's asynchronous nature, and I have read about callback functions, but I'm not sure I understand how to apply it to my code. Any help would be GREATLY appreciated:
for(x = 4648; x < 4650; x++){ //iterate over a few gameIDs, used in URL for request
scrapeGame(x);
}
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
Essentially, what I am looking to do, is within the for loop, tell it to WAIT to finish the scrapeGame(x) function before incrementing x and running it for the next game -- otherwise, the arrays start to overwrite each other and the data becomes a huge mess.
EDIT: I've now included the full code which I am attempting to run! I'm getting errors when looking in the files after they are written. For example, the first file is 8kb, second is ~16, 3rd is ~32, etc. It seems things aren't getting cleared before running the next game.
Idea of the program is to pull Jeopardy questions/answers from the archive site in order to eventually build a quiz app for myself.
//Iterate over arbitrary number of games, scrape each
for(x = 4648; x < 4650; x++){
scrapeGame(x, function(scrapeResult) {
if(scrapeResult){
console.log('Scrape Successful');
} else {
console.log('Scrape ERROR');
}
});
}
function scrapeGame(gameId, callback){
var request = require('request');
cheerio = require('cheerio');
fs = require('fs');
categories = [];
categorylist = [];
ids = [];
clues = [];
values = ['0','$200','$400','$600','$800','$1000','$400','$800','$1200','$1600','$2000'];
valuelist = [];
answers = [];
array = [];
file = [];
status = false;
var showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId;
var showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function(){
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function(){
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1){
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1){
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function(){
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
$('.correct_response').each(function(){
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for(var i = 0; i < array[0].length; i++){
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId +'.txt', stringFile, function(err) {
if(err) {
console.log(err);
} else {
console.log("Game #" + gameId + " has been scraped.");
status = true;
}
});
}
});
}
});
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
//feed callback status
callback(status);
}