I am in the market for a new vehicle. Instead of repeatedly searching the dealerships websites, I thought this would be an interesting and fun opportunity to learn a little node and mongodb so I'm scraping my local dealerships' websites to grab the makes and models that I am interested in.
The problem that I am running into is that node won't terminate after my final callback has run through.
var cheerio = require('cheerio');
var request = require('request');
var db = require('mongodb');
var S = require('string');
var log = require('console').log;
var async = require('async');
var links = [];
var website = 'http://www.yahoo.com';
async.series(
[
function(){
log('starting');
db.connect('mongodb://127.0.0.1:27017/test',
function(err, base){
if(err) throw err;
db = base;
});
},
request(website, start)
],
function(){
log('closing DB');
db.close();
});
function start(err,resp,body){
var $ = cheerio.load(body);
var numPages = 2;
$('.gbps').each(function(i,elem) {
links.push('http://www.yahoo.com');
});
var pageURLS = [];
for (var i = 2; i<=numPages; i++){
//create URLs for additional pages
pageURLS[i-2] = website;
}
var pages = 1;
log('getting page URLs');
pageURLS.forEach(function(url, index, array){
request(url, function(error,response,bodies) {
pages++;
var $ = cheerio.load(bodies);
$('.tab').each(function(i,elem) {
links.push('http://www.yahoo.com');
});
if (pages == numPages){
getDetailInfo();
};
});
});
}
function getDetailInfo(){
log(links.length);
links.forEach(function(link, index, array){
request(link, doStuff);
});
}
function doStuff(err, response, body){
if(err){
log(err);
}
parseDetailResponse(err,response,body, addToDB);
}
function parseDetailResponse(err,resp,body,callback){
log('parsing');
var $ = cheerio.load(body);
var specs = $('.specifications').children().map(function(i, elem){
var key = 'key';
var value = 'value';
var ret = {};
ret [ 'name' ] = key;
ret [ 'value' ] = value;
return ret;
});
var makeAndModel = 'makeAndModel';
callback(['picture url', 'vehicle description', 100, specs, makeAndModel]);
}
function getMakeAndModel(stuff){
var $ = cheerio.load(stuff);
temp = $('.gbps').map(function(i, elem){
var ret = {};
switch(i){
case 0:
ret['name'] = 'year';
ret['value'] = $(this).text();
break;
case 1:
ret['name'] = 'make';
ret['value'] = $(this).text();
break;
case 2:
ret['name'] = 'model';
ret['value'] = $(this).text();
break;
case 3:
ret['name'] = 'ignore';
ret['value'] = $(this).text();
break;
default:
ret['name'] = 'ignore';
ret['value'] = 'ignore';
}
return ret;
});
return temp;
}
function addToDB(arr){
log('adding to DB');
pic = arr[0];
description = arr[1];
price = arr[2];
specs = arr[3];
makeAndModel = arr[4];
var obj = {};
for (var i = specs.length - 1; i >= 0; i--) {
obj [specs[i].name] = specs[i].value;
};
for (var i = makeAndModel.length - 1; i >= 0; i--){
obj [makeAndModel[i].name] = makeAndModel[i].value;
};
db.collection('carsTest').update(
{VIN: obj.VIN},
{
$set: {
VIN: obj.VIN,
make: obj.make,
model: obj.model,
year: obj.year,
price: price,
engine: obj.Engine,
interior: obj.Interior,
exterior: obj.Exterior,
'model code': obj['Model Code'],
'stock number': S(obj['Stock Number']).toInt(),
transmission: obj.Transmission,
mileage: obj.Mileage ? obj.Mileage : 0,
description: description,
picture: pic,
}
},
{upsert: true, safe: true},
function(err,result){
if(err){
throw err;
}
});
log('finished with this one!');
}
I've omitted and changed a fair amount as a proof here without a lot of error checking or anything but even this will add the document but won't quit. Node just sits there, waiting for something to happen and it never calls the final callback to close the db and exit.
> db.carsTest.find().pretty()
{
"_id" : ObjectId("52139aa7c9b7a39e0f1eb61d"),
"VIN" : null,
"description" : "vehicle description",
"engine" : null,
"exterior" : null,
"interior" : null,
"make" : null,
"mileage" : 0,
"model" : null,
"model code" : null,
"picture" : "picture url",
"price" : 100,
"stock number" : NaN,
"transmission" : null,
"year" : null
}