0

I am in the market for a new vehicle. Instead of repeatedly searching the dealerships websites, I thought this would be an interesting and fun opportunity to learn a little node and mongodb so I'm scraping my local dealerships' websites to grab the makes and models that I am interested in.

The problem that I am running into is that node won't terminate after my final callback has run through.

var cheerio = require('cheerio');
var request = require('request');
var db = require('mongodb');
var S = require('string');
var log = require('console').log;
var async = require('async');

var links = [];
var website = 'http://www.yahoo.com'; 

async.series(
    [
        function(){
            log('starting');
            db.connect('mongodb://127.0.0.1:27017/test',
                function(err, base){
                    if(err) throw err;
                    db = base;
                });
        },
        request(website, start)
    ],
        function(){
            log('closing DB');
            db.close();
    });

function start(err,resp,body){
    var $ = cheerio.load(body);
    var numPages = 2;
    $('.gbps').each(function(i,elem) {
        links.push('http://www.yahoo.com');
    });

    var pageURLS = [];
    for (var i = 2; i<=numPages; i++){
        //create URLs for additional pages
        pageURLS[i-2] = website;
    }
    var pages = 1;
    log('getting page URLs');
    pageURLS.forEach(function(url, index, array){
        request(url, function(error,response,bodies) {
            pages++;
            var $ = cheerio.load(bodies);
            $('.tab').each(function(i,elem) {
                links.push('http://www.yahoo.com');
            });
            if (pages == numPages){
                getDetailInfo();
            };
        });
    });
}

function getDetailInfo(){
    log(links.length);
    links.forEach(function(link, index, array){
        request(link, doStuff);
    });
}

function doStuff(err, response, body){
    if(err){
        log(err);
    }
    parseDetailResponse(err,response,body, addToDB);
}

function parseDetailResponse(err,resp,body,callback){
    log('parsing');
    var $ = cheerio.load(body);
    var specs = $('.specifications').children().map(function(i, elem){
        var key = 'key';
        var value = 'value';
        var ret = {};
        ret [ 'name' ] = key;
        ret [ 'value' ] = value;
        return ret;
    });
    var makeAndModel = 'makeAndModel';
    callback(['picture url', 'vehicle description', 100, specs, makeAndModel]);
}

function getMakeAndModel(stuff){
    var $ = cheerio.load(stuff);
    temp = $('.gbps').map(function(i, elem){
        var ret = {};
        switch(i){
            case 0:
                ret['name'] = 'year';
                ret['value'] = $(this).text();
                break;
            case 1:
                ret['name'] = 'make';
                ret['value'] = $(this).text();
                break;
            case 2:
                ret['name'] = 'model';
                ret['value'] = $(this).text();
                break;
            case 3:
                ret['name'] = 'ignore';
                ret['value'] = $(this).text();
                break;
            default:
                ret['name'] = 'ignore';
                ret['value'] = 'ignore';
        }
        return ret;
    });
    return temp;
}

function addToDB(arr){
    log('adding to DB');
    pic = arr[0];
    description = arr[1];
    price = arr[2];
    specs = arr[3];
    makeAndModel = arr[4];

    var obj = {};
    for (var i = specs.length - 1; i >= 0; i--) {
        obj [specs[i].name] = specs[i].value;
     }; 
    for (var i = makeAndModel.length - 1; i >= 0; i--){
        obj [makeAndModel[i].name] = makeAndModel[i].value;
    };
    db.collection('carsTest').update(
        {VIN: obj.VIN},
        {
            $set: {
                VIN: obj.VIN,
                make: obj.make,
                model: obj.model,
                year: obj.year,
                price: price,
                engine: obj.Engine,
                interior: obj.Interior,
                exterior: obj.Exterior,
                'model code': obj['Model Code'],
                'stock number': S(obj['Stock Number']).toInt(),
                transmission: obj.Transmission,
                mileage: obj.Mileage ? obj.Mileage : 0,
                description: description,
                picture: pic,
            }
        },
        {upsert: true, safe: true},
        function(err,result){
            if(err){
                throw err;
            }
        });
    log('finished with this one!');
}

I've omitted and changed a fair amount as a proof here without a lot of error checking or anything but even this will add the document but won't quit. Node just sits there, waiting for something to happen and it never calls the final callback to close the db and exit.

> db.carsTest.find().pretty()
{
    "_id" : ObjectId("52139aa7c9b7a39e0f1eb61d"),
    "VIN" : null,
    "description" : "vehicle description",
    "engine" : null,
    "exterior" : null,
    "interior" : null,
    "make" : null,
    "mileage" : 0,
    "model" : null,
    "model code" : null,
    "picture" : "picture url",
    "price" : 100,
    "stock number" : NaN,
    "transmission" : null,
    "year" : null
}

1 Answer 1

2

I think that you misunderstand how async.series works.

Your functions in async.series don't take callback as an argument and they don't call it. And that request(...) stuff is probably not a function at all. That's probably why it breaks async loop. Try this:

async.series(
    [
        function(callback) { // <--- missing callback
            log('starting');
            db.connect('mongodb://127.0.0.1:27017/test',
                function(err, base){
                    if(err) throw err;
                    db = base;
                    callback(); // <--- missing callback
                });
        },
        function(callback) { // <--- missing function with callback
            request(website, function(err,resp,body) {
                start(err, resp, body, callback);
            })
        }
    ],
    function(){
        log('closing DB');
        db.close();
    }
);

Note that I've added callback argument when calling start. Thus you will have to refactor your code havily so that every function accepts callback which can be called at the end when you know that all jobs are done. For example you can add async.parallel inside start and this function may look like this:

function start(err, resp, body, callback) {
    // some stuff happens here
    var jobs = []
    pageURLS.forEach(function(url, index, array){
        jobs.push(function(clb) {
            request(url, function(error,response,bodies) {
                // some stuff
                clb(); // <--- this refers to the local callback for the job
            });
        });
    });
    async.parallel(jobs, function() {
        // all jobs are done, let's finilize everything
        callback();
    });
};
Sign up to request clarification or add additional context in comments.

1 Comment

Definitely misunderstood how async works. Sorry it took me a few days to get back to you but as it's a side project, I've not had a lot of time to work on it.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.