Scraping StackOverflow's questions with Node.js

Ask Question

Asked 7 years, 5 months ago

Modified 7 years, 5 months ago

Viewed 678 times

I'm new to node and web scraping in general but with some research I've written this working code.

My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.

How can I improve it in both readability and perfomance?
Any convention or language/technology nuance that I got wrong?

const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster) {
    let nextPage = 1;
    for (let i = 0; i < CORES; i++) {
        CLUSTER.fork({ startingPoint: nextPage });
        nextPage += 1;
    }

    CLUSTER.on('online', (worker) => {
        console.log(`Worker ${worker.process.pid} is now working.`);
    });

    CLUSTER.on('exit', (worker, code, signal) => {
        if (code !== 0) {//restart
            console.log(`Worker ${worker.process.pid} died. Restarting.`);
            CLUSTER.fork({ startingPoint: worker.process.env.startingPoint });
        } else {//scrape next X pages
            console.log(`Worker ${worker.process.pid} finished it's work succesfully.`);
            if (nextPage <= LAST_PAGE_TO_SCRAPE) {
                CLUSTER.fork({ startingPoint: nextPage });
                nextPage += QUANTITY_OF_PAGES_PER_WORKER;
            }
        }
    });
} else {
    let workerStartingPoint = parseInt(process.env.startingPoint);
    for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++) {
        REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html) {
            if (error) {
                process.exit(workerStartingPoint);//error code is where the worker should start again
            }
            let $ = CHEERIO.load(html);
            JSONFRAME($);
            let frame = {
                questions: {
                    _s: "#questions .question-summary",
                    _d: [{
                        "votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
                        "answers": ".statscontainer .stats .status strong",
                        "views": ".statscontainer .views",
                        "title": ".summary h3 a",
                        "tags": [".summary .tags a"],
                        "url": ".question-hyperlink @ href",
                        "user": {
                            "name": ".summary .started .user-info .user-details a",
                            "profile-link": ".summary .started .user-info .user-details a @ href",
                            "reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
                        },
                        "date asked": ".summary .started .user-info .user-action-time .relativetime"
                    }]
                }
            }
            let questions = $('body').scrape(frame, { string: true });
            FS.writeFile('page-' + i + '.json', questions, function (error) {
                if (error) {
                    process.exit(workerStartingPoint);
                }
                process.exit(0);
            })
        });
    }
}

If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.

edited Jun 20, 2018 at 20:03

200_success

146k22 gold badges191 silver badges481 bronze badges

asked Jun 20, 2018 at 19:16

Régis B.

313 bronze badges

5

\$\begingroup\$ Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience. \$\endgroup\$

esote
– esote

2018-06-20 19:37:24 +00:00
Commented Jun 20, 2018 at 19:37
\$\begingroup\$ Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper. \$\endgroup\$

Régis B.
– Régis B.

2018-06-20 19:40:53 +00:00
Commented Jun 20, 2018 at 19:40

Add a comment |

0 You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.

Stack Exchange Network

Scraping StackOverflow's questions with Node.js

0

You must log in to answer this question.

Hot Network Questions

Scraping StackOverflow's questions with Node.js

0

You must log in to answer this question.

Related

Hot Network Questions