1

When I parse a static html page, my node.js app works well. However, when the url is a JavaScript-generated page, the app doesn't work. How can I scrape a JavaScript-generated web page?

My app.js

var express = require('express'),
  fs = require('fs'),
  request = require('request'),
  cheerio = require('cheerio'),
  app = express();

app.get('/scrape', function( req, res ) {

  url = 'http://www.apache.org/';

  request( url, function( error, response, html ) {
    if( !error ) {
      var $ = cheerio.load(html);

      var title, release, rating;
      var json = { title : "" };

      $('body').filter(function() {
        var data = $(this);
        title = data.find('.panel-title').text();
        json.title = title;
      })
    }

    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
      console.log( 'File successfully written! - Check your project directory for the output.json file' );
    });

    // Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
    res.send( 'Check your console!' );
  });
});

app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
0

1 Answer 1

5

Cheerio won't execute the javascript on the page as it's just made for parsing plain HTML.

I'd suggest a different approach using something like PhantomJS: http://phantomjs.org/

Sign up to request clarification or add additional context in comments.

1 Comment

I'm working on a project and am encountering this same issue. I've been looking into phantom.js to help remedy this issue. Anywhere you can point me?

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.