0

How can I get the latest page data (HTML & Javascript varaibles) from PhantomJS

e.g page.refresh() or something?

I have an Interval, than checks a variable (on the page) every 200ms. However, this variable and the page content, isn't shown to have changed over time. (even though I know it has)

So I need an efficient way to check the value of a JS variable every 200ms or so,

then once I've discovered that variable has changed value, I want to request the latest page HTML.

How can I do this?

var Error = function (description) {
    this.description = description;
    return this;
};

var DTO = function (status, content, error) {
    this.status = status;
    this.content = content;
    this.error = error;
    return this;
};

function outputAndExit(dto) {
    console.log(JSON.stringify(dto));
    phantom.exit();
}

//For any uncaught exception, just log it out for .NET to capture
window.onerror = function (errorMsg, url, lineNumber) {
    var description = 'window.onerror caught an error: ' +
        'errorMsg: ' + errorMsg +
        'url: ' + url +
        'lineNumber: ' + lineNumber;
    outputAndExit(new DTO(false, null, new Error(description)));
};

var GetDynamicPageResult__ = function () {
    var obj = new GetDynamicPageResult();
    obj.initialize();
    return obj;
};

var GetDynamicPageResult = function () {
    var self = this;
    this.initialize = function () {

        this.error = null;
        this.isContentReadyForCrawler = false;

        this.ticker = null;
        this.tickerInterval = 150;
        this.tickerElapsed = 0;

        this.url = '';

        this.loadDependencies();
        this.processArgs();

        this.openPage();

    };
    this.loadDependencies = function () {
        this.system = require('system'),
        this.page = require('webpage').create(),
        this.page.injectJs('jquery-1.10.2.min');
        this.fs = require('fs');
    };
    this.processArgs = function () {
        if (this.system.args.length == 0) {
            outputAndExit(new DTO(false, null, new Error('No arguments given')));
        }
        //system.args[0] Was the name of this script
        this.url = this.system.args[1];
    };
    this.updateIsContentReadyForCrawler = function () {
        var updateIsContentReadyForCrawler = self.page.evaluate(function () {
            self.isContentReadyForCrawler = window.isContentReadyForCrawler;
        });
    };
    this.openPage = function () {
        self.page.open(this.url, function (status) { //NB: status = 'success' || 'fail'
            if (status !== 'success') {
                outputAndExit(new DTO(false, null, new Error('page.open received a non-success status')));
            }
            self.initTicker();
        });
    };

    this.initTicker = function () {
        this.ticker = setInterval(self.handleTick, self.tickerInterval);
    };
    this.handleTick = function () {
        self.tickerElapsed += self.tickerInterval;
        self.updateIsContentReadyForCrawler();
        if (self.isContentReadyForCrawler) {
            clearInterval(self.ticker);
            var content = self.page.content;
            self.finish(true, content, null);
        } else {
            var tooMuchTimeElapsed = self.tickerElapsed > 7000;
            if (tooMuchTimeElapsed) {
                clearInterval(self.ticker);
                self.finish(false, null, new Error('Too much time elapsed'));
            }
        }
    };
    this.finish = function (status, content, error) {
        content = content || '';
        error = error || {};
        outputAndExit(new DTO(status, content, error));
    };
};

/**********************************************************************************/
/***************************** Helpers *****************************/
/**********************************************************************************/

var Utility__ = function () {
    var obj = new Utility();
    obj.initialize();
    return obj;
};

var Utility = function () {
    var self = this;
    this.initialize = function () {
    };
    this.isEmpty = function (obj) {
        var isEmpty = false;
        (obj == undefined || obj == null) && (isEmpty = true);
        return isEmpty;
    };
    this.isStringEmpty = function (str) {
        var isEmpty = false;
        isEmpty(str) && (isEmpty = true);
        (isEmpty == false && $.trim(str) == '') && (isEmpty = true);
        return isEmpty;
    };
};

var getDynamicPageResult = new GetDynamicPageResult__();
3
  • Can you show the code you have, running on interval, that is not working. Commented Nov 11, 2013 at 2:36
  • Added it :-), self.page.evaluate part Commented Nov 11, 2013 at 2:40
  • My apologies everyone. It wasn't working for me because my script wasn't retrieving the page properly Commented Nov 11, 2013 at 5:56

1 Answer 1

2

I think you are almost there: you need to be using page.evaluate(), but currently only use it to get window.isContentReadyForCrawler. You need to use page.evaluate() to grab the latest HTML too.

I'm going to shamelessly paste in code from another answer (https://stackoverflow.com/a/12044474/841830):

var html = page.evaluate(function () {
    var root = document.getElementsByTagName("html")[0];
    var html = root ? root.outerHTML : document.body.innerHTML;
    return html;
});
Sign up to request clarification or add additional context in comments.

1 Comment

Thanks Darren! Sorry I should have mentioned, the "window.isContentReadyForCrawler" bool is never updated!! It just keeps its initial value

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.