2

I'm trying to parse webpages generated by js with qtwebkit, I found an example of how to get page source:

import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
class Render(QWebPage):
  def __init__(self, url):
       self.app = QApplication(sys.argv)
       QWebPage.__init__(self)
       self.loadFinished.connect(self._loadFinished)
       self.mainFrame().load(QUrl(url))
       self.app.exec_()

  def _loadFinished(self, result):
       self.frame = self.mainFrame()
       self.app.quit()
url = 'http://www.thesite.gov/search'
r = Render(url)
html = r.frame.toHtml()

But i don't know how to make it work in threads. So, how to do this and if it's not possible - is there another fast way to get wepages generated by js?

1 Answer 1

3

Given QT's async nature, the QtWebkit methods are non-blocking as well, so there is no point running them in threads. You can start them parallelly like this:

from functools import partial

from PySide.QtCore import QUrl
from PySide.QtGui import QApplication
from PySide.QtWebKit import QWebView, QWebSettings


TARGET_URLS = (
    'http://stackoverflow.com',
    'http://github.com',
    'http://bitbucket.org',
    'http://news.ycombinator.com',
    'http://slashdot.org',
    'http://www.reddit.com',
    'http://www.dzone.com',
    'http://www.ideone.com',
    'http://jsfiddle.net',
)


class Crawler(object):

    def __init__(self, app):
        self.app = app
        self.results = dict()
        self.browsers = dict()

    def _load_finished(self, browser_id, ok):
        print ok, browser_id
        web_view, _flag = self.browsers[browser_id]
        self.browsers[browser_id] = (web_view, True)

        frame = web_view.page().mainFrame()
        self.results[frame.url()] = frame.toHtml()

        web_view.loadFinished.disconnect()
        web_view.stop()

        if all([closed for bid, closed in self.browsers.values()]):
            print 'all finished'
            self.app.quit()

    def start(self, urls):
        for browser_id, url in enumerate(urls):
            web_view = QWebView()
            web_view.settings().setAttribute(QWebSettings.AutoLoadImages,
                                             False)
            loaded = partial(self._load_finished, browser_id)
            web_view.loadFinished.connect(loaded)
            web_view.load(QUrl(url))
            self.browsers[browser_id] = (web_view, False)


if __name__ == '__main__':
    app = QApplication([])
    crawler = Crawler(app)
    crawler.start(TARGET_URLS)
    app.exec_()
    print 'got:', crawler.results.keys()
Sign up to request clarification or add additional context in comments.

8 Comments

How to parse web page(html +js) with PySide or PtQt? These codes seems that can't parse JS
@user1179442: It can be enabled through: QWebView().settings().setAttribute(QWebSettings.JavascriptEnabled, False)
Thanks for your info. But the 2nd argument should be "True" :)
@user1179442 indeed, sorry about that :) (copy/paste)
I've another question that sometimes the result looks didn't parsed all js. Is it due to asynchronous ?
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.