Search code examples
pythonpyqt4qwebpage

QWebpage only fetches HTML once, and cannot be invoked again


I have a code:

from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication

class TextBrowser(QtCore.QObject):

    def __init__(self, url):
        self.some_url = url

        self.html_source = None

        QtCore.QObject.__init__(self)
        self.page = QWebPage()

        self.page.loadFinished.connect(self.get_html)

        self.page.mainFrame().load(self.some_url)

    def get_html(self):
        frame = self.page.mainFrame()
        self.html_source = unicode(frame.toHtml()).encode('utf-8')
        QtCore.QCoreApplication.quit()


def get_html_source(some_url):
    app = QApplication([])
    browser = TextBrowser(QtCore.QUrl(some_url))
    app.exec_()
    return browser.html_source

So now, if i run:

print get_html_source('http://www.google.com')

It's okay, and returns a html source from the page http://www.google.com. But if I run another next one like this:

print get_html_source('http://www.google.com')
print get_html_source('http://www.yahoo.com/')

This executes only once, outputs google's html source but after that the PyCharm returns "Process finished with exit code 139" and second call of get_html_source() doesn't executing.

I need to iterate through some url list and get source code from them using by Qwebpage, but my implementation doesn't work.

Where can i find some info about my needs or what am i doing wrong?


Solution

  • Consider the following. exec_ starts the event loops (once), and two separate pages are running:

    from PyQt4 import QtCore, QtGui
    from PyQt4.QtWebKit import QWebPage
    from PyQt4.QtGui import QApplication
    
    class TextBrowser(QtGui.QDialog):
    
        def __init__(self, url):
            self.some_url = url
    
            QtCore.QObject.__init__(self)
            self.page = QWebPage()
            self.page.loadFinished.connect(self.get_html)
            self.page.mainFrame().load(self.some_url)
    
        def get_html(self):
            frame = self.page.mainFrame()
            self.html = frame.toHtml()
            self.close()
    
    
    def get_html_source():
        app = QApplication([])
        urls = ['http://www.google.com', 'http://www.yahoo.com/']
        out = []
        for u in urls:
            t = TextBrowser(QtCore.QUrl(u))
            t.exec_()
            out.append(t.html)
        print(out)
    
    if __name__ == "__main__":
        get_html_source()
    

    This program has no means to exit as it stands - I suppose you wanted to do more with the HTML than print it anyway.