Search code examples
pythonpython-3.xbeautifulsouppyqt4

PyQt Class not working for the second usage


I'm using PyQt to fully load a page(including JS) and get it contents using Beautiful Soup. Works fine at the first iteration, but after, it crashes. I don't have a big knowledge in Python, and even less in PyQt, so any help is very welcome.

Class borrowed from here.

from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal


class Render(QWebPage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.html = None
        signal.signal(signal.SIGINT, signal.SIG_DFL)
        self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def _finished_loading(self, result):
        self.html = self.mainFrame().toHtml()
        self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
        self.app.quit() 

###################################################################


l = ["http://www.google.com/?q=a", "http://www.google.com/?q=b", "http://www.google.com/?q=c"]

for page in l:
    soup = Render(page).soup
    print("# soup done: " + page)

enter image description here


Solution

  • The example crashes because the RenderPage class attempts to create a new QApplication and event-loop for every url it tries to load.

    Instead, only one QApplication should be created, and the QWebPage subclass should load a new url after each page has been processed, rather than using a for-loop.

    Here's a re-write of the example which should do what you want:

    import sys, signal
    from bs4 import BeautifulSoup
    from bs4.dammit import UnicodeDammit
    from PyQt4 import QtCore, QtGui, QtWebKit
    
    class WebPage(QtWebKit.QWebPage):
        def __init__(self):
            QtWebKit.QWebPage.__init__(self)
            self.mainFrame().loadFinished.connect(self.handleLoadFinished)
    
        def process(self, items):
            self._items = iter(items)
            self.fetchNext()
    
        def fetchNext(self):
            try:
                self._url, self._func = next(self._items)
                self.mainFrame().load(QtCore.QUrl(self._url))
            except StopIteration:
                return False
            return True
    
        def handleLoadFinished(self):
            self._func(self._url, self.mainFrame().toHtml())
            if not self.fetchNext():
                print('# processing complete')
                QtGui.qApp.quit()
    
    
    def funcA(url, html):
        print('# processing:', url)
        # soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
        # do stuff with soup...
    
    def funcB(url, html):
        print('# processing:', url)
        # soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
        # do stuff with soup...
    
    if __name__ == '__main__':
    
        items = [
            ('http://stackoverflow.com', funcA),
            ('http://google.com', funcB),
            ]
    
        signal.signal(signal.SIGINT, signal.SIG_DFL)
        print('Press Ctrl+C to quit\n')
        app = QtGui.QApplication(sys.argv)
        webpage = WebPage()
        webpage.process(items)
        sys.exit(app.exec_())