Search code examples
pythonmultithreadingweb-scrapingpyqtqwebengineview

Why does my PyQt code not execute totally when multithreading?


I'm trying to write a web scraper using PyQt5 and multithreading so that I can scrape multiple urls in parallel (i'm aware of this : Scrape multiple urls using QWebPage but I really want to write a parallel version and really can't see why it doesn't work) I've written this code :

python
import sys
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *

from PyQt5.QtWebEngineWidgets import QWebEnginePage

import time

urlb = "https://www.google.fr/"


class Worker(QRunnable, QWebEnginePage):
    '''
    Worker thread
    '''
    def __init__(self, url):
        super(Worker, self).__init__()
        self.url = url
    
    def _on_load_finished(self):
        print("tfouuu")
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str
    
    @pyqtSlot()
    def run(self):
        print("a") 
        time.sleep(2)
        print(self.url)
        print("b")
        QWebEnginePage.__init__(self)
        print("c")
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        print("d")

class MainWindow(QMainWindow):


    def __init__(self, *args, **kwargs):
        
        self.threadpool = QThreadPool()
        print("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount())
        
        super(MainWindow, self).__init__(*args, **kwargs)
        
        worker = Worker(urlb)
        worker2 = Worker(urlb)
        self.threadpool.start(worker)
        self.threadpool.start(worker2)


    
    
app = QApplication([])
window = MainWindow()
app.exec_()

But I have 2 problems:

  • the first one is that my code keeps running without stopping (I guess it has to do with the lack of app.quit() line but I don't really know where to put it)

  • and mostly the second problem is that my code prints only 'a', 'b', 'c' -> it doesn't run the connect and load part


Solution

  • QWebEngineView cannot and should not run on another thread.

    Instead if you want to get html asynchronously then you should use the Qt signals:

    from functools import partial
    from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
    
    
    class WebManager(QtCore.QObject):
        def __init__(self, parent=None):
            super(WebManager, self).__init__(parent)
            self.pages = []
            self.results = []
    
        def load(self, url):
            page = QtWebEngineWidgets.QWebEnginePage(self)
            page.loadFinished.connect(self._on_load_finished)
            self.pages.append(page)
            page.load(QtCore.QUrl(url))
    
        @QtCore.pyqtSlot(bool)
        def _on_load_finished(self, ok):
            page = self.sender()
            if not isinstance(page, QtWebEngineWidgets.QWebEnginePage):
                return
            if ok:
                wrapper = partial(self.callable, page)
                page.toHtml(wrapper)
            else:
                self.pages.remove(page)
                page.deleteLater()
    
        def callable(self, page, html):
            self.pages.remove(page)
            url = page.requestedUrl().toString()
            page.deleteLater()
            self.results.append((url, html))
            if not self.pages:
                QtWidgets.QApplication.quit()
    
    
    if __name__ == "__main__":
        import sys
    
        app = QtWidgets.QApplication(sys.argv)
    
        manager = WebManager()
    
        pages = []
        format_url = "http://pyqt.sourceforge.net/Docs/PyQt5/%s.html"
        for name in dir(QtWebEngineWidgets):
            if name.startswith("Q"):
                url = format_url % name.lower()
                manager.load(url)
        app.exec_()
        for url, html in manager.results:
            print(url)
            print(html)