Search code examples
pythonpython-3.xpyqtpyqt5qwebengineview

How to get html of a page loaded in QWebEngineView


I am trying to get HTML of a page loaded in PyQT5 QWebEngineView. Here is a simple example:

import sys
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import *
from PyQt5.QtWidgets import *


def callback_function(html):
    print(html)


def on_load_finished():

    web.page().runJavaScript("document.getElementsByTagName('html')[0]", callback_function)


app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.loadFinished.connect(on_load_finished)

sys.exit(app.exec_())

I was hoping to be able to return html from the runJavaScript() call but i get a blank in the callback function.

What is incorrect in my code and what alternatives are available for obtaining HTML of a page?


Solution

  • Using my old answer written C++ and translating the solution to Python:

    import sys
    from PyQt5.QtCore import QUrl
    from PyQt5.QtWebEngineWidgets import QWebEngineView
    from PyQt5.QtWidgets import QApplication
    
    
    def callback_function(html):
        print(html)
    
    
    def on_load_finished():
        web.page().runJavaScript("document.documentElement.outerHTML", callback_function)
    
    
    app = QApplication(sys.argv)
    web = QWebEngineView()
    web.load(QUrl("https://stackoverflow.com"))
    web.show()
    web.resize(640, 480)
    web.loadFinished.connect(on_load_finished)
    
    sys.exit(app.exec_())
    

    Update:

    The problem in your case is that getElementsByTagName() returns a list of js elements, and that element cannot be exported to python, what you should do is get the innerHTML:

    import sys
    from PyQt5.QtCore import QUrl
    from PyQt5.QtWebEngineWidgets import QWebEngineView
    from PyQt5.QtWidgets import QApplication
    
    
    def callback_function(html):
        print(html)
    
    
    def on_load_finished():
        web.page().runJavaScript(
            "document.getElementsByTagName('html')[0].innerHTML", callback_function
        )
        # or document.getElementsByTagName('html')[0].outerHTML
    
    
    app = QApplication(sys.argv)
    web = QWebEngineView()
    web.load(QUrl("https://stackoverflow.com"))
    web.show()
    web.resize(640, 480)
    web.loadFinished.connect(on_load_finished)
    
    sys.exit(app.exec_())