I am trying to get HTML of a page loaded in PyQT5 QWebEngineView. Here is a simple example:
import sys
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import *
from PyQt5.QtWidgets import *
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript("document.getElementsByTagName('html')[0]", callback_function)
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
I was hoping to be able to return html from the runJavaScript() call but i get a blank in the callback function.
What is incorrect in my code and what alternatives are available for obtaining HTML of a page?
Using my old answer written C++ and translating the solution to Python:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript("document.documentElement.outerHTML", callback_function)
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
Update:
The problem in your case is that getElementsByTagName() returns a list of js elements, and that element cannot be exported to python, what you should do is get the innerHTML:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript(
"document.getElementsByTagName('html')[0].innerHTML", callback_function
)
# or document.getElementsByTagName('html')[0].outerHTML
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://stackoverflow.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())