How can I "render" HTML with with PyQt5 v5.6 QWebEngineView?
I have previously performed the task with PyQt5 v5.4.1 QWebPage, but it was suggested to try the newer QWebEngineView.
Here's that implementation (it generally works as expected, but has a tendency to hang indefinitely for some sites and situations):
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebKitWidgets import QWebPage
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
What follows is my attempt at using QWebEngineView. First, the installation and setup of PyQt5 v5.6 on Ubuntu:
# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5
# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/
Now for the Python... The following results in a segmentation fault:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# what's going on here? how can I get the HTML from toHtml?
self.page().toHtml(self.callable)
self.app.quit()
def callable(self, data):
self.html = data
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The trouble appears to lie in the call to asynchronous toHtml(). It seems like it should be fairly simple, but I'm at a loss with what to do with it. I see it's been discussed in the context of C++, but I'm not sure how to translate this to Python. How can I get the HTML out?
Quite a bit of discussion on the topic was made in the following thread: https://riverbankcomputing.com/pipermail/pyqt/2015-January/035324.html
The new QWebEngine interface takes account of the fact that the
underlying Chromium engine is asynchronous. As such we have to turn an asynchronous API into a synchronous one.
Here's how that looks:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The answer by Six & Veehmot is great, but I found out that for my purpose it was not sufficient, as it did not expand the dropdown elements of the page that I wanted to scrape.
A slight modification fixed this:
def render(url):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop,QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
print(render(dummy_url))
As you pointed out, Qt5.4 relies on async calls. It's not necessary to use the Loop (as seen on your answer), since your only mistake was to call quit before the toHtml call finishes.
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
It's not entirely clear to me what you mean by "render". I understand it to mean, "display the HTML accordingly on the screen." The following does just that.
# main.py
import sys
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class Browser(QtWebEngineWidgets.QWebEngineView):
def __init__(self):
super().__init__()
html = """
<!DOCTYPE html>
<html>
<head>
<title>Example</title>
<meta charset="utf-8" />
</head>
<body>
<script>alert('Running some Javascript');</script>
<h1>Hello world!</h1>
<p>Goodbye, cruel world...</p>
</body>
</html>
"""
# With QWebEnginePage.setHtml, the html is loaded immediately.
# baseUrl is used to resolve relative URLs in the document.
# For whatever reason, it seems like the baseUrl resolves to
# the parent of the path, not the baseUrl itself. As a
# workaround, either append a dummy directory to the base url
# or start all relative paths in the html with the current
# directory.
# https://doc-snapshots.qt.io/qtforpython-5.15/PySide2/QtWebEngineWidgets/QWebEnginePage.html#PySide2.QtWebEngineWidgets.PySide2.QtWebEngineWidgets.QWebEnginePage.setHtml
here = os.path.dirname(os.path.abspath(__file__)).replace('\\', '/')
base_path = os.path.join(os.path.dirname(here), 'dummy').replace('\\', '/')
self.url = QtCore.QUrl('file:///' + base_path)
self.page().setHtml(html, baseUrl=self.url)
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.init_widgets()
self.init_layout()
def init_widgets(self):
self.browser = Browser()
self.browser.loadFinished.connect(self.load_finished)
def init_layout(self):
layout = QtWidgets.QVBoxLayout()
layout.addWidget(self.browser)
centralWidget = QtWidgets.QWidget()
centralWidget.setLayout(layout)
self.setCentralWidget(centralWidget)
def load_finished(self, status):
self.msg = QtWidgets.QMessageBox()
self.msg.setIcon(QtWidgets.QMessageBox.Information)
self.msg.setWindowTitle('Load Status')
self.msg.setText(f"It is {str(status)} that the page loaded.")
self.msg.show()
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
main_window = MainWindow()
main_window.show()
sys.exit(app.exec_())
The setHtml method takes a string so it must be read in first when using an HTML file.
Related
I'm trying to build a PyQT5 class that will execute s embedded within it.
Here's the code:
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
That connect method is unreferencable, here's what I imported:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebKitWidgets import QWebPage
Am I missing something?
I am trying to loop over a list of URLs using PyQt4 and Beautifulsoup using the following code:
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, pyqtSignal
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print ('Downloading', url)
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass
soup = BeautifulSoup(unicode(html), "lxml")
t = soup.findAll("div", {"class": "detalhamento_label_valor hidden-print ng-binding"})[0].text
print t
urls = ["http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000004?modulo=0&sistema=portal" ]
r = Render(urls, cb=scrape)
It seems to work well if the urls are the same [0,1], but it gets stuck once the url changes [2]. I am not really familiar with PyQt4, so I wonder if there is something trivial I might be missing.
EDIT
The program hangs while running the third item of the url list on this operation:
self.mainFrame().load(QUrl(url))
Other than that, the only warning I get is:
libpng warning: iCCP: known incorrect sRGB profile
Though I'm not sure what it means, it does not seem to be connected to the issue.
I am new to Python and am trying to understand why I am getting the following error:
Traceback (most recent call last):
File "WebScraper.py", line 10, in <module>
class Render(QWebPage):
NameError: name 'QWebPage' is not defined
Here is the code:
import sys
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import *
from lxml import html
#Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
#This step is important.Converting QString to Ascii for lxml to process
archive_links = html.fromstring(str(result.toAscii()))
print(archive_links)
I understand the __init__ acts as a constructor but why is it not setting it to self? So I need to change it to something like QWebPage.x = self?
You're not importing QWebPage
Try adding this import to the top of your script:
from PyQt5.QtWebKitWidgets import QWebPage
What version of PyQt5 are you using? Please note that starting PyQt5.9, QtWebKitWidgets (and also QtWebKit) is not longer available (deprecated), resulting in the error you are getting.
Let me juxtapose two rendering functions, one using old (PyQt4) and one using latest PyQt5:
Using PyQt4, note the usage of QWebPage and of course the imports
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
"""Render HTML with PyQt4 WebEngine."""
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
Using PyQt5, note the usage of QWebEngineView instead of QWebPage and of course the imports
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
class Render(QWebEngineView):
"""Render HTML with PyQt5 WebEngine."""
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(
QEventLoop.ExcludeUserInputEvents |
QEventLoop.ExcludeSocketNotifiers |
QEventLoop.WaitForMoreEvents)
self.app.quit()
I have a PyQt4 class that downloads webpages that I use for scrapping purposes.
When I pass a list of urls to the Render class while instantiating it works fine(single call) but when I try to loop the [r = Render(url, cb=scrape)]with a multiple list of urls, after the first loop,the execution stops or hangs without any error thrown.
I want to loop the class separately because the urls list belong to different category and will have to store the contents extracted separately.
I also came to know that only one app can be initiated, if that is the case how to exit the app without quitting it. so that new url list can be used by the same app
I am stuck with this issue for a while. Thanks in advance
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print 'Downloading', url
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass # have scraping code here
url1 = ['http://webscraping.com', 'http://webscraping.com/blog']
url2 = ['http://webscraping.com', 'http://webscraping.com/blog']
urls =[]
urls.append(url1)
urls.append(url2)
for url in urls:
r = Render(url, cb=scrape)
The problem is you can only instantiate a single QApplication object. Here is an updated version that avoids this and then only runs Qt's execution loop when downloading a URL:
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.cb = cb
def crawl(self, url):
print 'Downloading', url
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.app.quit()
def scrape(url, html):
pass # add scraping code here
print len(html)
r = Render(cb=scrape)
urls = ['http://webscraping.com', 'http://webscraping.com/blog']
for url in urls:
r.crawl(url)
unfortunately, #hoju 's answer did not work for me.
here is what works for me (basically setting up a timer to check if loading has completed).
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QTimer
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.frame = None
self.mainFrame().loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
def go_again():
global r, timer, urls
if(len(urls)>0):
print("loading",urls[0])
r = Render(urls.pop())
timer.start(1000)
else:
print("finished")
sys.exit(app.exec_())
def check_done():
global r, timer
if r.frame is not None:
timer.stop()
html_result = r.frame.toHtml()
#do something with html
print("loaded")
go_again()
app = QApplication(sys.argv)
urls = ['http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class','http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class']
timer = QTimer()
timer.timeout.connect(check_done)
#check every second
go_again()
sys.exit(app.exec_())
I am attempting to render multiple webpages and taking screenshots of them, but I can only get it to work when rendering one webpage, because when I try it on multiple the program will either stop dead in it's tracks and hang forever OR just not do anything with images, css and will extract the text of the site and put it in one long block of text. Usually what's happening is it will hang.
Code I'm using to render the webpage in memory is this:
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
# Settings
s = self.settings()
#s.setAttribute(QWebSettings.AutoLoadImages, False)
s.setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QWebSettings.PluginsEnabled, True)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
#self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
size = self.frame.contentsSize()
size.setWidth(1366)
self.setViewportSize(size)
self.app.quit()
And here's how I'm saving the image:
def run(url):
os.chdir("output")
r = Render(url)
image = QImage(r.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
r.frame.render(painter)
painter.end()
fp = "%s.png" % os_safe_name(url)
image.save(fp)
os.chdir("..")
Anyone know why this is happening?
Pretty much as described in Luke's answer, I shuffled things around to avoid creating a QApplication instance for each Render
Not the tidiest, but works for me:
import re
import sys
import time
# Tested with PySide 1.0.9, changing imports to PyQt should work identically
from PySide.QtCore import Qt, QUrl
from PySide.QtGui import QApplication, QImage, QPainter
from PySide.QtWebKit import QWebPage, QWebSettings
def os_safe_name(url):
url = re.sub("[^a-zA-Z0-9_-]+", "_", url)
url = re.sub("_{2,}", "_", url)
return url
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.url = url
self.finished = False
# Settings
s = self.settings()
#s.setAttribute(QWebSettings.AutoLoadImages, False)
s.setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QWebSettings.PluginsEnabled, True)
#self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
# When page is loaded, callback saves image to file
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
frame = self.mainFrame()
size = frame.contentsSize()
size.setWidth(1366)
self.setViewportSize(size)
image = QImage(self.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
self.filepath = "output/%s.png" % os_safe_name(self.url)
image.save(self.filepath)
self.finished = True
def run(url, app = None):
if app is None:
app = QApplication(sys.argv)
r = Render(url)
while not r.finished:
app.processEvents()
time.sleep(0.01)
return r.filepath
if __name__ == '__main__':
app = QApplication(sys.argv)
print run("http://stackoverflow.com", app=app)
print run("http://google.com", app=app)
I presume you are creating multiple instances of your Render class. If this is the case, then you are most likely having problems because you create multiple QApplication instances. Instead, create a single QApplication and share it between all of your Render instances.
You'll also probably need to stop using app.quit() since you want the QApplication to continue functioning. Furthermore, since app.exec_() won't exit until you call quit(), you'll need to make your own event loop instead. Something like this:
while not self.finished:
self.app.processEvents()
time.sleep(0.01)