Screenshot of multiple webpages in a headerless QtWebKit browser in Python

Screenshot of multiple webpages in a headerless QtWebKit browser in Python - python

I am attempting to render multiple webpages and taking screenshots of them, but I can only get it to work when rendering one webpage, because when I try it on multiple the program will either stop dead in it's tracks and hang forever OR just not do anything with images, css and will extract the text of the site and put it in one long block of text. Usually what's happening is it will hang.
Code I'm using to render the webpage in memory is this:
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
# Settings
s = self.settings()
#s.setAttribute(QWebSettings.AutoLoadImages, False)
s.setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QWebSettings.PluginsEnabled, True)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
#self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
size = self.frame.contentsSize()
size.setWidth(1366)
self.setViewportSize(size)
self.app.quit()
And here's how I'm saving the image:
def run(url):
os.chdir("output")
r = Render(url)
image = QImage(r.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
r.frame.render(painter)
painter.end()
fp = "%s.png" % os_safe_name(url)
image.save(fp)
os.chdir("..")
Anyone know why this is happening?

Pretty much as described in Luke's answer, I shuffled things around to avoid creating a QApplication instance for each Render
Not the tidiest, but works for me:
import re
import sys
import time
# Tested with PySide 1.0.9, changing imports to PyQt should work identically
from PySide.QtCore import Qt, QUrl
from PySide.QtGui import QApplication, QImage, QPainter
from PySide.QtWebKit import QWebPage, QWebSettings
def os_safe_name(url):
url = re.sub("[^a-zA-Z0-9_-]+", "_", url)
url = re.sub("_{2,}", "_", url)
return url
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.url = url
self.finished = False
# Settings
s = self.settings()
#s.setAttribute(QWebSettings.AutoLoadImages, False)
s.setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QWebSettings.PluginsEnabled, True)
#self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
# When page is loaded, callback saves image to file
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
frame = self.mainFrame()
size = frame.contentsSize()
size.setWidth(1366)
self.setViewportSize(size)
image = QImage(self.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
self.filepath = "output/%s.png" % os_safe_name(self.url)
image.save(self.filepath)
self.finished = True
def run(url, app = None):
if app is None:
app = QApplication(sys.argv)
r = Render(url)
while not r.finished:
app.processEvents()
time.sleep(0.01)
return r.filepath
if __name__ == '__main__':
app = QApplication(sys.argv)
print run("http://stackoverflow.com", app=app)
print run("http://google.com", app=app)

I presume you are creating multiple instances of your Render class. If this is the case, then you are most likely having problems because you create multiple QApplication instances. Instead, create a single QApplication and share it between all of your Render instances.
You'll also probably need to stop using app.quit() since you want the QApplication to continue functioning. Furthermore, since app.exec_() won't exit until you call quit(), you'll need to make your own event loop instead. Something like this:
while not self.finished:
self.app.processEvents()
time.sleep(0.01)

Related

`mime.hasImage()` returns `true` but `mime.imageData()` returns `None` on Linux

I'm trying to run a simple PyQt5 application on Linux, the code is as follows:
#!/usr/bin/python
import sys
from PyQt5.QtWidgets import QApplication, QWidget
def main():
app = QApplication(sys.argv)
w = QWidget()
w.resize(250, 150)
w.move(300, 300)
w.setWindowTitle('Simple')
w.show()
mime = app.clipboard().mimeData()
print(mime.hasImage()) # True
print(mime.imageData()) # None
sys.exit(app.exec_())
if __name__ == '__main__':
main()
Before running it, I copied an image into the clipboard, so mime.hasImage() should return True. No problem, that's also the case. But what's weird is, mime.imageData() sometimes returns None. that shouldn't happen. mime.imageData() should contain the image that I copied instead of None. Is there anything wrong with the code?
By the way, this seems to only happen on Linux, mime.imageData() never returns None on Windows. I'm using python3

That hasImage() returns True does not imply that imageData() returns a QImage since it only indicates that the user copied an image to the clipboard, and in what format do I copy the image? Well, it could be png, jpg, etc or it could provide the url for the client application to download or html to insert it into the client application and then obtain the image by rendering the HTML.
So in general the application from which the image was copied is responsible for the sending format and that there is no restrictive standard for that format but there are common formats.
The following example shows the logic to handle the images that come from urls and HTML:
#!/usr/bin/python
import sys
from functools import cached_property
from PyQt5.QtCore import pyqtSignal, QObject, QUrl
from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply
from PyQt5.QtGui import QGuiApplication, QImage, QPixmap
from PyQt5.QtWidgets import QApplication, QWidget, QLabel
from bs4 import BeautifulSoup
class ImageDownloader(QObject):
finished = pyqtSignal(QImage)
def __init__(self, parent=None):
super().__init__(parent)
self.manager.finished.connect(self.handle_finished)
#cached_property
def manager(self):
return QNetworkAccessManager()
def start_download(self, url):
self.manager.get(QNetworkRequest(url))
def handle_finished(self, reply):
if reply.error() != QNetworkReply.NoError:
print("error: ", reply.errorString())
return
image = QImage()
image.loadFromData(reply.readAll())
self.finished.emit(image)
class ClipboardManager(QObject):
imageChanged = pyqtSignal(QImage)
def __init__(self, parent=None):
super().__init__(parent)
QGuiApplication.clipboard().dataChanged.connect(
self.handle_clipboard_datachanged
)
self.downloader.finished.connect(self.imageChanged)
#cached_property
def downloader(self):
return ImageDownloader()
def handle_clipboard_datachanged(self):
mime = QGuiApplication.clipboard().mimeData()
if mime.hasImage():
image = mime.imageData()
if image is not None:
self.imageChanged.emit(image)
elif mime.hasUrls():
url = mime.urls()[0]
self.downloader.start_download(urls[0])
elif mime.hasHtml():
html = mime.html()
soup = BeautifulSoup(html, features="lxml")
imgs = soup.findAll("img")
if imgs:
url = QUrl.fromUserInput(imgs[0]["src"])
self.downloader.start_download(url)
else:
for fmt in mime.formats():
print(fmt, mime.data(fmt))
def main():
app = QApplication(sys.argv)
label = QLabel(scaledContents=True)
label.resize(250, 150)
label.move(300, 300)
label.setWindowTitle("Simple")
label.show()
manager = ClipboardManager()
manager.imageChanged.connect(
lambda image: label.setPixmap(QPixmap.fromImage(image))
)
sys.exit(app.exec_())
if __name__ == "__main__":
main()

How to execute QWebEngine in Python function

I have a QWebEngine class tor read webpages and create BeautifulSoup for them.
Here is the code:
import sys
from bs4 import BeautifulSoup
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
self.soup = []
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
self.soup.append(BeautifulSoup(html, 'lxml'))
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
Here is another function to call WebPage class:
def get_soup(urls):
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
return webpage.soup
Here is the main:
if __name__ == "__main__":
urls = ["http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=sh", "http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=sz"]
soups = get_soup(urls)
However, the program restarts when I executed the program.
What should be changed?

This is a problem that I had already had and analyzing I found that the QApplication is destroyed before QWebEnginePage making the QWebEngineProfile is deleted, and in this case causing QWebEnginePage crashes. The solution is to make the app have a greater scope by making it a global variable.
On the other hand you have to call exec_() so that the eventloop that allows the operation of the signals
# ...
app = None
def get_soup(urls):
global app
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
app.exec_()
return webpage.soup
# ...
Note: It seems that the QTBUG-75547 related to this problem has been solved for Qt5>=5.12.4 so probably in a next release of PyQtWebEngine that bug will no longer be observed.

How to "render" HTML with PyQt5's QWebEngineView

How can I "render" HTML with with PyQt5 v5.6 QWebEngineView?
I have previously performed the task with PyQt5 v5.4.1 QWebPage, but it was suggested to try the newer QWebEngineView.
Here's that implementation (it generally works as expected, but has a tendency to hang indefinitely for some sites and situations):
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebKitWidgets import QWebPage
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
What follows is my attempt at using QWebEngineView. First, the installation and setup of PyQt5 v5.6 on Ubuntu:
# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5
# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/
Now for the Python... The following results in a segmentation fault:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# what's going on here? how can I get the HTML from toHtml?
self.page().toHtml(self.callable)
self.app.quit()
def callable(self, data):
self.html = data
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The trouble appears to lie in the call to asynchronous toHtml(). It seems like it should be fairly simple, but I'm at a loss with what to do with it. I see it's been discussed in the context of C++, but I'm not sure how to translate this to Python. How can I get the HTML out?

Quite a bit of discussion on the topic was made in the following thread: https://riverbankcomputing.com/pipermail/pyqt/2015-January/035324.html
The new QWebEngine interface takes account of the fact that the
underlying Chromium engine is asynchronous. As such we have to turn an asynchronous API into a synchronous one.
Here's how that looks:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

The answer by Six & Veehmot is great, but I found out that for my purpose it was not sufficient, as it did not expand the dropdown elements of the page that I wanted to scrape.
A slight modification fixed this:
def render(url):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop,QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
print(render(dummy_url))

As you pointed out, Qt5.4 relies on async calls. It's not necessary to use the Loop (as seen on your answer), since your only mistake was to call quit before the toHtml call finishes.
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

It's not entirely clear to me what you mean by "render". I understand it to mean, "display the HTML accordingly on the screen." The following does just that.
# main.py
import sys
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class Browser(QtWebEngineWidgets.QWebEngineView):
def __init__(self):
super().__init__()
html = """
<!DOCTYPE html>
<html>
<head>
<title>Example</title>
<meta charset="utf-8" />
</head>
<body>
<script>alert('Running some Javascript');</script>
<h1>Hello world!</h1>
<p>Goodbye, cruel world...</p>
</body>
</html>
"""
# With QWebEnginePage.setHtml, the html is loaded immediately.
# baseUrl is used to resolve relative URLs in the document.
# For whatever reason, it seems like the baseUrl resolves to
# the parent of the path, not the baseUrl itself. As a
# workaround, either append a dummy directory to the base url
# or start all relative paths in the html with the current
# directory.
# https://doc-snapshots.qt.io/qtforpython-5.15/PySide2/QtWebEngineWidgets/QWebEnginePage.html#PySide2.QtWebEngineWidgets.PySide2.QtWebEngineWidgets.QWebEnginePage.setHtml
here = os.path.dirname(os.path.abspath(__file__)).replace('\\', '/')
base_path = os.path.join(os.path.dirname(here), 'dummy').replace('\\', '/')
self.url = QtCore.QUrl('file:///' + base_path)
self.page().setHtml(html, baseUrl=self.url)
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.init_widgets()
self.init_layout()
def init_widgets(self):
self.browser = Browser()
self.browser.loadFinished.connect(self.load_finished)
def init_layout(self):
layout = QtWidgets.QVBoxLayout()
layout.addWidget(self.browser)
centralWidget = QtWidgets.QWidget()
centralWidget.setLayout(layout)
self.setCentralWidget(centralWidget)
def load_finished(self, status):
self.msg = QtWidgets.QMessageBox()
self.msg.setIcon(QtWidgets.QMessageBox.Information)
self.msg.setWindowTitle('Load Status')
self.msg.setText(f"It is {str(status)} that the page loaded.")
self.msg.show()
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
main_window = MainWindow()
main_window.show()
sys.exit(app.exec_())
The setHtml method takes a string so it must be read in first when using an HTML file.

pyqt4: Loop main Render class?

I have a PyQt4 class that downloads webpages that I use for scrapping purposes.
When I pass a list of urls to the Render class while instantiating it works fine(single call) but when I try to loop the [r = Render(url, cb=scrape)]with a multiple list of urls, after the first loop,the execution stops or hangs without any error thrown.
I want to loop the class separately because the urls list belong to different category and will have to store the contents extracted separately.
I also came to know that only one app can be initiated, if that is the case how to exit the app without quitting it. so that new url list can be used by the same app
I am stuck with this issue for a while. Thanks in advance
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print 'Downloading', url
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass # have scraping code here
url1 = ['http://webscraping.com', 'http://webscraping.com/blog']
url2 = ['http://webscraping.com', 'http://webscraping.com/blog']
urls =[]
urls.append(url1)
urls.append(url2)
for url in urls:
r = Render(url, cb=scrape)

The problem is you can only instantiate a single QApplication object. Here is an updated version that avoids this and then only runs Qt's execution loop when downloading a URL:
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.cb = cb
def crawl(self, url):
print 'Downloading', url
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.app.quit()
def scrape(url, html):
pass # add scraping code here
print len(html)
r = Render(cb=scrape)
urls = ['http://webscraping.com', 'http://webscraping.com/blog']
for url in urls:
r.crawl(url)

unfortunately, #hoju 's answer did not work for me.
here is what works for me (basically setting up a timer to check if loading has completed).
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QTimer
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.frame = None
self.mainFrame().loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
def go_again():
global r, timer, urls
if(len(urls)>0):
print("loading",urls[0])
r = Render(urls.pop())
timer.start(1000)
else:
print("finished")
sys.exit(app.exec_())
def check_done():
global r, timer
if r.frame is not None:
timer.stop()
html_result = r.frame.toHtml()
#do something with html
print("loaded")
go_again()
app = QApplication(sys.argv)
urls = ['http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class','http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class']
timer = QTimer()
timer.timeout.connect(check_done)
#check every second
go_again()
sys.exit(app.exec_())

PyQt4 & flask : Cannot create children for a parent that is in a different thread

I am trying to save image of a page on a http request to a flask server.
This is the message that I get on running the thing
QObject: Cannot create children for a parent that is in a different thread.
here is my code
import sys
import time
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
import Image
from flask import Flask, Response, jsonify,request
app=Flask(__name__)
class Screenshot(QWebView):
def __init__(self):
self.app = QApplication(sys.argv)
QWebView.__init__(self)
self._loaded = False
self.loadFinished.connect(self._loadFinished)
def capture(self,url,width,output_file):
self.resize(width,300)
self.load(QUrl(url))
self.wait_load()
# set to webpage size
frame = self.page().mainFrame()
self.page().setViewportSize(frame.contentsSize())
# render image
image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
print 'saving', output_file
image.save(output_file)
def wait_load(self, delay=0):
# process app events until page loaded
while not self._loaded:
self.app.processEvents()
time.sleep(delay)
self._loaded = False
def _loadFinished(self, result):
self._loaded = True
if __name__=='__main__':
s = Screenshot()
#app.route('/image', methods=['GET','POST'])
def getPicture():
#reading args
url=request.args.get('url')
screenWidth=int(request.args.get('sw'))
top=int(request.args.get('top'))
left=int(request.args.get('left'))
width=int(request.args.get('width'))
height=int(request.args.get('height'))
#cropping image
s.capture(url,screenWidth,"temp.png")
img=Image.open("temp.png")
box=(left, top, left+width, top+height)
area=img.crop(box)
area.save("output","png")
return "output.png"
#app.route('/')
def api_root():
return 'Welcome'
app.run(host='0.0.0.0',port=3000,debug=True)
whenever I hit this using
curl http://0.0.0.0:3000/image?url=googlecom&sw=1920&top=100&left=100&width=200&height=200
I get the following error message,
QObject: Cannot create children for a parent that is in a different thread.
(Parent is Screenshot(0x7f9dbf121b90), parent's thread is QThread(0x7f9dbdb92240), current thread is QThread(0x7f9dbf11ed50)
QPixmap: It is not safe to use pixmaps outside the GUI thread
QPixmap: It is not safe to use pixmaps outside the GUI thread

I'm not sure that is the reason, but I don't see a self.app.exec_() call in Screenshot.__init__(). Without it, the QApplication instance never enters the main loop. BTW, I would instantiate QApplication outside of Screenshot but somewhere in your 'main' section. Not sure if it matters in this particular case, but it may.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Screenshot of multiple webpages in a headerless QtWebKit browser in Python - python

Related

`mime.hasImage()` returns `true` but `mime.imageData()` returns `None` on Linux

How to execute QWebEngine in Python function

How to "render" HTML with PyQt5's QWebEngineView

pyqt4: Loop main Render class?

PyQt4 & flask : Cannot create children for a parent that is in a different thread

Categories

Resources