I have a QWebEngine class tor read webpages and create BeautifulSoup for them.
Here is the code:
import sys
from bs4 import BeautifulSoup
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
self.soup = []
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
self.soup.append(BeautifulSoup(html, 'lxml'))
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
Here is another function to call WebPage class:
def get_soup(urls):
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
return webpage.soup
Here is the main:
if __name__ == "__main__":
urls = ["http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=sh", "http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=sz"]
soups = get_soup(urls)
However, the program restarts when I executed the program.
What should be changed?
This is a problem that I had already had and analyzing I found that the QApplication is destroyed before QWebEnginePage making the QWebEngineProfile is deleted, and in this case causing QWebEnginePage crashes. The solution is to make the app have a greater scope by making it a global variable.
On the other hand you have to call exec_() so that the eventloop that allows the operation of the signals
# ...
app = None
def get_soup(urls):
global app
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
app.exec_()
return webpage.soup
# ...
Note: It seems that the QTBUG-75547 related to this problem has been solved for Qt5>=5.12.4 so probably in a next release of PyQtWebEngine that bug will no longer be observed.
Related
I'm trying to write a PyQT QWebEngineView that opens a website, does a SAML login to AAD, returns, and once it sees a specific cookie (openconnect webvpn cookie), grabs the value and returns it to the "console" script which can continue processing and/or return to the command prompt.
I've glued together enough code that I can pop a browser window, step through my SAML authorization and see the cookie and cookie value. I don't know how to auto-close / exit the WebView window and "return" that cookie value and/or just the array to Python itself so I can keep processing it and/or exit. Not quite sure how to "clean up" my objects either.
I did probably fudge up my classes, initiators, and object variables. It's a kludge.
Thoughts? Ideas?
This is Arch Linux with latest Python and pyqt via package repo.
The code:
#!/usr/bin/python
#core python
import sys
#PyQT libraries
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtNetwork import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
#functions / classes
class OpenconnectSamlAuth(QMainWindow):
#init self object
def __init__(self):
#inherit parents functions, classes, etc....
super(OpenconnectSamlAuth, self).__init__()
#create webview object
self.webview = QWebEngineView()
#grab profile
self.profile = QWebEngineProfile("storage", self.webview)
self.cookie_store = self.profile.cookieStore()
self.cookie_store.cookieAdded.connect(self.onCookieAdded)
#empty array of cookies
self.samlcookies = []
#set some window options
#window width x height
self.resize(1024, 768);
#default settings
self.mySettings = QWebEngineSettings.defaultSettings()
self.mySettings.setAttribute(QWebEngineSettings.JavascriptEnabled, True)
#load URL / process login
def samlLogin(self,url):
#create page and load URL
webpage = QWebEnginePage(self.profile, self.webview)
self.webview.setPage(webpage)
self.webview.load(QUrl(url))
#windows options
self.setCentralWidget(self.webview)
#window title
self.webview.setWindowTitle('Loading...')
self.webview.titleChanged.connect(self.updateTitle)
#update title window
def updateTitle(self):
self.webview.setWindowTitle(self.webview.title())
#handle cookies being added
def onCookieAdded(self, cookie):
#check if cookies exists
#for c in self.cookies:
# if c.hasSameIdentifier(cookie):
# return
#self.cookies.append(QNetworkCookie(cookie)) return;
#bytearray(c.name()).decode()
print(bytearray( QNetworkCookie(cookie).name() ).decode() )
print(bytearray( QNetworkCookie(cookie).value() ).decode() )
return
#main loop
def main():
#initialize QT application object
App = QApplication(sys.argv)
#setup webkit window / browser session
OpenconnectWebObj = OpenconnectSamlAuth()
#load URL
OpenconnectWebObj.samlLogin("https://vpnserverurl/groupname")
#show connection window
OpenconnectWebObj.show()
#execute the app and grab the returned cookie
cookie = App.exec_()
print(cookie)
#exit
sys.exit()
#if called via command line; run this
if __name__ == '__main__':
main()
If you want to close the window then you must call the close() method, but in this case it seems that it requires terminating the Qt eventloop so the QCoreApplication.quit() method should be used. On the other hand, the cookie can be stored as an attribute and then used:
import sys
from PyQt5.QtCore import QCoreApplication, QUrl
from PyQt5.QtNetwork import QNetworkCookie
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtWebEngineWidgets import (
QWebEnginePage,
QWebEngineProfile,
QWebEngineSettings,
QWebEngineView,
)
class OpenconnectSamlAuth(QMainWindow):
def __init__(self, parent=None):
super(OpenconnectSamlAuth, self).__init__(parent)
self._cookie = None
self.webview = QWebEngineView()
self.profile = QWebEngineProfile("storage", self.webview)
self.cookie_store = self.profile.cookieStore()
self.cookie_store.cookieAdded.connect(self.handle_cookie_added)
self.profile.settings().setAttribute(QWebEngineSettings.JavascriptEnabled, True)
webpage = QWebEnginePage(self.profile, self)
self.webview.setPage(webpage)
self.webview.titleChanged.connect(self.update_title)
self.setCentralWidget(self.webview)
self.resize(1024, 768)
#property
def cookie(self):
return self._cookie
def login(self, url):
self.webview.load(QUrl.fromUserInput(url))
self.webview.setWindowTitle("Loading...")
def update_title(self):
self.webview.setWindowTitle(self.webview.title())
def handle_cookie_added(self, cookie):
print("added {name} : {value}".format(name=cookie.name(), value=cookie.value()))
if cookie.name() == b"name_of_cookie":
self._cookie = QNetworkCookie(cookie)
QCoreApplication.quit()
# main loop
def main():
app = QApplication(sys.argv)
openconnect_webobj = OpenconnectSamlAuth()
openconnect_webobj.login("https://vpnserverurl/groupname")
openconnect_webobj.show()
ret = app.exec_()
cookie = openconnect_webobj.cookie
if cookie is not None:
print("results:", cookie.name(), cookie.value(), cookie.toRawForm())
sys.exit(ret)
if __name__ == "__main__":
main()
How can I "render" HTML with with PyQt5 v5.6 QWebEngineView?
I have previously performed the task with PyQt5 v5.4.1 QWebPage, but it was suggested to try the newer QWebEngineView.
Here's that implementation (it generally works as expected, but has a tendency to hang indefinitely for some sites and situations):
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebKitWidgets import QWebPage
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
What follows is my attempt at using QWebEngineView. First, the installation and setup of PyQt5 v5.6 on Ubuntu:
# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5
# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/
Now for the Python... The following results in a segmentation fault:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# what's going on here? how can I get the HTML from toHtml?
self.page().toHtml(self.callable)
self.app.quit()
def callable(self, data):
self.html = data
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The trouble appears to lie in the call to asynchronous toHtml(). It seems like it should be fairly simple, but I'm at a loss with what to do with it. I see it's been discussed in the context of C++, but I'm not sure how to translate this to Python. How can I get the HTML out?
Quite a bit of discussion on the topic was made in the following thread: https://riverbankcomputing.com/pipermail/pyqt/2015-January/035324.html
The new QWebEngine interface takes account of the fact that the
underlying Chromium engine is asynchronous. As such we have to turn an asynchronous API into a synchronous one.
Here's how that looks:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The answer by Six & Veehmot is great, but I found out that for my purpose it was not sufficient, as it did not expand the dropdown elements of the page that I wanted to scrape.
A slight modification fixed this:
def render(url):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop,QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
print(render(dummy_url))
As you pointed out, Qt5.4 relies on async calls. It's not necessary to use the Loop (as seen on your answer), since your only mistake was to call quit before the toHtml call finishes.
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
It's not entirely clear to me what you mean by "render". I understand it to mean, "display the HTML accordingly on the screen." The following does just that.
# main.py
import sys
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class Browser(QtWebEngineWidgets.QWebEngineView):
def __init__(self):
super().__init__()
html = """
<!DOCTYPE html>
<html>
<head>
<title>Example</title>
<meta charset="utf-8" />
</head>
<body>
<script>alert('Running some Javascript');</script>
<h1>Hello world!</h1>
<p>Goodbye, cruel world...</p>
</body>
</html>
"""
# With QWebEnginePage.setHtml, the html is loaded immediately.
# baseUrl is used to resolve relative URLs in the document.
# For whatever reason, it seems like the baseUrl resolves to
# the parent of the path, not the baseUrl itself. As a
# workaround, either append a dummy directory to the base url
# or start all relative paths in the html with the current
# directory.
# https://doc-snapshots.qt.io/qtforpython-5.15/PySide2/QtWebEngineWidgets/QWebEnginePage.html#PySide2.QtWebEngineWidgets.PySide2.QtWebEngineWidgets.QWebEnginePage.setHtml
here = os.path.dirname(os.path.abspath(__file__)).replace('\\', '/')
base_path = os.path.join(os.path.dirname(here), 'dummy').replace('\\', '/')
self.url = QtCore.QUrl('file:///' + base_path)
self.page().setHtml(html, baseUrl=self.url)
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.init_widgets()
self.init_layout()
def init_widgets(self):
self.browser = Browser()
self.browser.loadFinished.connect(self.load_finished)
def init_layout(self):
layout = QtWidgets.QVBoxLayout()
layout.addWidget(self.browser)
centralWidget = QtWidgets.QWidget()
centralWidget.setLayout(layout)
self.setCentralWidget(centralWidget)
def load_finished(self, status):
self.msg = QtWidgets.QMessageBox()
self.msg.setIcon(QtWidgets.QMessageBox.Information)
self.msg.setWindowTitle('Load Status')
self.msg.setText(f"It is {str(status)} that the page loaded.")
self.msg.show()
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
main_window = MainWindow()
main_window.show()
sys.exit(app.exec_())
The setHtml method takes a string so it must be read in first when using an HTML file.
I have a PyQt4 class that downloads webpages that I use for scrapping purposes.
When I pass a list of urls to the Render class while instantiating it works fine(single call) but when I try to loop the [r = Render(url, cb=scrape)]with a multiple list of urls, after the first loop,the execution stops or hangs without any error thrown.
I want to loop the class separately because the urls list belong to different category and will have to store the contents extracted separately.
I also came to know that only one app can be initiated, if that is the case how to exit the app without quitting it. so that new url list can be used by the same app
I am stuck with this issue for a while. Thanks in advance
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print 'Downloading', url
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass # have scraping code here
url1 = ['http://webscraping.com', 'http://webscraping.com/blog']
url2 = ['http://webscraping.com', 'http://webscraping.com/blog']
urls =[]
urls.append(url1)
urls.append(url2)
for url in urls:
r = Render(url, cb=scrape)
The problem is you can only instantiate a single QApplication object. Here is an updated version that avoids this and then only runs Qt's execution loop when downloading a URL:
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.cb = cb
def crawl(self, url):
print 'Downloading', url
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.app.quit()
def scrape(url, html):
pass # add scraping code here
print len(html)
r = Render(cb=scrape)
urls = ['http://webscraping.com', 'http://webscraping.com/blog']
for url in urls:
r.crawl(url)
unfortunately, #hoju 's answer did not work for me.
here is what works for me (basically setting up a timer to check if loading has completed).
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QTimer
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.frame = None
self.mainFrame().loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
def go_again():
global r, timer, urls
if(len(urls)>0):
print("loading",urls[0])
r = Render(urls.pop())
timer.start(1000)
else:
print("finished")
sys.exit(app.exec_())
def check_done():
global r, timer
if r.frame is not None:
timer.stop()
html_result = r.frame.toHtml()
#do something with html
print("loaded")
go_again()
app = QApplication(sys.argv)
urls = ['http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class','http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class']
timer = QTimer()
timer.timeout.connect(check_done)
#check every second
go_again()
sys.exit(app.exec_())
I have a code:
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtCore.QObject):
def __init__(self, url):
self.some_url = url
self.html_source = None
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html_source = unicode(frame.toHtml()).encode('utf-8')
QtCore.QCoreApplication.quit()
def get_html_source(some_url):
app = QApplication([])
browser = TextBrowser(QtCore.QUrl(some_url))
app.exec_()
return browser.html_source
So now, if i run:
print get_html_source('http://www.google.com')
It's okay, and returns a html source from the page http://www.google.com. But if I run another next one like this:
print get_html_source('http://www.google.com')
print get_html_source('http://www.yahoo.com/')
This executes only once, outputs google's html source but after that the PyCharm returns "Process finished with exit code 139" and second call of get_html_source() doesn't executing.
I need to iterate through some url list and get source code from them using by Qwebpage, but my implementation doesn't work.
Where can i find some info about my needs or what am i doing wrong?
Consider the following. exec_ starts the event loops (once), and two separate pages are running:
from PyQt4 import QtCore, QtGui
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtGui.QDialog):
def __init__(self, url):
self.some_url = url
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html = frame.toHtml()
self.close()
def get_html_source():
app = QApplication([])
urls = ['http://www.google.com', 'http://www.yahoo.com/']
out = []
for u in urls:
t = TextBrowser(QtCore.QUrl(u))
t.exec_()
out.append(t.html)
print(out)
if __name__ == "__main__":
get_html_source()
This program has no means to exit as it stands - I suppose you wanted to do more with the HTML than print it anyway.
I am attempting to render multiple webpages and taking screenshots of them, but I can only get it to work when rendering one webpage, because when I try it on multiple the program will either stop dead in it's tracks and hang forever OR just not do anything with images, css and will extract the text of the site and put it in one long block of text. Usually what's happening is it will hang.
Code I'm using to render the webpage in memory is this:
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
# Settings
s = self.settings()
#s.setAttribute(QWebSettings.AutoLoadImages, False)
s.setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QWebSettings.PluginsEnabled, True)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
#self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
size = self.frame.contentsSize()
size.setWidth(1366)
self.setViewportSize(size)
self.app.quit()
And here's how I'm saving the image:
def run(url):
os.chdir("output")
r = Render(url)
image = QImage(r.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
r.frame.render(painter)
painter.end()
fp = "%s.png" % os_safe_name(url)
image.save(fp)
os.chdir("..")
Anyone know why this is happening?
Pretty much as described in Luke's answer, I shuffled things around to avoid creating a QApplication instance for each Render
Not the tidiest, but works for me:
import re
import sys
import time
# Tested with PySide 1.0.9, changing imports to PyQt should work identically
from PySide.QtCore import Qt, QUrl
from PySide.QtGui import QApplication, QImage, QPainter
from PySide.QtWebKit import QWebPage, QWebSettings
def os_safe_name(url):
url = re.sub("[^a-zA-Z0-9_-]+", "_", url)
url = re.sub("_{2,}", "_", url)
return url
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.url = url
self.finished = False
# Settings
s = self.settings()
#s.setAttribute(QWebSettings.AutoLoadImages, False)
s.setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
s.setAttribute(QWebSettings.PluginsEnabled, True)
#self.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
# When page is loaded, callback saves image to file
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
frame = self.mainFrame()
size = frame.contentsSize()
size.setWidth(1366)
self.setViewportSize(size)
image = QImage(self.viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
self.filepath = "output/%s.png" % os_safe_name(self.url)
image.save(self.filepath)
self.finished = True
def run(url, app = None):
if app is None:
app = QApplication(sys.argv)
r = Render(url)
while not r.finished:
app.processEvents()
time.sleep(0.01)
return r.filepath
if __name__ == '__main__':
app = QApplication(sys.argv)
print run("http://stackoverflow.com", app=app)
print run("http://google.com", app=app)
I presume you are creating multiple instances of your Render class. If this is the case, then you are most likely having problems because you create multiple QApplication instances. Instead, create a single QApplication and share it between all of your Render instances.
You'll also probably need to stop using app.quit() since you want the QApplication to continue functioning. Furthermore, since app.exec_() won't exit until you call quit(), you'll need to make your own event loop instead. Something like this:
while not self.finished:
self.app.processEvents()
time.sleep(0.01)