QWebpage only fetches HTML once, and cannot be invoked again

QWebpage only fetches HTML once, and cannot be invoked again - python

I have a code:
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtCore.QObject):
def __init__(self, url):
self.some_url = url
self.html_source = None
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html_source = unicode(frame.toHtml()).encode('utf-8')
QtCore.QCoreApplication.quit()
def get_html_source(some_url):
app = QApplication([])
browser = TextBrowser(QtCore.QUrl(some_url))
app.exec_()
return browser.html_source
So now, if i run:
print get_html_source('http://www.google.com')
It's okay, and returns a html source from the page http://www.google.com. But if I run another next one like this:
print get_html_source('http://www.google.com')
print get_html_source('http://www.yahoo.com/')
This executes only once, outputs google's html source but after that the PyCharm returns "Process finished with exit code 139" and second call of get_html_source() doesn't executing.
I need to iterate through some url list and get source code from them using by Qwebpage, but my implementation doesn't work.
Where can i find some info about my needs or what am i doing wrong?

Consider the following. exec_ starts the event loops (once), and two separate pages are running:
from PyQt4 import QtCore, QtGui
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtGui.QDialog):
def __init__(self, url):
self.some_url = url
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html = frame.toHtml()
self.close()
def get_html_source():
app = QApplication([])
urls = ['http://www.google.com', 'http://www.yahoo.com/']
out = []
for u in urls:
t = TextBrowser(QtCore.QUrl(u))
t.exec_()
out.append(t.html)
print(out)
if __name__ == "__main__":
get_html_source()
This program has no means to exit as it stands - I suppose you wanted to do more with the HTML than print it anyway.

Related

Using QWebEngine to login to a SAML authorization page, wait for a cookie, and then cleanup / exit

I'm trying to write a PyQT QWebEngineView that opens a website, does a SAML login to AAD, returns, and once it sees a specific cookie (openconnect webvpn cookie), grabs the value and returns it to the "console" script which can continue processing and/or return to the command prompt.
I've glued together enough code that I can pop a browser window, step through my SAML authorization and see the cookie and cookie value. I don't know how to auto-close / exit the WebView window and "return" that cookie value and/or just the array to Python itself so I can keep processing it and/or exit. Not quite sure how to "clean up" my objects either.
I did probably fudge up my classes, initiators, and object variables. It's a kludge.
Thoughts? Ideas?
This is Arch Linux with latest Python and pyqt via package repo.
The code:
#!/usr/bin/python
#core python
import sys
#PyQT libraries
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtNetwork import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
#functions / classes
class OpenconnectSamlAuth(QMainWindow):
#init self object
def __init__(self):
#inherit parents functions, classes, etc....
super(OpenconnectSamlAuth, self).__init__()
#create webview object
self.webview = QWebEngineView()
#grab profile
self.profile = QWebEngineProfile("storage", self.webview)
self.cookie_store = self.profile.cookieStore()
self.cookie_store.cookieAdded.connect(self.onCookieAdded)
#empty array of cookies
self.samlcookies = []
#set some window options
#window width x height
self.resize(1024, 768);
#default settings
self.mySettings = QWebEngineSettings.defaultSettings()
self.mySettings.setAttribute(QWebEngineSettings.JavascriptEnabled, True)
#load URL / process login
def samlLogin(self,url):
#create page and load URL
webpage = QWebEnginePage(self.profile, self.webview)
self.webview.setPage(webpage)
self.webview.load(QUrl(url))
#windows options
self.setCentralWidget(self.webview)
#window title
self.webview.setWindowTitle('Loading...')
self.webview.titleChanged.connect(self.updateTitle)
#update title window
def updateTitle(self):
self.webview.setWindowTitle(self.webview.title())
#handle cookies being added
def onCookieAdded(self, cookie):
#check if cookies exists
#for c in self.cookies:
# if c.hasSameIdentifier(cookie):
# return
#self.cookies.append(QNetworkCookie(cookie)) return;
#bytearray(c.name()).decode()
print(bytearray( QNetworkCookie(cookie).name() ).decode() )
print(bytearray( QNetworkCookie(cookie).value() ).decode() )
return
#main loop
def main():
#initialize QT application object
App = QApplication(sys.argv)
#setup webkit window / browser session
OpenconnectWebObj = OpenconnectSamlAuth()
#load URL
OpenconnectWebObj.samlLogin("https://vpnserverurl/groupname")
#show connection window
OpenconnectWebObj.show()
#execute the app and grab the returned cookie
cookie = App.exec_()
print(cookie)
#exit
sys.exit()
#if called via command line; run this
if __name__ == '__main__':
main()

If you want to close the window then you must call the close() method, but in this case it seems that it requires terminating the Qt eventloop so the QCoreApplication.quit() method should be used. On the other hand, the cookie can be stored as an attribute and then used:
import sys
from PyQt5.QtCore import QCoreApplication, QUrl
from PyQt5.QtNetwork import QNetworkCookie
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtWebEngineWidgets import (
QWebEnginePage,
QWebEngineProfile,
QWebEngineSettings,
QWebEngineView,
)
class OpenconnectSamlAuth(QMainWindow):
def __init__(self, parent=None):
super(OpenconnectSamlAuth, self).__init__(parent)
self._cookie = None
self.webview = QWebEngineView()
self.profile = QWebEngineProfile("storage", self.webview)
self.cookie_store = self.profile.cookieStore()
self.cookie_store.cookieAdded.connect(self.handle_cookie_added)
self.profile.settings().setAttribute(QWebEngineSettings.JavascriptEnabled, True)
webpage = QWebEnginePage(self.profile, self)
self.webview.setPage(webpage)
self.webview.titleChanged.connect(self.update_title)
self.setCentralWidget(self.webview)
self.resize(1024, 768)
#property
def cookie(self):
return self._cookie
def login(self, url):
self.webview.load(QUrl.fromUserInput(url))
self.webview.setWindowTitle("Loading...")
def update_title(self):
self.webview.setWindowTitle(self.webview.title())
def handle_cookie_added(self, cookie):
print("added {name} : {value}".format(name=cookie.name(), value=cookie.value()))
if cookie.name() == b"name_of_cookie":
self._cookie = QNetworkCookie(cookie)
QCoreApplication.quit()
# main loop
def main():
app = QApplication(sys.argv)
openconnect_webobj = OpenconnectSamlAuth()
openconnect_webobj.login("https://vpnserverurl/groupname")
openconnect_webobj.show()
ret = app.exec_()
cookie = openconnect_webobj.cookie
if cookie is not None:
print("results:", cookie.name(), cookie.value(), cookie.toRawForm())
sys.exit(ret)
if __name__ == "__main__":
main()

How to execute QWebEngine in Python function

I have a QWebEngine class tor read webpages and create BeautifulSoup for them.
Here is the code:
import sys
from bs4 import BeautifulSoup
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
self.soup = []
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
self.soup.append(BeautifulSoup(html, 'lxml'))
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
Here is another function to call WebPage class:
def get_soup(urls):
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
return webpage.soup
Here is the main:
if __name__ == "__main__":
urls = ["http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=sh", "http://www.hkexnews.hk/sdw/search/mutualmarket_c.aspx?t=sz"]
soups = get_soup(urls)
However, the program restarts when I executed the program.
What should be changed?

This is a problem that I had already had and analyzing I found that the QApplication is destroyed before QWebEnginePage making the QWebEngineProfile is deleted, and in this case causing QWebEnginePage crashes. The solution is to make the app have a greater scope by making it a global variable.
On the other hand you have to call exec_() so that the eventloop that allows the operation of the signals
# ...
app = None
def get_soup(urls):
global app
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
app.exec_()
return webpage.soup
# ...
Note: It seems that the QTBUG-75547 related to this problem has been solved for Qt5>=5.12.4 so probably in a next release of PyQtWebEngine that bug will no longer be observed.

pyqt4: Loop main Render class?

I have a PyQt4 class that downloads webpages that I use for scrapping purposes.
When I pass a list of urls to the Render class while instantiating it works fine(single call) but when I try to loop the [r = Render(url, cb=scrape)]with a multiple list of urls, after the first loop,the execution stops or hangs without any error thrown.
I want to loop the class separately because the urls list belong to different category and will have to store the contents extracted separately.
I also came to know that only one app can be initiated, if that is the case how to exit the app without quitting it. so that new url list can be used by the same app
I am stuck with this issue for a while. Thanks in advance
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print 'Downloading', url
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass # have scraping code here
url1 = ['http://webscraping.com', 'http://webscraping.com/blog']
url2 = ['http://webscraping.com', 'http://webscraping.com/blog']
urls =[]
urls.append(url1)
urls.append(url2)
for url in urls:
r = Render(url, cb=scrape)

The problem is you can only instantiate a single QApplication object. Here is an updated version that avoids this and then only runs Qt's execution loop when downloading a URL:
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.cb = cb
def crawl(self, url):
print 'Downloading', url
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.app.quit()
def scrape(url, html):
pass # add scraping code here
print len(html)
r = Render(cb=scrape)
urls = ['http://webscraping.com', 'http://webscraping.com/blog']
for url in urls:
r.crawl(url)

unfortunately, #hoju 's answer did not work for me.
here is what works for me (basically setting up a timer to check if loading has completed).
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QTimer
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.frame = None
self.mainFrame().loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
def go_again():
global r, timer, urls
if(len(urls)>0):
print("loading",urls[0])
r = Render(urls.pop())
timer.start(1000)
else:
print("finished")
sys.exit(app.exec_())
def check_done():
global r, timer
if r.frame is not None:
timer.stop()
html_result = r.frame.toHtml()
#do something with html
print("loaded")
go_again()
app = QApplication(sys.argv)
urls = ['http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class','http://stackoverflow.com/questions/34603886/pyqt4-loop-main-render-class']
timer = QTimer()
timer.timeout.connect(check_done)
#check every second
go_again()
sys.exit(app.exec_())

Filling out a form using PyQt and QWebview

I would like to use PyQt/QWebview to 1) load a specific url, 2) enter information into a form, 3) click buttons/links. Mechanize does not work because I need an actual browser.
Here's my code:
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4 import QtCore
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("https://www.lendingclub.com/account/gotoLogin.action"))
def fillForm():
doc = web.page().mainFrame().documentElement()
user = doc.findFirst("input[id=master_username]")
passwd = doc.findFirst("input[id=master_password]")
user.setAttribute("value", "email#email.com")
passwd.setAttribute("value", "password")
button = doc.findFirst("input[id=master_sign-in-submit]")
button.evaluateJavaScript("click()")
QtCore.QObject.connect(web, QtCore.SIGNAL("loadFinished"), fillForm)
web.show()
sys.exit(app.exec_())
The page loads correctly, but no input is entered and the form is not submitted. Any ideas?

This helped me to make it work:
user.setAttribute("value", "email#email.com")
-->
user.evaluateJavaScript("this.value = 'email#email.com'")
Attribute and property are different things.
One more fix:
click() --> this.click()

For anyone looking to do this with PyQt5, this example may help as several things have changed. Obviously the javascript needs to be adjusted based on the contents of the website.
import os
import sys
from PyQt5.QtWidgets import QApplication, QVBoxLayout, QWidget
from PyQt5.QtCore import QUrl, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEngineView
class WebPage(QWebEngineView):
def __init__(self):
QWebEngineView.__init__(self)
self.load(QUrl("https://www.url.com"))
self.loadFinished.connect(self._on_load_finished)
def _on_load_finished(self):
print("Finished Loading")
self.page().toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.page().runJavaScript("document.getElementsByName('loginid')[0].value = 'email#email.com'")
self.page().runJavaScript("document.getElementsByName('password')[0].value = 'test'")
self.page().runJavaScript ("document.getElementById('signin').click()")
if __name__ == "__main__":
app = QApplication(sys.argv)
web = WebPage()
web.show()
sys.exit(app.exec_()) # only need one app, one running event loop

You might be able to do it with Webkit/QWebView but what about using selenium: http://code.google.com/p/selenium/ ? It is designed for exactly this kind of browser automation and has nice python bindings.

Python QtWebKit save webpage to file

What's the best and simplest way to save a webpage displayed with QWebView() to file?
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtGui import *
from PyQt4.QtScript import *
import sys
import time
currentfile = "test.htm"
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("http://news.google.com"))
web.show()
data = web.page().currentFrame().documentElement().toInnerXml()
open(currentfile,"w").write(data)
sys.exit(app.exec_())

As the page loading is asynchronous, you have to wait for the loadFinished signal before trying to save it.
Then you can retrieve the page content with web.page().currentFrame().toHtml() which returns a python unicode string, which you can write to a file with the codecs module:
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
import sys
import codecs
class Downloader(QObject):
# To be emitted when every items are downloaded
done = Signal()
def __init__(self, urlList, parent = None):
super(Downloader, self).__init__(parent)
self.urlList = urlList
self.counter = 0
# As you probably don't need to display the page
# you can use QWebPage instead of QWebView
self.page = QWebPage(self)
self.page.loadFinished.connect(self.save)
self.startNext()
def currentUrl(self):
return self.urlList[self.counter][0]
def currentFilename(self):
return self.urlList[self.counter][1]
def startNext(self):
print "Downloading %s..."%self.currentUrl()
self.page.mainFrame().load(self.currentUrl())
def save(self, ok):
if ok:
data = self.page.mainFrame().toHtml()
with codecs.open(self.currentFilename(), encoding="utf-8", mode="w") as f:
f.write(data)
print "Saving %s to %s."%(self.currentUrl(), self.currentFilename())
else:
print "Error while downloading %s\nSkipping."%self.currentUrl()
self.counter += 1
if self.counter < len(self.urlList):
self.startNext()
else:
self.done.emit()
urlList = [("http://news.google.com", "google.html"),
("http://www.stackoverflow.com","stack.html"),
("http://www.imdb.com", "imdb.html")]
app = QApplication(sys.argv)
downloader = Downloader(urlList)
# Quit when done
downloader.done.connect(app.quit)
# To view the pages
web = QWebView()
# To prevent user action that would interrupt the current page loading
web.setDisabled(True)
web.setPage(downloader.page)
web.show()
sys.exit(app.exec_())

Is there a reason that the page needs to be loaded with QtWebKit first? Simply using the command-line utility wget, or curl, would do the job.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

QWebpage only fetches HTML once, and cannot be invoked again - python

Related

Using QWebEngine to login to a SAML authorization page, wait for a cookie, and then cleanup / exit

How to execute QWebEngine in Python function

pyqt4: Loop main Render class?

Filling out a form using PyQt and QWebview

Python QtWebKit save webpage to file

Categories

Resources