Python QtWebKit save webpage to file - python

What's the best and simplest way to save a webpage displayed with QWebView() to file?
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtGui import *
from PyQt4.QtScript import *
import sys
import time
currentfile = "test.htm"
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("http://news.google.com"))
web.show()
data = web.page().currentFrame().documentElement().toInnerXml()
open(currentfile,"w").write(data)
sys.exit(app.exec_())

As the page loading is asynchronous, you have to wait for the loadFinished signal before trying to save it.
Then you can retrieve the page content with web.page().currentFrame().toHtml() which returns a python unicode string, which you can write to a file with the codecs module:
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
import sys
import codecs
class Downloader(QObject):
# To be emitted when every items are downloaded
done = Signal()
def __init__(self, urlList, parent = None):
super(Downloader, self).__init__(parent)
self.urlList = urlList
self.counter = 0
# As you probably don't need to display the page
# you can use QWebPage instead of QWebView
self.page = QWebPage(self)
self.page.loadFinished.connect(self.save)
self.startNext()
def currentUrl(self):
return self.urlList[self.counter][0]
def currentFilename(self):
return self.urlList[self.counter][1]
def startNext(self):
print "Downloading %s..."%self.currentUrl()
self.page.mainFrame().load(self.currentUrl())
def save(self, ok):
if ok:
data = self.page.mainFrame().toHtml()
with codecs.open(self.currentFilename(), encoding="utf-8", mode="w") as f:
f.write(data)
print "Saving %s to %s."%(self.currentUrl(), self.currentFilename())
else:
print "Error while downloading %s\nSkipping."%self.currentUrl()
self.counter += 1
if self.counter < len(self.urlList):
self.startNext()
else:
self.done.emit()
urlList = [("http://news.google.com", "google.html"),
("http://www.stackoverflow.com","stack.html"),
("http://www.imdb.com", "imdb.html")]
app = QApplication(sys.argv)
downloader = Downloader(urlList)
# Quit when done
downloader.done.connect(app.quit)
# To view the pages
web = QWebView()
# To prevent user action that would interrupt the current page loading
web.setDisabled(True)
web.setPage(downloader.page)
web.show()
sys.exit(app.exec_())

Is there a reason that the page needs to be loaded with QtWebKit first? Simply using the command-line utility wget, or curl, would do the job.

Related

how to get html source code from QWebEnginePage in pyqt5 [duplicate]

I am trying to create my own browser using Python QWebEngineView . I have followed a tutorial that worked on an previous version of PyQt5 (around 2015), but due to its recent updates, some parts of the previous code no longer work.
I have fixed most errors but I am unable to perform html file opening/saving. I always receive a system error when I click on the save button. The following is my code for file saving:
(QMainWindow class)
save_file_action = QAction(QIcon("disk--pencil.png"), "Save Page As...", self)
save_file_action.setStatusTip("Save current page to file")
file_menu.addAction(save_file_action)
(save_file function)
def save_file(self):
filename, _ = QFileDialog.getSaveFilename(self, "Save Page As", "",
"Hypertext Markup Language (*.htm *.html);;"
"All files(*.*)")
if filename:
html = self.browser.page().mainFrame().toHtml()
with open(filename, 'w') as f:
f.write(html)
Thank you.
the toHtml() function of QtWebEngine is asynchronous, so it does not return anything directly, but you have to pass it a callback so that in that function returns the html, to convert that process asynchronous to synchronous we use a QEventLoop with the help of a signal :
import sys
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
class Browser(QMainWindow):
htmlFinished = pyqtSignal()
def __init__(self, *args, **kwargs):
QMainWindow.__init__(self, *args, **kwargs)
self.mHtml = ""
self.view = QWebEngineView()
self.setCentralWidget(self.view)
self.view.setUrl(QUrl("http://www.google.com/"))
file_menu = QMenu(self.menuBar())
file_menu.setTitle("File")
save_file_action = QAction(QIcon("disk--pencil.png"), "Save Page As...",self)
file_menu.addAction(save_file_action)
self.menuBar().addAction(file_menu.menuAction())
save_file_action.triggered.connect(self.save_file)
def callback(self, html):
self.mHtml = html
self.htmlFinished.emit()
def save_file(self):
filename, _ = QFileDialog.getSaveFileName(self, "Save Page As", "", "Hypertext Markup Language (*.htm *.html);;" "All files(*.*)")
if filename:
self.view.page().toHtml(self.callback)
loop = QEventLoop()
self.htmlFinished.connect(loop.quit)
loop.exec_()
with open(filename, 'w') as f:
f.write(self.mHtml)
if __name__ == '__main__':
app = QApplication(sys.argv)
w = Browser()
w.show()
sys.exit(app.exec_())

Using QWebEngine to login to a SAML authorization page, wait for a cookie, and then cleanup / exit

I'm trying to write a PyQT QWebEngineView that opens a website, does a SAML login to AAD, returns, and once it sees a specific cookie (openconnect webvpn cookie), grabs the value and returns it to the "console" script which can continue processing and/or return to the command prompt.
I've glued together enough code that I can pop a browser window, step through my SAML authorization and see the cookie and cookie value. I don't know how to auto-close / exit the WebView window and "return" that cookie value and/or just the array to Python itself so I can keep processing it and/or exit. Not quite sure how to "clean up" my objects either.
I did probably fudge up my classes, initiators, and object variables. It's a kludge.
Thoughts? Ideas?
This is Arch Linux with latest Python and pyqt via package repo.
The code:
#!/usr/bin/python
#core python
import sys
#PyQT libraries
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtNetwork import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
#functions / classes
class OpenconnectSamlAuth(QMainWindow):
#init self object
def __init__(self):
#inherit parents functions, classes, etc....
super(OpenconnectSamlAuth, self).__init__()
#create webview object
self.webview = QWebEngineView()
#grab profile
self.profile = QWebEngineProfile("storage", self.webview)
self.cookie_store = self.profile.cookieStore()
self.cookie_store.cookieAdded.connect(self.onCookieAdded)
#empty array of cookies
self.samlcookies = []
#set some window options
#window width x height
self.resize(1024, 768);
#default settings
self.mySettings = QWebEngineSettings.defaultSettings()
self.mySettings.setAttribute(QWebEngineSettings.JavascriptEnabled, True)
#load URL / process login
def samlLogin(self,url):
#create page and load URL
webpage = QWebEnginePage(self.profile, self.webview)
self.webview.setPage(webpage)
self.webview.load(QUrl(url))
#windows options
self.setCentralWidget(self.webview)
#window title
self.webview.setWindowTitle('Loading...')
self.webview.titleChanged.connect(self.updateTitle)
#update title window
def updateTitle(self):
self.webview.setWindowTitle(self.webview.title())
#handle cookies being added
def onCookieAdded(self, cookie):
#check if cookies exists
#for c in self.cookies:
# if c.hasSameIdentifier(cookie):
# return
#self.cookies.append(QNetworkCookie(cookie)) return;
#bytearray(c.name()).decode()
print(bytearray( QNetworkCookie(cookie).name() ).decode() )
print(bytearray( QNetworkCookie(cookie).value() ).decode() )
return
#main loop
def main():
#initialize QT application object
App = QApplication(sys.argv)
#setup webkit window / browser session
OpenconnectWebObj = OpenconnectSamlAuth()
#load URL
OpenconnectWebObj.samlLogin("https://vpnserverurl/groupname")
#show connection window
OpenconnectWebObj.show()
#execute the app and grab the returned cookie
cookie = App.exec_()
print(cookie)
#exit
sys.exit()
#if called via command line; run this
if __name__ == '__main__':
main()
If you want to close the window then you must call the close() method, but in this case it seems that it requires terminating the Qt eventloop so the QCoreApplication.quit() method should be used. On the other hand, the cookie can be stored as an attribute and then used:
import sys
from PyQt5.QtCore import QCoreApplication, QUrl
from PyQt5.QtNetwork import QNetworkCookie
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtWebEngineWidgets import (
QWebEnginePage,
QWebEngineProfile,
QWebEngineSettings,
QWebEngineView,
)
class OpenconnectSamlAuth(QMainWindow):
def __init__(self, parent=None):
super(OpenconnectSamlAuth, self).__init__(parent)
self._cookie = None
self.webview = QWebEngineView()
self.profile = QWebEngineProfile("storage", self.webview)
self.cookie_store = self.profile.cookieStore()
self.cookie_store.cookieAdded.connect(self.handle_cookie_added)
self.profile.settings().setAttribute(QWebEngineSettings.JavascriptEnabled, True)
webpage = QWebEnginePage(self.profile, self)
self.webview.setPage(webpage)
self.webview.titleChanged.connect(self.update_title)
self.setCentralWidget(self.webview)
self.resize(1024, 768)
#property
def cookie(self):
return self._cookie
def login(self, url):
self.webview.load(QUrl.fromUserInput(url))
self.webview.setWindowTitle("Loading...")
def update_title(self):
self.webview.setWindowTitle(self.webview.title())
def handle_cookie_added(self, cookie):
print("added {name} : {value}".format(name=cookie.name(), value=cookie.value()))
if cookie.name() == b"name_of_cookie":
self._cookie = QNetworkCookie(cookie)
QCoreApplication.quit()
# main loop
def main():
app = QApplication(sys.argv)
openconnect_webobj = OpenconnectSamlAuth()
openconnect_webobj.login("https://vpnserverurl/groupname")
openconnect_webobj.show()
ret = app.exec_()
cookie = openconnect_webobj.cookie
if cookie is not None:
print("results:", cookie.name(), cookie.value(), cookie.toRawForm())
sys.exit(ret)
if __name__ == "__main__":
main()

save html files in QWebEngineView browser

I am trying to create my own browser using Python QWebEngineView . I have followed a tutorial that worked on an previous version of PyQt5 (around 2015), but due to its recent updates, some parts of the previous code no longer work.
I have fixed most errors but I am unable to perform html file opening/saving. I always receive a system error when I click on the save button. The following is my code for file saving:
(QMainWindow class)
save_file_action = QAction(QIcon("disk--pencil.png"), "Save Page As...", self)
save_file_action.setStatusTip("Save current page to file")
file_menu.addAction(save_file_action)
(save_file function)
def save_file(self):
filename, _ = QFileDialog.getSaveFilename(self, "Save Page As", "",
"Hypertext Markup Language (*.htm *.html);;"
"All files(*.*)")
if filename:
html = self.browser.page().mainFrame().toHtml()
with open(filename, 'w') as f:
f.write(html)
Thank you.
the toHtml() function of QtWebEngine is asynchronous, so it does not return anything directly, but you have to pass it a callback so that in that function returns the html, to convert that process asynchronous to synchronous we use a QEventLoop with the help of a signal :
import sys
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
class Browser(QMainWindow):
htmlFinished = pyqtSignal()
def __init__(self, *args, **kwargs):
QMainWindow.__init__(self, *args, **kwargs)
self.mHtml = ""
self.view = QWebEngineView()
self.setCentralWidget(self.view)
self.view.setUrl(QUrl("http://www.google.com/"))
file_menu = QMenu(self.menuBar())
file_menu.setTitle("File")
save_file_action = QAction(QIcon("disk--pencil.png"), "Save Page As...",self)
file_menu.addAction(save_file_action)
self.menuBar().addAction(file_menu.menuAction())
save_file_action.triggered.connect(self.save_file)
def callback(self, html):
self.mHtml = html
self.htmlFinished.emit()
def save_file(self):
filename, _ = QFileDialog.getSaveFileName(self, "Save Page As", "", "Hypertext Markup Language (*.htm *.html);;" "All files(*.*)")
if filename:
self.view.page().toHtml(self.callback)
loop = QEventLoop()
self.htmlFinished.connect(loop.quit)
loop.exec_()
with open(filename, 'w') as f:
f.write(self.mHtml)
if __name__ == '__main__':
app = QApplication(sys.argv)
w = Browser()
w.show()
sys.exit(app.exec_())

QWebpage only fetches HTML once, and cannot be invoked again

I have a code:
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtCore.QObject):
def __init__(self, url):
self.some_url = url
self.html_source = None
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html_source = unicode(frame.toHtml()).encode('utf-8')
QtCore.QCoreApplication.quit()
def get_html_source(some_url):
app = QApplication([])
browser = TextBrowser(QtCore.QUrl(some_url))
app.exec_()
return browser.html_source
So now, if i run:
print get_html_source('http://www.google.com')
It's okay, and returns a html source from the page http://www.google.com. But if I run another next one like this:
print get_html_source('http://www.google.com')
print get_html_source('http://www.yahoo.com/')
This executes only once, outputs google's html source but after that the PyCharm returns "Process finished with exit code 139" and second call of get_html_source() doesn't executing.
I need to iterate through some url list and get source code from them using by Qwebpage, but my implementation doesn't work.
Where can i find some info about my needs or what am i doing wrong?
Consider the following. exec_ starts the event loops (once), and two separate pages are running:
from PyQt4 import QtCore, QtGui
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtGui.QDialog):
def __init__(self, url):
self.some_url = url
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html = frame.toHtml()
self.close()
def get_html_source():
app = QApplication([])
urls = ['http://www.google.com', 'http://www.yahoo.com/']
out = []
for u in urls:
t = TextBrowser(QtCore.QUrl(u))
t.exec_()
out.append(t.html)
print(out)
if __name__ == "__main__":
get_html_source()
This program has no means to exit as it stands - I suppose you wanted to do more with the HTML than print it anyway.

login.live.com with python and mechanize?

I need to automatically login with python and mechanize on login.live.com.
The problem is,that I can't find any browser.forms(), but there should be some, since I checked the HTML code:
My code:
import urllib2
import lxml
from mechanize import Browser
br=Browser()
#Simulate user
br.set_handle_robots( False )
br.addheaders = [('User-agent', 'GoogleChrome')]
#open site
url = "https://login.live.com/"
rep = br.open(url)
for frm in br.forms():
print frm
There should be a form named 'f1' on 'login.live.com'. Is it possible, that this part is generated dynamically?
Nero
As sbarzowski pointed out you need to execute the javascript on the site.
But you don't need to leave python for that. In fact you could automate Qt webkit.
Example (python3, tested on linux):
#!/usr/bin/env python3
import sys
from urllib.request import urlopen
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
SHOWBROWSER = True
LOGIN = 'name#example.com'
PASS = 'foo'
class WebPage(QWebPage):
def __init__(self, parent=None):
super(WebPage, self).__init__(parent)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl('http://login.live.com'))
def javaScriptConsoleMessage(self, msg, lineNumber, sourceID):
print("JsConsole(%s:%d): %s" % (sourceID, lineNumber, msg))
def _loadFinished(self, result):
frame = self.mainFrame()
url = frame.requestedUrl().toString()
print(url)
if url == 'http://login.live.com/':
frame.evaluateJavaScript(self.get_jquery())
frame.evaluateJavaScript(
'''
$('input[name="login"]').val('{login}')
$('input[name="passwd"]').val('{password}')
$('input[type="submit"]').click()
'''.format(login=LOGIN, password=PASS)
)
if 'auth/complete-signin' in url:
print('finished login')
if not SHOWBROWSER:
QApplication.quit()
def get_jquery(self):
response = urlopen('http://code.jquery.com/jquery-2.1.3.js')
return response.read().decode('utf-8')
class Window(QWidget):
def __init__(self):
super(Window, self).__init__()
self.view = QWebView(self)
self.view.setPage(WebPage())
layout = QVBoxLayout(self)
layout.setMargin(0)
layout.addWidget(self.view)
def headless():
app = QApplication(sys.argv)
view = QWebView()
view.setPage(WebPage())
app.exec_()
def main():
app = QApplication(sys.argv)
window = Window()
window.show()
app.exec_()
if __name__ == "__main__":
if SHOWBROWSER:
main()
else:
headless()
The answer from https://login.live.com has empty body. Everything is done through javascript onload.
To see yourself you can (on Mac and Linux at least):
wget https://login.live.com/
Or in your code:
import urllib2
from mechanize import Browser
br=Browser()
#Simulate user
br.set_handle_robots( False )
br.addheaders = [('User-agent', 'GoogleChrome')]
#open site
url = "https://login.live.com/"
rep = br.open(url)
print rep.read()
It may be hard/impossible to get these forms without executing javascript, but to do so I think you will have to leave python. EDIT: Or maybe you don't have to (see other answers).
If you have no need to actually analyze the site respones and just want to do some simple things there you can just make your requests without caring too much about responses (you still have http status codes which may be enough to see if your requests succeded).
I guess there is also actual API. I'm not familiar with MS products and don't know exactly what you are trying to do, so I cannot point to anything specific.

Categories

Resources