I would like to use PyQt/QWebview to 1) load a specific url, 2) enter information into a form, 3) click buttons/links. Mechanize does not work because I need an actual browser.
Here's my code:
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4 import QtCore
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("https://www.lendingclub.com/account/gotoLogin.action"))
def fillForm():
doc = web.page().mainFrame().documentElement()
user = doc.findFirst("input[id=master_username]")
passwd = doc.findFirst("input[id=master_password]")
user.setAttribute("value", "email#email.com")
passwd.setAttribute("value", "password")
button = doc.findFirst("input[id=master_sign-in-submit]")
button.evaluateJavaScript("click()")
QtCore.QObject.connect(web, QtCore.SIGNAL("loadFinished"), fillForm)
web.show()
sys.exit(app.exec_())
The page loads correctly, but no input is entered and the form is not submitted. Any ideas?
This helped me to make it work:
user.setAttribute("value", "email#email.com")
-->
user.evaluateJavaScript("this.value = 'email#email.com'")
Attribute and property are different things.
One more fix:
click() --> this.click()
For anyone looking to do this with PyQt5, this example may help as several things have changed. Obviously the javascript needs to be adjusted based on the contents of the website.
import os
import sys
from PyQt5.QtWidgets import QApplication, QVBoxLayout, QWidget
from PyQt5.QtCore import QUrl, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEngineView
class WebPage(QWebEngineView):
def __init__(self):
QWebEngineView.__init__(self)
self.load(QUrl("https://www.url.com"))
self.loadFinished.connect(self._on_load_finished)
def _on_load_finished(self):
print("Finished Loading")
self.page().toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.page().runJavaScript("document.getElementsByName('loginid')[0].value = 'email#email.com'")
self.page().runJavaScript("document.getElementsByName('password')[0].value = 'test'")
self.page().runJavaScript ("document.getElementById('signin').click()")
if __name__ == "__main__":
app = QApplication(sys.argv)
web = WebPage()
web.show()
sys.exit(app.exec_()) # only need one app, one running event loop
You might be able to do it with Webkit/QWebView but what about using selenium: http://code.google.com/p/selenium/ ? It is designed for exactly this kind of browser automation and has nice python bindings.
Related
I am working with QWebEngineView and have found that when trying to click on a pdf file link it won't open the file. I have found that QWebEngineView has not way to display pdf files on it's own. With some research I can now download pdf files and display them on their own, however I need to be able to get the link of the pdf file from QWebEngineView to know which one to download. The problem is that the .url() function only returns the url of the current webpage and doesn't seem to be affected by me clicking the link of the pdf file and I can't find any other way to get the link of the pdf file. Any ideas on how to get the link to the pdf file? Any help is appreciated.
You can use javascript to get all the links and then filter by the extension:
import sys
from PyQt5.QtCore import QCoreApplication, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
def main():
app = QApplication(sys.argv)
url = QUrl("https://www.princexml.com/samples/")
view = QWebEngineView()
def callback(links):
for link in links:
if link.endswith(".pdf"):
print(link)
QCoreApplication.quit()
def handle_load_finished(ok):
if ok:
view.page().runJavaScript(
"""
(function() {
// https://stackoverflow.com/a/3824292/6622587
var urls = [];
for(var i = document.links.length; i --> 0;)
if(document.links[i].hostname === location.hostname)
urls.push(document.links[i].href);
return urls;
})();""",
callback,
)
view.loadFinished.connect(handle_load_finished)
view.load(url)
view.resize(640, 480)
view.show()
sys.exit(app.exec_())
if __name__ == "__main__":
main()
Output:
http://www.princexml.com/howcome/2016/samples/magic6/magic.pdf
http://www.princexml.com/howcome/2016/samples/magic6/magic.pdf
https://www.princexml.com/samples/flyer/flyer.pdf
https://www.princexml.com/samples/flyer/flyer.pdf
https://www.princexml.com/samples/catalog/PrinceCatalogue.pdf
https://www.princexml.com/samples/catalog/PrinceCatalogue.pdf
http://www.princexml.com/howcome/2016/samples//malthus/essay.pdf
http://www.princexml.com/howcome/2016/samples//malthus/essay.pdf
http://www.princexml.com/howcome/2016/samples/magic8/index.pdf
http://www.princexml.com/howcome/2016/samples/magic8/index.pdf
http://www.princexml.com/howcome/2016/samples/invoice/index.pdf
https://www.princexml.com/samples/invoice/invoicesample.pdf
http://www.princexml.com/howcome/2016/samples/invoice/index.pdf
https://www.princexml.com/samples/invoice/invoicesample.pdf
Update:
If you want to download the PDF then it is not necessary to implement the above since QWebEngineView does allow downloads.
import sys
from PyQt5.QtCore import QCoreApplication, QFileInfo, QUrl
from PyQt5.QtWidgets import QApplication, QFileDialog
from PyQt5.QtWebEngineWidgets import QWebEngineView
def handle_download_erequested(download):
download.downloadProgress.connect(print)
download.stateChanged.connect(print)
download.finished.connect(lambda: print("download finished"))
old_path = download.url().path() # download.path()
suffix = QFileInfo(old_path).suffix()
path, _ = QFileDialog.getSaveFileName(None, "Save File", old_path, "*." + suffix)
if path:
download.setPath(path)
download.accept()
def main():
app = QApplication(sys.argv)
url = QUrl("https://www.princexml.com/samples/")
view = QWebEngineView()
view.page().profile().downloadRequested.connect(handle_download_erequested)
view.load(url)
view.resize(640, 480)
view.show()
sys.exit(app.exec_())
if __name__ == "__main__":
main()
Also QWebEngineView has a PDF viewer
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
def main():
print(
f"PyQt5 version: {QtCore.PYQT_VERSION_STR}, Qt version: {QtCore.QT_VERSION_STR}"
)
app = QtWidgets.QApplication(sys.argv)
view = QtWebEngineWidgets.QWebEngineView()
settings = view.settings()
settings.setAttribute(QtWebEngineWidgets.QWebEngineSettings.PluginsEnabled, True)
url = QtCore.QUrl("https://www.princexml.com/samples/invoice/invoicesample.pdf")
view.load(url)
view.resize(640, 480)
view.show()
sys.exit(app.exec_())
if __name__ == "__main__":
main()
i use python and pyqt4 for web view.
here my simple code :
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
import os
app = QApplication(sys.argv)
web_view= QWebView()
google='https://www.google.com'
web_view.load(QUrl(google))
web_view.show()
sys.exit(app.exec_())
i want to create a new button where that button any time to execute i want to take current url text link and store it in some variable.
how to do that ?how to take current url from web view ?
QWebView has the url() method that returns the current url, you must call it when you want to get the url.
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
app = QApplication(sys.argv)
w = QWidget()
lay = QVBoxLayout(w)
button = QPushButton("Click Me")
web_view= QWebView()
lay.addWidget(button)
lay.addWidget(web_view)
def foo():
print(web_view.url().toString())
button.clicked.connect(foo)
google='https://www.google.com'
web_view.load(QUrl(google))
w.show()
sys.exit(app.exec_())
I have a code:
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtCore.QObject):
def __init__(self, url):
self.some_url = url
self.html_source = None
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html_source = unicode(frame.toHtml()).encode('utf-8')
QtCore.QCoreApplication.quit()
def get_html_source(some_url):
app = QApplication([])
browser = TextBrowser(QtCore.QUrl(some_url))
app.exec_()
return browser.html_source
So now, if i run:
print get_html_source('http://www.google.com')
It's okay, and returns a html source from the page http://www.google.com. But if I run another next one like this:
print get_html_source('http://www.google.com')
print get_html_source('http://www.yahoo.com/')
This executes only once, outputs google's html source but after that the PyCharm returns "Process finished with exit code 139" and second call of get_html_source() doesn't executing.
I need to iterate through some url list and get source code from them using by Qwebpage, but my implementation doesn't work.
Where can i find some info about my needs or what am i doing wrong?
Consider the following. exec_ starts the event loops (once), and two separate pages are running:
from PyQt4 import QtCore, QtGui
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtGui.QDialog):
def __init__(self, url):
self.some_url = url
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html = frame.toHtml()
self.close()
def get_html_source():
app = QApplication([])
urls = ['http://www.google.com', 'http://www.yahoo.com/']
out = []
for u in urls:
t = TextBrowser(QtCore.QUrl(u))
t.exec_()
out.append(t.html)
print(out)
if __name__ == "__main__":
get_html_source()
This program has no means to exit as it stands - I suppose you wanted to do more with the HTML than print it anyway.
i'm trying to show a webpage in a frame, but i can't figure out how to do it, also because i can't find the right documentation and/or tutorial for the QtWebkit.
Thanks.
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class MainWindow(QtGui.QMainWindow):
def __init__(self):
QtGui.QMainWindow.__init__(self)
self.resize(350, 250)
self.setWindowTitle('MainWindow')
self.statusBar().showMessage('Loading...')
self.web = QtWebKit.QWebView()
self.web.load(QtCore.QUrl('google.com'))
self.web.show()
app = QtGui.QApplication(sys.argv)
main = MainWindow()
main.show()
sys.exit(app.exec_())
for some doc, you can try the riverbank documentation (though code examples are still in C…)
It seems that your code is fine (maybe add http://?. But did you try to do that without classes? This should work:
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("http://google.com"))
web.show()
sys.exit(app.exec_())
Otherwise, there is a problem somewhere else (a proxy maybe?)
The following code does not reach searchResults. I have printed out documentElement.findFirst('input[name="btnG"]') and found it to be <input name="btnG" type="submit" value="Google Search" class="lsb"> so we are good up to that point. Note that my goal is not to scrape Google but it's simpler to learn via the well known and public Google.
#!/usr/bin/python
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage, QWebView
class Scrape(QApplication):
def __init__(self):
super(Scrape, self).__init__(None)
self.webView = QWebView()
self.webView.loadFinished.connect(self.searchForm)
def load(self, url):
self.webView.load(QUrl(url))
def searchForm(self):
documentElement = self.webView.page().currentFrame().documentElement()
inputSearch = documentElement.findFirst('input[title="Google Search"]')
inputSearch.setAttribute('value', 'test')
self.webView.loadFinished.disconnect(self.searchForm)
self.webView.loadFinished.connect(self.searchResults)
documentElement.findFirst('input[name="btnG"]').evaluateJavaScript('click()')
def searchResults(self):
for element in documentElement.find('li[class="g"]'):
print unicode(element.toOuterXml())
self.exit()
my_scrape = Scrape()
my_scrape.load('http://google.com/ncr')
my_scrape.exec_()
I have finally figured it out! and submitted to http://drupal4hu.com/node/266