Submitting Google with PyQT QWebElement - python

The following code does not reach searchResults. I have printed out documentElement.findFirst('input[name="btnG"]') and found it to be <input name="btnG" type="submit" value="Google Search" class="lsb"> so we are good up to that point. Note that my goal is not to scrape Google but it's simpler to learn via the well known and public Google.
#!/usr/bin/python
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage, QWebView
class Scrape(QApplication):
def __init__(self):
super(Scrape, self).__init__(None)
self.webView = QWebView()
self.webView.loadFinished.connect(self.searchForm)
def load(self, url):
self.webView.load(QUrl(url))
def searchForm(self):
documentElement = self.webView.page().currentFrame().documentElement()
inputSearch = documentElement.findFirst('input[title="Google Search"]')
inputSearch.setAttribute('value', 'test')
self.webView.loadFinished.disconnect(self.searchForm)
self.webView.loadFinished.connect(self.searchResults)
documentElement.findFirst('input[name="btnG"]').evaluateJavaScript('click()')
def searchResults(self):
for element in documentElement.find('li[class="g"]'):
print unicode(element.toOuterXml())
self.exit()
my_scrape = Scrape()
my_scrape.load('http://google.com/ncr')
my_scrape.exec_()

I have finally figured it out! and submitted to http://drupal4hu.com/node/266

Related

How to do Web Scraping with BeautifulSoup without affecting my PyQt5 application?

I am making an application that involves taking websites and getting the links they have (and also images), but these operations take some time and cause the application to freeze. I've tried using QThread and QRunnable to try and separate the execution of the application from the execution of the functions I use.
I programmed this small example of how it works in my app:
from PyQt5 import QtWidgets,QtCore
from PyQt5.QtWidgets import QApplication, QMainWindow,QLineEdit,QVBoxLayout
from PyQt5.QtCore import QThread
from bs4 import BeautifulSoup
import requests
class mainwindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.setFixedSize(600,64)
## Set a QLineEdit
self.Line=QtWidgets.QLineEdit()
self.Line.setPlaceholderText('You should be able to type here while the links are searched')
self.Line.setFixedSize(600,32)
self.layout().addWidget(self.Line)
## Set a QPushButton
self.Button=QtWidgets.QPushButton()
self.Button.setText('Seach links')
self.Button.move(0,32)
self.Button.setFixedSize(600,32)
self.layout().addWidget(self.Button)
## Connect button with function
self.Button.clicked.connect(lambda:self.search_start())
## Function calls QProcess class
def search_start(self):
self.sclass=search_class()
self.sclass.func_seach()
## Class search for links
class search_class(QThread):
def func_seach(self):
url='https://youtu.be/dQw4w9WgXcQ'
links_list=[]
for link in BeautifulSoup(requests.get(url).text, 'html.parser').find_all('a'):
links_list.append(link.get('href'))
print(links_list)
if __name__=='__main__':
Aplication=QtWidgets.QApplication([])
MainWindow=mainwindow()
MainWindow.show()
Aplication.exec_()
How can I prevent the app from freezing while executing that function?
I think you may have already noticed that I'm pretty new to this. I need to know what's wrong, what I'm doing wrong and how to fix it.
Thank you very much in advance.
The solution was to put the entire class below in a function inside the first class and trigger my function through another function that started the other one in a thread:
from PyQt5 import QtWidgets,QtCore
from PyQt5.QtWidgets import QApplication, QMainWindow,QLineEdit,QVBoxLayout
from PyQt5.QtCore import QThread
from bs4 import BeautifulSoup
import requests
class mainwindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.setFixedSize(600,64)
self.Line=QtWidgets.QLineEdit()
self.Line.setPlaceholderText('You should be able to type here while the links are searched')
self.Line.setFixedSize(600,32)
self.layout().addWidget(self.Line)
## Set a QPushButton
self.Button=QtWidgets.QPushButton()
self.Button.setText('Seach links')
self.Button.move(0,32)
self.Button.setFixedSize(600,32)
self.layout().addWidget(self.Button)
## Connect button with function
self.Button.clicked.connect(lambda:self.search_start())
## Function calls func_seach function as a thread
def search_start(self):
thread=Thread(target=self.func_seach)
thread.start()
def func_seach(self):
url='https://youtu.be/dQw4w9WgXcQ'
links_list=[]
for link in BeautifulSoup(requests.get(url).text, 'html.parser').find_all('a'):
links_list.append(link.get('href'))
print(links_list)
if __name__=='__main__':
Aplication=QtWidgets.QApplication([])
MainWindow=mainwindow()
MainWindow.show()
Aplication.exec_()

copy from clipboard

I'm still quite a newbie with Python and PyQt5, so I have a really basic question. My idea is to build application to download a URL. Here is a picture of my design:
When I right click on the URL from any website, copy it, and go to my application and press on icon on toolbar named (Add URL), the URL should be pasted immediately inside the QLineEdit.
Here is my code:
from PyQt5.QtWidgets import*
from PyQt5.QtCore import*
from PyQt5.QtGui import*
from PyQt5.uic import loadUiType
from PyQt5.QtWidgets import QApplication ,QMainWindow,QAction
from os import path
import sys
FORM_CLASS,_= loadUiType(path.join(path.dirname(__file__),"main.ui"))
class MainApp(QMainWindow , FORM_CLASS):
def __init__(self, parent=None):
super(MainApp, self).__init__(parent)
QMainWindow.__init__(self)
self.setupUi(self)
self.idm_UI()
self.idm_Buttons()
def idm_UI(self):
self.setWindowTitle("Download URL")
self.setFixedSize(631,400)
self.setWindowIcon(QIcon("download.jpg"))
# To Create the Icone
exitAct = QAction(QIcon('exit.png'),'Exit',self)
exitAct.triggered.connect(self.idm_exit)
pasteAction = QAction(QIcon("paste.png"), "Add URL", self)
pasteAction.triggered.connect(self.idm_add)
self.toolbar = self.addToolBar('Toolbar')
self.toolbar.addAction(exitAct)
self.toolbar.addAction(pasteAction)
def idm_exit(self):
self.close()
def idm_add(self): # What is the right method that I can use to paste the URL inside lineEdit_4?
pass
The name of define method of function is
def def idm_add(self):
So, what function or method do I need to use to paste the URL inside the LineEditor box?
What you are to paste the text that is stored in the clipboard, for this you must use QClipboard.
def idm_add(self):
clipboard = QApplication.clipboard()
self.lineEdit_4.setText(clipboard.text())

QWebEngineView - how to open links in system browser

I have the following code snippet working in PySide and need to translate it to work in PySide2.
The purpose is to force all links to open in the system browser when clicked (rather than the widget trying to load them):
from PySide.QtWebKit import QWebView, QWebPage
class HtmlView(QWebView):
def __init__(self, parent=None):
super(HtmlView, self).__init__(parent)
self.page().setLinkDelegationPolicy(QWebPage.DelegateAllLinks) # not working in PySide2
self.linkClicked.connect(self.openWebsite) # not working in PySide2
This was my attempt of a translation:
from PySide2.QtWebEngineWidgets import QWebEngineView, QWebEnginePage
class HtmlView(QWebEngineView):
def __init__(self, parent=None):
super(HtmlView, self).__init__(parent)
self.page().setLinkDelegationPolicy(QWebEnginePage.DelegateAllLinks) # not working in PySide2
self.linkClicked.connect(self.openWebsite) # not working in PySide2
However, QWebEngineView.linkClicked does not exist and neither does QWebEngineView.setLinkDelegationPolicy or
QWebEnginePage.DelegateAllLinks.
What is the best way to achieve this in PySide2 without the above?
Edit: I checked the QEvents that are triggered but no event seems to be fired off when a link is clicked, so without the linkClicked event from PySide/Qt4.8 I have no idea how to hook into this.
Thanks,
frank
You have to use acceptNavigationRequest:
This function is called upon receiving a request to navigate to the
specified url by means of the specified navigation type type.
isMainFrame indicates whether the request corresponds to the main
frame or a child frame. If the function returns true, the navigation
request is accepted and url is loaded. The default implementation
accepts all navigation requests.
In your case you must reject and open the url when the type is QWebEnginePage::NavigationTypeLinkClicked.
from PySide2.QtCore import QUrl
from PySide2.QtGui import QDesktopServices
from PySide2.QtWidgets import QApplication
from PySide2.QtWebEngineWidgets import QWebEngineView, QWebEnginePage
class WebEnginePage(QWebEnginePage):
def acceptNavigationRequest(self, url, _type, isMainFrame):
if _type == QWebEnginePage.NavigationTypeLinkClicked:
QDesktopServices.openUrl(url);
return False
return True
class HtmlView(QWebEngineView):
def __init__(self, *args, **kwargs):
QWebEngineView.__init__(self, *args, **kwargs)
self.setPage(WebEnginePage(self))
if __name__ == '__main__':
import sys
app = QApplication(sys.argv)
w = HtmlView()
w.load(QUrl("https://stackoverflow.com/questions/47736408/pyside2-qwebview-how-to-open-links-in-system-browser"));
w.show()
sys.exit(app.exec_())

QWebpage only fetches HTML once, and cannot be invoked again

I have a code:
from PyQt4 import QtCore
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtCore.QObject):
def __init__(self, url):
self.some_url = url
self.html_source = None
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html_source = unicode(frame.toHtml()).encode('utf-8')
QtCore.QCoreApplication.quit()
def get_html_source(some_url):
app = QApplication([])
browser = TextBrowser(QtCore.QUrl(some_url))
app.exec_()
return browser.html_source
So now, if i run:
print get_html_source('http://www.google.com')
It's okay, and returns a html source from the page http://www.google.com. But if I run another next one like this:
print get_html_source('http://www.google.com')
print get_html_source('http://www.yahoo.com/')
This executes only once, outputs google's html source but after that the PyCharm returns "Process finished with exit code 139" and second call of get_html_source() doesn't executing.
I need to iterate through some url list and get source code from them using by Qwebpage, but my implementation doesn't work.
Where can i find some info about my needs or what am i doing wrong?
Consider the following. exec_ starts the event loops (once), and two separate pages are running:
from PyQt4 import QtCore, QtGui
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtGui import QApplication
class TextBrowser(QtGui.QDialog):
def __init__(self, url):
self.some_url = url
QtCore.QObject.__init__(self)
self.page = QWebPage()
self.page.loadFinished.connect(self.get_html)
self.page.mainFrame().load(self.some_url)
def get_html(self):
frame = self.page.mainFrame()
self.html = frame.toHtml()
self.close()
def get_html_source():
app = QApplication([])
urls = ['http://www.google.com', 'http://www.yahoo.com/']
out = []
for u in urls:
t = TextBrowser(QtCore.QUrl(u))
t.exec_()
out.append(t.html)
print(out)
if __name__ == "__main__":
get_html_source()
This program has no means to exit as it stands - I suppose you wanted to do more with the HTML than print it anyway.

Filling out a form using PyQt and QWebview

I would like to use PyQt/QWebview to 1) load a specific url, 2) enter information into a form, 3) click buttons/links. Mechanize does not work because I need an actual browser.
Here's my code:
import sys
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4 import QtCore
app = QApplication(sys.argv)
web = QWebView()
web.load(QUrl("https://www.lendingclub.com/account/gotoLogin.action"))
def fillForm():
doc = web.page().mainFrame().documentElement()
user = doc.findFirst("input[id=master_username]")
passwd = doc.findFirst("input[id=master_password]")
user.setAttribute("value", "email#email.com")
passwd.setAttribute("value", "password")
button = doc.findFirst("input[id=master_sign-in-submit]")
button.evaluateJavaScript("click()")
QtCore.QObject.connect(web, QtCore.SIGNAL("loadFinished"), fillForm)
web.show()
sys.exit(app.exec_())
The page loads correctly, but no input is entered and the form is not submitted. Any ideas?
This helped me to make it work:
user.setAttribute("value", "email#email.com")
-->
user.evaluateJavaScript("this.value = 'email#email.com'")
Attribute and property are different things.
One more fix:
click() --> this.click()
For anyone looking to do this with PyQt5, this example may help as several things have changed. Obviously the javascript needs to be adjusted based on the contents of the website.
import os
import sys
from PyQt5.QtWidgets import QApplication, QVBoxLayout, QWidget
from PyQt5.QtCore import QUrl, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEngineView
class WebPage(QWebEngineView):
def __init__(self):
QWebEngineView.__init__(self)
self.load(QUrl("https://www.url.com"))
self.loadFinished.connect(self._on_load_finished)
def _on_load_finished(self):
print("Finished Loading")
self.page().toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.page().runJavaScript("document.getElementsByName('loginid')[0].value = 'email#email.com'")
self.page().runJavaScript("document.getElementsByName('password')[0].value = 'test'")
self.page().runJavaScript ("document.getElementById('signin').click()")
if __name__ == "__main__":
app = QApplication(sys.argv)
web = WebPage()
web.show()
sys.exit(app.exec_()) # only need one app, one running event loop
You might be able to do it with Webkit/QWebView but what about using selenium: http://code.google.com/p/selenium/ ? It is designed for exactly this kind of browser automation and has nice python bindings.

Categories

Resources