This question already has an answer here:
Scrape multiple urls using QWebPage
(1 answer)
Closed 4 years ago.
So, guys, i am still confuse why the iteration always stop at the third turn
Here is my code :
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import sys
import numpy as np
from bs4 import BeautifulSoup as soup
class Client(QWebEnginePage):
def __init__(self,url):
global app
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ""
self.loadFinished.connect(self.on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def on_load_finished(self):
self.html = self.toHtml(self.Callable)
print("Load Finished")
def Callable(self,data):
self.html = data
self.app.quit()
linkgroup = []
linkgroup.append("https://docs.python.org/3/whatsnew/3.7.html")
linkgroup.append("https://docs.python.org/3/tutorial/index.html")
linkgroup.append("https://docs.python.org/3/installing/index.html")
linkgroup.append("https://docs.python.org/3/reference/index.html")
linkgroup.append("https://docs.python.org/3/using/index.html")
for h in range(0,len(linkgroup)):
#Setting Url
url = linkgroup[h]
print(url)
print("Loop Index : " + str(h))
client_response = Client(url)
The output is this
https://docs.python.org/3/whatsnew/3.7.html
Loop Index : 0
Load Finished
https://docs.python.org/3/tutorial/index.html
Loop Index : 1
Load Finished
https://docs.python.org/3/installing/index.html
Loop Index : 2
As you can see, the rest iterations of the loop doesn't get executed as it doesn't show the respond from the Client class
As commented by pschill above, you should have only one QApplication. How about passing it as a parameter to the constructor? Something like:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import sys
import numpy as np
from bs4 import BeautifulSoup as soup
class Client(QWebEnginePage):
def __init__(self,url,app):
self.app = app
QWebEnginePage.__init__(self)
self.html = ""
self.loadFinished.connect(self.on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def on_load_finished(self):
self.html = self.toHtml(self.Callable)
print("Load Finished")
def Callable(self,data):
self.html = data
self.app.quit()
linkgroup = []
linkgroup.append("https://docs.python.org/3/whatsnew/3.7.html")
linkgroup.append("https://docs.python.org/3/tutorial/index.html")
linkgroup.append("https://docs.python.org/3/installing/index.html")
linkgroup.append("https://docs.python.org/3/reference/index.html")
linkgroup.append("https://docs.python.org/3/using/index.html")
app = QApplication(sys.argv)
for h in range(0,len(linkgroup)):
#Setting Url
url = linkgroup[h]
print(url)
print("Loop Index : " + str(h))
client_response = Client(url, app)
Related
I wrote a program to extract link pictures from webcomics, however, when I run it, it just extracts image links from the last link chapter, not all image links from all chapters. What is the issue with my program?
I have tried several ways but not things useful.
from PyQt5 import QtNetwork, QtCore
from requests_html import HTML
from functools import cached_property
from PyQt5.QtCore import QCoreApplication, QUrl
url1 = "https://saytruyen.net/truyen-su-tro-lai-cua-phap-su-hac-am-sau-66666-nam.html"
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
#cached_property
def manager(self):
return QtNetwork.QNetworkAccessManager()
def start(self):
self.start_request(QtCore.QUrl(url1))
def start_request(self, url):
request = QtNetwork.QNetworkRequest(url)
self.manager.get(request)
def handle_response(self, reply):
err = reply.error()
if err == QtNetwork.QNetworkReply.NoError:
self.process(str(reply.readAll(), 'utf-8'))
else:
print("Error occured: ", err)
print(reply.errorString())
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = "https://saytruyen.net/" + i.attrs["href"]
#print(url2)
#self.start_request(QtCore.QUrl(url2))
req = QtNetwork.QNetworkRequest(QUrl(url2))
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.nam.get(req)
def handleResponse(self, reply):
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
QCoreApplication.quit()
There are two problems:
the QNetworkAccessManager used for the download is being continuously recreated; since a network request is asynchronous, it isn't processed instantly, and it will be destroyed along with the network manager in the next cycle of the for loop since it's being overwritten; the result is that the previous request will be destroyed and only the last one will "survive";
the application is quit as soon as the first reply is received, preventing all other requests to be processed;
The solution is to create a single manager for the download process in the __init__, and quit as soon as all requests have been received.
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.urls = set()
# ...
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = QUrl("https://saytruyen.net/" + i.attrs["href"])
if url2 in self.urls:
continue
self.urls.add(url2)
req = QtNetwork.QNetworkRequest(url2)
self.nam.get(req)
def handleResponse(self, reply):
self.urls.discard(reply.url())
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
if not self.urls:
QCoreApplication.quit()
Note that it's usually enough (and better) to have a single network manager and properly handle responses based on queued requests, but for simple situations like this one having two managers doesn't represent a huge problem.
I followed this solution and worked fine at the beginning.
Connect QWebEngine to proxy
But i cant change proxy again after i change it once or QWebEngineView doesnt cares it.
The original code contains more than that, so i purified it to a working example to demonstrate my problem
Assume our ip is "x.x.x.x", proxy1's ip is "y.y.y.y", and proxy2's ip is "z.z.z.z"
When you run the sample code you have to see
x.x.x.x
y.y.y.y
z.z.z.z
but i got
x.x.x.x
y.y.y.y
y.y.y.y
So, any ideas how to solve that?
Sample running code: (just change proxy infos in test function. You may try any working proxies from here: http://free-proxy.cz/en/).
""" NODOC """
__author__ = 'ozgur'
__creation_date__ = '7.10.2020 14:04'
import sys
import time
from PyQt5 import QtNetwork
from PyQt5.QtCore import QMetaObject, QTimer, QUrl
from PyQt5.QtTest import QTest
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication, QMainWindow, QVBoxLayout
class RosaRafined(QMainWindow):
# noinspection PyUnusedLocal
def __init__(self, qapp: QApplication):
super(self.__class__, self).__init__()
self.qapp = qapp
self._setup_ui()
self.counter = 0
def _setup_ui(self):
self.setObjectName("Form")
self.resize(1024, 768)
self.verticalLayout = QVBoxLayout(self)
self.verticalLayout.setObjectName("verticalLayout")
self.webengine = QWebEngineView(self)
self.webengine.setObjectName("webengine")
self.webengine.resize(1024, 768)
# self.webengine.page().profile().setPersistentStoragePath(self.temppath)
# self.webengine.page().profile().setCachePath(self.temppath)
self.verticalLayout.addWidget(self.webengine)
QMetaObject.connectSlotsByName(self)
QTimer().singleShot(1000, self.test)
#staticmethod
def _change_proxy(ip, port, user, passwd):
proxy = QtNetwork.QNetworkProxy()
proxy.setType(QtNetwork.QNetworkProxy.HttpProxy)
proxy.setHostName(ip)
proxy.setPort(port)
proxy.setUser(user)
proxy.setPassword(passwd)
QtNetwork.QNetworkProxy.setApplicationProxy(proxy)
def test(self):
testurl = QUrl("https://whatismyipaddress.com")
if self.counter == 0:
print("Will show your real ip, visiting url")
self.webengine.setUrl(testurl)
# your ip x.x.x.x shown, normally
elif self.counter == 1:
self._change_proxy("y.y.y.y", 80, "user", "pass")
QTest.qWait(2)
print("Will show your proxy1 ip")
self.webengine.setUrl(testurl)
# your ip y.y.y.y shown, normally
elif self.counter == 2:
self._change_proxy("x.x.x.x", 80, "user", "pass")
QTest.qWait(2)
print("Will show your proxy2 ip, unfortunately not...")
self.webengine.setUrl(testurl)
# still seeing y.y.y.y , it is not normal
else:
self.qapp.quit()
self.counter += 1
QTimer().singleShot(5000, self.test)
if __name__ == '__main__':
app = QApplication(sys.argv)
form = RosaRafined(app)
form.show()
app.exec_()
sys.exit()
I'm making a simple WebScraping that download the image of the items of some champions of a site, I put a "for" with 5 characters and it only executes 2 of them and then closes without giving any error!
import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
print("#1 __init__")
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('#2 On Load finished')
def Callable(self, html_str):
print("#3 Callable\n")
self.html = html_str
self.app.quit()
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
def main():
champions = ['Amumu','Akali','Zed','Nunu'] #champions
for champ in champions:
try:
print("\nDownloading Images >: %s"% champ)
data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
soup = bs.BeautifulSoup(data.html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
ImageDownload(image['src'])
except:
print("Shi...")
main()
i'm getting no error but the program only executes 2 times this is the problem, someone help me !!!
What it seems is that the QWebEnginePage does not close correctly, it is also advisable to reuse instead of creating another QWebEnginePage, so using an old answer as a basis I have implemented the following solution:
import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
self.process(self.url(), html)
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def process(self, url, html):
print('loaded: [%d chars] %s' % (len(html), url.toString()))
class ScrapePage(WebPage):
def __init__(self):
super(ScrapePage, self).__init__()
self.results = set()
def process(self, url, html):
soup = bs.BeautifulSoup(html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
self.results.add(image['src'])
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = ScrapePage()
champions = ['Amumu','Akali','Zed','Nunu']
base_url = 'https://www.probuilds.net/champions/details/'
urls = []
for champ in champions:
url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
urls.append(url)
webpage.start(urls)
app.exec_()
for url in webpage.results:
ImageDownload(url)
I have this code that receives all of the networking resources of a web page.
I took this code from this site so I don't know how it works but I know that it receives all of the networking resources of a web page, which is what I need.
This is my code:
import sys, time
from PySide.QtCore import QUrl, SIGNAL
from PySide.QtGui import QApplication
from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
#reload(sys)
#sys.setdefaultencoding('utf-8')
fn_log = 'url_dd.txt'
fp_log = open(fn_log, 'ab+')
class WebPage(QWebPage):
def __init__(self, logger=None, parent=None):
super(WebPage, self).__init__(parent)
def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
sys.stderr.write('Javascritp error at line number %d\n' % (lineNumber))
sys.stderr.write('%s\n' % (message, ))
sys.stderr.write('Source ID: %s\n' % (sourceID, ))
class Crawler(QApplication):
def __init__(self, url):
super(Crawler, self).__init__(sys.argv)
self.url = url
self.web_view = QWebView()
self.web_page = WebPage()
self.web_view.setPage(self.web_page)
self.web_frame = self.web_page.mainFrame()
self.network = NetworkAccessManager()
self.web_page.setNetworkAccessManager(self.network)
self.settings = self.web_page.settings().globalSettings()
self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
QWebSettings.clearMemoryCaches()
self.web_view.resize(1024, 9000)
self.connect(self.web_page, SIGNAL('loadFinished(bool)'), self.loadFinished)
print('Before loading')
self.web_view.load(QUrl(self.url))
print('After loading')
def loadFinished(self, ok):
print('Start loadFinished()')
print('Start writing')
#with open('content_dd.txt', 'ab+') as fp:
#fp.write(self.web_frame.toHtml().toUtf8())
print('End writing')
print('End loadFinished()')
try:
self.quit()
except Exception as e:
print('FATAL ERROR: %s' % (str(e)))
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self):
super(NetworkAccessManager, self).__init__()
# QNetworkAccessManager.__init__(self)
self.connect(self, SIGNAL('finished (QNetworkReply *)'), self.finishd)
def createRequest(self, operation, request, data):
# url = request.url().toString()
self.setNetworkAccessible(self.Accessible)
return QNetworkAccessManager.createRequest(self, operation, request, data)
def finishd(self, reply):
print('In NetworkAccessManager finishd')
url = str(reply.url().toString())
log = '%s: %s\n' % (time.ctime(), url)
#fp_log.write(log)
print(reply)
print(reply.request())
print(log)
print(url)
if __name__ == '__main__':
url = 'http://need4bit.com'
crawler = Crawler(url)
sys.exit(crawler.exec_())
How should I modify this code so it could save all the resources into a directory.
Is it possible to perform in asynchrone(like with asyncio) web requests under Pyqt4 (QwebPage)?
For example, how can I call multiple urls in parallel with this code:
#!/usr/bin/env python3.4
import sys
import signal
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import QWebPage
class Crawler( QWebPage ):
def __init__(self, url):
QWebPage.__init__( self )
self._url = url
self.content = ''
def crawl( self ):
signal.signal( signal.SIGINT, signal.SIG_DFL )
self.connect( self, SIGNAL( 'loadFinished(bool)' ), self._finished_loading )
self.mainFrame().load( QUrl( self._url ) )
def _finished_loading( self, result ):
self.content = self.mainFrame().toHtml()
print(self.content)
sys.exit( 0 )
def main():
app = QApplication( sys.argv )
crawler = Crawler( self._url, self._file )
crawler.crawl()
sys.exit( app.exec_() )
if __name__ == '__main__':
crawl = Crawler( 'http://www.example.com')
crawl.main()
Thanks
You cannot make self.mainFrame().load(QUrl(self._url)) working through asyncio, sorry -- the method implemented in Qt itself.
But you can install quamash event loop and asynchronously call aiohttp.request coroutine to get web pages.
The way doesn't work with QWebPage though.
Requests are already done asynchronously, so you all you need to do is create multiple instances of QWebPage.
Here's a simple demo based on your example script:
import sys, signal
from PyQt4 import QtCore, QtGui, QtWebKit
urls = [
'http://qt-project.org/doc/qt-4.8/qwebelement.html',
'http://qt-project.org/doc/qt-4.8/qwebframe.html',
'http://qt-project.org/doc/qt-4.8/qwebinspector.html',
'http://qt-project.org/doc/qt-4.8/qwebpage.html',
'http://qt-project.org/doc/qt-4.8/qwebsettings.html',
'http://qt-project.org/doc/qt-4.8/qwebview.html',
]
class Crawler(QtWebKit.QWebPage):
def __init__(self, url, identifier):
super(Crawler, self).__init__()
self.loadFinished.connect(self._finished_loading)
self._id = identifier
self._url = url
self.content = ''
def crawl(self):
self.mainFrame().load(QtCore.QUrl(self._url))
def _finished_loading(self, result):
self.content = self.mainFrame().toHtml()
print('[%d] %s' % (self._id, self._url))
print(self.content[:250].rstrip(), '...')
print()
self.deleteLater()
if __name__ == '__main__':
app = QtGui.QApplication( sys.argv )
signal.signal( signal.SIGINT, signal.SIG_DFL)
crawlers = []
for index, url in enumerate(urls):
crawlers.append(Crawler(url, index))
crawlers[-1].crawl()
sys.exit( app.exec_() )