I'm making a simple WebScraping that download the image of the items of some champions of a site, I put a "for" with 5 characters and it only executes 2 of them and then closes without giving any error!
import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
print("#1 __init__")
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('#2 On Load finished')
def Callable(self, html_str):
print("#3 Callable\n")
self.html = html_str
self.app.quit()
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
def main():
champions = ['Amumu','Akali','Zed','Nunu'] #champions
for champ in champions:
try:
print("\nDownloading Images >: %s"% champ)
data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
soup = bs.BeautifulSoup(data.html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
ImageDownload(image['src'])
except:
print("Shi...")
main()
i'm getting no error but the program only executes 2 times this is the problem, someone help me !!!
What it seems is that the QWebEnginePage does not close correctly, it is also advisable to reuse instead of creating another QWebEnginePage, so using an old answer as a basis I have implemented the following solution:
import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
self.process(self.url(), html)
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def process(self, url, html):
print('loaded: [%d chars] %s' % (len(html), url.toString()))
class ScrapePage(WebPage):
def __init__(self):
super(ScrapePage, self).__init__()
self.results = set()
def process(self, url, html):
soup = bs.BeautifulSoup(html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
self.results.add(image['src'])
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = ScrapePage()
champions = ['Amumu','Akali','Zed','Nunu']
base_url = 'https://www.probuilds.net/champions/details/'
urls = []
for champ in champions:
url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
urls.append(url)
webpage.start(urls)
app.exec_()
for url in webpage.results:
ImageDownload(url)
Related
I have made a Desktop Application using Python and used PyQt5 and Pytube which could download video from youtube. When download is in Progress, I want to show user an animation. In Fact I did it, but when the file is getting downloaded the PyQt window seems like freezing and everything just gets paused until the download is complete. So, Does anyone know why is this happening? How do I fix it?
Here's the code snippet:
def download_created(self, qual): # Used in 'selection' method
selected_stream = yt.streams.get_by_resolution(qual)
self.progress_func()
try:
self.download_btn.setCurrentIndex(-1)
selected_stream.download(self.askLocation() + "/")
except:
pass
# This gets the quality that the user chooses
def selection(self):
global quality
quality = self.download_btn.currentText()
try:
self.download_created(quality) # Calls a method called 'download'
except:
self.start_anime()
# Fetching the details about the Link from Youtube
def download_youtube(self):
global check
if check != self.get_input():
check = self.get_input()
self.download_btn.clear()
enter_url = self.get_input()
try:
global yt
yt = pytube.YouTube(
enter_url,
on_progress_callback = on_progress,
on_complete_callback = self.complete_func)
self.start_anime()
except:
self.input_error()
VIDEO_TITLE = (yt.title)
global VIDEO_ID
VIDEO_ID = (yt.video_id)
videos = yt.streams.filter(mime_type="video/mp4", progressive="True")
# Display all the available qualities
for i in videos:
self.download_btn.addItem(i.resolution)
self.download_btn.currentIndexChanged.connect(self.selection)
You have to execute the time consuming tasks in another thread, for example in your case the task of getting the streams and downloading.
import sys
import threading
from functools import cached_property
from PyQt5 import QtCore, QtWidgets
import pytube
class QPyTube(QtCore.QObject):
initialized = QtCore.pyqtSignal(bool, str)
download_started = QtCore.pyqtSignal()
download_progress_changed = QtCore.pyqtSignal(int)
download_finished = QtCore.pyqtSignal()
def __init__(self, url):
super().__init__()
self._url = url
self._yt = None
self._mutex = threading.Lock()
threading.Thread(target=self._init, daemon=True).start()
#property
def url(self):
return self._url
#cached_property
def resolutions(self):
return list()
def _init(self):
with self._mutex:
self.resolutions.clear()
try:
self._yt = pytube.YouTube(
self.url,
on_progress_callback=self._on_progress,
on_complete_callback=self._on_complete,
)
streams = self._yt.streams.filter(mime_type="video/mp4", progressive="True")
except Exception as e:
self.initialized.emit(False, str(e))
return
with self._mutex:
self.resolutions = [stream.resolution for stream in streams]
self.initialized.emit(True, "")
def download(self, resolution, directory):
threading.Thread(
target=self._download, args=(resolution, directory), daemon=True
).start()
def _download(self, resolution, directory):
stream = self._yt.streams.get_by_resolution(resolution)
self.download_started.emit()
stream.download(directory)
def _on_progress(self, stream, chunk, bytes_remaining):
self.download_progress_changed.emit(
100 * (stream.filesize - bytes_remaining) // stream.filesize
)
def _on_complete(self, stream, filepath):
self.download_finished.emit()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super().__init__(parent)
self.le_url = QtWidgets.QLineEdit("http://youtube.com/watch?v=2lAe1cqCOXo")
self.lbl_error = QtWidgets.QLabel()
self.btn_search = QtWidgets.QPushButton("Search")
self.cmb_resolutions = QtWidgets.QComboBox()
self.le_directory = QtWidgets.QLineEdit("")
self.btn_download = QtWidgets.QPushButton("Download")
self.pgb_download = QtWidgets.QProgressBar()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QGridLayout(central_widget)
lay.addWidget(self.le_url, 0, 0)
lay.addWidget(self.btn_search, 0, 1)
lay.addWidget(self.cmb_resolutions, 1, 0)
lay.addWidget(self.le_directory, 1, 1)
lay.addWidget(self.btn_download, 1, 2)
lay.addWidget(self.pgb_download, 2, 0, 1, 3)
self.btn_download.setEnabled(False)
self._qpytube = None
self.btn_search.clicked.connect(self.handle_search_clicked)
self.btn_download.clicked.connect(self.handle_download_clicked)
def handle_search_clicked(self):
self.cmb_resolutions.clear()
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.lbl_error.clear()
self._qpytube = QPyTube(self.le_url.text())
self._qpytube.initialized.connect(self.handle_initialized)
self._qpytube.download_progress_changed.connect(self.pgb_download.setValue)
self._qpytube.download_started.connect(self.handle_download_started)
self._qpytube.download_finished.connect(self.handle_download_finished)
#QtCore.pyqtSlot(bool, str)
def handle_initialized(self, status, error=""):
if status:
self.cmb_resolutions.addItems(self._qpytube.resolutions)
self.btn_download.setEnabled(True)
else:
self.lbl_error.setText(error)
self.btn_search.setEnabled(True)
def handle_download_clicked(self):
self._qpytube.download(
self.cmb_resolutions.currentText(), self.le_directory.text()
)
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.le_directory.setEnabled(False)
def handle_download_started(self):
self.lbl_error.clear()
print("started")
def handle_download_finished(self):
self.pgb_download.setValue(100)
self.btn_search.setEnabled(True)
self.btn_download.setEnabled(True)
self.le_directory.setEnabled(True)
print("finished")
def main(args):
app = QtWidgets.QApplication(args)
w = MainWindow()
w.show()
app.exec_()
if __name__ == "__main__":
main(sys.argv)
I have made a Desktop Application using Python and used PyQt5 and Pytube which could download video from youtube. When download is in Progress, I want to show user an animation. In Fact I did it, but when the file is getting downloaded the PyQt window seems like freezing and everything just gets paused until the download is complete. So, Does anyone know why is this happening? How do I fix it?
Here's the code snippet:
def download_created(self, qual): # Used in 'selection' method
selected_stream = yt.streams.get_by_resolution(qual)
self.progress_func()
try:
self.download_btn.setCurrentIndex(-1)
selected_stream.download(self.askLocation() + "/")
except:
pass
# This gets the quality that the user chooses
def selection(self):
global quality
quality = self.download_btn.currentText()
try:
self.download_created(quality) # Calls a method called 'download'
except:
self.start_anime()
# Fetching the details about the Link from Youtube
def download_youtube(self):
global check
if check != self.get_input():
check = self.get_input()
self.download_btn.clear()
enter_url = self.get_input()
try:
global yt
yt = pytube.YouTube(
enter_url,
on_progress_callback = on_progress,
on_complete_callback = self.complete_func)
self.start_anime()
except:
self.input_error()
VIDEO_TITLE = (yt.title)
global VIDEO_ID
VIDEO_ID = (yt.video_id)
videos = yt.streams.filter(mime_type="video/mp4", progressive="True")
# Display all the available qualities
for i in videos:
self.download_btn.addItem(i.resolution)
self.download_btn.currentIndexChanged.connect(self.selection)
You have to execute the time consuming tasks in another thread, for example in your case the task of getting the streams and downloading.
import sys
import threading
from functools import cached_property
from PyQt5 import QtCore, QtWidgets
import pytube
class QPyTube(QtCore.QObject):
initialized = QtCore.pyqtSignal(bool, str)
download_started = QtCore.pyqtSignal()
download_progress_changed = QtCore.pyqtSignal(int)
download_finished = QtCore.pyqtSignal()
def __init__(self, url):
super().__init__()
self._url = url
self._yt = None
self._mutex = threading.Lock()
threading.Thread(target=self._init, daemon=True).start()
#property
def url(self):
return self._url
#cached_property
def resolutions(self):
return list()
def _init(self):
with self._mutex:
self.resolutions.clear()
try:
self._yt = pytube.YouTube(
self.url,
on_progress_callback=self._on_progress,
on_complete_callback=self._on_complete,
)
streams = self._yt.streams.filter(mime_type="video/mp4", progressive="True")
except Exception as e:
self.initialized.emit(False, str(e))
return
with self._mutex:
self.resolutions = [stream.resolution for stream in streams]
self.initialized.emit(True, "")
def download(self, resolution, directory):
threading.Thread(
target=self._download, args=(resolution, directory), daemon=True
).start()
def _download(self, resolution, directory):
stream = self._yt.streams.get_by_resolution(resolution)
self.download_started.emit()
stream.download(directory)
def _on_progress(self, stream, chunk, bytes_remaining):
self.download_progress_changed.emit(
100 * (stream.filesize - bytes_remaining) // stream.filesize
)
def _on_complete(self, stream, filepath):
self.download_finished.emit()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super().__init__(parent)
self.le_url = QtWidgets.QLineEdit("http://youtube.com/watch?v=2lAe1cqCOXo")
self.lbl_error = QtWidgets.QLabel()
self.btn_search = QtWidgets.QPushButton("Search")
self.cmb_resolutions = QtWidgets.QComboBox()
self.le_directory = QtWidgets.QLineEdit("")
self.btn_download = QtWidgets.QPushButton("Download")
self.pgb_download = QtWidgets.QProgressBar()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QGridLayout(central_widget)
lay.addWidget(self.le_url, 0, 0)
lay.addWidget(self.btn_search, 0, 1)
lay.addWidget(self.cmb_resolutions, 1, 0)
lay.addWidget(self.le_directory, 1, 1)
lay.addWidget(self.btn_download, 1, 2)
lay.addWidget(self.pgb_download, 2, 0, 1, 3)
self.btn_download.setEnabled(False)
self._qpytube = None
self.btn_search.clicked.connect(self.handle_search_clicked)
self.btn_download.clicked.connect(self.handle_download_clicked)
def handle_search_clicked(self):
self.cmb_resolutions.clear()
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.lbl_error.clear()
self._qpytube = QPyTube(self.le_url.text())
self._qpytube.initialized.connect(self.handle_initialized)
self._qpytube.download_progress_changed.connect(self.pgb_download.setValue)
self._qpytube.download_started.connect(self.handle_download_started)
self._qpytube.download_finished.connect(self.handle_download_finished)
#QtCore.pyqtSlot(bool, str)
def handle_initialized(self, status, error=""):
if status:
self.cmb_resolutions.addItems(self._qpytube.resolutions)
self.btn_download.setEnabled(True)
else:
self.lbl_error.setText(error)
self.btn_search.setEnabled(True)
def handle_download_clicked(self):
self._qpytube.download(
self.cmb_resolutions.currentText(), self.le_directory.text()
)
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.le_directory.setEnabled(False)
def handle_download_started(self):
self.lbl_error.clear()
print("started")
def handle_download_finished(self):
self.pgb_download.setValue(100)
self.btn_search.setEnabled(True)
self.btn_download.setEnabled(True)
self.le_directory.setEnabled(True)
print("finished")
def main(args):
app = QtWidgets.QApplication(args)
w = MainWindow()
w.show()
app.exec_()
if __name__ == "__main__":
main(sys.argv)
I wrote a program to extract link pictures from webcomics, however, when I run it, it just extracts image links from the last link chapter, not all image links from all chapters. What is the issue with my program?
I have tried several ways but not things useful.
from PyQt5 import QtNetwork, QtCore
from requests_html import HTML
from functools import cached_property
from PyQt5.QtCore import QCoreApplication, QUrl
url1 = "https://saytruyen.net/truyen-su-tro-lai-cua-phap-su-hac-am-sau-66666-nam.html"
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
#cached_property
def manager(self):
return QtNetwork.QNetworkAccessManager()
def start(self):
self.start_request(QtCore.QUrl(url1))
def start_request(self, url):
request = QtNetwork.QNetworkRequest(url)
self.manager.get(request)
def handle_response(self, reply):
err = reply.error()
if err == QtNetwork.QNetworkReply.NoError:
self.process(str(reply.readAll(), 'utf-8'))
else:
print("Error occured: ", err)
print(reply.errorString())
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = "https://saytruyen.net/" + i.attrs["href"]
#print(url2)
#self.start_request(QtCore.QUrl(url2))
req = QtNetwork.QNetworkRequest(QUrl(url2))
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.nam.get(req)
def handleResponse(self, reply):
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
QCoreApplication.quit()
There are two problems:
the QNetworkAccessManager used for the download is being continuously recreated; since a network request is asynchronous, it isn't processed instantly, and it will be destroyed along with the network manager in the next cycle of the for loop since it's being overwritten; the result is that the previous request will be destroyed and only the last one will "survive";
the application is quit as soon as the first reply is received, preventing all other requests to be processed;
The solution is to create a single manager for the download process in the __init__, and quit as soon as all requests have been received.
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.urls = set()
# ...
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = QUrl("https://saytruyen.net/" + i.attrs["href"])
if url2 in self.urls:
continue
self.urls.add(url2)
req = QtNetwork.QNetworkRequest(url2)
self.nam.get(req)
def handleResponse(self, reply):
self.urls.discard(reply.url())
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
if not self.urls:
QCoreApplication.quit()
Note that it's usually enough (and better) to have a single network manager and properly handle responses based on queued requests, but for simple situations like this one having two managers doesn't represent a huge problem.
This question already has an answer here:
Scrape multiple urls using QWebPage
(1 answer)
Closed 4 years ago.
So, guys, i am still confuse why the iteration always stop at the third turn
Here is my code :
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import sys
import numpy as np
from bs4 import BeautifulSoup as soup
class Client(QWebEnginePage):
def __init__(self,url):
global app
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ""
self.loadFinished.connect(self.on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def on_load_finished(self):
self.html = self.toHtml(self.Callable)
print("Load Finished")
def Callable(self,data):
self.html = data
self.app.quit()
linkgroup = []
linkgroup.append("https://docs.python.org/3/whatsnew/3.7.html")
linkgroup.append("https://docs.python.org/3/tutorial/index.html")
linkgroup.append("https://docs.python.org/3/installing/index.html")
linkgroup.append("https://docs.python.org/3/reference/index.html")
linkgroup.append("https://docs.python.org/3/using/index.html")
for h in range(0,len(linkgroup)):
#Setting Url
url = linkgroup[h]
print(url)
print("Loop Index : " + str(h))
client_response = Client(url)
The output is this
https://docs.python.org/3/whatsnew/3.7.html
Loop Index : 0
Load Finished
https://docs.python.org/3/tutorial/index.html
Loop Index : 1
Load Finished
https://docs.python.org/3/installing/index.html
Loop Index : 2
As you can see, the rest iterations of the loop doesn't get executed as it doesn't show the respond from the Client class
As commented by pschill above, you should have only one QApplication. How about passing it as a parameter to the constructor? Something like:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import sys
import numpy as np
from bs4 import BeautifulSoup as soup
class Client(QWebEnginePage):
def __init__(self,url,app):
self.app = app
QWebEnginePage.__init__(self)
self.html = ""
self.loadFinished.connect(self.on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def on_load_finished(self):
self.html = self.toHtml(self.Callable)
print("Load Finished")
def Callable(self,data):
self.html = data
self.app.quit()
linkgroup = []
linkgroup.append("https://docs.python.org/3/whatsnew/3.7.html")
linkgroup.append("https://docs.python.org/3/tutorial/index.html")
linkgroup.append("https://docs.python.org/3/installing/index.html")
linkgroup.append("https://docs.python.org/3/reference/index.html")
linkgroup.append("https://docs.python.org/3/using/index.html")
app = QApplication(sys.argv)
for h in range(0,len(linkgroup)):
#Setting Url
url = linkgroup[h]
print(url)
print("Loop Index : " + str(h))
client_response = Client(url, app)
I have this code that receives all of the networking resources of a web page.
I took this code from this site so I don't know how it works but I know that it receives all of the networking resources of a web page, which is what I need.
This is my code:
import sys, time
from PySide.QtCore import QUrl, SIGNAL
from PySide.QtGui import QApplication
from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
#reload(sys)
#sys.setdefaultencoding('utf-8')
fn_log = 'url_dd.txt'
fp_log = open(fn_log, 'ab+')
class WebPage(QWebPage):
def __init__(self, logger=None, parent=None):
super(WebPage, self).__init__(parent)
def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
sys.stderr.write('Javascritp error at line number %d\n' % (lineNumber))
sys.stderr.write('%s\n' % (message, ))
sys.stderr.write('Source ID: %s\n' % (sourceID, ))
class Crawler(QApplication):
def __init__(self, url):
super(Crawler, self).__init__(sys.argv)
self.url = url
self.web_view = QWebView()
self.web_page = WebPage()
self.web_view.setPage(self.web_page)
self.web_frame = self.web_page.mainFrame()
self.network = NetworkAccessManager()
self.web_page.setNetworkAccessManager(self.network)
self.settings = self.web_page.settings().globalSettings()
self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
QWebSettings.clearMemoryCaches()
self.web_view.resize(1024, 9000)
self.connect(self.web_page, SIGNAL('loadFinished(bool)'), self.loadFinished)
print('Before loading')
self.web_view.load(QUrl(self.url))
print('After loading')
def loadFinished(self, ok):
print('Start loadFinished()')
print('Start writing')
#with open('content_dd.txt', 'ab+') as fp:
#fp.write(self.web_frame.toHtml().toUtf8())
print('End writing')
print('End loadFinished()')
try:
self.quit()
except Exception as e:
print('FATAL ERROR: %s' % (str(e)))
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self):
super(NetworkAccessManager, self).__init__()
# QNetworkAccessManager.__init__(self)
self.connect(self, SIGNAL('finished (QNetworkReply *)'), self.finishd)
def createRequest(self, operation, request, data):
# url = request.url().toString()
self.setNetworkAccessible(self.Accessible)
return QNetworkAccessManager.createRequest(self, operation, request, data)
def finishd(self, reply):
print('In NetworkAccessManager finishd')
url = str(reply.url().toString())
log = '%s: %s\n' % (time.ctime(), url)
#fp_log.write(log)
print(reply)
print(reply.request())
print(log)
print(url)
if __name__ == '__main__':
url = 'http://need4bit.com'
crawler = Crawler(url)
sys.exit(crawler.exec_())
How should I modify this code so it could save all the resources into a directory.