How to download a file using PySide/PyQt from a website - python

I have this code that receives all of the networking resources of a web page.
I took this code from this site so I don't know how it works but I know that it receives all of the networking resources of a web page, which is what I need.
This is my code:
import sys, time
from PySide.QtCore import QUrl, SIGNAL
from PySide.QtGui import QApplication
from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
#reload(sys)
#sys.setdefaultencoding('utf-8')
fn_log = 'url_dd.txt'
fp_log = open(fn_log, 'ab+')
class WebPage(QWebPage):
def __init__(self, logger=None, parent=None):
super(WebPage, self).__init__(parent)
def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
sys.stderr.write('Javascritp error at line number %d\n' % (lineNumber))
sys.stderr.write('%s\n' % (message, ))
sys.stderr.write('Source ID: %s\n' % (sourceID, ))
class Crawler(QApplication):
def __init__(self, url):
super(Crawler, self).__init__(sys.argv)
self.url = url
self.web_view = QWebView()
self.web_page = WebPage()
self.web_view.setPage(self.web_page)
self.web_frame = self.web_page.mainFrame()
self.network = NetworkAccessManager()
self.web_page.setNetworkAccessManager(self.network)
self.settings = self.web_page.settings().globalSettings()
self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
QWebSettings.clearMemoryCaches()
self.web_view.resize(1024, 9000)
self.connect(self.web_page, SIGNAL('loadFinished(bool)'), self.loadFinished)
print('Before loading')
self.web_view.load(QUrl(self.url))
print('After loading')
def loadFinished(self, ok):
print('Start loadFinished()')
print('Start writing')
#with open('content_dd.txt', 'ab+') as fp:
#fp.write(self.web_frame.toHtml().toUtf8())
print('End writing')
print('End loadFinished()')
try:
self.quit()
except Exception as e:
print('FATAL ERROR: %s' % (str(e)))
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self):
super(NetworkAccessManager, self).__init__()
# QNetworkAccessManager.__init__(self)
self.connect(self, SIGNAL('finished (QNetworkReply *)'), self.finishd)
def createRequest(self, operation, request, data):
# url = request.url().toString()
self.setNetworkAccessible(self.Accessible)
return QNetworkAccessManager.createRequest(self, operation, request, data)
def finishd(self, reply):
print('In NetworkAccessManager finishd')
url = str(reply.url().toString())
log = '%s: %s\n' % (time.ctime(), url)
#fp_log.write(log)
print(reply)
print(reply.request())
print(log)
print(url)
if __name__ == '__main__':
url = 'http://need4bit.com'
crawler = Crawler(url)
sys.exit(crawler.exec_())
How should I modify this code so it could save all the resources into a directory.

Related

GUI not responding while downloading video PyQt5 , PyTube [duplicate]

I have made a Desktop Application using Python and used PyQt5 and Pytube which could download video from youtube. When download is in Progress, I want to show user an animation. In Fact I did it, but when the file is getting downloaded the PyQt window seems like freezing and everything just gets paused until the download is complete. So, Does anyone know why is this happening? How do I fix it?
Here's the code snippet:
def download_created(self, qual): # Used in 'selection' method
selected_stream = yt.streams.get_by_resolution(qual)
self.progress_func()
try:
self.download_btn.setCurrentIndex(-1)
selected_stream.download(self.askLocation() + "/")
except:
pass
# This gets the quality that the user chooses
def selection(self):
global quality
quality = self.download_btn.currentText()
try:
self.download_created(quality) # Calls a method called 'download'
except:
self.start_anime()
# Fetching the details about the Link from Youtube
def download_youtube(self):
global check
if check != self.get_input():
check = self.get_input()
self.download_btn.clear()
enter_url = self.get_input()
try:
global yt
yt = pytube.YouTube(
enter_url,
on_progress_callback = on_progress,
on_complete_callback = self.complete_func)
self.start_anime()
except:
self.input_error()
VIDEO_TITLE = (yt.title)
global VIDEO_ID
VIDEO_ID = (yt.video_id)
videos = yt.streams.filter(mime_type="video/mp4", progressive="True")
# Display all the available qualities
for i in videos:
self.download_btn.addItem(i.resolution)
self.download_btn.currentIndexChanged.connect(self.selection)
You have to execute the time consuming tasks in another thread, for example in your case the task of getting the streams and downloading.
import sys
import threading
from functools import cached_property
from PyQt5 import QtCore, QtWidgets
import pytube
class QPyTube(QtCore.QObject):
initialized = QtCore.pyqtSignal(bool, str)
download_started = QtCore.pyqtSignal()
download_progress_changed = QtCore.pyqtSignal(int)
download_finished = QtCore.pyqtSignal()
def __init__(self, url):
super().__init__()
self._url = url
self._yt = None
self._mutex = threading.Lock()
threading.Thread(target=self._init, daemon=True).start()
#property
def url(self):
return self._url
#cached_property
def resolutions(self):
return list()
def _init(self):
with self._mutex:
self.resolutions.clear()
try:
self._yt = pytube.YouTube(
self.url,
on_progress_callback=self._on_progress,
on_complete_callback=self._on_complete,
)
streams = self._yt.streams.filter(mime_type="video/mp4", progressive="True")
except Exception as e:
self.initialized.emit(False, str(e))
return
with self._mutex:
self.resolutions = [stream.resolution for stream in streams]
self.initialized.emit(True, "")
def download(self, resolution, directory):
threading.Thread(
target=self._download, args=(resolution, directory), daemon=True
).start()
def _download(self, resolution, directory):
stream = self._yt.streams.get_by_resolution(resolution)
self.download_started.emit()
stream.download(directory)
def _on_progress(self, stream, chunk, bytes_remaining):
self.download_progress_changed.emit(
100 * (stream.filesize - bytes_remaining) // stream.filesize
)
def _on_complete(self, stream, filepath):
self.download_finished.emit()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super().__init__(parent)
self.le_url = QtWidgets.QLineEdit("http://youtube.com/watch?v=2lAe1cqCOXo")
self.lbl_error = QtWidgets.QLabel()
self.btn_search = QtWidgets.QPushButton("Search")
self.cmb_resolutions = QtWidgets.QComboBox()
self.le_directory = QtWidgets.QLineEdit("")
self.btn_download = QtWidgets.QPushButton("Download")
self.pgb_download = QtWidgets.QProgressBar()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QGridLayout(central_widget)
lay.addWidget(self.le_url, 0, 0)
lay.addWidget(self.btn_search, 0, 1)
lay.addWidget(self.cmb_resolutions, 1, 0)
lay.addWidget(self.le_directory, 1, 1)
lay.addWidget(self.btn_download, 1, 2)
lay.addWidget(self.pgb_download, 2, 0, 1, 3)
self.btn_download.setEnabled(False)
self._qpytube = None
self.btn_search.clicked.connect(self.handle_search_clicked)
self.btn_download.clicked.connect(self.handle_download_clicked)
def handle_search_clicked(self):
self.cmb_resolutions.clear()
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.lbl_error.clear()
self._qpytube = QPyTube(self.le_url.text())
self._qpytube.initialized.connect(self.handle_initialized)
self._qpytube.download_progress_changed.connect(self.pgb_download.setValue)
self._qpytube.download_started.connect(self.handle_download_started)
self._qpytube.download_finished.connect(self.handle_download_finished)
#QtCore.pyqtSlot(bool, str)
def handle_initialized(self, status, error=""):
if status:
self.cmb_resolutions.addItems(self._qpytube.resolutions)
self.btn_download.setEnabled(True)
else:
self.lbl_error.setText(error)
self.btn_search.setEnabled(True)
def handle_download_clicked(self):
self._qpytube.download(
self.cmb_resolutions.currentText(), self.le_directory.text()
)
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.le_directory.setEnabled(False)
def handle_download_started(self):
self.lbl_error.clear()
print("started")
def handle_download_finished(self):
self.pgb_download.setValue(100)
self.btn_search.setEnabled(True)
self.btn_download.setEnabled(True)
self.le_directory.setEnabled(True)
print("finished")
def main(args):
app = QtWidgets.QApplication(args)
w = MainWindow()
w.show()
app.exec_()
if __name__ == "__main__":
main(sys.argv)

show the download percentage progressbar in python [duplicate]

I have made a Desktop Application using Python and used PyQt5 and Pytube which could download video from youtube. When download is in Progress, I want to show user an animation. In Fact I did it, but when the file is getting downloaded the PyQt window seems like freezing and everything just gets paused until the download is complete. So, Does anyone know why is this happening? How do I fix it?
Here's the code snippet:
def download_created(self, qual): # Used in 'selection' method
selected_stream = yt.streams.get_by_resolution(qual)
self.progress_func()
try:
self.download_btn.setCurrentIndex(-1)
selected_stream.download(self.askLocation() + "/")
except:
pass
# This gets the quality that the user chooses
def selection(self):
global quality
quality = self.download_btn.currentText()
try:
self.download_created(quality) # Calls a method called 'download'
except:
self.start_anime()
# Fetching the details about the Link from Youtube
def download_youtube(self):
global check
if check != self.get_input():
check = self.get_input()
self.download_btn.clear()
enter_url = self.get_input()
try:
global yt
yt = pytube.YouTube(
enter_url,
on_progress_callback = on_progress,
on_complete_callback = self.complete_func)
self.start_anime()
except:
self.input_error()
VIDEO_TITLE = (yt.title)
global VIDEO_ID
VIDEO_ID = (yt.video_id)
videos = yt.streams.filter(mime_type="video/mp4", progressive="True")
# Display all the available qualities
for i in videos:
self.download_btn.addItem(i.resolution)
self.download_btn.currentIndexChanged.connect(self.selection)
You have to execute the time consuming tasks in another thread, for example in your case the task of getting the streams and downloading.
import sys
import threading
from functools import cached_property
from PyQt5 import QtCore, QtWidgets
import pytube
class QPyTube(QtCore.QObject):
initialized = QtCore.pyqtSignal(bool, str)
download_started = QtCore.pyqtSignal()
download_progress_changed = QtCore.pyqtSignal(int)
download_finished = QtCore.pyqtSignal()
def __init__(self, url):
super().__init__()
self._url = url
self._yt = None
self._mutex = threading.Lock()
threading.Thread(target=self._init, daemon=True).start()
#property
def url(self):
return self._url
#cached_property
def resolutions(self):
return list()
def _init(self):
with self._mutex:
self.resolutions.clear()
try:
self._yt = pytube.YouTube(
self.url,
on_progress_callback=self._on_progress,
on_complete_callback=self._on_complete,
)
streams = self._yt.streams.filter(mime_type="video/mp4", progressive="True")
except Exception as e:
self.initialized.emit(False, str(e))
return
with self._mutex:
self.resolutions = [stream.resolution for stream in streams]
self.initialized.emit(True, "")
def download(self, resolution, directory):
threading.Thread(
target=self._download, args=(resolution, directory), daemon=True
).start()
def _download(self, resolution, directory):
stream = self._yt.streams.get_by_resolution(resolution)
self.download_started.emit()
stream.download(directory)
def _on_progress(self, stream, chunk, bytes_remaining):
self.download_progress_changed.emit(
100 * (stream.filesize - bytes_remaining) // stream.filesize
)
def _on_complete(self, stream, filepath):
self.download_finished.emit()
class MainWindow(QtWidgets.QMainWindow):
def __init__(self, parent=None):
super().__init__(parent)
self.le_url = QtWidgets.QLineEdit("http://youtube.com/watch?v=2lAe1cqCOXo")
self.lbl_error = QtWidgets.QLabel()
self.btn_search = QtWidgets.QPushButton("Search")
self.cmb_resolutions = QtWidgets.QComboBox()
self.le_directory = QtWidgets.QLineEdit("")
self.btn_download = QtWidgets.QPushButton("Download")
self.pgb_download = QtWidgets.QProgressBar()
central_widget = QtWidgets.QWidget()
self.setCentralWidget(central_widget)
lay = QtWidgets.QGridLayout(central_widget)
lay.addWidget(self.le_url, 0, 0)
lay.addWidget(self.btn_search, 0, 1)
lay.addWidget(self.cmb_resolutions, 1, 0)
lay.addWidget(self.le_directory, 1, 1)
lay.addWidget(self.btn_download, 1, 2)
lay.addWidget(self.pgb_download, 2, 0, 1, 3)
self.btn_download.setEnabled(False)
self._qpytube = None
self.btn_search.clicked.connect(self.handle_search_clicked)
self.btn_download.clicked.connect(self.handle_download_clicked)
def handle_search_clicked(self):
self.cmb_resolutions.clear()
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.lbl_error.clear()
self._qpytube = QPyTube(self.le_url.text())
self._qpytube.initialized.connect(self.handle_initialized)
self._qpytube.download_progress_changed.connect(self.pgb_download.setValue)
self._qpytube.download_started.connect(self.handle_download_started)
self._qpytube.download_finished.connect(self.handle_download_finished)
#QtCore.pyqtSlot(bool, str)
def handle_initialized(self, status, error=""):
if status:
self.cmb_resolutions.addItems(self._qpytube.resolutions)
self.btn_download.setEnabled(True)
else:
self.lbl_error.setText(error)
self.btn_search.setEnabled(True)
def handle_download_clicked(self):
self._qpytube.download(
self.cmb_resolutions.currentText(), self.le_directory.text()
)
self.btn_search.setEnabled(False)
self.btn_download.setEnabled(False)
self.le_directory.setEnabled(False)
def handle_download_started(self):
self.lbl_error.clear()
print("started")
def handle_download_finished(self):
self.pgb_download.setValue(100)
self.btn_search.setEnabled(True)
self.btn_download.setEnabled(True)
self.le_directory.setEnabled(True)
print("finished")
def main(args):
app = QtWidgets.QApplication(args)
w = MainWindow()
w.show()
app.exec_()
if __name__ == "__main__":
main(sys.argv)

Why am I just taking the picture url from the last url?

I wrote a program to extract link pictures from webcomics, however, when I run it, it just extracts image links from the last link chapter, not all image links from all chapters. What is the issue with my program?
I have tried several ways but not things useful.
from PyQt5 import QtNetwork, QtCore
from requests_html import HTML
from functools import cached_property
from PyQt5.QtCore import QCoreApplication, QUrl
url1 = "https://saytruyen.net/truyen-su-tro-lai-cua-phap-su-hac-am-sau-66666-nam.html"
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
#cached_property
def manager(self):
return QtNetwork.QNetworkAccessManager()
def start(self):
self.start_request(QtCore.QUrl(url1))
def start_request(self, url):
request = QtNetwork.QNetworkRequest(url)
self.manager.get(request)
def handle_response(self, reply):
err = reply.error()
if err == QtNetwork.QNetworkReply.NoError:
self.process(str(reply.readAll(), 'utf-8'))
else:
print("Error occured: ", err)
print(reply.errorString())
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = "https://saytruyen.net/" + i.attrs["href"]
#print(url2)
#self.start_request(QtCore.QUrl(url2))
req = QtNetwork.QNetworkRequest(QUrl(url2))
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.nam.get(req)
def handleResponse(self, reply):
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
QCoreApplication.quit()
There are two problems:
the QNetworkAccessManager used for the download is being continuously recreated; since a network request is asynchronous, it isn't processed instantly, and it will be destroyed along with the network manager in the next cycle of the for loop since it's being overwritten; the result is that the previous request will be destroyed and only the last one will "survive";
the application is quit as soon as the first reply is received, preventing all other requests to be processed;
The solution is to create a single manager for the download process in the __init__, and quit as soon as all requests have been received.
class Manager:
def __init__(self):
self.manager.finished.connect(self.handle_response)
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.urls = set()
# ...
def process(self, data):
html = HTML(html=data)
rs = html.find("#list-chapter a", first=False)
for i in reversed(rs):
url2 = QUrl("https://saytruyen.net/" + i.attrs["href"])
if url2 in self.urls:
continue
self.urls.add(url2)
req = QtNetwork.QNetworkRequest(url2)
self.nam.get(req)
def handleResponse(self, reply):
self.urls.discard(reply.url())
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
html2 = HTML(html = str(bytes_string, 'utf-8'))
rs_c = html2.find("#lst_content img")
for x in rs_c:
img ="https://saytruyen.net/" + x.attrs['src']
print(img)
else:
print("Error occured: ", er)
print(reply.errorString())
if not self.urls:
QCoreApplication.quit()
Note that it's usually enough (and better) to have a single network manager and properly handle responses based on queued requests, but for simple situations like this one having two managers doesn't represent a huge problem.

Python WebScraping closes without finishing and without giving error

I'm making a simple WebScraping that download the image of the items of some champions of a site, I put a "for" with 5 characters and it only executes 2 of them and then closes without giving any error!
import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
print("#1 __init__")
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('#2 On Load finished')
def Callable(self, html_str):
print("#3 Callable\n")
self.html = html_str
self.app.quit()
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
def main():
champions = ['Amumu','Akali','Zed','Nunu'] #champions
for champ in champions:
try:
print("\nDownloading Images >: %s"% champ)
data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
soup = bs.BeautifulSoup(data.html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
ImageDownload(image['src'])
except:
print("Shi...")
main()
i'm getting no error but the program only executes 2 times this is the problem, someone help me !!!
What it seems is that the QWebEnginePage does not close correctly, it is also advisable to reuse instead of creating another QWebEnginePage, so using an old answer as a basis I have implemented the following solution:
import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
self.process(self.url(), html)
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def process(self, url, html):
print('loaded: [%d chars] %s' % (len(html), url.toString()))
class ScrapePage(WebPage):
def __init__(self):
super(ScrapePage, self).__init__()
self.results = set()
def process(self, url, html):
soup = bs.BeautifulSoup(html, 'html.parser')
items = soup.find_all('div',{'class':'items'})
for photos in items:
images = photos.find_all('img')
for image in images:
self.results.add(image['src'])
def already_exist(image_name):
for _, _, folder in os.walk('Images'):
if image_name in folder:
return False
else:
return True
def ImageDownload(url):
image_name = url.split("/")
try:
if already_exist(image_name[-1]):
full_path = "Images/" + image_name[-1]
urllib.request.urlretrieve(url, full_path)
print("Download %s" % image_name)
else:
print("Image already Downloaded >: %s" % image_name[-1])
except:
print("Error Download")
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = ScrapePage()
champions = ['Amumu','Akali','Zed','Nunu']
base_url = 'https://www.probuilds.net/champions/details/'
urls = []
for champ in champions:
url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
urls.append(url)
webpage.start(urls)
app.exec_()
for url in webpage.results:
ImageDownload(url)

python3.4 Pyqt4 web request asyncio

Is it possible to perform in asynchrone(like with asyncio) web requests under Pyqt4 (QwebPage)?
For example, how can I call multiple urls in parallel with this code:
#!/usr/bin/env python3.4
import sys
import signal
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import QWebPage
class Crawler( QWebPage ):
def __init__(self, url):
QWebPage.__init__( self )
self._url = url
self.content = ''
def crawl( self ):
signal.signal( signal.SIGINT, signal.SIG_DFL )
self.connect( self, SIGNAL( 'loadFinished(bool)' ), self._finished_loading )
self.mainFrame().load( QUrl( self._url ) )
def _finished_loading( self, result ):
self.content = self.mainFrame().toHtml()
print(self.content)
sys.exit( 0 )
def main():
app = QApplication( sys.argv )
crawler = Crawler( self._url, self._file )
crawler.crawl()
sys.exit( app.exec_() )
if __name__ == '__main__':
crawl = Crawler( 'http://www.example.com')
crawl.main()
Thanks
You cannot make self.mainFrame().load(QUrl(self._url)) working through asyncio, sorry -- the method implemented in Qt itself.
But you can install quamash event loop and asynchronously call aiohttp.request coroutine to get web pages.
The way doesn't work with QWebPage though.
Requests are already done asynchronously, so you all you need to do is create multiple instances of QWebPage.
Here's a simple demo based on your example script:
import sys, signal
from PyQt4 import QtCore, QtGui, QtWebKit
urls = [
'http://qt-project.org/doc/qt-4.8/qwebelement.html',
'http://qt-project.org/doc/qt-4.8/qwebframe.html',
'http://qt-project.org/doc/qt-4.8/qwebinspector.html',
'http://qt-project.org/doc/qt-4.8/qwebpage.html',
'http://qt-project.org/doc/qt-4.8/qwebsettings.html',
'http://qt-project.org/doc/qt-4.8/qwebview.html',
]
class Crawler(QtWebKit.QWebPage):
def __init__(self, url, identifier):
super(Crawler, self).__init__()
self.loadFinished.connect(self._finished_loading)
self._id = identifier
self._url = url
self.content = ''
def crawl(self):
self.mainFrame().load(QtCore.QUrl(self._url))
def _finished_loading(self, result):
self.content = self.mainFrame().toHtml()
print('[%d] %s' % (self._id, self._url))
print(self.content[:250].rstrip(), '...')
print()
self.deleteLater()
if __name__ == '__main__':
app = QtGui.QApplication( sys.argv )
signal.signal( signal.SIGINT, signal.SIG_DFL)
crawlers = []
for index, url in enumerate(urls):
crawlers.append(Crawler(url, index))
crawlers[-1].crawl()
sys.exit( app.exec_() )

Categories

Resources