Update tqdm bar - python

I'm creating a small script for me to download youtube videos sounds.
Everytime you download a sound i use a tqdm bar to display download infos.
The first time you download everything work fine, but the second time my bar is completely destroyed :(. i really don't know what's happening with it...
(i think the bar doesn't update correctly)
Here's the code that handle the bar and download the sound
Thanks for your time :)
def DownloadAudioFromUrl(url):
print("Getting the URL...")
vid = pafy.new(url)
print("Done")
print("Getting best quality...")
stream = vid.getbestaudio()
fileSize = stream.get_filesize()
print("Done")
print("Downloading: " + vid.title + " ...")
with tqdm.tqdm(total=fileSize, unit_scale=True, unit='B', initial=0) as pbar:
stream.download("Download/", quiet=True, callback=lambda _, received, *args: UpdateBar(pbar, received))
print("Done")
ResetProgressBar(pbar)
WebmToMp3()
def ResetProgressBar(bar):
bar.reset()
bar.clear()
bar.close()
# i used these last time i tried i don't undersand how they work :/
def UpdateBar(pbar, current_received):
global previous_received
diff = current_received - previous_received
pbar.update(diff)
previous_received = current_received
So i tried to update the bar with "reset" "clear" and "stop" but it changed nothing

Related

Python tqdm download progress with playwright

I have following code to download something (in this example it's a video).
from playwright.sync_api import sync_playwright
import func
import os, time, shutil
import requests
from tqdm.auto import tqdm
def download():
with page.expect_download() as download_info:
page.click("text=720p (MP4")
download = download_info.value
with requests.get(download.url, stream=True) as r:
# check header to get content length, in bytes
total_length = int(r.headers.get("Content-Length"))
# implement progress bar via tqdm
with tqdm.wrapattr(r.raw, "read", total=total_length, desc="") as raw:
# save the output to a file
with open(f"{os.path.basename(r.url)}", 'wb') as output:
shutil.copyfileobj(raw, output)
download.save_as(os.path.join(func.report_folder_path, download.suggested_filename))
# initialize navigation
with sync_playwright() as p:
browser = p.chromium.launch(channel="msedge", headless=False)
context = browser.new_context(accept_downloads=True)
page = context.new_page()
# go to canon s21 login
print("Entering on download page")
page.goto('https://www.jw.org/en/library/videos/#en/mediaitems/StudioFeatured/docid-702023003_1_VIDEO')
page.wait_for_selector("text=Download")
page.locator("text=Download").first.click()
download()
time.sleep(3)
print('end')
I'm trying to implement a file download progress bar at the same time as the download is done by playwright. This code manages to download the file in the directory correctly and although it does not show a timeout error, it only ends the download function after the 30 seconds of the default timeout.
Another thing I noticed is that the download progress bar does not correctly show the real download status, i.e. it shows the download at 100% even though the real download file is still in progress.
Any idea how to fix these problems?

Resume download with pySmartDL

I would like to know if, when the program stops while downloading a file with pySmartDL, it is possible to resume it where it stopped
is pause/unpause what you are looking for?
https://github.com/iTaybb/pySmartDL/blob/master/test/test_pySmartDL.py#L76
def test_pause_unpause(self, testfile=None):
obj = pySmartDL.SmartDL(testfile if testfile else self.res_7za920_mirrors, dest=self.dl_dir, progress_bar=False, connect_default_logger=self.enable_logging)
obj.start(blocking=False)
while not obj.get_dl_size():
time.sleep(0.1)
# pause
obj.pause()
time.sleep(0.5)
if obj.get_status() == "finished":
# too bad, the file was too small and was downloaded complectely until we stopped it.
# We should download a bigger file
if self.res_testfile_100mb == testfile:
self.fail("The download got completed before we could stop it, even though we've used a big file. Are we on a 100GB/s internet connection or somethin'?")
return self.test_pause_unpause(testfile=self.res_testfile_100mb)
dl_size = obj.get_dl_size()
# verify download has really stopped
time.sleep(2.5)
self.assertEqual(dl_size, obj.get_dl_size())
# continue
obj.unpause()
time.sleep(2.5)
self.assertNotEqual(dl_size, obj.get_dl_size())
obj.wait()
self.assertTrue(obj.isSuccessful())
more likely you want restart partially downloaded file which has this issue opened. https://github.com/iTaybb/pySmartDL/issues/14

Why is os.path.getmtime() always running twice? It does not make any sense

Here is the code:
import os
import asyncio
async def func_placing_sell_orders():
prev_final_stocks_list_state = os.path.getmtime('stock_data//final_stocks_list.json')
print('i run once')
while True:
if (prev_final_stocks_list_state != os.path.getmtime('stock_data//final_stocks_list.json')):
prev_final_stocks_list_state = os.path.getmtime('stock_data//final_stocks_list.json')
print('here')
asyncio.get_event_loop().run_until_complete(func_placing_sell_orders())
simplified ver:
import os
def simple():
state = os.path.getmtime('file.json')
print('i run once')
while True:
if (state != os.path.getmtime('file.json')):
state = os.path.getmtime('file.json')
print('here')
simple()
This is the print out:
i run once
here
here
here, gets print out twice every time I save the file. I ran to check the time between previous and current modified time and it is always different, which implies it should only run once per save.
This is so basic I don't understand why I'm getting this result. Please send help
If the file is large enough maybe the first "here" is while file is still writing edits and the last "here" is after the saving is done. Also, if you're using something like open("file", "w") or something like this to write edits, the file will be first clean (first "here") and then edited with with new data (second "here")
You can ignore too fast reports (<1s) with a simple timer
lastEdit = time.time()
while True:
if (state != os.path.getmtime('file.json')):
state = os.path.getmtime('file.json')
if time.time()-lastEdit > 1:
print('here')
lastEdit = time.time()

request.urlretrieve in multiprocessing Python gets stuck

I am trying to download images from a list of URLs using Python. To make the process faster, I used the multiprocessing library.
The problem I am facing is that the script often hangs/freezes on its own, and I don't know why.
Here is the code that I am using
...
import multiprocessing as mp
def getImages(val):
#Dowload images
try:
url= # preprocess the url from the input val
local= #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
print("DONE - " + url)
return 1
except Exception as e:
print("CAN'T DOWNLOAD - " + url )
return 0
if __name__ == '__main__':
files = "urls.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=4)
res = pool.map(getImages, lst)
print ("tempw")
It often gets stuck halfway through the list (it prints DONE, or CAN't DOWNLOAD to half of the list it has processed but I don't know what is happening on the rest of them). Has anyone faced this problem? I have searched for similar problems (e.g. this link) but found no answer.
Thanks in advance
Ok, I have found an answer.
A possible culprit was the script was stuck in connecting/downloading from the URL. So what I added was a socket timeout to limit the time to connect and download the image.
And now, the issue no longer bothers me.
Here is my complete code
...
import multiprocessing as mp
import socket
# Set the default timeout in seconds
timeout = 20
socket.setdefaulttimeout(timeout)
def getImages(val):
#Dowload images
try:
url= # preprocess the url from the input val
local= #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
print("DONE - " + url)
return 1
except Exception as e:
print("CAN'T DOWNLOAD - " + url )
return 0
if __name__ == '__main__':
files = "urls.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=4)
res = pool.map(getImages, lst)
print ("tempw")
Hope this solution helps others who are facing the same issue
It looks like you're facing a GIL issue : The python Global Interpreter Lock basically forbid python to do more than one task at the same time.
The Multiprocessing module is really launching separate instances of python to get the work done in parallel.
But in your case, urllib is called in all these instances : each of them is trying to lock the IO process : the one who succeed (e.g. come first) get you the result, while the others (trying to lock an already locked process) fail.
This is a very simplified explanation, but here are some additionnal ressources :
You can find another way to parallelize requests here : Multiprocessing useless with urllib2?
And more info about the GIL here : What is a global interpreter lock (GIL)?

How to use progressbar module with urlretrieve

My pyhton3 script downloads a number of images over the internet using urlretrieve, and I'd like to add a progressbar with a completed percentage and download speed for each download.
The progressbar module seems like a good solution, but although I've looked through their examples, and example4 seems like the right thing, I still can't understand how to wrap it around the urlretrieve.
I guess I should add a third parameter:
urllib.request.urlretrieve('img_url', 'img_filename', some_progressbar_based_reporthook)
But how do I properly define it?
The suggestion in the other answer did not progress for me past 1%. Here is a complete implementation that works for me on Python 3:
import progressbar
import urllib.request
pbar = None
def show_progress(block_num, block_size, total_size):
global pbar
if pbar is None:
pbar = progressbar.ProgressBar(maxval=total_size)
pbar.start()
downloaded = block_num * block_size
if downloaded < total_size:
pbar.update(downloaded)
else:
pbar.finish()
pbar = None
urllib.request.urlretrieve(model_url, model_file, show_progress)
I think a better solution is to create a class that has all the needed state
import progressbar
class MyProgressBar():
def __init__(self):
self.pbar = None
def __call__(self, block_num, block_size, total_size):
if not self.pbar:
self.pbar=progressbar.ProgressBar(maxval=total_size)
self.pbar.start()
downloaded = block_num * block_size
if downloaded < total_size:
self.pbar.update(downloaded)
else:
self.pbar.finish()
and call :
urllib.request.urlretrieve('img_url', 'img_filename', MyProgressBar())
The hook is defined as:
urlretrieve(url[, filename[, reporthook[, data]]])
"The third argument, if present, is a hook function that will be called
once on establishment of the network connection and once after each block
read thereafter. The hook will be passed three arguments; a count of blocks
transferred so far, a block size in bytes, and the total size of the file.
The third argument may be -1 on older FTP servers which do not return a
file size in response to a retrieval request. "
So, you can write a hook as follows:
# Global variables
pbar = None
downloaded = 0
def show_progress(count, block_size, total_size):
if pbar is None:
pbar = ProgressBar(maxval=total_size)
downloaded += block_size
pbar.update(block_size)
if downloaded == total_size:
pbar.finish()
pbar = None
downloaded = 0
As a side note I strongly recommend you to use requests library which is a lot easier to use and you can iterate over the response with the iter_content() method.
In python 3 you can achieve the same result without the progressbar module:
import urllib.request
# prepare progressbar
def show_progress(block_num, block_size, total_size):
print(round(block_num * block_size / total_size *100,2), end="\r")
# use urlretrieve
urllib.request.urlretrieve(url, fileName, show_progress)

Categories

Resources