I need to download a sizable (~200MB) file. I figured out how to download and save the file with here. It would be nice to have a progress bar to know how much has been downloaded. I found ProgressBar but I'm not sure how to incorperate the two together.
Here's the code I tried, but it didn't work.
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
with closing(download_file()) as r:
for i in range(20):
bar.update(i)
I suggest you try tqdm, it's very easy to use.
Example code for downloading with requests library:
from tqdm import tqdm
import requests
url = "http://www.ovh.net/files/10Mb.dat" #big file test
# Streaming, so we can iterate over the response.
response = requests.get(url, stream=True)
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 #1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open('test.dat', 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")
The tqdm package now includes a function designed more specifically for this type of situation: wrapattr. You just wrap an object's read (or write) attribute, and tqdm handles the rest; there's no messing with block sizes or anything like that. Here's a simple download function that puts it all together with requests:
def download(url, filename):
import functools
import pathlib
import shutil
import requests
from tqdm.auto import tqdm
r = requests.get(url, stream=True, allow_redirects=True)
if r.status_code != 200:
r.raise_for_status() # Will only raise for 4xx codes, so...
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
file_size = int(r.headers.get('Content-Length', 0))
path = pathlib.Path(filename).expanduser().resolve()
path.parent.mkdir(parents=True, exist_ok=True)
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
return path
It seems that there is a disconnect between the examples on the Progress Bar Usage page and what the code actually requires.
In the following example, note the use of maxval instead of max_value. Also note the use of .start() to initialized the bar. This has been noted in an Issue.
The n_chunk parameter denotes how many 1024 kb chunks to stream at once while looping through the request iterator.
import requests
import time
import numpy as np
import progressbar
url = "http://wikipedia.com/"
def download_file(url, n_chunk=1):
r = requests.get(url, stream=True)
# Estimates the number of bar updates
block_size = 1024
file_size = int(r.headers.get('Content-Length', None))
num_bars = np.ceil(file_size / (n_chunk * block_size))
bar = progressbar.ProgressBar(maxval=num_bars).start()
with open('test.html', 'wb') as f:
for i, chunk in enumerate(r.iter_content(chunk_size=n_chunk * block_size)):
f.write(chunk)
bar.update(i+1)
# Add a little sleep so you can see the bar progress
time.sleep(0.05)
return
download_file(url)
EDIT: Addressed comment about code clarity.
EDIT2: Fixed logic so bar reports 100% at completion. Credit to leovp's answer for using the 1024 kb block size.
Also python library enlighten can be used, it is powerful, provides colorful progress bars and correctly works in Linux, Windows.
Below is code + live screen-cast. This code can be run here on repl.it.
import math
import requests, enlighten
url = 'https://upload.wikimedia.org/wikipedia/commons/a/ae/Arthur_Streeton_-_Fire%27s_on_-_Google_Art_Project.jpg?download'
fname = 'image.jpg'
# Should be one global variable
MANAGER = enlighten.get_manager()
r = requests.get(url, stream = True)
assert r.status_code == 200, r.status_code
dlen = int(r.headers.get('Content-Length', '0')) or None
with MANAGER.counter(color = 'green', total = dlen and math.ceil(dlen / 2 ** 20), unit = 'MiB', leave = False) as ctr, \
open(fname, 'wb', buffering = 2 ** 24) as f:
for chunk in r.iter_content(chunk_size = 2 ** 20):
print(chunk[-16:].hex().upper())
f.write(chunk)
ctr.update()
Output (+ ascii-video)
It seems like you're going to need to get the remote file size (answered here) to calculate how far along you are.
You could then update your progress bar while processing each chunk... if you know the total size and the size of the chunk, you can figure out when to update the progress bar.
There is an answer with tqdm.
def download(url, fname):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Gits: https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
Calculating the file size with your already downloaded size would find how far you are. Or you could use tqdm.
For some reason I couldn't get file size with requests when working with zip files, so I used urllib to get it
# A simple downloader with progress bar
import requests
from tqdm import tqdm
import zipfile
from urllib.request import urlopen
url = "https://web.cs.dal.ca/~juanr/downloads/malnis_dataset.zip"
block_size = 1024 #1 Kibibyte
filename = url.split("/")[-1]
print(f"Downloading {filename}...")
site = urlopen(url)
meta = site.info()
# Streaming, so we can iterate over the response.
response = requests.get(url, stream=True)
total_size_in_bytes = int(meta["Content-Length"])
progress_bar = tqdm(total = total_size_in_bytes, unit='iB', unit_scale=True)
with open('test.dat', 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
print("Download complete")
print(f"Extracting {filename}...")
zip = zipfile.ZipFile(filename, "r")
zip.extractall()
zip.close()
print("Extracting complete")
Related
I want to save the files in order of the list. (like bbb.jpg->001.jpg, aaa.jpg -> 002.jpg...)
Because of alphabetical order, files are not saved as I want. (like aaa.jpg, bbb.jpg, ccc.jpg...)
There is also a way to sort files chronologically, but it is also impossible to use multiprocessing.
So my question is how can I save the files in the order I want, or in the name I want.
Here is my code.
from urllib.request import Request, urlopen
import urllib.request
import os
import os.path
import re
import time
from multiprocessing import Pool
import multiprocessing
from functools import partial
mylist = ['https://examsite.com/bbb.jpg',
'https://examsite.com/aaa.jpg',
'https://examsite.com/ddd.jpg',
'https://examsite.com/eee.jpg',
'https://examsite.com/ccc.jpg']
def image_URL_download (path, html):
originNames = (f"{html}".split)('/')[-1]
PathandNames = (path + str(originNames))
req = urllib.request.Request(html, headers={'User-Agent': 'Mozilla/5.0'})
urlopen = request.urlopen(req).read()
with open(PathandNames,'wb') as savefile2:
savefile2.write(urlopen)
print (f"download {originNames}")
if __name__ == "__main__":
start = time.time()
path = './down'
pool = multiprocessing.Pool(processes=4)
img_down = partial(image_URL_download, path)
pool.map(img_down, mylist)
pool.close()
pool.join()
print("DONE! time :", time.time() - start)
Here is a full example that takes a bunch of images (thumbnails, here) from Wikimedia commons images. It saves them numbered 000.jpg, 001.jpg, etc. (in /tmp, but of course adjust as needed). Bonus: it displays an animated progress bar during download, courtesy tqdm:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
tld = 'https://commons.wikimedia.org'
url = '/wiki/Category:Images'
soup = BeautifulSoup(requests.get(urljoin(tld, url)).content)
imglist = [x.get('src') for x in soup.find_all('img', src=True)]
imglist = [urljoin(tld, x) for x in imglist if x.endswith('.jpg')]
def load_img(i_url):
i, url = i_url
img = requests.get(url).content
with open(f'/tmp/{i:03d}.jpg', 'wb') as f:
f.write(img)
return True
def load_all(imglist):
with ThreadPoolExecutor() as executor:
results = list(tqdm(
executor.map(load_img, enumerate(imglist)),
total=len(imglist), unit=' images'))
return results
results = load_all(imglist)
I like to generate a in-memory (temp file) data stream in Python. One thread is filling the stream with data, and a other one consumes it.
After checking the io - Core tools for working with streams , it seems to me that the io module is the best choice for it.
So I put a simple example for me:
#!/usr/local/bin/python3
# encoding: utf-8
import io
if __name__ == '__main__':
a = io.BytesIO()
a.write("hello".encode())
txt = a.read(100)
txt = txt.decode("utf-8")
print(txt)
My example does not work. "hello" is not written to a and can not be read after. So were is my error? How do I have to alter my code to get a file like object in memory?
Actually it's written; but reading is the problem. You should be referring to class io.BytesIO. You can get the value using getvalue(). Like,
import io
a = io.BytesIO()
a.write("hello".encode())
txt = a.getvalue()
txt = txt.decode("utf-8")
print(txt)
#dhilmathy and #ShadowRanger mentioned that io.BytesIO() do not have separate read and write pointer.
I overcome this problem with creating a simple class that implements a read pointer and remembers the amount of bytes written. When the amount of read bytes is equal the amount of written bytes the file is shrink to save memory.
My solution so far:
#!/usr/local/bin/python3
# encoding: utf-8
import io
class memoryStreamIO(io.BytesIO):
"""
memoryStreamIO
a in memory file like stream object
"""
def __init__(self):
super().__init__()
self._wIndex = 0
self._rIndex = 0
self._mutex = threading.Lock()
def write(self, d : bytearray):
self._mutex.acquire()
r = super().write(d)
self._wIndex += len(d)
self._mutex.release()
return r
def read(self, n : int):
self._mutex.acquire()
super().seek(self._rIndex)
r = super().read(n)
self._rIndex += len(r)
# now we are checking if we can
if self._rIndex == self._wIndex:
super().truncate(0)
super().seek(0)
self._rIndex = 0
self._wIndex = 0
self._mutex.release()
return r
def seek(self, n):
self._mutex.acquire()
self._rIndex = n
r = super().seek(n)
self._mutex.release()
return r
if __name__ == '__main__':
a = streamIO()
a.write("hello".encode())
txt = (a.read(100)).decode()
print(txt)
a.write("abc".encode())
txt = (a.read(100)).decode()
print(txt)
Clarification: I don't want to add pages to a PDF file. I want to add content to a very big PDF page. The page changes sometimes and the content is different every time.
I'm using pypdf2 and reportlab to make small additions to big PDF pages (~10MB). This takes 30 seconds and more and the majority of that time is spend parsing the original.
Usually the page also needs to be turned using mergeRotatedTranslatedPage.
My idea was to generate the content array of the original once and then copy it every time I want to add something. So I modified PageObject._merge to do just that. It worked... kind of. I'm now down to 18 sec.
Is there a better way to speed up this process? 18 sec for one page is still pretty slow.
If you want to use 100% capacity of all the cores of your processor, you can do it with "multiprocessing", as follows:
We count the number of pages in the PDF and the number of nuclei that your processor has, in order to calculate how many pages have to work each nucleus has.
The pages that must work are sent to each nucleus and at the end all the PDF's are joined.
# -*- coding: utf-8 -*-
from io import BytesIO
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.colors import Color
from webcolors import hex_to_rgb
import base64
import multiprocessing
import math
class SkyMark:
self.mainPdf=''
self.mask=''
#beginning, call from your api rest y pass request
def begin(self,request):
stringPdfBase64 = str(request.json['pdfBase4'])
blob = self.workMultiprocess(stringPdfBase64,'MyWaterMark text')
pdfbase64 = blob.decode('utf-8')
return pdfbase64
def workMultiprocess(self,stringPdfBase64,message='SkyMark'):
try:
#get pdf PdfFileReader objeto with your message
self.mask = self.getMaskWaterMark(message)
#Convert main pdfB64 to PdfFileReader object
sanitizedPdf = stringPdfBase64.rsplit(',', 1)[-1]
data = base64.b64decode(sanitizedPdf)
stream = BytesIO(data)
self.mainPdf = PdfFileReader(stream , strict=False)
numPaginas = self.mainPdf .getNumPages()
#count your cores of your processor
coresProcessor = int( multiprocessing.cpu_count() ) or 22
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
#calculate how many pages each processor has
byPage= int( math.ceil( numPaginas/coresProcessor ))
pagesFrom=0
pagesTo=0
#Send task for every core
for i in range(coresProcessor):
pagesFrom = pagesTo
pagesTo = pagesFrom + byPage
if pagesTo>numPaginas:
pagesTo=numPaginas
p = multiprocessing.Process(target=self.doByPage, args=(pagesFrom,pagesTo,i, return_dict))
jobs.append(p)
p.start()
if pagesTo>=numPaginas:
break
for proc in jobs:
proc.join()
#Order single PDF's for merge
randData = return_dict.values()
ascArray = sorted(randData, key=lambda k: k['procnum'])
singlePdfsArray = []
for pdfs in ascArray:
singlePdfsArray.append(pdfs['dataB64'])
#merge task
return self.mergePdfsArray(singlePdfsArray)
except Exception as e:
print(f'Error {e}')
#Explotamos los cores del procesador
def doByPage(self,fromPage,toPage,procnum,return_dict):
output = PdfFileWriter()
waterMark = self.mask.getPage(0)
for i in range(fromPage,toPage):
#print(f'WaterMark page: {i}, Core: {procnum}')
page = self.mainPdf.getPage(i)
page.mergePage(waterMark)
page.compressContentStreams()
output.addPage(page)
letter_data = BytesIO()
output.write(letter_data)
letter_data.seek(0)
dataB64 = base64.b64encode(letter_data.read())
return_dict[procnum] = {'procnum':procnum,'dataB64':dataB64}
#Single Pdf with your watermark
def getMaskWaterMark(self,texto):
font_name='Helvetica'
font_size=22
color='#000000'
opacity=0.08
x=1
y=840
filename=''
bgTexto='';
for i in range(1, 6):
bgTexto+= ' '+texto;
cantidadRenglones=100
mask_stream = BytesIO()
watermark_canvas = canvas.Canvas(mask_stream, pagesize=A4)
watermark_canvas.setFont(font_name, font_size)
r, g, b = hex_to_rgb(color)
c = Color(r, g, b, alpha=opacity)
watermark_canvas.setFillColor(c)
print(watermark_canvas)
for i in range(1, cantidadRenglones):
watermark_canvas.drawString(x, y-(i * 25), bgTexto)
watermark_canvas.save()
mask_stream.seek(0)
mask = PdfFileReader(mask_stream , strict=False)
return mask
#Merge all pdf in only one pdf
def mergePdfsArray(self,arrayPdfsBase64):
merge = PdfFileMerger()
for f in arrayPdfsBase64:
nada = base64.b64decode(f)
stre = BytesIO(nada)
src = PdfFileReader(stre , strict=False)
merge.append(src)
letter_data = BytesIO()
merge.write(letter_data)
letter_data.seek(0)
data = base64.b64encode(letter_data.read())
return data
I have my bot working by now, but the thing is it can only send text. I have seen in the Bot API there are functions to send photos, videos... but I can't get it to work. Someone has achieved it? I'm using python source code from yukuku/telebot
elif text == '/image':
img = Image.new('RGB', (512, 512))
base = random.randint(0, 16777216)
pixels = [base+i*j for i in range(512) for j in range(512)] # generate sample image
img.putdata(pixels)
output = StringIO.StringIO()
img.save(output, 'JPEG')
reply(img=output.getvalue())
When I change the code, nothing happened.
img = Image.open('image.png')
img.show()
Please help me. I need the correct code. Sorry for my bad English.
I have included two functions, one is good for sending local images, the other one is good for sending remote images.
def sendImage():
url = "https://api.telegram.org/bot<Token>/sendPhoto";
files = {'photo': open('/path/to/img.jpg', 'rb')}
data = {'chat_id' : "YOUR_CHAT_ID"}
r= requests.post(url, files=files, data=data)
print(r.status_code, r.reason, r.content)
def sendImageRemoteFile(img_url):
url = "https://api.telegram.org/bot<Token>/sendPhoto";
remote_image = requests.get(img_url)
photo = io.BytesIO(remote_image.content)
photo.name = 'img.png'
files = {'photo': photo}
data = {'chat_id' : "YOUR_CHAT_ID"}
r= requests.post(url, files=files, data=data)
print(r.status_code, r.reason, r.content)
The solution is
elif 'Hi' in text:
reply(img=urllib2.urlopen('img url').read())
or
if text == 'help':
reply(img=urllib2.urlopen('img url').read())
Before sending the photo, you have to do output.seek(0) to put the cursor back to the beginning of the file, else it will be read as zero
I understand the question. Here's the answer:
def sendImageFromUrl(url):
#this tweak added if request image failed
headers = {'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
response = requests.get(url, headers=headers)
#response = requests.get(url)
output = StringIO(response.content)
img = Image.open(output)
img.save(output, 'JPEG')
resp = multipart.post_multipart(BASE_URL + 'sendPhoto', [
('chat_id', str(chat_id)),
('caption', 'Your Caption'),
], [
('photo', 'image.jpg', output.getvalue()),
])
Make sure your server does have python module: requests.
You can download here: https://pypi.python.org/pypi/requests#downloads
And put in your application like this
/myapp/app.yaml
/myapp/main.py
/myapp/requests/packages/
/myapp/requests/__init__.py
/myapp/requests/adapters.py
etc...
Credit: https://stackoverflow.com/a/17128168/1097372
Put in main.py after line 10
import requests
from StringIO import StringIO
I've seen this question and i followed every step, changing the code to satisfy my requirements, that are Python3, Pillow, and ctypes. The less libraries, the better.
import ctypes
from PIL import ImageGrab, Image
from io import BytesIO
user32 = ctypes.windll.user32
img = ImageGrab.grab()
output = BytesIO()
img.convert("RGB").save(output, "BMP")
data = output.getvalue()[14:]
output.close()
user32.OpenClipboard()
user32.EmptyClipboard()
user32.SetClipboardData(user32.CF_DIB, data)
user32.CloseClipboard()
That is the stripped code from my script that, i think, is the same code in the question ported to my requirements. When executed, it should copy the current desktop to the clipboard. I get this instead:
File "C:\Users\Gcq\Documents\python\Screen\Screen.py", line 132, in shot
user32.OpenClipboard()
ValueError: Procedure probably called with not enough arguments (4 bytes missing)
I'm sorry i'm asking such a (probably) easy question here, but i really don't know what is failing, and ctypes is not my thing.
The example uses pywin32, which is Python wrapper around Win32 API that hides some low level details you need to take care yourself of if you want to use ctypes.
Here is how you do it using ctypes, it adds a functionally of creating globally allocated buffer and copy the data into that buffer:
#!python
from PIL import Image
#from cStringIO import StringIO
from io import BytesIO
from ctypes import *
from ctypes.wintypes import *
HGLOBAL = HANDLE
SIZE_T = c_size_t
GHND = 0x0042
GMEM_SHARE = 0x2000
GlobalAlloc = windll.kernel32.GlobalAlloc
GlobalAlloc.restype = HGLOBAL
GlobalAlloc.argtypes = [UINT, SIZE_T]
GlobalLock = windll.kernel32.GlobalLock
GlobalLock.restype = LPVOID
GlobalLock.argtypes = [HGLOBAL]
GlobalUnlock = windll.kernel32.GlobalUnlock
GlobalUnlock.restype = BOOL
GlobalUnlock.argtypes = [HGLOBAL]
CF_DIB = 8
OpenClipboard = windll.user32.OpenClipboard
OpenClipboard.restype = BOOL
OpenClipboard.argtypes = [HWND]
EmptyClipboard = windll.user32.EmptyClipboard
EmptyClipboard.restype = BOOL
EmptyClipboard.argtypes = None
SetClipboardData = windll.user32.SetClipboardData
SetClipboardData.restype = HANDLE
SetClipboardData.argtypes = [UINT, HANDLE]
CloseClipboard = windll.user32.CloseClipboard
CloseClipboard.restype = BOOL
CloseClipboard.argtypes = None
#################################################
image = Image.new("RGB", (200, 200), (255, 0, 0))
#output = StringIO()
output = BytesIO()
image.convert("RGB").save(output, "BMP")
data = output.getvalue()[14:]
output.close()
hData = GlobalAlloc(GHND | GMEM_SHARE, len(data))
pData = GlobalLock(hData)
memmove(pData, data, len(data))
GlobalUnlock(hData)
OpenClipboard(None)
EmptyClipboard()
SetClipboardData(CF_DIB, pData)
CloseClipboard()
Whew. Apparently the win32clipboard library does simplify some things when compared to ctypes. Your attempt to simply replace one with the other is far from correct.
So I booted up my Windows virtual machine, installed Pillow and rewrote your program, learning from two other answers:
import io
import ctypes
msvcrt = ctypes.cdll.msvcrt
kernel32 = ctypes.windll.kernel32
user32 = ctypes.windll.user32
from PIL import ImageGrab
img = ImageGrab.grab()
output = io.BytesIO()
img.convert('RGB').save(output, 'BMP')
data = output.getvalue()[14:]
output.close()
CF_DIB = 8
GMEM_MOVEABLE = 0x0002
global_mem = kernel32.GlobalAlloc(GMEM_MOVEABLE, len(data))
global_data = kernel32.GlobalLock(global_mem)
msvcrt.memcpy(ctypes.c_char_p(global_data), data, len(data))
kernel32.GlobalUnlock(global_mem)
user32.OpenClipboard(None)
user32.EmptyClipboard()
user32.SetClipboardData(CF_DIB, global_mem)
user32.CloseClipboard()