How to download images while numbering (with multiprocessing) - python

I want to save the files in order of the list. (like bbb.jpg->001.jpg, aaa.jpg -> 002.jpg...)
Because of alphabetical order, files are not saved as I want. (like aaa.jpg, bbb.jpg, ccc.jpg...)
There is also a way to sort files chronologically, but it is also impossible to use multiprocessing.
So my question is how can I save the files in the order I want, or in the name I want.
Here is my code.
from urllib.request import Request, urlopen
import urllib.request
import os
import os.path
import re
import time
from multiprocessing import Pool
import multiprocessing
from functools import partial
mylist = ['https://examsite.com/bbb.jpg',
'https://examsite.com/aaa.jpg',
'https://examsite.com/ddd.jpg',
'https://examsite.com/eee.jpg',
'https://examsite.com/ccc.jpg']
def image_URL_download (path, html):
originNames = (f"{html}".split)('/')[-1]
PathandNames = (path + str(originNames))
req = urllib.request.Request(html, headers={'User-Agent': 'Mozilla/5.0'})
urlopen = request.urlopen(req).read()
with open(PathandNames,'wb') as savefile2:
savefile2.write(urlopen)
print (f"download {originNames}")
if __name__ == "__main__":
start = time.time()
path = './down'
pool = multiprocessing.Pool(processes=4)
img_down = partial(image_URL_download, path)
pool.map(img_down, mylist)
pool.close()
pool.join()
print("DONE! time :", time.time() - start)

Here is a full example that takes a bunch of images (thumbnails, here) from Wikimedia commons images. It saves them numbered 000.jpg, 001.jpg, etc. (in /tmp, but of course adjust as needed). Bonus: it displays an animated progress bar during download, courtesy tqdm:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
tld = 'https://commons.wikimedia.org'
url = '/wiki/Category:Images'
soup = BeautifulSoup(requests.get(urljoin(tld, url)).content)
imglist = [x.get('src') for x in soup.find_all('img', src=True)]
imglist = [urljoin(tld, x) for x in imglist if x.endswith('.jpg')]
def load_img(i_url):
i, url = i_url
img = requests.get(url).content
with open(f'/tmp/{i:03d}.jpg', 'wb') as f:
f.write(img)
return True
def load_all(imglist):
with ThreadPoolExecutor() as executor:
results = list(tqdm(
executor.map(load_img, enumerate(imglist)),
total=len(imglist), unit=' images'))
return results
results = load_all(imglist)

Related

Logic put in a Python class runs indefinitely

I have logic for bulk calculating image hash.
Script 1
import dhash
import glob
from PIL import Image
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
PATH = '*.jpg'
def makehash(t):
filename, d = t
with Image.open(filename) as image:
image.draft('L', (32,32))
row, col = dhash.dhash_row_col(image)
d[filename] = dhash.format_hex(row, col)
def main():
with Manager() as manager:
d = manager.dict()
with ProcessPoolExecutor() as executor:
executor.map(makehash, [(jpg, d) for jpg in glob.glob(PATH)])
print(d)
if __name__ == '__main__':
main()
For around 10,000 JPEGs, it runs for less than a minute. However, if I put the logic into a class, it runs indefinitely:
import numpy
import cv2
import glob
import os
import dhash
from timeit import default_timer as timer
from datetime import timedelta
from wand.image import Image
from itertools import chain
from alive_progress import alive_bar
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
FOLDER_DUPLICATE = 'duplicate'
def listdir_nohidden(path):
return glob.glob(os.path.join(path, '*'))
def create_dir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def print_elapsed(sec):
print("Elapsed Time: ", timedelta(seconds=sec))
class ImgToolkit:
file_list = {}
def __init__(self):
# initialize something
with Manager() as manager:
self.file_list = manager.dict()
print("loaded")
def find_duplicates(self):
if os.path.exists(FOLDER_DUPLICATE) and listdir_nohidden(FOLDER_DUPLICATE):
print("ERROR: Duplicate folder exists and not empty. Halting")
else:
start = timer()
print("Phase 1 - Hashing")
imgs = glob.glob('*.jpg')
def get_photo_hashes_pillow(t):
filename, self.file_list = t
with Image.open(filename) as image:
image.draft('L', (32, 32))
row, col = dhash.dhash_row_col(image)
self.file_list[filename] = dhash.format_hex(row, col)
with ProcessPoolExecutor() as executor:
executor.map(get_photo_hashes_pillow, [(jpg, self.file_list) for jpg in imgs])
print(self.file_list)
end = timer()
print_elapsed(end-start)
And I use the class as follow:
from imgtoolkit import imgtoolkit
if __name__ == '__main__':
kit = imgtoolkit.ImgToolkit()
kit.find_duplicates()
What did I miss? I am quite new in Python.
UPDATE
I found that the function get_photo_hashes_pillow never get called, as I put a print() line in the 1st line of function. But why?

Image path problem when executing Python multiprocessing

There is no problem without multiprocessing.
Using multiprocessing alone causes path problems.
No matter how hard I search, I can't find the answer to the content, so I ask for help
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.preprocessing.image import array_to_img, img_to_array, load_img
import time
from multiprocessing import Pool
def b(path):
for f in path:
print (f)
new_img = load_img(f,target_size=(256,256))
arr_img = img_to_array(new_img)
return arr_img
def main():
start = int(time.time())
num_cores = 4
pool = Pool(num_cores)
pool.map(b, 'C:\\Users\\003.png')
print("***run time(sec) :", int(time.time()) - start)
if __name__ == "__main__":
main()
Error message
load_img
with open(path, 'rb') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'C'
The error message is the same even if it is put as a variable as follows.
def main():
start = int(time.time())
num_cores = 4
pool = Pool(num_cores)
bb = 'C:\\Users\\003.png'
pool.map(b, bb)
print("***run time(sec) :", int(time.time()) - start)
this piece of code has the problem.
pool.map(b, 'C:\\Users\\003.png')
you are using map, the first parameter is a function(which is okay for you), and the second need to be iterable, so it need to be like(a list for example) ['C:\\Users\\003.png']
because you gave it like 'C:\\Users\\003.png' map is trying to iterate it like C , then :\ and so on. which is throwing the error. so please change your code to (i.e a list)
pool.map(b, ['C:\\Users\\003.png'])

Is there a way to speed up PDF page merging (basically watermarking one with the other), when the base page is used repeatedly?

Clarification: I don't want to add pages to a PDF file. I want to add content to a very big PDF page. The page changes sometimes and the content is different every time.
I'm using pypdf2 and reportlab to make small additions to big PDF pages (~10MB). This takes 30 seconds and more and the majority of that time is spend parsing the original.
Usually the page also needs to be turned using mergeRotatedTranslatedPage.
My idea was to generate the content array of the original once and then copy it every time I want to add something. So I modified PageObject._merge to do just that. It worked... kind of. I'm now down to 18 sec.
Is there a better way to speed up this process? 18 sec for one page is still pretty slow.
If you want to use 100% capacity of all the cores of your processor, you can do it with "multiprocessing", as follows:
We count the number of pages in the PDF and the number of nuclei that your processor has, in order to calculate how many pages have to work each nucleus has.
The pages that must work are sent to each nucleus and at the end all the PDF's are joined.
# -*- coding: utf-8 -*-
from io import BytesIO
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.colors import Color
from webcolors import hex_to_rgb
import base64
import multiprocessing
import math
class SkyMark:
self.mainPdf=''
self.mask=''
#beginning, call from your api rest y pass request
def begin(self,request):
stringPdfBase64 = str(request.json['pdfBase4'])
blob = self.workMultiprocess(stringPdfBase64,'MyWaterMark text')
pdfbase64 = blob.decode('utf-8')
return pdfbase64
def workMultiprocess(self,stringPdfBase64,message='SkyMark'):
try:
#get pdf PdfFileReader objeto with your message
self.mask = self.getMaskWaterMark(message)
#Convert main pdfB64 to PdfFileReader object
sanitizedPdf = stringPdfBase64.rsplit(',', 1)[-1]
data = base64.b64decode(sanitizedPdf)
stream = BytesIO(data)
self.mainPdf = PdfFileReader(stream , strict=False)
numPaginas = self.mainPdf .getNumPages()
#count your cores of your processor
coresProcessor = int( multiprocessing.cpu_count() ) or 22
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
#calculate how many pages each processor has
byPage= int( math.ceil( numPaginas/coresProcessor ))
pagesFrom=0
pagesTo=0
#Send task for every core
for i in range(coresProcessor):
pagesFrom = pagesTo
pagesTo = pagesFrom + byPage
if pagesTo>numPaginas:
pagesTo=numPaginas
p = multiprocessing.Process(target=self.doByPage, args=(pagesFrom,pagesTo,i, return_dict))
jobs.append(p)
p.start()
if pagesTo>=numPaginas:
break
for proc in jobs:
proc.join()
#Order single PDF's for merge
randData = return_dict.values()
ascArray = sorted(randData, key=lambda k: k['procnum'])
singlePdfsArray = []
for pdfs in ascArray:
singlePdfsArray.append(pdfs['dataB64'])
#merge task
return self.mergePdfsArray(singlePdfsArray)
except Exception as e:
print(f'Error {e}')
#Explotamos los cores del procesador
def doByPage(self,fromPage,toPage,procnum,return_dict):
output = PdfFileWriter()
waterMark = self.mask.getPage(0)
for i in range(fromPage,toPage):
#print(f'WaterMark page: {i}, Core: {procnum}')
page = self.mainPdf.getPage(i)
page.mergePage(waterMark)
page.compressContentStreams()
output.addPage(page)
letter_data = BytesIO()
output.write(letter_data)
letter_data.seek(0)
dataB64 = base64.b64encode(letter_data.read())
return_dict[procnum] = {'procnum':procnum,'dataB64':dataB64}
#Single Pdf with your watermark
def getMaskWaterMark(self,texto):
font_name='Helvetica'
font_size=22
color='#000000'
opacity=0.08
x=1
y=840
filename=''
bgTexto='';
for i in range(1, 6):
bgTexto+= ' '+texto;
cantidadRenglones=100
mask_stream = BytesIO()
watermark_canvas = canvas.Canvas(mask_stream, pagesize=A4)
watermark_canvas.setFont(font_name, font_size)
r, g, b = hex_to_rgb(color)
c = Color(r, g, b, alpha=opacity)
watermark_canvas.setFillColor(c)
print(watermark_canvas)
for i in range(1, cantidadRenglones):
watermark_canvas.drawString(x, y-(i * 25), bgTexto)
watermark_canvas.save()
mask_stream.seek(0)
mask = PdfFileReader(mask_stream , strict=False)
return mask
#Merge all pdf in only one pdf
def mergePdfsArray(self,arrayPdfsBase64):
merge = PdfFileMerger()
for f in arrayPdfsBase64:
nada = base64.b64decode(f)
stre = BytesIO(nada)
src = PdfFileReader(stre , strict=False)
merge.append(src)
letter_data = BytesIO()
merge.write(letter_data)
letter_data.seek(0)
data = base64.b64encode(letter_data.read())
return data

Progress Bar while download file over http with Requests

I need to download a sizable (~200MB) file. I figured out how to download and save the file with here. It would be nice to have a progress bar to know how much has been downloaded. I found ProgressBar but I'm not sure how to incorperate the two together.
Here's the code I tried, but it didn't work.
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
with closing(download_file()) as r:
for i in range(20):
bar.update(i)
I suggest you try tqdm, it's very easy to use.
Example code for downloading with requests library:
from tqdm import tqdm
import requests
url = "http://www.ovh.net/files/10Mb.dat" #big file test
# Streaming, so we can iterate over the response.
response = requests.get(url, stream=True)
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 #1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open('test.dat', 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")
The tqdm package now includes a function designed more specifically for this type of situation: wrapattr. You just wrap an object's read (or write) attribute, and tqdm handles the rest; there's no messing with block sizes or anything like that. Here's a simple download function that puts it all together with requests:
def download(url, filename):
import functools
import pathlib
import shutil
import requests
from tqdm.auto import tqdm
r = requests.get(url, stream=True, allow_redirects=True)
if r.status_code != 200:
r.raise_for_status() # Will only raise for 4xx codes, so...
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
file_size = int(r.headers.get('Content-Length', 0))
path = pathlib.Path(filename).expanduser().resolve()
path.parent.mkdir(parents=True, exist_ok=True)
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
return path
It seems that there is a disconnect between the examples on the Progress Bar Usage page and what the code actually requires.
In the following example, note the use of maxval instead of max_value. Also note the use of .start() to initialized the bar. This has been noted in an Issue.
The n_chunk parameter denotes how many 1024 kb chunks to stream at once while looping through the request iterator.
import requests
import time
import numpy as np
import progressbar
url = "http://wikipedia.com/"
def download_file(url, n_chunk=1):
r = requests.get(url, stream=True)
# Estimates the number of bar updates
block_size = 1024
file_size = int(r.headers.get('Content-Length', None))
num_bars = np.ceil(file_size / (n_chunk * block_size))
bar = progressbar.ProgressBar(maxval=num_bars).start()
with open('test.html', 'wb') as f:
for i, chunk in enumerate(r.iter_content(chunk_size=n_chunk * block_size)):
f.write(chunk)
bar.update(i+1)
# Add a little sleep so you can see the bar progress
time.sleep(0.05)
return
download_file(url)
EDIT: Addressed comment about code clarity.
EDIT2: Fixed logic so bar reports 100% at completion. Credit to leovp's answer for using the 1024 kb block size.
Also python library enlighten can be used, it is powerful, provides colorful progress bars and correctly works in Linux, Windows.
Below is code + live screen-cast. This code can be run here on repl.it.
import math
import requests, enlighten
url = 'https://upload.wikimedia.org/wikipedia/commons/a/ae/Arthur_Streeton_-_Fire%27s_on_-_Google_Art_Project.jpg?download'
fname = 'image.jpg'
# Should be one global variable
MANAGER = enlighten.get_manager()
r = requests.get(url, stream = True)
assert r.status_code == 200, r.status_code
dlen = int(r.headers.get('Content-Length', '0')) or None
with MANAGER.counter(color = 'green', total = dlen and math.ceil(dlen / 2 ** 20), unit = 'MiB', leave = False) as ctr, \
open(fname, 'wb', buffering = 2 ** 24) as f:
for chunk in r.iter_content(chunk_size = 2 ** 20):
print(chunk[-16:].hex().upper())
f.write(chunk)
ctr.update()
Output (+ ascii-video)
It seems like you're going to need to get the remote file size (answered here) to calculate how far along you are.
You could then update your progress bar while processing each chunk... if you know the total size and the size of the chunk, you can figure out when to update the progress bar.
There is an answer with tqdm.
def download(url, fname):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
Gits: https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51
Calculating the file size with your already downloaded size would find how far you are. Or you could use tqdm.
For some reason I couldn't get file size with requests when working with zip files, so I used urllib to get it
# A simple downloader with progress bar
import requests
from tqdm import tqdm
import zipfile
from urllib.request import urlopen
url = "https://web.cs.dal.ca/~juanr/downloads/malnis_dataset.zip"
block_size = 1024 #1 Kibibyte
filename = url.split("/")[-1]
print(f"Downloading {filename}...")
site = urlopen(url)
meta = site.info()
# Streaming, so we can iterate over the response.
response = requests.get(url, stream=True)
total_size_in_bytes = int(meta["Content-Length"])
progress_bar = tqdm(total = total_size_in_bytes, unit='iB', unit_scale=True)
with open('test.dat', 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
print("Download complete")
print(f"Extracting {filename}...")
zip = zipfile.ZipFile(filename, "r")
zip.extractall()
zip.close()
print("Extracting complete")

convert linux python multiprocessing to windows

I would like to use this Linux Python script in Windows Python.
how to rewrite it ? The part to be rewritten in multiprocessing part.
from __future__ import print_function
from collections import Counter
import glob
import multiprocessing
import os
import re
import sys
import time
def create_data(filepath):
...
return values
filepaths = glob.glob('*/*.txt')
num_tasks = len(filepaths)
p = multiprocessing.Pool()
results = p.imap(create_data, filepaths)
while (True):
completed = results._index
print("\r--- Completed {:,} out of {:,}".format(completed, num_tasks), end='')
sys.stdout.flush()
time.sleep(1)
if (completed == num_tasks): break
p.close()
p.join()
df_full = pd.DataFrame(list(results))
print()
thanks for your help.

Categories

Resources