I have logic for bulk calculating image hash.
Script 1
import dhash
import glob
from PIL import Image
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
PATH = '*.jpg'
def makehash(t):
filename, d = t
with Image.open(filename) as image:
image.draft('L', (32,32))
row, col = dhash.dhash_row_col(image)
d[filename] = dhash.format_hex(row, col)
def main():
with Manager() as manager:
d = manager.dict()
with ProcessPoolExecutor() as executor:
executor.map(makehash, [(jpg, d) for jpg in glob.glob(PATH)])
print(d)
if __name__ == '__main__':
main()
For around 10,000 JPEGs, it runs for less than a minute. However, if I put the logic into a class, it runs indefinitely:
import numpy
import cv2
import glob
import os
import dhash
from timeit import default_timer as timer
from datetime import timedelta
from wand.image import Image
from itertools import chain
from alive_progress import alive_bar
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
FOLDER_DUPLICATE = 'duplicate'
def listdir_nohidden(path):
return glob.glob(os.path.join(path, '*'))
def create_dir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def print_elapsed(sec):
print("Elapsed Time: ", timedelta(seconds=sec))
class ImgToolkit:
file_list = {}
def __init__(self):
# initialize something
with Manager() as manager:
self.file_list = manager.dict()
print("loaded")
def find_duplicates(self):
if os.path.exists(FOLDER_DUPLICATE) and listdir_nohidden(FOLDER_DUPLICATE):
print("ERROR: Duplicate folder exists and not empty. Halting")
else:
start = timer()
print("Phase 1 - Hashing")
imgs = glob.glob('*.jpg')
def get_photo_hashes_pillow(t):
filename, self.file_list = t
with Image.open(filename) as image:
image.draft('L', (32, 32))
row, col = dhash.dhash_row_col(image)
self.file_list[filename] = dhash.format_hex(row, col)
with ProcessPoolExecutor() as executor:
executor.map(get_photo_hashes_pillow, [(jpg, self.file_list) for jpg in imgs])
print(self.file_list)
end = timer()
print_elapsed(end-start)
And I use the class as follow:
from imgtoolkit import imgtoolkit
if __name__ == '__main__':
kit = imgtoolkit.ImgToolkit()
kit.find_duplicates()
What did I miss? I am quite new in Python.
UPDATE
I found that the function get_photo_hashes_pillow never get called, as I put a print() line in the 1st line of function. But why?
Related
Running my multiprocessing script while using the getpass function in python (import getpass): and
I keep getting this error message. I am also running this code in a .py file on my command prompt terminal, using windows 10 as well.
error message
The following is my code:
import time
import multiprocessing
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
class multiprocess:
def __init__(self):
pass
def test(self, Batch_test):
pw_2 = getpass.getpass(prompt="Password", stream=False)
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver', 'jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2',
['f408195', pw_2], 'C:/JDBC/db2jcc.jar')
PML = PML.cursor()
Batch_query = "select id_hstrcl_data_bch_load_frst_evnt as btch_strt, id_hstrcl_data_bch_load_last_evnt as btch_end from UDBADM.hstrcl_data_bch_load WHERE ID_HSTRCL_DATA_BCH_LOAD BETWEEN 1 and 2"
PML.execute(Batch_query)
Batch_records = PML.fetchall()
Batch_records = pd.DataFrame(Batch_records)
for ind in Batch_test:
print(ind)
first_evnt = Batch_records.iloc[ind, 0]
last_evnt = Batch_records.iloc[ind, 1]
PML_loan_Query = "select CAST(b.id_lpa_alt_loan AS INT) AS id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt between ? and ?"
PML.execute(PML_loan_Query, (first_evnt, last_evnt))
loan_records = PML.fetchall()
return loan_records
def run(self):
processes = []
for i in range(2):
p = multiprocessing.Process(target=self.test, args=(i,))
processes.append(p)
for p in processes:
p.start()
if __name__ == '__main__':
a = multiprocess()
a.run()
There is no problem without multiprocessing.
Using multiprocessing alone causes path problems.
No matter how hard I search, I can't find the answer to the content, so I ask for help
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.preprocessing.image import array_to_img, img_to_array, load_img
import time
from multiprocessing import Pool
def b(path):
for f in path:
print (f)
new_img = load_img(f,target_size=(256,256))
arr_img = img_to_array(new_img)
return arr_img
def main():
start = int(time.time())
num_cores = 4
pool = Pool(num_cores)
pool.map(b, 'C:\\Users\\003.png')
print("***run time(sec) :", int(time.time()) - start)
if __name__ == "__main__":
main()
Error message
load_img
with open(path, 'rb') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'C'
The error message is the same even if it is put as a variable as follows.
def main():
start = int(time.time())
num_cores = 4
pool = Pool(num_cores)
bb = 'C:\\Users\\003.png'
pool.map(b, bb)
print("***run time(sec) :", int(time.time()) - start)
this piece of code has the problem.
pool.map(b, 'C:\\Users\\003.png')
you are using map, the first parameter is a function(which is okay for you), and the second need to be iterable, so it need to be like(a list for example) ['C:\\Users\\003.png']
because you gave it like 'C:\\Users\\003.png' map is trying to iterate it like C , then :\ and so on. which is throwing the error. so please change your code to (i.e a list)
pool.map(b, ['C:\\Users\\003.png'])
I want to save the files in order of the list. (like bbb.jpg->001.jpg, aaa.jpg -> 002.jpg...)
Because of alphabetical order, files are not saved as I want. (like aaa.jpg, bbb.jpg, ccc.jpg...)
There is also a way to sort files chronologically, but it is also impossible to use multiprocessing.
So my question is how can I save the files in the order I want, or in the name I want.
Here is my code.
from urllib.request import Request, urlopen
import urllib.request
import os
import os.path
import re
import time
from multiprocessing import Pool
import multiprocessing
from functools import partial
mylist = ['https://examsite.com/bbb.jpg',
'https://examsite.com/aaa.jpg',
'https://examsite.com/ddd.jpg',
'https://examsite.com/eee.jpg',
'https://examsite.com/ccc.jpg']
def image_URL_download (path, html):
originNames = (f"{html}".split)('/')[-1]
PathandNames = (path + str(originNames))
req = urllib.request.Request(html, headers={'User-Agent': 'Mozilla/5.0'})
urlopen = request.urlopen(req).read()
with open(PathandNames,'wb') as savefile2:
savefile2.write(urlopen)
print (f"download {originNames}")
if __name__ == "__main__":
start = time.time()
path = './down'
pool = multiprocessing.Pool(processes=4)
img_down = partial(image_URL_download, path)
pool.map(img_down, mylist)
pool.close()
pool.join()
print("DONE! time :", time.time() - start)
Here is a full example that takes a bunch of images (thumbnails, here) from Wikimedia commons images. It saves them numbered 000.jpg, 001.jpg, etc. (in /tmp, but of course adjust as needed). Bonus: it displays an animated progress bar during download, courtesy tqdm:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
tld = 'https://commons.wikimedia.org'
url = '/wiki/Category:Images'
soup = BeautifulSoup(requests.get(urljoin(tld, url)).content)
imglist = [x.get('src') for x in soup.find_all('img', src=True)]
imglist = [urljoin(tld, x) for x in imglist if x.endswith('.jpg')]
def load_img(i_url):
i, url = i_url
img = requests.get(url).content
with open(f'/tmp/{i:03d}.jpg', 'wb') as f:
f.write(img)
return True
def load_all(imglist):
with ThreadPoolExecutor() as executor:
results = list(tqdm(
executor.map(load_img, enumerate(imglist)),
total=len(imglist), unit=' images'))
return results
results = load_all(imglist)
I'm implementing logging function in a static period, and getting id to identify where the function is called is necessary for this.
Currently the id is computed with serialization of 'inspect.stack()[1][1:]' by 'pickle' and I have created below python script for testing this feature.
I'm currently using 'inspect' module but it returns only filename, linenumber, source at that line.
import inspect
import hashlib
def loginfo_throttle():
frame = inspect.stack()[1][0]
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
print(hashlib.md5(pickle.dumps(inspect.stack()[1][1:])).hexdigest())
print(inspect.getframeinfo(frame))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
def main():
loginfo_throttle();loginfo_throttle()
loginfo_throttle()
if __name__ == '__main__':
main()
The output is as below, and the first and second output is exactly same and this is the problem.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9ab15e3e3d106afca16e4155ac571152
Traceback(filename='spam.py', lineno=20, function='main', code_context=[' loginfo_throttle();loginfo_throttle()\n'], index=0)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9ab15e3e3d106afca16e4155ac571152
Traceback(filename='spam.py', lineno=20, function='main', code_context=[' loginfo_throttle();loginfo_throttle()\n'], index=0)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2d63d9b94b68ba14989ffebed0b70ab2
Traceback(filename='spam.py', lineno=21, function='main', code_context=[' loginfo_throttle()\n'], index=0)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
Update
What I'd like to do in actual is as below,
so distinguishing functions in same line using the called time is difficult.
import inspect
import pickle
import time
class LoggingThrottle(object):
time_table = {}
def __call__(self, period, msg):
id = pickle.dumps(inspect.stack()[1][1:])
now = time.time()
last_time = self.time_table.get(id)
if (last_time is None) or ((now - last_time) > period):
print(msg)
self.time_table[id] = now
logging_throttle = LoggingThrottle()
def main():
for i in xrange(1000):
logging_throttle(3, 'foo'); logging_throttle(3, 'spam')
logging_throttle(3, 'bar')
time.sleep(0.1)
if __name__ == '__main__':
main()
This outputs only 'foo' and 'bar'.
foo
bar
foo
bar
You can add time() to distinguish functions called within one line.
import inspect
import hashlib
import pickle
from time import time
def loginfo_throttle():
frame = inspect.stack()[1][0]
print('>'*30)
t = time()
print(hashlib.md5(pickle.dumps((t, inspect.stack()[1][1:]))).hexdigest())
print(t, inspect.getframeinfo(frame))
print('<'*30)
def main():
loginfo_throttle();loginfo_throttle()
loginfo_throttle()
if __name__ == '__main__':
main()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5576395790524ffafbc8e3c257580697
(1464942290.227, Traceback(filename='<ipython-input-70-f8a8af94e830>', lineno=16, function='main', code_context=[u' loginfo_throttle();loginfo_throttle()\n'], index=0))
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
a56f921a23caef7f5d40713141a53b55
(1464942290.354, Traceback(filename='<ipython-input-70-f8a8af94e830>', lineno=16, function='main', code_context=[u' loginfo_throttle();loginfo_throttle()\n'], index=0))
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
16e9e10e253c42a4eeb5b5c42911de19
(1464942290.482, Traceback(filename='<ipython-input-70-f8a8af94e830>', lineno=17, function='main', code_context=[u' loginfo_throttle()\n'], index=0))
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
I found the solution, by getting column in source code as below, seeing this post. How to find column number in Python code
import inspect
import pickle
import time
class LoggingThrottle(object):
time_table = {}
def __call__(self, period, msg):
caller = inspect.stack()[1]
frame, _, lineno, _, code, _ = caller
caller_id = (
inspect.getabsfile(frame),
code,
lineno,
frame.f_lasti,
)
# print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
# print(caller_id)
# print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
caller_id = pickle.dumps(caller_id)
now = time.time()
last_time = self.time_table.get(caller_id)
if (last_time is None) or ((now - last_time) > period):
print(msg)
self.time_table[caller_id] = now
logging_throttle = LoggingThrottle()
def main():
for i in xrange(1000):
logging_throttle(3, 'foo'); logging_throttle(3, 'spam')
logging_throttle(3, 'bar')
time.sleep(0.1)
if __name__ == '__main__':
main()
I would like to use this Linux Python script in Windows Python.
how to rewrite it ? The part to be rewritten in multiprocessing part.
from __future__ import print_function
from collections import Counter
import glob
import multiprocessing
import os
import re
import sys
import time
def create_data(filepath):
...
return values
filepaths = glob.glob('*/*.txt')
num_tasks = len(filepaths)
p = multiprocessing.Pool()
results = p.imap(create_data, filepaths)
while (True):
completed = results._index
print("\r--- Completed {:,} out of {:,}".format(completed, num_tasks), end='')
sys.stdout.flush()
time.sleep(1)
if (completed == num_tasks): break
p.close()
p.join()
df_full = pd.DataFrame(list(results))
print()
thanks for your help.