Python loop over batch of files - python

I want to loop over a batch of files in order to get 32 images of each sub-directory at a time (I cant load all images due to memory) e.g load img 1-32 of every dir use them and then load img 33-64 then 65-96 etc
My directory:
Rootdir
- dir1
- img 1
- img 2
- img...
- dir2
- img 5000001
- img 5000002
- img...
- dir3
- img 10000001
- img 10000002
- img...
So I would need to load img1,2,..,32, 5000001,...5000032, 1000001,...10000032 at first loop then img33,34,..,64, 5000033,...5000064, 1000033,...10000064 at second loop
Is there a way to do this properly?
I am trying using os.walk and it allows me to loop over my directory but I don't see how I can adapt this loop to my required 32 batches?
for dirName, subdirList, fileList in os.walk(rootdir):
print('Found directory: %s' % dirName)
for fname in sorted(fileList):
img_path = os.path.join(dirName, fname)
try:
img = load_img(img_path, target_size=None)
imgs.append(img)
except Exception as e:
print(str(e), fname, i)
#do something on imgs
EDIT
all of your comment get me stuff like that:
dir1/img1.jpg to dir1/img32.jpg then dir1/img33.jpg to dir1/img64.jpg then
...
then dir2/img1.jpg to dir1/img32.jpg then dir2/img33.jpg to dir2/img64.jpg then ...
then dir3/img1.jpg to dir3/img32.jpg then dir3/img33.jpg to dir3/img64.jpg :(
What I'm trying to achieve is:
Files of dir1 numero 1 to 32 + files of dir2 numero 1 to 32 + files of dir3 numero 1 to 32 then
Files of dir1 numero 33 to 64 + files of dir2 numero 33 to 64 + files of dir3 numero 33 to 64 in the same loop

os.walk already returns a generator which will yield a 3-tuple (dirpath, dirnames, filenames) values on fly, so you just need to yield the slice of the filenames array in batches of 32.
This is an example:
import os
# Your root directory path
rootdir = r"Root"
#Your batch size
batch_size = 32
def walk_dirs(directory, batch_size):
walk_dirs_generator = os.walk(directory)
for dirname, subdirectories, filenames in walk_dirs_generator:
for i in range(0, len(filenames), batch_size):
# slice the filenames list 0-31, 32-64 and so on
yield [os.path.join(dirname, filename) for filename in filenames[i:i+batch_size]]
# Finally iterate over the walk_dirs function which itself returns a generator
for file_name_batch in walk_dirs(rootdir, batch_size):
for file_name in file_name_batch:
# Do some processing on the batch now
print (file_name)
pass

You could take a look at os.walk()
EDIT: simple counter example
counter = 0
for x in mylist:
# do something with x
todo_list.append(x)
counter += 1
if counter % 32 == 0:
# do something with todo list
todo_list = [] # empty todo list for next batch

What about always using the same img list and process it as soon as you have 32 images?
for dirName, subdirList, fileList in os.walk('c:\\Java\\'):
print('Found directory: %s' % dirName)
for fname in sorted(fileList):
img_path = os.path.join(dirName, fname)
try:
img = load_img(img_path, target_size=None)
imgs.append(img)
if len(imgs) == 32:
print("Doing what I have to with current imgs list (add your function here)")
img = [] # cleaning img list
except Exception as e:
print(str(e))
#do something on imgs
if you need to keep track of all the previous lists you can simply copy the list content over.
Let me know if you want that implementation too.

Okay I found a way, not the most beautiful but here it is:
I use a set to know which file I already seen and I continue if I'm on it so it doesn't count.
number_of_directory = 17
batch_size = 32
seen = set()
for overall_count in pbar(range(data_number // (batch_size * number_of_directory))):
imgs = []
for dirName, subdirList, fileList in os.walk(rootdir):
count = 0
for fname in sorted(fileList):
if fname in seen:
continue
if count == batch_size:
break
img_path = os.path.join(dirName, fname)
try:
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
img = cv2.resize(img, (img_width, img_height))
imgs.append(np.array(img))
except Exception as e:
print(str(e), fname)
seen.add(fname)
count +=1
#Do something with images

Related

Where is the bottleneck in my image manipulation code?

I wrote this script to do some image processing on a large number of PNG files (around 1500 in total). They are organized into subdirectories.
That's my code:
from PIL import Image
import os
path = "/Some/given/path"
file_list = []
counter = 1
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".png"):
temp_file = {"path": os.path.join(root, file), "name": file}
file_list.append(temp_file)
for curr_file in file_list:
img = Image.open(curr_file["path"])
img = img.convert("RGBA")
val = list(img.getdata())
new_data = []
for item in val:
if item[3] == 0:
new_data.append(item)
else:
new_data.append((0, 0, 0, 255))
img.putdata(new_data)
file_name = "transform" + str(counter) + ".png"
replaced_text = curr_file["name"]
new_file_name = curr_file["path"].replace(replaced_text, file_name)
img.save(new_file_name)
counter += 1
The folder structure is as follows:
Source folder
-- folder__1
-- image_1.png
-- image_2.png
-- image_3.png
-- folder__2
-- image_3.png
-- image_5.png
-- folder__3
-- image_6.png
When testing on individual images, the image processing takes only a few seconds. However, when running the script, it takes around an hour to process 15 images. Any suggestions on where I'm messing up?
The main issue is located here:
new_data = []
for item in val:
if item[3] == 0:
new_data.append(item)
else:
new_data.append((0, 0, 0, 255))
img.putdata(new_data) # <--
You don't need to update the content of img for each pixel, if you're collecting the complete new_data anyway. So, just move that line outside the loop:
new_data = []
for item in val:
if item[3] == 0:
new_data.append(item)
else:
new_data.append((0, 0, 0, 255))
img.putdata(new_data) # <--
Now, get rid of iterating all pixels at all by using NumPy and its vectorization capabilities:
from PIL import Image
import os
import numpy as np # <--
path = "/Some/given/path"
file_list = []
counter = 1
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".png"):
temp_file = {"path": os.path.join(root, file), "name": file}
file_list.append(temp_file)
for curr_file in file_list:
img = Image.open(curr_file["path"])
img = img.convert("RGBA")
img = np.array(img) # <--
img[img[..., 3] != 0] = (0, 0, 0, 255) # <--
img = Image.fromarray(img) # <--
file_name = "transform" + str(counter) + ".png"
replaced_text = curr_file["name"]
new_file_name = curr_file["path"].replace(replaced_text, file_name)
img.save(new_file_name)
counter += 1
Basically, you set all pixels with alpha channel not equal to 0 to (0, 0, 0, 255). That's the NumPy one-liner you see there. The line before and after are just for transformation from Pillow Image to NumPy array and vice versa.
EDIT: If you don't want to have NumPy in your code, you could also get rid of the loops by using Pillow's point function, cf. this tutorial:
from PIL import Image
import os
path = "/Some/given/path"
file_list = []
counter = 1
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".png"):
temp_file = {"path": os.path.join(root, file), "name": file}
file_list.append(temp_file)
for curr_file in file_list:
img = Image.open(curr_file["path"])
img = img.convert("RGBA")
source = img.split() # <--
mask = source[3].point(lambda i: i > 0 and 255) # <--
img.paste(Image.new("RGBA", img.size, (0, 0, 0, 255)), None, mask) # <--
file_name = "transform" + str(counter) + ".png"
replaced_text = curr_file["name"]
new_file_name = curr_file["path"].replace(replaced_text, file_name)
img.save(new_file_name)
counter += 1
----------------------------------------
System information
----------------------------------------
Platform: Windows-10-10.0.16299-SP0
Python: 3.9.1
NumPy: 1.20.2
Pillow: 8.1.2
----------------------------------------
You can use snakeviz library to profile your code -
Snakeviz - https://jiffyclub.github.io/snakeviz/
python -m cProfile -o program.prof my_program.py
Once the profile is generated you can visualise and see which function/which line is taking more time.
snakeviz program.prof

Find the large_files but output unexpected results

I have such a program to find the large files
import os, time, shelve
start = time.time()
root = '/'
# errors= set()
# dirs = set()
while True:
try:
root = os.path.abspath(root) #ensure its a abspath
#set the baseline as 100M
#consider the shift
baseline = 100 * 2**20 # 2*20 is1M
#setup to collect the large files
large_files = []
#root is a better choise as the a concept
for foldername, subfolders, files in os.walk(root):
for f in files:
# print(f"{foldername}, {f}")
abspath = os.path.join(foldername, f)
size = os.path.getsize(abspath)
if size >= baseline:
large_files.append((os.path.basename(abspath), size))
print(abspath, size/(2**20))
#write the large files to shelf
shelf = shelve.open('/root/large_files.db')
shelf["large_files"] = large_files
shelf.close()
if subfolders == []:
end = time.time()
break
except (PermissionError,FileNotFoundError) as e:
# errors.add(e)
pass
It consistently output the identical results
[root#iz2ze9wve43n2nyuvmsfx5z ~]# python3 search_large_files.py
/dev/core 134217726.0078125
/dev/core 134217726.0078125
/dev/core 134217726.0078125
....
However, I found no reasons that
print(abspath, size/(2**20))
will do this constantly.
What's the problem might be in my code:
You have an infinite outer loop with while True:, and apparently /dev/core is the only file in your filesystem that exceeds the file size specified by baseline, so it would keep outputting the same file over and over again.
Remove while True: and un-indent the block inside and your code would work.
Note that your if subfolders == []: condition is outside your for foldername, subfolders, files in os.walk(root): loop and would therefore not be useful. You should record the end time unconditionally anyway so you should simply remove the if condition and the break statement as well.

os.walk not saving images in sub directories

Images is a folder which have 10 more sub folders and every sub folder have one image which i am resizing and saving on same place but os.walk is not working can anyone check what i did wrong.
path='E:/Dataset_Final/Images/'
def count_em(path):
for root, dirs, files in sorted(os.walk(path)):
for file_ in files:
full_file_path = os.path.join(root, file_)
print (full_file_path)
img = Image.open(full_file_path)
new_width = 32
new_height = 32
img = img.resize((new_width, new_height), Image.ANTIALIAS)
img.save(os.path.join(root, file_+''),'png')
return
count_em(path)
You return after the first directory.
Remove the return statement and your code should work as expected.

Python Read multiple images from multiple folders

I want to read multiple .jpg images, that are in 3 separate folders. The 3 folders are on the same path. I tried to do it like this:
path1 = os.path.abspath('Type_1')
path2 = os.path.abspath('Type_2')
path3 = os.path.abspath('Type_3')
folder = os.path.join(path1, path2, path3)
def load_images_from_folder(folder):
images = []
for filename in os.listdir(folder):
if filename.endswith(".jpg"):
img = cv2.imread(os.path.join(folder, filename))
if img is not None:
images.append(img)
return images
print(load_images_from_folder(folder))
But it only returns the last path and not all of them. I also tried to use relative paths, such as:
path1 = os.path.relpath('Type_1')
path2 = os.path.relpath('Type_2')
path3 = os.path.relpath('Type_3')
folder = os.path.join(os.path.sep, path1, path2, path3)
but still the same problem. Can someone help with this?
Your return images line is inside your loop, so it will return as soon as it finds any matching result. You only want it to return after the whole loop as finished.
Reduce the indentation of the return statement so that it lies after the loop instead of inside it.
def load_images_from_folder(folder):
images = []
for filename in os.listdir(folder):
if filename.endswith(".jpg"):
img = cv2.imread(os.path.join(folder, filename))
if img is not None:
images.append(img)
return images
[Edit]
If you want to look in multiple adjacent folders, you want something like this:
root_folder = '[whatever]/data/train'
folders = [os.path.join(root_folder, x) for x in ('Type_1', 'Type_2', 'Type_3')]
all_images = [img for folder in folders for img in load_images_from_folder(folder)]
That will call load_images on each folder, and put all the results into one list.
If I understand the problem correctly, your file structure looks as follows:
- Type1
- Image1.jpg
- Image2.jpg
- Type2
- Image1.jpg
- Image2.jpg
- Type3
- Image1.jpg
- Image2.jpg
If this is true, then the os.path.join call is errant (it'll result in a string that reads "Type1/Type2/Type3" which achieves nothing for you).
I think the code you're looking for is as follows:
def load_images_from_folder(folder):
images = []
for filename in os.listdir(folder):
if any([filename.endswith(x) for x in ['.jpeg', '.jpg']]):
img = cv2.imread(os.path.join(folder, filename))
if img is not None:
images.append(img)
return images
folders = [
'Type1',
'Type2',
'Type3',
]
for folder in folders:
images = load_images_from_folder(folder)
# your code that does something with the return images goes here
I know this is really old, but this worked for me recently.
def create_dataset(img_folder):
img_data_array=[]
class_name=[]
for dirl in os.listdir(img_folder):
for file in os.listdir(os.path.join(img_folder,dirl)):
if any([file.endswith(x) for x in ['.jpeg', '.jpg']]):
image_path=os.path.join(img_folder,dirl,file)
image=cv2.imread(image_path,cv2.COLOR_BGR2RGB)
img_data_array.append(image)
class_name.append(dirl)
return img_data_array,class_name
img_data, class_name =create_dataset(train_folder)

Python Convert all JPGS in a folder to PDF

I have code that converts all .jpgs in a folder to one PDF, but it is not working. I believe it to be because of something with my directory being passed. The below is the code and my output. Now it states that my PDF was written, but it doesn't display the directory.
root = "C:\\Users\\Matthew\\Desktop\\Comics\\"
try:
n = 0
for dirpath, dirnames, filenames in os.walk(root):
PdfOutputFileName = os.path.basename(dirpath) + ".pdf"
c = canvas.Canvas(PdfOutputFileName)
if n > 0 :
for filename in filenames:
LowerCaseFileName = filename.lower()
if LowerCaseFileName.endswith(".jpg"):
print(filename)
filepath = os.path.join(dirpath, filename)
print(filepath)
im = ImageReader(filepath)
imagesize = im.getSize()
c.setPageSize(imagesize)
c.drawImage(filepath,0,0)
c.showPage()
c.save()
n = n + 1
print "PDF of Image directory created" + PdfOutputFileName
except:
print "Failed creating PDF"
The below is my output:
PDF of Image directory created.pdf
At start n is 0.
The os.walk loop only runs once in this case (since there's probably only one directory to scan, and you get only one print statement so that's the proof), providing filenames and dirnames as iterables, but you skip the iteration by testing n > 0, so it does nothing.
for dirpath, dirnames, filenames in os.walk(root):
PdfOutputFileName = os.path.basename(dirpath) + ".pdf"
c = canvas.Canvas(PdfOutputFileName)
if n > 0 :
My advice: get rid of the if n > 0 test.
Have a look at img2pdf for lossless conversion in Python:
https://gitlab.mister-muffin.de/josch/img2pdf
Example CLI usage:
img2pdf img1.png img2.jpg -o out.pdf

Categories

Resources