Sorting a set of matrices - python

I have many images(about 10000). The my goal is make the binary research on a the set the matrixs bidimensional and researching if there are images duplicate and delete this images. But exist the concept the matrix major another matrix? How i can solve? The alternative is make a research sequential, but is many innefficient.

#Miki's suggestion seemed like a fun exercise, so I created an implementation that you can use.
More on hashing here
import hashlib, os, cv2
# location of images
path = '.'
# create list that will hold the hashes
all_hashes = []
# get and iterate all image paths
all_files = os.listdir(path)
for f in all_files:
# check image extension
name, ext = os.path.splitext(f)
if ext == '.jpg':
# open image
img = cv2.imread(f)
# hash the image and get hex representation
hash = hashlib.md5(img).hexdigest()
# check if hash already exists, if not then add it to the list
if hash in all_hashes:
print('Already exists: ' + f)
else:
all_hashes.append(hash)

Related

Opening files from directory in specific order

I have a folder that contains around 500 images that I am rotating at a random angle from 0 to 360. The files are named 00i.jpeg where i = 0 then i = 1. For example I have an image named 009.jpeg and one named 0052.jpeg and another one 00333.jpeg. My code below works as is does rotate the image, but how the files are being read through is not stepping correctly.
I would think I would need some sort of stepping code chunk that starts at 0 and adds one each time, but I'm not sure where I would put that. os.listdir doesn't allow me to do that because (from my understanding) it just lists the files out. I tried using os.walk but I cannot use cv2.imread. I receive a SystemError: <built-in function imread> returned NULL without setting an error error.
Any suggestions?
import cv2
import imutils
from random import randrange
import os
os.chdir("C:\\Users\\name\\Desktop\\training\\JPEG")
j = 0
for infile in os.listdir("C:\\Users\\name\\Desktop\\training\\JPEG"):
filename = 'testing' + str(j) + '.jpeg'
i = randrange(360)
image = cv2.imread(infile)
rotation_output = imutils.rotate_bound(image, angle=i)
os.chdir("C:\\Users\\name\\Desktop\\rotate_test")
cv2.imwrite("C:\\Users\\name\\Desktop\\rotate_test\\" + filename, rotation_output)
os.chdir("C:\\Users\\name\\Desktop\\training\\JPEG")
j = j + 1
print(infile)
000.jpeg
001.jpeg
0010.jpeg
00100.jpeg
...
Needs to be:
print(infile)
000.jpeg
001.jpeg
002.jpeg
003.jpeg
...
Get a list of files first, then use sort with key where the key is an integer version of the file name without extension.
files = os.listdir("C:\\Users\\name\\Desktop\\training\\JPEG")
files.sort(key=lambda x:int(x.split('.')[0]))
for infile in files:
...
Practical example:
files = ['003.jpeg','000.jpeg','001.jpeg','0010.jpeg','00100.jpeg','002.jpeg']
files.sort(key=lambda x:int(x.split('.')[0]))
print(files)
Output
['000.jpeg', '001.jpeg', '002.jpeg', '003.jpeg', '0010.jpeg', '00100.jpeg']

Why do I have to run this python script twice to format images correctly?

Goal:
I am trying to batch process images contained inside a folder to resize and optimize them for use online.
Problem:
The following script works, but I have to run it twice before I get the output I want. This is how I would expect it to work:
function 1: resize_aspect_fit()
Resizes each image in the target folder to a specific size, adds "_small.png" to the file name, and saves it as a new file in the subfolder "optimized_images", created in the same directory as the original group of images.
function2: png_conversion()
Takes the newly made images inside "optimized_images" ("_small.png") and applies a conversion that reduces the size of the original file, adding the "-opt.png" suffix to indicate it has been optimized.
function3: unoptimized_cleanup()
Takes the files built by function 1, which are no longer necessary (since they have been optimized) and deletes them, to reduce clutter.
When I run the script I get the expected response from function1, all files in the target file are resized appropriately and saved in the "optimized_images" folder. But I have to run the script a second time before function 2 and 3 take effect. It does work, but I have never encountered an issue like this before. Any idea why this is happening?
What I tried:
I thought this might be related to file open/close operations, but I think I am closing them all at the appropriate time. I swapped Image.open syntax to use "with Image.open(path) as image:" but that did not solve the problem.
I thought there might be some issue with os.listdir or os.path where it might have to be 'reset' in order to iterate through a directory of files twice, but I cannot find anything.
from PIL import Image
import os, sys
path = "../path/to/images/"
new_folder = '/optimized_images/'
optimized_path = path + new_folder[1:]
dirs = os.listdir( path )
optimized_dirs = os.listdir( optimized_path )
def resize_aspect_fit(final_size=250, dirs=dirs, optimized_path=optimized_path, optimized_dirs=optimized_dirs):
for item in dirs:
if item == '.DS_Store':
continue
if os.path.isfile(path+item):
with Image.open(path+item) as im:
f, e = os.path.splitext(path+item)
size = im.size
ratio = float(final_size) / max(size)
new_image_size = tuple([int(x*ratio) for x in size])
im = im.resize(new_image_size, Image.ANTIALIAS)
new_im = Image.new("RGBA", (final_size, final_size), color=(255,255,255,0))
new_im.paste(im, ((final_size-new_image_size[0])//2, (final_size-new_image_size[1])//2))
new_path, new_filename = f.rsplit('/', 1)
new_im.save(new_path + new_folder + new_filename + '_small.png', 'PNG', quality=10, optimize=True)
new_im.close()
def png_conversion(optimized_dirs=optimized_dirs, optimized_path=optimized_path):
for item in optimized_dirs:
if item == '.DS_Store':
continue
f, e = os.path.splitext(optimized_path+item)
with Image.open(f + e) as im:
im.load()
# Get the alpha band
alpha = im.split()[-1]
im = im.convert('RGB').convert('P', palette=Image.ADAPTIVE, colors=255)
# Set all pixel values below 128 to 255,
# and the rest to 0
mask = Image.eval(alpha, lambda a: 255 if a <=128 else 0)
# Paste the color of index 255 and use alpha as a mask
im.paste(255, mask)
# The transparency index is 255
e = e.split('.png')[0]
im.save(f + e + "-opt.png", transparency=255)
im.close()
def unoptimized_cleanup(optimized_dirs=optimized_dirs, optimized_path=optimized_path):
for item in optimized_dirs:
if item.endswith('small.png'):
os.remove(os.path.join(optimized_path, item))
#functions called in order
resize_aspect_fit(final_size=250, dirs=dirs)
png_conversion(optimized_dirs=optimized_dirs, optimized_path=optimized_path)
unoptimized_cleanup(optimized_dirs=optimized_dirs, optimized_path=optimized_path)
I expect that for the following folder structure:
folder/image1.png
folder/image2.png
the output should look like this, with the appropriately sized and smaller files:
folder/optimized_images/image1_small-opt.png
folder/optimized_images/image2_small-opt.png
Relevant Sources that I pulled from:
Converting PNG32 to PNG8 with PIL while preserving transparency
Python/PIL Resize all images in a folder
Sorry for the long question/code, and thanks in advance for any help!!
The problem is that you create the variable optimized_dirs before you run step 1. So before step 1 is executed, you make a list of files in that directory, which is empty at that point. If you run it a second time, the files are in optimized_dirs, and hence then it works.
A solution would be to read the contents of optimized_dirs inside the function png_compression, i.e. moving os.listdir( optimized_path ) in there.
By the way: I see that you do some magic to build paths where you use [1:] to prevent double slashes. It is more robust to build paths using os.path.join, which will ensure there is always a single slash between directories, regardless of whether you specify them at the start or end of each of them.

How to read the mask of an image using opencv in python

I am working on this challenge called Carvana Segmentation in kaggle. The dataset consists of 5088 images, for each image there is a mask. For eg, the below is a single image (.jpg file) and its corresponding mask (.gif file).
I was able to read .jpg files using cv2, but not the .gif files. The syntax i used to read .gif file is
>>> image = cv2.imread('filename.gif',cv2.IMREAD_GRAYSCALE)
When I try to print the image, returns None
>>> print(image) -> None
Can someone suggest any other method, please
imageio allows to read gifs like this:
import imageio
img = imageio.imread('filename.gif')
Following this repo:
https://github.com/asharma327/Read_Gif_OpenCV_Python/blob/master/gif_to_pic.py
you can do the following to read the image
import cv2
import os
def convert_gif_to_frames(gif):
# Initialize the frame number and create empty frame list
frame_num = 0
frame_list = []
# Loop until there are frames left
while True:
try:
# Try to read a frame. Okay is a BOOL if there are frames or not
okay, frame = gif.read()
# Append to empty frame list
frame_list.append(frame)
# Break if there are no other frames to read
if not okay:
break
# Increment value of the frame number by 1
frame_num += 1
except KeyboardInterrupt: # press ^C to quit
break
return frame_list
def output_frames_as_pics(frame_list):
# Reduce the list of frames by half to make the list more managable
frame_list_reduce = frame_list[0::2]
# Get the path of the current working directory
path = os.getcwd()
# Set then name of your folder
'''Replace this name with what you want your folder name to be'''
folder_name = 'Picturebook_Pics_Kiss'
# If the folder does not exist, then make it
if not os.path.exists(path + '/' + folder_name):
os.makedirs(path + '/' + folder_name)
for frames_idx in range(len(frame_list_reduce)):
cv2.imwrite(os.path.join(path + '/' + folder_name, str(frames_idx+1) + '.png'), frame_list_reduce[frames_idx])
return
gif = cv2.VideoCapture('/home/ahmedramzy/Documents/gif/giphy.gif')
# here you can get the frames and work on it
xx = convert_gif_to_frames(gif_kiss)
# here if you want to write it on hard disk using imwrite
output_frames_as_pics(xx)
You can't use imread(), there's no codec for that builtin (still a license problem)[https://answers.opencv.org/question/185929/how-to-read-gif-in-python/]
Since you are interested in python, you may use PIL library as mentioned here.
from PIL import Image
im = Image.open("animation.gif")
# To iterate through the entire gif
try:
while 1:
im.seek(im.tell()+1)
# do something to im
except EOFError:
pass # end of sequence

Iteratively open image with increasing ID number as a file name in pyhon

I've got an image database with a set of images named [frame01.png, frame02.png, ..., frameN.png].
My directory path is ./img, and iteratively I'd like to read one by one, do some image processing until reaching the last one. Since I'm not familiar with strings concatenation in python, what's the easiest way to do it?
file_names = os.listdir('path_to_folder/')
should give you a list of all you files.
To read them you can have:
for file_name in file_names:
read_and_process_image('path_to_folder/' + file_name)
Then inside read_and_process_image:
import matplotlib.image
def read_and_process_image(path):
read_img = matplotlib.image.imread(path) # or whatever you use to read the image
# process read_img
Alternatively, you could have:
import glob
for image_path in glob.glob("path_to_your_image*.png"):
image = matplotlib.image.imread(image_path) # or whatever you use to read the image
# process your image
If you are just looking for a quick way to create the list with this particular names:
[ 'frame' + "%02d" % (i,) + '.png' for i in range(1, MAX_NUM)]
If your last image is 20 then replace MAX_NUM with 20 + 1 applies for any other number x, x + 1.
How/what you use to read the files depends on you. You can use matplotlib.image as in the examples or whatever works for you.

Reading images while maintaining folder structure

I have to write a matlab script in python as apparently what I want to achieve is done much more efficiently in Python.
So the first task is to read all images into python using opencv while maintaining folder structure. For example if the parent folder has 50 sub folders and each sub folder has 10 images then this is how the images variable should look like in python, very much like a cell in matlab. I read that python lists can perform this cell like behaviour without importing anything, so thats good I guess.
For example, below is how I coded it in Matlab:
path = '/home/university/Matlab/att_faces';
subjects = dir(path);
subjects = subjects(~strncmpi('.', {subjects.name}, 1)); %remove the '.' and '..' subfolders
img = cell(numel(subjects),1); %initialize the cell equal to number of subjects
for i = 1: numel(subjects)
path_now = fullfile(path, subjects(i).name);
contents = dir([path_now, '/*.pgm']);
for j = 1: numel(contents)
img{i}{j} = imread(fullfile(path_now,contents(j).name));
disp([i,j]);
end
end
The above img will have 50 cells and each cell will have stored 10 images. img{1} will be all images belonging to subject 1 and so on.
Im trying to replicate this in python but am failing, this is what I have I got so far:
import cv2
import os
import glob
path = '/home/university/Matlab/att_faces'
sub_f = os.listdir(path)
images = []
for n in sub_f:
path_now = os.path.join(path, sub_f[n], '*.pgm')
images[n] = [cv2.imread(file) for file in glob.glob(path_now)]
Its not exactly what I am looking for, some help would be appreciated. Please ignore silly mistakes as it is my first day writing in python.
Thanks
edit: directory structure:
The first problem is that n isn't a number or index, it is a string containing the path name. To get the index, you can use enumerate, which gives index, value pairs.
Second, unlike in MATLAB you can't assign to indexes that don't exist. You need to pre-allocate your image array or, better yet, append to it.
Third, it is better not to use the variable file since in python 2 it is a built-in data type so it can confuse people.
So with preallocating, this should work:
images = [None]*len(sub_f)
for n, cursub in enumerate(sub_f):
path_now = os.path.join(path, cursub, '*.pgm')
images[n] = [cv2.imread(fname) for fname in glob.glob(path_now)]
Using append, this should work:
for cursub in sub_f
path_now = os.path.join(path, cursub, '*.pgm')
images.append([cv2.imread(fname) for fname in glob.glob(path_now)])
That being said, there is an easier way to do this. You can use the pathlib module to simplify this.
So something like this should work:
from pathlib import Path
mypath = Path('/home/university/Matlab/att_faces')
images = []
for subdir in mypath.iterdir():
images.append([cv2.imread(str(curfile)) for curfile in subdir.glob('*.pgm')])
This loops over the subdirectories, then globs each one.
This can even be done in a nested list comprehension:
images = [[cv2.imread(str(curfile)) for curfile in subdir.glob('*.pgm')]
for subdir in mypath.iterdir()]
It should be the following:
import os
path = '/home/university/Matlab/att_faces'
sub_f = os.listdir(path)
print(sub_f) #--- this will print all the files present in this directory ---
#--- this a list to which you will append all the images ---
images = []
#--- iterate through every file in the directory and read those files that end with .pgm format ---
#--- after reading it append it to the list ---
for n in sub_f:
if n.endswith('.pgm'):
path_now = os.path.join(path, n)
print(path_now)
images.append(cv2.imread(path_now, 1))
import cv2
import os
import glob
path = '/home/university/Matlab/att_faces'
sub_f = os.listdir(path)
images = []
#read the images
for folder in sub_f:
path_now = os.path.join(path, folder, '*.pgm')
images.append([cv2.imread(file) for file in glob.glob(path_now)])
#display the images
for folder in images:
for image in folder:
cv2.imshow('image',image)
cv2.waitKey(0)
cv2.destroyAllWindows()

Categories

Resources