match subfolder name with file name and output folder python - python

I have the following folder-file structure:
mainfolder_segment_polygon
folder_poly5numSeg
subfolder_compactness40
subfolder_aoi1
file_aoi1_seg0.shp
file_aoi1_seg1.shp
subfolder_aoi2
file_aoi2_seg0.shp
file_aoi2_seg1.shp
folder_poly6numSeg
subfolder_compactness40
subfolder_aoi1
file_aoi1_seg0.shp
file_aoi1_seg1.shp
subfolder_aoi2
file_aoi2_seg0.shp
file_aoi2_seg1.shp
I want to be able to load all the files from the same folder (segment_polygon), apply a function to them, and export to another set of folders (segment_multipoly) with the same structure.
The files from r".\segmentation_aoi\segment_polygon\poly5numSeg\compactness40\aoi1" should be processed together and be exported to r".\segmentation_aoi\segment_multipoly\multi5numSeg\compactness40\aoi1"
The files from r".\segmentation_aoi\segment_polygon\poly6numSeg\compactness40\aoi2" should be processed together and be exported to r".\segmentation_aoi\segment_multipoly\multi6numSeg\compactness40\aoi2"
and so on...
The names "mainfolder", "folder", "subfolder", "file" are there just to indicate to which level the names belong, but they are not part of the folder's labels.
input_path = os.path.join(src, "segment_polygon\\")
output_path = os.path.join(src, "segment_multipoly\\")
root = Path(input_path)
for maindir, subdirs, shpfiles in os.walk(input_path):
for shp in shpfiles:
aoi_root, shp_ext = shp.split("_")
for file in root.glob("*/*/*/*.shp"):
part_path = Path(file).parts
folder_numSeg_name = part_path[9] #here I get the subfolder "poly5numSeg", "poly6numSeg", etc
folder_aoi_name = part_path[11] #here I get the subfolder "aoi1", "aoi2", etc...
aoiprep_seg = part_path[12] # here I get the name of the file "aoi1_seg0.shp", aoi1_seg1.shp", etc
if aoi_root == folder_aoi_name:
'''apply a function to shp'''
shp.to_file(os.path.join(output_path, folder_numSeg_name, "compactness40\\", folder_aoi_name, shp)
I am a bit at lost.
Working in Windows 10, Python 3. Thank you for all help.
UPDATE OF THE SCRIPT
segment_polygon = os.path.join(output, "segment_polygon\\") # input path
segment_multipoly = os.path.join(output, "segment_multipoly\\") # output path
# 1. get aoi directories
aoi_dir = [path for path in glob.glob(os.path.join(segment_polygon, "*/*/*"))
if os.path.isdir(path)]
# list to store the shapefiles to be intersected
input_list = []
for path in aoi_dir:
# 2. get the files
shp_paths = glob.glob(path + os.sep + '*.shp')
for shp_path in shp_paths:
# 3. do things with shp_path
full_path, seg_shp = os.path.split(shp_path)
aoi_folder = full_path[-5:] # aoi01, aoi02, aoi03....aoi25
if seg_shp.startswith(aoi_folder):
input_list.append(shp_path) # creates the new list with shapefiles that start with the same aoiX value
auto_inter = gpd.GeoDataFrame.from_file(input_list[0]) #process shp
for i in range(len(input_list)-1):
mp = gpd.GeoDataFrame.from_file(input_list[i+1]) # process shp
auto_inter = gpd.overlay(auto_inter, mp, how='intersection') #process shp
print(f"shp included in the list:\n {input_list}")
# 4. create your output file path
print(full_path)
output_path = full_path.replace("poly", "multi")
N_output_path = output_path.replace("gon", "polygon")
print(f"output_path:\n {N_output_path}")
# make sure the directories exist
if not os.path.exists(os.path.dirname(N_output_path)):
os.makedirs(os.path.dirname(N_output_path), exist_ok=True)
# create output file name
multipoly_name = aoi_folder + ".shp"
# export
auto_inter.to_file(os.path.join(N_output_path, multipoly_name)) #export shp
Incorporated changes from ygorg. However, it takes ALL the shapefiles for intersection. I want only aoi1 files for intersection and save on aoi1 folder. Then, aoi2 shapefiles and save on aoi2 folder, and so on. This doesn't work yet.

Mixing os.walk and glob seem to be quite confusing. If you want to process each aoiX folder. Try to first list all those directories, then list the .shp files in each directory, then apply the function and finally create your output_path and write to it.
When working with files it's always good to decompose what you need to not get overwhelmed.
# 1. get aoi directories
aoi_dir = [path for path in glob.glob('segment_polygon/*/*/*')
if os.path.isdir(path)]
for path in aoi_dir:
# 2. get the files
shp_paths = glob.glob(path + os.sep + '*.shp')
for shp_path in shp_paths:
# 3. do things with shp_path
# 4. create your output file path
output_path = shp_path.replace('segment_polygon', 'segment_multipoly')
# make sure the directories exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# write in output file
And always do a dry run without processing or writing anything, and printing the paths so you are sure of what goes where !

I managed to solve the problem. Thank you ygorg for the input. It led me to the right path.
# Create a list of the subfolders of segment_polygon
poly_dir = [path for path in glob.glob(os.path.join(segment_polygon, "*/*/*"))
if os.path.isdir(path)]
for aoi_poly in poly_dir:
# define input folder
input_subfolder = aoi_poly.split("segment_polygon\\")[1] # splits the path at "...\\" and keeps the tail (position:1)
#print(f"input folder: {input_subfolder}")
#define export folder
export_subfolder = input_subfolder.replace("poly", "multi")
export_folder = os.path.join(segment_multipoly, export_subfolder)
#print(f"output folder: {export_folder}")
# define name output shapefile
numseg, compactness, aoi = [int(s) for s in re.findall(r'\d+', aoi_poly)] #extract only the integers from the "poly" path
name_output = "aoi" + str(aoi)+ "_" + "numSeg"+ str(numseg) + "_c" + str(compactness) + ".shp" # str() is used to concatenate integers as part of the string
#print(f"shapefile label: {name_output}")
full_outputpath = os.path.join(export_folder, name_output)
#print(f"full output path: {full_outputpath}")
# intersect and merge all single polygons
input_list = list(filter(lambda mpoly: mpoly.endswith('.shp'), os.listdir(aoi_poly)))
###### apply my function here ######
# export
filetoexport.to_file(full_outputpath)

Related

Renaming a single directory of files with a specific syntax

I'm trying to rename and add pad the names of a few hundred files to the same length.
So far I've managed to correctly rename and pad the file names within my IDE but I'm not sure how I link that to actually rename the files themselves.
Atomic Samurai__________.png
BabyYodatheBased________.png
Baradum_________________.png
bcav____________________.png
This is the code that does the rename and the padding within my IDE:
import glob, os
pad_images = glob.glob(r"C:\Users\test\*.png")
split_images = []
for i in pad_images:
split = i.split("\\")[-1]
split_images.append(split)
longest_file_name = max(split_images, key=len)
longest_int = len(longest_file_name)
new_images = []
for i in split_images:
parts = i.split('.')
new_name = (parts[0]).ljust(longest_int, '_') + "." + parts[1])
I've been trying to get os.rename(old_name, new_name) to work but I'm not sure where I actually get the old name from as I've split things up into different for loops.
Try saving the old file names to a list and do all the modifications (split and rename) in a single loop thereafter:
path = "C:/Users/test"
images = [f for f in os.listdir(path) if f.endswith(".png")]
length = len(max(images, key=len))
for file in images:
parts = file.split("\\")[-1].split(".")
new_name = f'{parts[0].ljust(length,"_")}.{parts[1]}'
os.rename(os.path.join(path,file), os.path.join(path,new_name))

How to get the full file path including the directory?

I have a quiet complex problem. I have multiple filenames in a list, the root directory of those files is the same: mother_directory. However every file has a different subdirectory. Now I have a script which is processing some files and I need to know the exact full path including the subdirectories of every file. I know that I could use os.walk but that will make my function too nested as inside this function I'm planning to use another function which uses those full paths.
This is the file structure:
mother_directory:
|_child1:
20211011.xml
20211001.xml
|_child2:
20211002.xml
This is my current code:
mother_path = r'c:\data\user1\Desktop\mother_directory'
blue_dates = ['20211011', '20211012', '20211013', '20211001', '20211002']
red_dates = ['20211011', '20211009', '20211008', '20211001', '20211002']
file_names = ['20211011.xml', '20211001.xml', '20211002.xml']
def process_files(x):
if x in red_dates:
match_file = [s for s in file_names if x in s]
file_path = os.path.join(mother_path, match_file [0])
print(file_path)
for x in blue_dates:
process_files(x)
My current output:
c:\data\user1\Desktop\mother_directory\20211011.xml
c:\data\user1\Desktop\mother_directory\20211001.xml
c:\data\user1\Desktop\mother_directory\20211002.xml
When I run my function I want my desired output to be like this:
c:\data\user1\Desktop\mother_directory\child1\20211011.xml
c:\data\user1\Desktop\mother_directory\child1\20211001.xml
c:\data\user1\Desktop\mother_directory\child2\20211002.xml
I added a condition, I believe it will work now.
def process_files(x):
if x in red_dates:
match_file = [s for s in file_names if x in s]
for root, dirs, files in os.walk(mother_path):
for file in files:
if match_file[0] in file:
print(os.path.join(root,match_file[0]))

Change only a part of filename

I have these images in my folder:
area11.tif
area12.tif
area14.tif
area21.tif
area22.tif
area25.tif
How can I change only the last digit so they became ordered and "more incremental" ?
Instead if area14.tif it should be area13.tif and the same thing for area22/area25.
I have a code but it's a bit broken because it delete some files (it's strange, I know...).
EDIT: added (maybe broken..) code
try:
path = (os.path.expanduser('~\\FOLDER\\'))
files = os.listdir(path)
idx = 0
for file in files:
idx =+ 1
i = 'ex_area'
if file.endswith('.tif'):
i = i + str(idx)
os.rename(os.path.join(path, file), os.path.join(path, str(i) + '.tif'))
except OSError as e:
if e.errno != errno.EEXIST:
raise
1) Read the files names in the directory into array (of strings).
2) Iterate over the array of filenames
3) For each filename, slice the string and insert the index
4) Rename
For example:
import os
import glob
[os.rename(n, "{}{}.tif".format(n[:5], i)) for i, n in enumerate(glob.glob("area*"))]
First you get the list of the images pathes with the glob module :
images = glob.glob("/sample/*.tif")
then you just rename all of them with the os module :
for i in range(len(images)): os.rename(images[i], ‘area’+i+’.tif’)
First rename all filename to temp name and then add whatever name you prefer
import glob,os
images = glob.glob("*.tif")
for i in range(len(images)):
os.rename(images[i], 'temp_'+str(i)+'.tif')
tempImages = glob.glob("temp*.tif")
for i in range(len(tempImages)):
os.rename(tempImages[i], 'area'+str(i+1)+'.tif')
Found also this other solution. But there is a small difference in this one, and a better way of do the job in the end (at least for me): create a folder for each area. So simple I didn't think of it before...
BTW, here is the code, commented. I am using this one just because I achieved what I want. Thanks to all who answered, made me learn new things.
path = (os.path.expanduser('~\\FOLDER\\AREA1\\')) #select folder
files = os.listdir(path)
i = 1 #counter
name = 'area' #variable which the file will take as name
for file in files:
if file.endswith('.tif'): #search only for .tif. Can change with any supported format
os.rename(os.path.join(path, file), os.path.join(path, name + str(i)+'.tif')) #name + str(i)+'.tif' will take the name and concatenate to first number in counter. #If you put as name "area1" the str(i) will add another number near the name so, here is the second digit.
i += 1 #do this for every .tif file in the folder
It's a little bit simple, but because I put the files in two separate folders. If you keep the files in the same folder, this will not work properly.
EDIT: now that I see, it's the same as my code above....

Rename multiple files inside multiple folders

So I have a lot of folders with a certain name. In each folder I have +200 items. The items inside the folders has names like:
CT.34562346.246.dcm
RD.34562346.dcm
RN.34562346.LAO.dcm
And some along that style.
I now wish to rename all files inside all folders so that the number (34562346) is replaced with the name of the folder. So for example in the folder named "1" the files inside should become:
CT.1.246.dcm
RD.1.dcm
RN.1.LAO.dcm
So only the large number is replaced. And yes, all files are similar like this. It would be the number after the first . that should be renamed.
So far I have:
import os
base_dir = "foo/bar/" #In this dir I have all my folders
dir_list = []
for dirname in os.walk(base_dir):
dir_list.append(dirname[0])
This one just lists the entire paths of all folders.
dir_list_split = []
for name in dir_list[1:]: #The 1 is because it lists the base_dir as well
x = name.split('/')[2]
dir_list_split.append(x)
This one extracts the name of each folder.
And then the next thing would be to enter the folders and rename them. And I'm kind of stuck here ?
The pathlib module, which was new in Python 3.4, is often overlooked. I find that it often makes code simpler than it would otherwise be with os.walk.
In this case, .glob('**/*.*') looks recursively through all of the folders and subfolders that I created in a sample folder called example. The *.* part means that it considers all files.
I put path.parts in the loop to show you that pathlib arranges to parse pathnames for you.
I check that the string constant '34562346' is in its correct position in each filename first. If it is then I simply replace it with the items from .parts that is the next level of folder 'up' the folders tree.
Then I can replace the rightmost element of .parts with the newly altered filename to create the new pathname and then do the rename. In each case I display the new pathname, if it was appropriate to create one.
>>> from pathlib import Path
>>> from os import rename
>>> for path in Path('example').glob('**/*.*'):
... path.parts
... if path.parts[-1][3:11]=='34562346':
... new_name = path.parts[-1].replace('34562346', path.parts[-2])
... new_path = '/'.join(list(path.parts[:-1])+[new_name])
... new_path
... ## rename(str(path), new_path)
... else:
... 'no change'
...
('example', 'folder_1', 'id.34562346.6.a.txt')
'example/folder_1/id.folder_1.6.a.txt'
('example', 'folder_1', 'id.34562346.wax.txt')
'example/folder_1/id.folder_1.wax.txt'
('example', 'folder_2', 'subfolder_1', 'ty.34562346.90.py')
'example/folder_2/subfolder_1/ty.subfolder_1.90.py'
('example', 'folder_2', 'subfolder_1', 'tz.34562346.98.py')
'example/folder_2/subfolder_1/tz.subfolder_1.98.py'
('example', 'folder_2', 'subfolder_2', 'doc.34.34562346.implication.rtf')
'no change'
This will rename files in subdirectories too:
import os
rootdir = "foo" + os.sep + "bar"
for subdir, dirs, files in os.walk(rootdir):
for file in files:
filepath = subdir + os.sep + file
foldername = subdir.split(os.sep)[-1]
number = ""
foundnumber = False
for c in filepath:
if c.isdigit():
foundnumber = True
number = number + c
elif foundnumber:
break
if foundnumber:
newfilepath = filepath.replace(number,foldername)
os.rename(filepath, newfilepath)
Split each file name on the . and replace the second item with the file name, then join on .'s again for the new file name. Here's some sample code that demonstrates the concept.
folder_name = ['1', '2']
file_names = ['CT.2345.234.dcm', 'BG.234234.222.dcm', "RA.3342.221.dcm"]
for folder in folder_name:
new_names = []
for x in file_names:
file_name = x.split('.')
file_name[1] = folder
back_together = '.'.join(file_name)
new_names.append(back_together)
print(new_names)
Output
['CT.1.234.dcm', 'BG.1.222.dcm', 'RA.1.221.dcm']
['CT.2.234.dcm', 'BG.2.222.dcm', 'RA.2.221.dcm']

Python: moving file to a newly created directory

I've got my script creating a bunch of files (size varies depending on inputs) and I want to be certain files in certain folders based on the filenames.
So far I've got the following but although directories are being created no files are being moved, I'm not sure if the logic in the final for loop makes any sense.
In the below code I'm trying to move all .png files ending in _01 into the sub_frame_0 folder.
Additionally is their someway to increment both the file endings _01 to _02 etc., and the destn folder ie. from sub_frame_0 to sub_frame_1 to sub_frame_2 and so on.
for index, i in enumerate(range(num_sub_frames+10)):
path = os.makedirs('./sub_frame_{}'.format(index))
# Slice layers into sub-frames and add to appropriate directory
list_of_files = glob.glob('*.tif')
for fname in list_of_files:
image_slicer.slice(fname, num_sub_frames) # Slices the .tif frames into .png sub-frames
list_of_sub_frames = glob.glob('*.png')
for i in list_of_sub_frames:
if i == '*_01.png':
shutil.move(os.path.join(os.getcwd(), '*_01.png'), './sub_frame_0/')
As you said, the logic of the final loop does not make sense.
if i == '*_01.ng'
It would evaluate something like 'image_01.png' == '*_01.png' and be always false.
Regexp should be the way to go, but for this simple case you just can slice the number from the file name.
for i in list_of_sub_frames:
frame = int(i[-6:-4]) - 1
shutil.move(os.path.join(os.getcwd(), i), './sub_frame_{}/'.format(frame))
If i = 'image_01.png' then i[-6:-4] would take '01', convert it to integer and then just subtract 1 to follow your schema.
A simple fix would be to check if '*_01.png' is in the file name i and change the shutil.move to include i, the filename. (It's also worth mentioning that iis not a good name for a filepath
list_of_sub_frames = glob.glob('*.png')
for i in list_of_sub_frames:
if '*_01.png' in i:
shutil.move(os.path.join(os.getcwd(), i), './sub_frame_0/')
Additionally is [there some way] to increment both the file endings _01 to _02 etc., and the destn folder ie. from sub_frame_0 to sub_frame_1 to sub_frame_2 and so on.
You could create file names doing something as simple as this:
for i in range(10):
#simple string parsing
file_name = 'sub_frame_'+str(i)
folder_name = 'folder_sub_frame_'+str(i)
Here is a complete example using regular expressions. This also implements the incrementing of file names/destination folders
import os
import glob
import shutil
import re
num_sub_frames = 3
# No need to enumerate range list without start or step
for index in range(num_sub_frames+10):
path = os.makedirs('./sub_frame_{0:02}'.format(index))
# Slice layers into sub-frames and add to appropriate directory
list_of_files = glob.glob('*.tif')
for fname in list_of_files:
image_slicer.slice(fname, num_sub_frames) # Slices the .tif frames into .png sub-frames
list_of_sub_frames = glob.glob('*.png')
for name in list_of_sub_frames:
m = re.search('(?P<fname>.+?)_(?P<num>\d+).png', name)
if m:
num = int(m.group('num'))+1
newname = '{0}_{1:02}.png'.format(m.group('fname'), num)
newpath = os.path.join('./sub_frame_{0:02}/'.format(num), newname)
print m.group() + ' -> ' + newpath
shutil.move(os.path.join(os.getcwd(), m.group()), newpath)

Categories

Resources