I have a quiet complex problem. I have multiple filenames in a list, the root directory of those files is the same: mother_directory. However every file has a different subdirectory. Now I have a script which is processing some files and I need to know the exact full path including the subdirectories of every file. I know that I could use os.walk but that will make my function too nested as inside this function I'm planning to use another function which uses those full paths.
This is the file structure:
mother_directory:
|_child1:
20211011.xml
20211001.xml
|_child2:
20211002.xml
This is my current code:
mother_path = r'c:\data\user1\Desktop\mother_directory'
blue_dates = ['20211011', '20211012', '20211013', '20211001', '20211002']
red_dates = ['20211011', '20211009', '20211008', '20211001', '20211002']
file_names = ['20211011.xml', '20211001.xml', '20211002.xml']
def process_files(x):
if x in red_dates:
match_file = [s for s in file_names if x in s]
file_path = os.path.join(mother_path, match_file [0])
print(file_path)
for x in blue_dates:
process_files(x)
My current output:
c:\data\user1\Desktop\mother_directory\20211011.xml
c:\data\user1\Desktop\mother_directory\20211001.xml
c:\data\user1\Desktop\mother_directory\20211002.xml
When I run my function I want my desired output to be like this:
c:\data\user1\Desktop\mother_directory\child1\20211011.xml
c:\data\user1\Desktop\mother_directory\child1\20211001.xml
c:\data\user1\Desktop\mother_directory\child2\20211002.xml
I added a condition, I believe it will work now.
def process_files(x):
if x in red_dates:
match_file = [s for s in file_names if x in s]
for root, dirs, files in os.walk(mother_path):
for file in files:
if match_file[0] in file:
print(os.path.join(root,match_file[0]))
Related
In this code, I want to print the desired output, like x[0] should print the first path, and x[1] should print the second path. But I don't know how to do it? I used split but it didn't give me the expected result.
Given result
/home/runner/TestP1/folder1/
/home/runner/TestP1/folder2/
/home/runner/TestP1/folder1/sub
Required result
/home/runner/TestP1/folder2/
Code
import os
def filesystem(rootdir):
for rootdir, dirs, files in os.walk(rootdir):
for subdir in dirs:
x = os.path.join(rootdir, subdir)
print(x)
filesystem("/home/runner/TestP1")
If you want to access each directory as if it were a list, then you need to construct one. Below I have given an example:
import os
all_directories = []
for root, dirs, files in os.walk(r"/home/runner/TestP1"):
all_directories += dirs # Append the list of subdirs in this directory to the master list.
print(all_directories[1])
Be mindful that the order will be depth-based. Meaning directories in sub directories will come before adjacent ones. If you want breadth-first ordering, set top_down to False in os.walk.
I have the following folder-file structure:
mainfolder_segment_polygon
folder_poly5numSeg
subfolder_compactness40
subfolder_aoi1
file_aoi1_seg0.shp
file_aoi1_seg1.shp
subfolder_aoi2
file_aoi2_seg0.shp
file_aoi2_seg1.shp
folder_poly6numSeg
subfolder_compactness40
subfolder_aoi1
file_aoi1_seg0.shp
file_aoi1_seg1.shp
subfolder_aoi2
file_aoi2_seg0.shp
file_aoi2_seg1.shp
I want to be able to load all the files from the same folder (segment_polygon), apply a function to them, and export to another set of folders (segment_multipoly) with the same structure.
The files from r".\segmentation_aoi\segment_polygon\poly5numSeg\compactness40\aoi1" should be processed together and be exported to r".\segmentation_aoi\segment_multipoly\multi5numSeg\compactness40\aoi1"
The files from r".\segmentation_aoi\segment_polygon\poly6numSeg\compactness40\aoi2" should be processed together and be exported to r".\segmentation_aoi\segment_multipoly\multi6numSeg\compactness40\aoi2"
and so on...
The names "mainfolder", "folder", "subfolder", "file" are there just to indicate to which level the names belong, but they are not part of the folder's labels.
input_path = os.path.join(src, "segment_polygon\\")
output_path = os.path.join(src, "segment_multipoly\\")
root = Path(input_path)
for maindir, subdirs, shpfiles in os.walk(input_path):
for shp in shpfiles:
aoi_root, shp_ext = shp.split("_")
for file in root.glob("*/*/*/*.shp"):
part_path = Path(file).parts
folder_numSeg_name = part_path[9] #here I get the subfolder "poly5numSeg", "poly6numSeg", etc
folder_aoi_name = part_path[11] #here I get the subfolder "aoi1", "aoi2", etc...
aoiprep_seg = part_path[12] # here I get the name of the file "aoi1_seg0.shp", aoi1_seg1.shp", etc
if aoi_root == folder_aoi_name:
'''apply a function to shp'''
shp.to_file(os.path.join(output_path, folder_numSeg_name, "compactness40\\", folder_aoi_name, shp)
I am a bit at lost.
Working in Windows 10, Python 3. Thank you for all help.
UPDATE OF THE SCRIPT
segment_polygon = os.path.join(output, "segment_polygon\\") # input path
segment_multipoly = os.path.join(output, "segment_multipoly\\") # output path
# 1. get aoi directories
aoi_dir = [path for path in glob.glob(os.path.join(segment_polygon, "*/*/*"))
if os.path.isdir(path)]
# list to store the shapefiles to be intersected
input_list = []
for path in aoi_dir:
# 2. get the files
shp_paths = glob.glob(path + os.sep + '*.shp')
for shp_path in shp_paths:
# 3. do things with shp_path
full_path, seg_shp = os.path.split(shp_path)
aoi_folder = full_path[-5:] # aoi01, aoi02, aoi03....aoi25
if seg_shp.startswith(aoi_folder):
input_list.append(shp_path) # creates the new list with shapefiles that start with the same aoiX value
auto_inter = gpd.GeoDataFrame.from_file(input_list[0]) #process shp
for i in range(len(input_list)-1):
mp = gpd.GeoDataFrame.from_file(input_list[i+1]) # process shp
auto_inter = gpd.overlay(auto_inter, mp, how='intersection') #process shp
print(f"shp included in the list:\n {input_list}")
# 4. create your output file path
print(full_path)
output_path = full_path.replace("poly", "multi")
N_output_path = output_path.replace("gon", "polygon")
print(f"output_path:\n {N_output_path}")
# make sure the directories exist
if not os.path.exists(os.path.dirname(N_output_path)):
os.makedirs(os.path.dirname(N_output_path), exist_ok=True)
# create output file name
multipoly_name = aoi_folder + ".shp"
# export
auto_inter.to_file(os.path.join(N_output_path, multipoly_name)) #export shp
Incorporated changes from ygorg. However, it takes ALL the shapefiles for intersection. I want only aoi1 files for intersection and save on aoi1 folder. Then, aoi2 shapefiles and save on aoi2 folder, and so on. This doesn't work yet.
Mixing os.walk and glob seem to be quite confusing. If you want to process each aoiX folder. Try to first list all those directories, then list the .shp files in each directory, then apply the function and finally create your output_path and write to it.
When working with files it's always good to decompose what you need to not get overwhelmed.
# 1. get aoi directories
aoi_dir = [path for path in glob.glob('segment_polygon/*/*/*')
if os.path.isdir(path)]
for path in aoi_dir:
# 2. get the files
shp_paths = glob.glob(path + os.sep + '*.shp')
for shp_path in shp_paths:
# 3. do things with shp_path
# 4. create your output file path
output_path = shp_path.replace('segment_polygon', 'segment_multipoly')
# make sure the directories exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# write in output file
And always do a dry run without processing or writing anything, and printing the paths so you are sure of what goes where !
I managed to solve the problem. Thank you ygorg for the input. It led me to the right path.
# Create a list of the subfolders of segment_polygon
poly_dir = [path for path in glob.glob(os.path.join(segment_polygon, "*/*/*"))
if os.path.isdir(path)]
for aoi_poly in poly_dir:
# define input folder
input_subfolder = aoi_poly.split("segment_polygon\\")[1] # splits the path at "...\\" and keeps the tail (position:1)
#print(f"input folder: {input_subfolder}")
#define export folder
export_subfolder = input_subfolder.replace("poly", "multi")
export_folder = os.path.join(segment_multipoly, export_subfolder)
#print(f"output folder: {export_folder}")
# define name output shapefile
numseg, compactness, aoi = [int(s) for s in re.findall(r'\d+', aoi_poly)] #extract only the integers from the "poly" path
name_output = "aoi" + str(aoi)+ "_" + "numSeg"+ str(numseg) + "_c" + str(compactness) + ".shp" # str() is used to concatenate integers as part of the string
#print(f"shapefile label: {name_output}")
full_outputpath = os.path.join(export_folder, name_output)
#print(f"full output path: {full_outputpath}")
# intersect and merge all single polygons
input_list = list(filter(lambda mpoly: mpoly.endswith('.shp'), os.listdir(aoi_poly)))
###### apply my function here ######
# export
filetoexport.to_file(full_outputpath)
So I have a lot of folders with a certain name. In each folder I have +200 items. The items inside the folders has names like:
CT.34562346.246.dcm
RD.34562346.dcm
RN.34562346.LAO.dcm
And some along that style.
I now wish to rename all files inside all folders so that the number (34562346) is replaced with the name of the folder. So for example in the folder named "1" the files inside should become:
CT.1.246.dcm
RD.1.dcm
RN.1.LAO.dcm
So only the large number is replaced. And yes, all files are similar like this. It would be the number after the first . that should be renamed.
So far I have:
import os
base_dir = "foo/bar/" #In this dir I have all my folders
dir_list = []
for dirname in os.walk(base_dir):
dir_list.append(dirname[0])
This one just lists the entire paths of all folders.
dir_list_split = []
for name in dir_list[1:]: #The 1 is because it lists the base_dir as well
x = name.split('/')[2]
dir_list_split.append(x)
This one extracts the name of each folder.
And then the next thing would be to enter the folders and rename them. And I'm kind of stuck here ?
The pathlib module, which was new in Python 3.4, is often overlooked. I find that it often makes code simpler than it would otherwise be with os.walk.
In this case, .glob('**/*.*') looks recursively through all of the folders and subfolders that I created in a sample folder called example. The *.* part means that it considers all files.
I put path.parts in the loop to show you that pathlib arranges to parse pathnames for you.
I check that the string constant '34562346' is in its correct position in each filename first. If it is then I simply replace it with the items from .parts that is the next level of folder 'up' the folders tree.
Then I can replace the rightmost element of .parts with the newly altered filename to create the new pathname and then do the rename. In each case I display the new pathname, if it was appropriate to create one.
>>> from pathlib import Path
>>> from os import rename
>>> for path in Path('example').glob('**/*.*'):
... path.parts
... if path.parts[-1][3:11]=='34562346':
... new_name = path.parts[-1].replace('34562346', path.parts[-2])
... new_path = '/'.join(list(path.parts[:-1])+[new_name])
... new_path
... ## rename(str(path), new_path)
... else:
... 'no change'
...
('example', 'folder_1', 'id.34562346.6.a.txt')
'example/folder_1/id.folder_1.6.a.txt'
('example', 'folder_1', 'id.34562346.wax.txt')
'example/folder_1/id.folder_1.wax.txt'
('example', 'folder_2', 'subfolder_1', 'ty.34562346.90.py')
'example/folder_2/subfolder_1/ty.subfolder_1.90.py'
('example', 'folder_2', 'subfolder_1', 'tz.34562346.98.py')
'example/folder_2/subfolder_1/tz.subfolder_1.98.py'
('example', 'folder_2', 'subfolder_2', 'doc.34.34562346.implication.rtf')
'no change'
This will rename files in subdirectories too:
import os
rootdir = "foo" + os.sep + "bar"
for subdir, dirs, files in os.walk(rootdir):
for file in files:
filepath = subdir + os.sep + file
foldername = subdir.split(os.sep)[-1]
number = ""
foundnumber = False
for c in filepath:
if c.isdigit():
foundnumber = True
number = number + c
elif foundnumber:
break
if foundnumber:
newfilepath = filepath.replace(number,foldername)
os.rename(filepath, newfilepath)
Split each file name on the . and replace the second item with the file name, then join on .'s again for the new file name. Here's some sample code that demonstrates the concept.
folder_name = ['1', '2']
file_names = ['CT.2345.234.dcm', 'BG.234234.222.dcm', "RA.3342.221.dcm"]
for folder in folder_name:
new_names = []
for x in file_names:
file_name = x.split('.')
file_name[1] = folder
back_together = '.'.join(file_name)
new_names.append(back_together)
print(new_names)
Output
['CT.1.234.dcm', 'BG.1.222.dcm', 'RA.1.221.dcm']
['CT.2.234.dcm', 'BG.2.222.dcm', 'RA.2.221.dcm']
I've got my script creating a bunch of files (size varies depending on inputs) and I want to be certain files in certain folders based on the filenames.
So far I've got the following but although directories are being created no files are being moved, I'm not sure if the logic in the final for loop makes any sense.
In the below code I'm trying to move all .png files ending in _01 into the sub_frame_0 folder.
Additionally is their someway to increment both the file endings _01 to _02 etc., and the destn folder ie. from sub_frame_0 to sub_frame_1 to sub_frame_2 and so on.
for index, i in enumerate(range(num_sub_frames+10)):
path = os.makedirs('./sub_frame_{}'.format(index))
# Slice layers into sub-frames and add to appropriate directory
list_of_files = glob.glob('*.tif')
for fname in list_of_files:
image_slicer.slice(fname, num_sub_frames) # Slices the .tif frames into .png sub-frames
list_of_sub_frames = glob.glob('*.png')
for i in list_of_sub_frames:
if i == '*_01.png':
shutil.move(os.path.join(os.getcwd(), '*_01.png'), './sub_frame_0/')
As you said, the logic of the final loop does not make sense.
if i == '*_01.ng'
It would evaluate something like 'image_01.png' == '*_01.png' and be always false.
Regexp should be the way to go, but for this simple case you just can slice the number from the file name.
for i in list_of_sub_frames:
frame = int(i[-6:-4]) - 1
shutil.move(os.path.join(os.getcwd(), i), './sub_frame_{}/'.format(frame))
If i = 'image_01.png' then i[-6:-4] would take '01', convert it to integer and then just subtract 1 to follow your schema.
A simple fix would be to check if '*_01.png' is in the file name i and change the shutil.move to include i, the filename. (It's also worth mentioning that iis not a good name for a filepath
list_of_sub_frames = glob.glob('*.png')
for i in list_of_sub_frames:
if '*_01.png' in i:
shutil.move(os.path.join(os.getcwd(), i), './sub_frame_0/')
Additionally is [there some way] to increment both the file endings _01 to _02 etc., and the destn folder ie. from sub_frame_0 to sub_frame_1 to sub_frame_2 and so on.
You could create file names doing something as simple as this:
for i in range(10):
#simple string parsing
file_name = 'sub_frame_'+str(i)
folder_name = 'folder_sub_frame_'+str(i)
Here is a complete example using regular expressions. This also implements the incrementing of file names/destination folders
import os
import glob
import shutil
import re
num_sub_frames = 3
# No need to enumerate range list without start or step
for index in range(num_sub_frames+10):
path = os.makedirs('./sub_frame_{0:02}'.format(index))
# Slice layers into sub-frames and add to appropriate directory
list_of_files = glob.glob('*.tif')
for fname in list_of_files:
image_slicer.slice(fname, num_sub_frames) # Slices the .tif frames into .png sub-frames
list_of_sub_frames = glob.glob('*.png')
for name in list_of_sub_frames:
m = re.search('(?P<fname>.+?)_(?P<num>\d+).png', name)
if m:
num = int(m.group('num'))+1
newname = '{0}_{1:02}.png'.format(m.group('fname'), num)
newpath = os.path.join('./sub_frame_{0:02}/'.format(num), newname)
print m.group() + ' -> ' + newpath
shutil.move(os.path.join(os.getcwd(), m.group()), newpath)
I have 2 folders, each with the same number of files. I want to rename the files in folder 2 based on the names of the files in folder 1. So in folder 1there might be three files titled:
Landsat_1,
Landsat_2,
Landsat_3
and in folder 2 these files are called:
1,
2,
3
and I want to rename them based on folder 1 names. I thought about turning the item names of each folder into a a .txt file and then turning the .txt file in a list and then renaming but I'm not sure if this is the best way to do it. Any suggestions?
Edit:
I have simplified the file names above, so just appending with Landsat_ wil not work for me.
The real file names in folder 1 are more like LT503002011_band1, LT5040300201_band1, LT50402312_band4. In folder 2 they are extract1, extract2, extract3. There are 500 files in total and in folder 2 it is just a running count of extract and a number for each file.
As someone said, "sort each list and zip them together in order to rename".
Notes:
the key() function extracts all of the numbers so that sorted() can sort the lists numerically based on the embedded numbers.
we sort both lists: os.listdir() returns files in arbitrary order.
The for loop is a common way to use zip: for itemA, itemB in zip(listA, listB):
os.path.join() provides portability: no worries about / or \
A typical invocation on Windows: python doit.py c:\data\lt c:\data\extract, assuming those are directories you have described.
A typical invocation on *nix: : python doit.py ./lt ./extract
import sys
import re
import os
assert len(sys.argv) == 3, "Usage: %s LT-dir extract-dir"%sys.argv[0]
_, ltdir, exdir = sys.argv
def key(x):
return [int(y) for y in re.findall('\d+', x)]
ltfiles = sorted(os.listdir(ltdir), key=key)
exfiles = sorted(os.listdir(exdir), key=key)
for exfile,ltfile in zip(exfiles, ltfiles):
os.rename(os.path.join(exdir,exfile), os.path.join(exdir,ltfile))
You might want to use the glob package which takes a filename pattern and outputs it into a list. For example, in that directory
glob.glob('*')
gives you
['Landsat_1', 'Landsat_2', 'Landsat_3']
Then you can loop over the filenames in the list and change the filenames accordingly:
import glob
import os
folderlist = glob.glob('*')
for folder in folderlist:
filelist = glob.glob(folder + '*')
for fil in filelist:
os.rename(fil, folder + fil)
Hope this helps
I went for more completeness :D.
# WARNING: BACKUP your data before running this code. I've checked to
# see that it mostly works, but I would want to test this very well
# against my actual data before I trusted it with that data! Especially
# if you're going to be modifying anything in the directories while this
# is running. Also, make sure you understand what this code is expecting
# to find in each directory.
import os
import re
main_dir_demo = 'main_dir_path'
extract_dir_demo = 'extract_dir_path'
def generate_paths(directory, filenames, target_names):
for filename, target_name in zip(filenames, target_names):
yield (os.path.join(directory, filename),
os.path.join(directory, target_name))
def sync_filenames(main_dir, main_regex, other_dir, other_regex, key=None):
main_files = [f for f in os.listdir(main_dir) if main_regex.match(f)]
other_files = [f for f in os.listdir(other_dir) if other_regex.match(f)]
# Do not proceed if there aren't the same number of things in each
# directory; better safe than sorry.
assert len(main_files) == len(other_files)
main_files.sort(key=key)
other_files.sort(key=key)
path_pairs = generate_paths(other_dir, other_files, main_files)
for other_path, target_path in path_pairs:
os.rename(other_path, target_path)
def demo_key(item):
"""Sort by the numbers in a string ONLY; not the letters."""
return [int(y) for y in re.findall('\d+', item)]
def main(main_dir, extract_dir, key=None):
main_regex = re.compile('LT\d+_band\d')
other_regex = re.compile('extract\d+')
sync_filenames(main_dir, main_regex, extract_dir, other_regex, key=key)
if __name__ == '__main__':
main(main_dir_demo, extract_dir_demo, key=demo_key)