How to Iterate over several directory levels and move files based on condition - python

I would like some help to loop through some directories and subdirectories and extracting data. I have a directory with three levels, with the third level containing several .csv.gz files. The structure is like this
I need to access level 2 (where subfolders are) of each folder and check the existence of a specific folder (in my example, this will be subfolder 3; I left the other folders empty for this example, but in real cases they will have data). If checking returns True, then I want to change the name of files within the target subfolder3 and transfer all files to another folder.
Bellow is my code. It is quite cumbersome and there is probably better ways of doing it. I tried using os.walk() and this is the closest I got to a solution but it won't move the files.
import os
import shutil
def organizer(parent_dir, target_dir, destination_dir):
for root, dirs, files in os.walk(parent_dir):
if root.endswith(target_dir):
target = root
for files in os.listdir(target):
if not files.startswith("."):
# this is to change the name of the file
fullname = files.split(".")
just_name = fullname[0]
csv_extension = fullname[1]
gz_extension = fullname[2]
subject_id = target
#make a new name
origin = subject_id + "/"+ just_name + "." + csv_extension + "." + gz_extension
#make a path based on this new name
new_name = os.path.join(destination_dir, origin)
#move file from origin folder to destination folder and rename the file
shutil.move(origin, new_name)
Any suggestions on how to make this work and / or more eficient?

simply enough, you can use the built-in os module, with os.walk(path) returns you root directories and files found
import os
for root, _, files in os.walk(path):
#your code here
for your problem, do this
import os
for root, dirs, files in os.walk(parent_directory);
for file in files:
#exctract the data from the "file"
check this for more information os.walk()
and if you want to get the name of the file, you can use os.path.basename(path)
you can even check for just the gzipped csv files you're looking for using built-in fnmatch module
import fnmathch, os
def find_csv_files(path):
result = []
for root, _, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, "*.csv.gz"): # find csv.gz using regex paterns
result.append(os.path.join(root, name))
return list(set(results)) #to get the unique paths if for some reason duplicated

Ok, guys, I was finally able to find a solution. Here it is. Not the cleanest one, but it works in my case. Thanks for the help.
def organizer(parent_dir, target_dir, destination_dir):
for root, dirs, files in os.walk(parent_dir):
if root.endswith(target_dir):
target = root
for files in os.listdir(target):
#this one because I have several .DS store files in the folder which I don't want to extract
if not files.startswith("."):
fullname = files.split(".")
just_name = fullname[0]
csv_extension = fullname[1]
gz_extension = fullname[2]
origin = target + "/" + files
full_folder_name = origin.split("/")
#make a new name
new_name = full_folder_name[5] + "_"+ just_name + "." + csv_extension + "." + gz_extension
#make a path based on this new name
new_path = os.path.join(destination_dir, new_name)
#move file from origin folder to destination folder and rename the file
shutil.move(origin, new_path)
The guess the problem was that was passing a variable that was a renamed file (in my example, I wrongly called this variable origin) as the origin path to shutil.move(). Since this path does not exist, then the files weren't moved.

Related

python how to collect a specific file from a list of folders and save

I have many folders in a master folder as given below. Each folder contains a .JPG file. I would like to extract all these files and store them in this master folder.
Inside each folder
My present code:
import os
import glob
os.chdir('Master folder')
extension = 'JPG'
jpg_files= [i for i in glob.glob('*.{}'.format(extension))]
This did not work.
To find the images in your tree, I would use os.walk. Below you can find a complete example to 'find and move' function that move all the files to your given path, and create a new filename for duplicate filenames.
The simple 'find and replace' function will also check with function add_index_to_filepath whether or not the file already exists, add an index (n) to the path. For example: if image.jpg would exists, it turns the next one into turn into image (1).jpg and the following one into image (2).jpg and so on.
import os
import re
import shutil
def add_index_to_filepath(path):
'''
Check if a file exists, and append '(n)' if true.
'''
# If the past exists, go adjust it
if os.path.exists(path):
# pull apart your path and filenames
folder, file = os.path.split(path)
filename, extension = os.path.splitext(file)
# discover the current index, and correct filename
try:
regex = re.compile(r'\(([0-9]*)\)$')
findex = regex.findall(filename)[0]
filename = regex.sub('({})'.format(int(findex) + 1), filename)
except IndexError:
filename = filename + ' (1)'
# Glue your path back together.
new_path = os.path.join(folder, '{}{}'.format(filename, extension))
# Recursivly call your function, go keep verifying if it exists.
return add_index_to_filepath(new_path)
return path
def find_and_move_files(path, extension_list):
'''
Walk through a given path and move the files from the sub-dir to the path.
Upper-and lower-case are ignored. Duplicates get a new filename.
'''
files_moved = []
# First walk through the path, to list all files.
for root, dirs, files in os.walk(path, topdown=False):
for file in files:
# Is your extension wanted?
extension = os.path.splitext(file)[-1].lower()
if extension in extension_list:
# Perpare your old an new path, and move
old_path = os.path.join(root, file)
new_path = add_index_to_filepath(os.path.join(path, file))
if new_path in files_moved:
shutil.move(old_path, new_path)
# Lets keep track of what we moved to return it in the end
files_moved.append(new_path)
return files_moved
path = '.' # your filepath for the master-folder
extensions = ['.jpg', '.jpeg'] # There are some variations of a jpeg-file extension.
found_files = find_and_move_files(path, extensions)

Iterate over files over several directories to extract data

I have a series of files that are nested as shown in the attached image. For each "inner" folder (e.g. like the 001717528 one), I want to extract a row of data from each the FITS files and create a CSV file that contains all the rows, and name that CSV file after the name of the "inner" folder (e.g. 001717528.csv that has data from the 18 fits files). The data-extracting part is easy but I have trouble coding the iteration.
I don't really know how to iterate over both the outer folders such as the 0017 and inner folders, and name the csv files as I want.
My code is looking like this:
for subdir, dirs, files in os.walk('../kepler'):
for file in files:
filepath = subdir + os.sep + file
if filepath.endswith(".fits"):
extract data
write to csv file
Apparently this will iterate over all files in the kepler folder so it doesn't work.
If you need to keep track of how far you've walked into the directory structure, you can count the file path delimiter (os.sep). In your case it's / because you're on a Mac.
for path, dirs, _ in os.walk("../kepler"):
if path.count(os.sep) == 2:
# path should be ../kepler/0017
for dir in dirs:
filename = dir + ".csv"
data_files = os.listdir(path + os.sep + dir)
for file in data_files:
if file.endswith(".fits"):
# Extract data
# Write to CSV file
As far as I can tell this meets your requirements, but let me know if I've missed something.
Try this code it should print the file path of all your ".fits" files:
# !/usr/bin/python
import os
base_dir = './test'
for root, dirs, files in os.walk(base_dir, topdown=False):
for name in files:
if name.endswith(".fits"):
file_path = os.path.join(root, name) #path of files
print(file_path)
# do your treatment on file_path
All you have to do is add your specific treatment.

How to get the files with the biggest size in the folders, change their name and save to a different folder

I need to get files with the biggest size in different folders, change their name to folder name that they belong to and save to a new folder. I have something like this and I got stuck:
import os
# Core settings
rootdir = 'C:\\Users\\X\\Desktop\\humps'
to_save = 'C:\\Users\\X\\Desktop\\new'
for root, dirs, files in os.walk(rootdir):
new_list = []
for file in files:
if file.endswith(".jpg"):
try:
print(file)
os.chdir(to_save)
add_id = root.split("humps\\")[1]
add_id = add_id.split("\\")[0]
file_name = os.path.join(root,file)
new_list.append(file_name)
bigfile = max(new_list, key=lambda x: x.stat().st_size)
except:
pass
To make it more clear: Let's say the name of the sub-folder is "elephant" and there are different elephant photos and subfolders in in this elephant folder. I want to go through those photos and subfolders and find the elephant foto with the biggest size, name it as elephant and save it to my target folder. Also repaet it for other sub folders such as lion, puma etc.
How I could achieve what I want ?
To find biggest file and save to another location
import os
import shutil
f_list = []
root = "path/to/directory"
root = os.path.abspath(root)
for folder, subfolders, files in os.walk(root):
for file in files:
filePath = os.path.join(folder, file)
f_list.append(filePath)
bigest_file = max(f_list,key=os.path.getsize)
new_path = "path/where/you/want/to/save"
shutil.copy(biggest_file,new_path)
if you want only images then add one more condition in loop
for folder, subfolders, files in os.walk(root):
for file in files:
if file.endswith(".jpg"):
filePath = os.path.join(folder, file)
f_list.append(filePath)
To get all folders biggest file
root = "demo"
root = os.path.abspath(root)
def test(path):
big_files = []
all_paths = [x[0] for x in os.walk(path)]
for paths in all_paths:
f_list = filter(os.path.isfile, os.listdir(paths))
if len(f_list) > 0:
big_files.append((paths,max(f_list,key=os.path.getsize)))
return big_files
print test(root)
How to get the files with the biggest size in the folders, change their name and save to a different folder
Basically you already have a good description of what you need to do. You just need to follow it step by step:
get all files in some search directory
filter for relevant files ("*.jpg")
get their sizes
find the maximum
copy to new directory with name of search directory
IMO it's an important skill to be able to break down a task into smaller tasks. Then, you just need to implement the smaller tasks and combine:
def iterate_files_recursively(directory="."):
for entry in os.scandir(directory):
if entry.is_dir():
for file in iterate_files_recursively(entry.path):
yield file
else:
yield entry
files = iterate_files_recursively(subfolder_name)
I'd use os.scandir because it avoids building up a (potentially) huge list of files in memory and instead allows me (via a generator) to work one file at a time. Note that starting with 3.6 you can use the result of os.scandir as a context manager (with syntax).
images = itertools.filterfalse(lambda f: not f.path.endswith('.jpg'), files)
Filtering is relatively straightforward except for the IMO strange choice of ìtertools.filterfalse to only keep elements for which its predicate returns False.
biggest = max(images, key=(lambda img: img.stat().st_size))
This is two steps in one: Get the maximum with the builtin max function, and use the file size as "key" to establish an order. Note that this raises a ValueError if you don't have any images ... so you might want to supply default=None or handle that exception.
shutil.copy(biggest.path, os.path.join(target_directory, subfolder_name + '.jpg')
shutil.copy copies the file and some metadata. Instead of hardcoding path separators, please use os.path.join!
Now all of this assumes that you know the subfolder_name. You can scan for those easily, too:
def iterate_directories(directory='.'):
for entry in os.scandir(directory):
if entry.is_dir():
yield entry
Here's some code that does what you want. Instead of using the old os.walk function, it uses modern pathlib functions.
The heart of this code is the recursive biggest function. It scans all the files and directories in folder, saving the matching file names to the files list, and recursively searching any directories it finds. It then returns the path of the largest file that it finds, or None if no matching files are found.
from pathlib import Path
import shutil
def filesize(path):
return path.stat().st_size
def biggest(folder, pattern):
''' Find the biggest file in folder that matches pattern
Search recursively in all subdirectories
'''
files = []
for f in folder.iterdir():
if f.is_file():
if f.match(pattern):
files.append(f)
elif f.is_dir():
found = biggest(f, pattern)
if found:
files.append(found)
if files:
return max(files, key=filesize)
def copy_biggest(src, dest, pattern):
''' Find the biggest file in each folder in src that matches pattern
and copy it to dest, using the folder's name as the new file name
'''
for path in src.iterdir():
if path.is_dir():
found = biggest(path, pattern)
if found:
newname = dest / path
print(path, ':', found, '->', newname)
shutil.copyfile(found, newname)
You can call it like this:
rootdir = r'C:\Users\X\Desktop\humps'
to_save = r'C:\Users\X\Desktop\new'
copy_biggest(Path(rootdir), Path(to_save), '*.jpg')
Note that the copied files will have the same name as the top-level folder in rootdir that they were found in, with no file extension. If you want to give them a .jpg extension, you can change
newname = dest / path
to
newname = (dest / path).with_suffix('.jpg')
The shutil module on older versions of Python 3 doesn't understand pathlib paths. But that's easy enough to remedy. In the copy_biggest function, replace
shutil.copyfile(found, newname)
with
shutil.copyfile(str(found), str(newname))

How to rename all files and subfolders within a directory using Python

I am doing a small project using a book about Python. One of the examples they show is how to rename all of the files and directories right underneath the root folder, however I want to go a step further and change files within sub directories.
I figured using os.walk would be my best bet but I can only get it to change file names not subdirectories. I would like to also change subdirectory names as I go through os.walk.
I am using a book and online resources to accomplish this, any help would be awesome.
Here is what I have right now based on the book. I have tried to include os.listdir but the regex won't work like that.
import shutil, os, re
#regex that matches files with dates
wrongFormat = re.compile(r"""^(.*?)
((0|1)?\d).
((0|1|2|3)?\d).
((19|20)\d\d)
(.*?)$
""", re.VERBOSE)
#Loop over the files in the working directory
path = '.'
for path, dirs, files in os.walk(path, topdown=True):
for incorrectDt in files:
dt = wrongFormat.search(incorrectDt)
#Skip file without a date
if dt == None:
continue
#Split filename into parts
beforePart = dt.group(1)
monthPart = dt.group(2)
dayPart = dt.group(4)
yearPart = dt.group(6)
afterPart = dt.group(8)
#Form the new date format
correctFormat = beforePart + monthPart + "_" + dayPart + "_" + yearPart + afterPart
#Get full, absolute file paths.
absWorkingDir = os.path.abspath(path)
incorrectDt= os.path.join(absWorkingDir, incorrectDt)
correctFormat = os.path.join(absWorkingDir, correctFormat)
#rename files
print ('Renaming "%s" to "%s"...' % (wrongFormat, correctFormat))
shutil.move(incorrectDt, correctFormat)

How to move files based on their names to specific directories in python?

I have a directory called /user/local/ inside which i have several files of the form, jenjar.dat_1 and jenmis.dat_1. There is another directory /user/data inside which there are two subdirectories of the form, jenjar and jenmis. I need a Python code that would move the jenjar.dat_1 into the jenjar directory of /user/data and similarly, jenmis.dat_1 into jenmis directory of '/user/data.
I guess the os module would work for thus but I'm confused. Most of the questions here do not show a Pythonic way to do this.
EDIT: I have found the solution to this
destination = '/user/local'
target = '/user/data'
destination_list = os.listdir(destination)
data_dir_list = os.listdir(target)
for fileName in destination_list:
if not os.path.isdir(os.path.join(destination, fileName)):
for prefix in data_dir_list:
if fileName.startswith(prefix):
shutil.copy(os.path.join(destination, fileName), os.path.join(target, prefix, fileName))
This should do the trick
srcDir = '/user/local'
targetDir = '/user/data'
for fname in os.listdir(srcDir):
if not os.path.isdir(os.path.join(srcDir, fname)):
for prefix in ['jenjar.dat', 'jenmis.dat']:
if fname.startswith(prefix):
if not os.path.isdir(os.path.join(targetDir, prefix)):
os.mkdir(os.path.join(targetDir, prefix))
shutil.move(os.path.join(srcDir, fnmae), targetDir)

Categories

Resources