Organizing data by filetype

Organizing data by filetype - python

I am trying to sort a large number of files based off of their file extension. A lot of the files are .doc, .docx, .xls, etc.
This is what I was thinking in my head, but if there is a simpler way to do things, let me know! I do have multiple files with the same extension, so I don't want it to create a new folder for that extension every time and overwrite the previous file. I also have a much larger list, but for this example I don't believe all of them are needed. The OS is MacOS.
import os, shutil
extList = ['.doc', '.docx', '.xls']
for ext in extList:
os.mkdir(path + '/' + ext +'_folder')
for file in os.listdir(filepath):
if file.endswith(ext): #missing an indent
print(file)
shutil.copyfile(file + '/' + ext +'_folder' + file)
Also, if I run into a file that I do not have on my list, I would like it to go into a folder named 'noextlist'.

Here is what I was able to create quickly
import os, re, shutil
DocFolder = r'...'#Your doc folder path
DocxFolder = r'...'#Your docx folder path
XlsFolder = r'...'#Your xls folder path
MiscFolder = r'...'#Your misc folder path
for root, dirs, files in os.walk(r'...'): #Your folder path you want to sort
for file in files:
if file.endswith(".doc"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,DocFolder)
elif file.endswith(".docx"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,DocxFolder)
elif file.endswith(".xls"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,XlsFolder)
else:
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,MiscFolder)
Edit:The main function here is the for root,dirs,files in os.walk This allows the program to transverse through the provided path to search all files including the ones in the sub folder and sort it out accordingly.

import errno
import shutil
from os import listdir, mkdir
from os.path import splitext, join
# set for fast lookup
extList = set(['.doc', '.docx', '.xls'])
# source path
filepath = ...
# dest path
path = ...
for f in listdir(filepath):
# extract extension from file name
ext = splitext(f)[1]
if ext in extList:
dir_ = join(path, "{}_folder".format(ext))
try:
mkdir(dir_)
except OSError as e:
if ex.errno != errno.EEXIST:
raise # raise if any other error than "already exists"
dest = join(dir_, f)
else:
dest = join(path, "noextlist_folder", f)
shutil.copy2(join(filepath, f), dest)

If I understand correctly, you like your solution but you need a way to rename files with duplicate names so that the extras don't disappear. You can check if the destination file already exists and construct a variant name by adding _1, _2, etc. to the filename until you find something unused.
newpathname = path + '/' + ext +'_folder' + "/" + file
n = 0
while os.path.exists(newpathname):
n += 1
base, ext = os.path.splitext(newpathname)
newpathname = "%s_%d%s" % (base, n, ext)
shutil.copyfile(filepath+"/"+file, newpathname)
But your code has some other glitches, so here's a rewritten scanner. It uses os.walk() to descend into several levels of subdirectories (you don't say if that's needed or not), and it collects files of all extensions in one pass. And it constructs variant names as before.
import os, shutil
extList = ['.doc', '.docx', '.xls']
from os.path import join as joinpath
# Make sure the destination directories exist
for ext in extList:
extdir = joinpath(path, ext[1:]+"_folder")
if not os.path.exists(extdir):
os.mkdir(extdir)
for dirname, _dirs, files in os.walk(filepath):
for file in files:
base, ext = os.path.splitext(file)
if ext not in extList:
continue
destpath = joinpath(path, ext[1:]+"_folder")
n = 0
newpathname = joinpath(destpath, file)
# If the new name is in use, find an unused variant
while os.path.exists(newpathname):
n += 1
newfile = "%s_%d%s" % (base, n, ext)
newpathname = joinpath(path, newfile)
sh.copy(joinpath(dirname, file), newpathname) # or other copy method

Related

python file name change

I am trying to change file names like below:
the 000000 are the same number.
000000_ABC.png --->000000+1_ABC.png
000000_DEF.png --->000000+2_DEF.png
000000_GHI.png --->000000+3_GHI.png
000000_JKL.png --->000000+4_JKL.png
In order to do so, I wrote code like below.
img_files = os.listdir(PATH_TO_PNG_FILES)
for img_file, i in zip(img_files, range(len(img_files))):
new_img_file = img_file.replace("_", "+"+str(i)+"_")
os.rename(path + img_file, path + new_img_file)
There are more than just four files and more of similar lines.
The problem is that immediately after running pycharm, it successfully produces the desired results, but after I run another page related to the result directories, the results continue to be changed like below even after the process finished. I do not understand why.
000000+1+1_ABC.png
000000+2+2_DEF.png
000000+3+3_GHI.png
000000+4+4_JKL.png
or
otherwise "+unexpected number"

This is because the other directory may already contain file in the format of "000000+1_ABC.png" and your script is changing _ to "+1_" resulting in "000000+1+1_ABC.png". To solve this you can add a if statement to check it should not contain "+" symbol.
img_files = os.listdir(path inside of which the png files are saved)
for img_file, i in zip(img_files, range(len(img_files))):
if not ("+" in img_file):
new_img_file = img_file.replace("_", "+"+str(i)+"_")
os.rename(path + img_file, path + new_img_file)

A simple and naive way would be to add a verification to check whether there is a '+' in the filename. If you have other files which may contain a +, you may have to check for a stricter pattern.

I made a YouTube video https://youtu.be/K9jhAPZLZLc on how to rename multiple files like the one you have assuming all your files are in the same directory.
To answer your question. assuming all image files are in the same folder.
path = 'C:\\Users\\USER\\Desktop\\rename_images\\images\\' # path to your images
files = os.listdir(path)
for count, filename in enumerate(files):
# Get the file extension
file, file_extension = os.path.splitext(filename)
# check if the current file is a folder or not
full_path = f'{path}{filename}'
if os.path.isdir(full_path):
print('This is a directory')
elif os.path.isfile(full_path):
print('This is a normal file')
# Rename
if not '+' in file:
try:
file_split = file.split('_')
zeros = file_split[0]
alpha = file_split[-1]
current_file_name = os.path.join(path, filename)
new_file_name = os.path.join(path, ''.join([f'{zeros}+{count}_{alpha}', file_extension]))
os.rename(current_file_name, new_file_name)
except:
pass
else:
pass
else:
print('This is a special file')

I would imagine that the problem comes from modifying the name insted of overwriting.
import os
DIR_PATH = 'files'
def rename_files(dir_name):
img_files = os.listdir(dir_name)
for i in range(len(img_files)):
file_name = img_files[i].split('_')[-1]
file_name = '000000+{0}_{1}'.format(i, file_name)
os.rename(
os.path.join(dir_name, img_files[i]),
os.path.join(dir_name, file_name)
)
if __name__ == '__main__':
rename_files(DIR_PATH)

Shutil - Move and Rename

In my python script I am moving a bunch of files from different sub directories to one location the trouble with this is that there are multiple files the same.
The code I am currently running is:
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.log'):
shutil.move(os.path.join(root, file), FILE_LOCATION_PATH)
The trouble with this is it will throw the error eventually of:
File "downloader.py", line 78, in <module>
shutil.move(os.path.join(root, file), FILE_LOCATION_PATH)
File "/usr/lib/python3.6/shutil.py", line 548, in move
raise Error("Destination path '%s' already exists" % real_dst)
shutil.Error: Destination path '/home/a.log' already exists
I can negate this by changing my move line to copy such as the below
shutil.move(os.path.join(root, file), FILE_LOCATION_PATH)
This will however replace any files with this name with the latest copy. I am trying to figure out a way I can rename any files with the same name to follow a naming convention like this
a.log
a_1.log
a_2.log
Any suggestions on how best to approach this or sample code. I am new to Python and trying to complete my first practical script.

If you are fine with filename like filename.log, filename.log_1 ... then code is just 4 lines more to your original code:
import shutil
import os
FILE_LOCATION_PATH='/destination/directory'
dir_path='/source/directory'
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.log'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)

Probably this is what you need
import re
import os
import shutil
FILE_LOCATION_PATH = 'dst'
def increment_file_name(f_name):
"""
a.txt -> a_1.txt
a_1.txt -> a_2.txt
"""
def split_f_name(f_name):
m = re.match(r"^(.+?)(_(\d+))?(\.(.+))??$", f_name)
return m.group(1), int(m.group(3) or 0), m.group(5)
def join_f_name(name, suffix, ext):
if not ext:
return "{}_{}".format(name, suffix)
return "{}_{}.{}".format(name, suffix, ext)
dirname = os.path.dirname(f_name)
name, suffix, ext = split_f_name(os.path.basename(f_name))
if dirname:
return os.path.join(dirname, join_f_name(name, suffix + 1, ext))
else:
return join_f_name(name, suffix + 1, ext)
def safe_move(path):
dir, f_name = os.path.split(path)
while os.path.exists(os.path.join(FILE_LOCATION_PATH, f_name)):
f_name = increment_file_name(f_name)
shutil.move(path, os.path.join(FILE_LOCATION_PATH, f_name))
for root, dirs, files in os.walk('src'):
for file in files:
if file.endswith('.log'):
safe_move(os.path.join(root, file))

Change a filename?

I have some files in a folder named like this test_1999.0000_seconds.vtk. What I would like to do is to is to change the name of the file to test_1999.0000.vtk.

You can use os.rename
os.rename("test_1999.0000_seconds.vtk", "test_1999.0000.vtk")

import os
currentPath = os.getcwd() # get the current working directory
unWantedString = "_seconds"
matchingFiles =[]
for path, subdirs, files in os.walk(currentPath):
for f in files:
if f.endswith(".vtk"): # To group the vtk files
matchingFiles.append(path+"\\"+ f) #
print matchingFiles
for mf in matchingFiles:
if unWantedString in mf:
oldName = mf
newName = mf.replace(unWantedString, '') # remove the substring from the string
os.rename(oldName, newName) # rename the old files with new name without the string

Move file to a folder or make a renamed copy if it exists in the destination folder

I have a piece of code i wrote for school:
import os
source = "/home/pi/lab"
dest = os.environ["HOME"]
for file in os.listdir(source):
if file.endswith(".c")
shutil.move(file,dest+"/c")
elif file.endswith(".cpp")
shutil.move(file,dest+"/cpp")
elif file.endswith(".sh")
shutil.move(file,dest+"/sh")
what this code is doing is looking for files in a source directory and then if a certain extension is found the file is moved to that directory. This part works. If the file already exists in the destination folder of the same name add 1 at end of the file name, and before the extension and if they are multiples copies do "1++".
Like this: test1.c,test2.c, test3.c
I tried using os.isfile(filename) but this only looks at the source directory. and I get a true or false.

To test if the file exists in the destination folder you should os.path.join the dest folder with the file name
import os
import shutil
source = "/home/pi/lab"
dest = os.environ["HOME"]
# Avoid using the reserved word 'file' for a variable - renamed it to 'filename' instead
for filename in os.listdir(source):
# os.path.splitext does exactly what its name suggests - split the name and extension of the file including the '.'
name, extension = os.path.splitext(filename)
if extension == ".c":
dest_filename = os.path.join(dest, filename)
if not os.path.isfile(dest_filename):
# We copy the file as is
shutil.copy(os.path.join(source, filename) , dest)
else:
# We rename the file with a number in the name incrementing the number until we find one that is not used.
# This should be moved to a separate function to avoid code duplication when handling the different file extensions
i = 0
dest_filename = os.path.join(dest, "%s%d%s" % (name, i, extension))
while os.path.isfile(dest_filename):
i += 1
dest_filename = os.path.join(dest, "%s%d%s" % (name, i, extension))
shutil.copy(os.path.join(source, filename), dest_filename)
elif extension == ".cpp"
...
# Handle other extensions
If you want to have put the renaming logic in a separate function using glob and re this is one way:
import glob
import re
...
def rename_file(source_filename, source_ext):
filename_pattern = os.path.join(dest, "%s[0-9]*%s"
% (source_filename, source_ext))
# Contains file such as 'a1.c', 'a2.c', etc...
existing_files = glob.glob(filename_pattern)
regex = re.compile("%s([0-9]*)%s" % (source_filename, source_ext))
# Retrieve the max of the index used for this file using regex
max_index = max([int(match.group(1))
for match in map(regex.search, existing_files)
if match])
source_full_path = os.path.join(source, "%s%s"
% (source_filename, source_ext))
# Rebuild the destination filename with the max index + 1
dest_full_path = os.path.join(dest, "%s%d%s"
% (source_filename,
(max_index + 1),
source_ext))
shutil.copy(source_full_path, dest_full_path)
...
# If the file already exists i.e. replace the while loop in the else statement
rename_file(name, extension)

I din't test the code. But something like this should do the job:-
i = 0
filename = "a.txt"
while True:
if os.isfile(filename):
i+= 1
break
if i:
fname, ext = filename.split('.')
filename = fname + str(i) + '.' + ext

Flatten complex directory structure in Python

I want to move files from a complex directory structure to just one place. For example i have this deep hierarchy:
foo/
foo2/
1.jpg
2.jpg
...
I want it to be:
1.jpg
2.jpg
...
My current solution:
def move(destination):
for_removal = os.path.join(destination, '\\')
is_in_parent = lambda x: x.find(for_removal) > -1
with directory(destination):
files_to_move = filter(is_in_parent,
glob_recursive(path='.'))
for file in files_to_move:
shutil.move(file, destination)
Definitions: directory and glob_recursive. Note, that my code only moves files to their common parent directory, not an arbitrary destination.
How can i move all files from a complex hierarchy to a single place succinctly and elegantly?

I don't like testing the name of the file about to be moved to see if we're already in the destination directory. Instead, this solution only scans the subdirectories of the destination
import os
import itertools
import shutil
def move(destination):
all_files = []
for root, _dirs, files in itertools.islice(os.walk(destination), 1, None):
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)
Explanation: os.walk walks recursively the destination in a "top down" manner. whole filenames are constructed with the os.path.join(root, filename) call. Now, to prevent scanning files at the top of the destination, we just need to ignore the first element of the iteration of os.walk. To do that I use islice(iterator, 1, None). One other more explicit way would be to do this:
def move(destination):
all_files = []
first_loop_pass = True
for root, _dirs, files in os.walk(destination):
if first_loop_pass:
first_loop_pass = False
continue
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)

this would do, it also renames files if they collide (I commented out the actual move and replaced with a copy):
import os
import sys
import string
import shutil
#Generate the file paths to traverse, or a single path if a file name was given
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
destination = "./newdir/"
fromdir = "./test/"
for f in getfiles(fromdir):
filename = string.split(f, '/')[-1]
if os.path.isfile(destination+filename):
filename = f.replace(fromdir,"",1).replace("/","_")
#os.rename(f, destination+filename)
shutil.copy(f, destination+filename)

Run recursively through directory, move the files and launch move for directories:
import shutil
import os
def move(destination, depth=None):
if not depth:
depth = []
for file_or_dir in os.listdir(os.path.join([destination] + depth, os.sep)):
if os.path.isfile(file_or_dir):
shutil.move(file_or_dir, destination)
else:
move(destination, os.path.join(depth + [file_or_dir], os.sep))

import os.path, shutil
def move(src, dest):
not_in_dest = lambda x: os.path.samefile(x, dest)
files_to_move = filter(not_in_dest,
glob_recursive(path=src))
for f in files_to_move:
shutil.move(f, dest)
Source for glob_recursive. Does not change name of file, if they collide.
samefile is a safe way to compare paths. But it doesn't work on Windows, so check How to emulate os.path.samefile behaviour on Windows and Python 2.7?.

def splitPath(p):
a,b = os.path.split(p)
return (splitPath(a) if len(a) and len(b) else []) + [b]
def safeprint(s):
try:
print(s)
except UnicodeEncodeError:
if sys.version_info >= (3,):
print(s.encode('utf8').decode(sys.stdout.encoding))
else:
print(s.encode('utf8'))
def flatten(root, doit):
SEP = "¦"
REPL = "?"
folderCount = 0
fileCount = 0
if not doit:
print("Simulating:")
for path, dirs, files in os.walk(root, topdown=False):
if path != root:
for f in files:
sp = splitPath(path)
np = ""
for element in sp[1:]:
e2 = element.replace(SEP, REPL)
np += e2 + SEP
f2 = f.replace(SEP, REPL)
newName = np + f2
safeprint("Moved: "+ newName )
if doit:
shutil.move(os.path.join(path, f), os.path.join(root, f))
# Uncomment, if you want filenames to be based on folder hierarchy.
#shutil.move(os.path.join(path, f), os.path.join(root, newName))
fileCount += 1
safeprint("Removed: "+ path)
if doit:
os.rmdir(path)
folderCount += 1
if doit:
print("Done.")
else:
print("Simulation complete.")
print("Moved files:", fileCount)
print("Removed folders:", folderCount)
directory_path = r"C:\Users\jd\Documents\myFtpData"
flatten(directory_path, True)

Adding on to the answers, I believe my answer will satisfy all your needs, the other answers fail when there is a subdirectory and file with the same filename as the upper directory.
This was SOLVED here, Also look at my Github Repo for Structured File Copy and Flattened File Copy:
import os, fnmatch, shutil
PATTERN = '*.txt' # Regex Pattern to Match files
INPUT_FOLDER = "A" # os.getcwd()
INPUT_FOLDER = os.path.abspath(INPUT_FOLDER)
include_input_foldername = False
prepend = "_included" if include_input_foldername else ""
OUTPUT_FOLDER = f"Structured_Copy_{os.path.basename(INPUT_FOLDER)}{prepend}"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def find(pattern, path):
"""Utility to find files wrt a regex search"""
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
all_files = find(PATTERN, INPUT_FOLDER)
for each_path in all_files:
relative_path = os.path.relpath(each_path, os.path.dirname(INPUT_FOLDER)) if include_input_foldername else os.path.relpath(each_path, INPUT_FOLDER)
flattened_relative_fullpath = os.path.join(OUTPUT_FOLDER, relative_path)
os.makedirs(os.path.dirname(flattened_relative_fullpath), exist_ok=True)
shutil.copy(each_path, flattened_relative_fullpath)
print(f"Copied {each_path} to {flattened_relative_fullpath}")
print(f"Finished Copying {len(all_files)} Files from : {INPUT_FOLDER} to : {OUTPUT_FOLDER}")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Organizing data by filetype - python

Related

python file name change

Shutil - Move and Rename

Change a filename?

Move file to a folder or make a renamed copy if it exists in the destination folder

Flatten complex directory structure in Python

Categories

Resources