Flatten complex directory structure in Python - python

I want to move files from a complex directory structure to just one place. For example i have this deep hierarchy:
foo/
foo2/
1.jpg
2.jpg
...
I want it to be:
1.jpg
2.jpg
...
My current solution:
def move(destination):
for_removal = os.path.join(destination, '\\')
is_in_parent = lambda x: x.find(for_removal) > -1
with directory(destination):
files_to_move = filter(is_in_parent,
glob_recursive(path='.'))
for file in files_to_move:
shutil.move(file, destination)
Definitions: directory and glob_recursive. Note, that my code only moves files to their common parent directory, not an arbitrary destination.
How can i move all files from a complex hierarchy to a single place succinctly and elegantly?

I don't like testing the name of the file about to be moved to see if we're already in the destination directory. Instead, this solution only scans the subdirectories of the destination
import os
import itertools
import shutil
def move(destination):
all_files = []
for root, _dirs, files in itertools.islice(os.walk(destination), 1, None):
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)
Explanation: os.walk walks recursively the destination in a "top down" manner. whole filenames are constructed with the os.path.join(root, filename) call. Now, to prevent scanning files at the top of the destination, we just need to ignore the first element of the iteration of os.walk. To do that I use islice(iterator, 1, None). One other more explicit way would be to do this:
def move(destination):
all_files = []
first_loop_pass = True
for root, _dirs, files in os.walk(destination):
if first_loop_pass:
first_loop_pass = False
continue
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)

this would do, it also renames files if they collide (I commented out the actual move and replaced with a copy):
import os
import sys
import string
import shutil
#Generate the file paths to traverse, or a single path if a file name was given
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
destination = "./newdir/"
fromdir = "./test/"
for f in getfiles(fromdir):
filename = string.split(f, '/')[-1]
if os.path.isfile(destination+filename):
filename = f.replace(fromdir,"",1).replace("/","_")
#os.rename(f, destination+filename)
shutil.copy(f, destination+filename)

Run recursively through directory, move the files and launch move for directories:
import shutil
import os
def move(destination, depth=None):
if not depth:
depth = []
for file_or_dir in os.listdir(os.path.join([destination] + depth, os.sep)):
if os.path.isfile(file_or_dir):
shutil.move(file_or_dir, destination)
else:
move(destination, os.path.join(depth + [file_or_dir], os.sep))

import os.path, shutil
def move(src, dest):
not_in_dest = lambda x: os.path.samefile(x, dest)
files_to_move = filter(not_in_dest,
glob_recursive(path=src))
for f in files_to_move:
shutil.move(f, dest)
Source for glob_recursive. Does not change name of file, if they collide.
samefile is a safe way to compare paths. But it doesn't work on Windows, so check How to emulate os.path.samefile behaviour on Windows and Python 2.7?.

def splitPath(p):
a,b = os.path.split(p)
return (splitPath(a) if len(a) and len(b) else []) + [b]
def safeprint(s):
try:
print(s)
except UnicodeEncodeError:
if sys.version_info >= (3,):
print(s.encode('utf8').decode(sys.stdout.encoding))
else:
print(s.encode('utf8'))
def flatten(root, doit):
SEP = "¦"
REPL = "?"
folderCount = 0
fileCount = 0
if not doit:
print("Simulating:")
for path, dirs, files in os.walk(root, topdown=False):
if path != root:
for f in files:
sp = splitPath(path)
np = ""
for element in sp[1:]:
e2 = element.replace(SEP, REPL)
np += e2 + SEP
f2 = f.replace(SEP, REPL)
newName = np + f2
safeprint("Moved: "+ newName )
if doit:
shutil.move(os.path.join(path, f), os.path.join(root, f))
# Uncomment, if you want filenames to be based on folder hierarchy.
#shutil.move(os.path.join(path, f), os.path.join(root, newName))
fileCount += 1
safeprint("Removed: "+ path)
if doit:
os.rmdir(path)
folderCount += 1
if doit:
print("Done.")
else:
print("Simulation complete.")
print("Moved files:", fileCount)
print("Removed folders:", folderCount)
directory_path = r"C:\Users\jd\Documents\myFtpData"
flatten(directory_path, True)

Adding on to the answers, I believe my answer will satisfy all your needs, the other answers fail when there is a subdirectory and file with the same filename as the upper directory.
This was SOLVED here, Also look at my Github Repo for Structured File Copy and Flattened File Copy:
import os, fnmatch, shutil
PATTERN = '*.txt' # Regex Pattern to Match files
INPUT_FOLDER = "A" # os.getcwd()
INPUT_FOLDER = os.path.abspath(INPUT_FOLDER)
include_input_foldername = False
prepend = "_included" if include_input_foldername else ""
OUTPUT_FOLDER = f"Structured_Copy_{os.path.basename(INPUT_FOLDER)}{prepend}"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def find(pattern, path):
"""Utility to find files wrt a regex search"""
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
all_files = find(PATTERN, INPUT_FOLDER)
for each_path in all_files:
relative_path = os.path.relpath(each_path, os.path.dirname(INPUT_FOLDER)) if include_input_foldername else os.path.relpath(each_path, INPUT_FOLDER)
flattened_relative_fullpath = os.path.join(OUTPUT_FOLDER, relative_path)
os.makedirs(os.path.dirname(flattened_relative_fullpath), exist_ok=True)
shutil.copy(each_path, flattened_relative_fullpath)
print(f"Copied {each_path} to {flattened_relative_fullpath}")
print(f"Finished Copying {len(all_files)} Files from : {INPUT_FOLDER} to : {OUTPUT_FOLDER}")

Related

Python: How to get the full path of a file in order to move it?

I had files that were in zips. I unzipped them with Zip-7 so they are in folders with the zip file names.
Each of these folders has either a .otf or .ttf (some have both) that I want out of them and moved to another folder.
I have tried a few methods of getting the full path of the files but every one of them leaves out the folder that the file is actually in.
Here is my latest try:
import os
import shutil
from pathlib import Path
result = []
for root, dirs, files in os.walk("."):
for d in dirs:
continue
for f in files:
if f.endswith(".otf"):
print(f)
p = Path(f).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
elif f.endswith(".ttf"):
print(f)
p = Path(f).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
else:
continue
Other attempts:
# parent_dir = Path(f).parents[1]
# shutil.move(f, parent_dir)
#print("OTF: " + f)
# fn = f
# f = f[:-4]
# f += '\\'
# f += fn
# result.append(os.path.realpath(f))
#os.path.relpath(os.path.join(root, f), "."))
I know this is something simple but I just can't figure it out. Thanks!
You should join the file name with the path name root:
for root, dirs, files in os.walk("."):
for d in dirs:
continue
for f in files:
if f.endswith(".otf"):
p = Path(os.path.join(root, f)).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
elif f.endswith(".ttf"):
p = Path(os.path.join(root, f)).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
else:
continue
for root, dirs, files in os.walk(".")
for d in dirs:
continue
for f in files:
print(os.path.abspath(f))
You can use os.path.abspath() to get a path of a full file
You would also need to still filter for the certain file types.

Python - match directories with pattern (regular expression)

I wrote a loop which ignores all sub-directories which contain .txt files within them.
src = raw_input("Enter source disk location: ")
src = os.path.abspath(src)
dst = raw_input("Enter first destination to copy: ")
dst = os.path.abspath(dst)
dest = raw_input("Enter second destination to move : ")
dest = os.path.abspath(dest)
path_patter = '(\S+)_(\d+)_(\d+)_(\d+)__(\d+)_(\d+)_(\d+)'
for dir, dirs, files in os.walk(src):
if any(f.endswith('.txt') for f in files):
dirs[:] = [] # do not recurse into subdirectories
continue
files = [os.path.join(dir, f) for f in files ]
for f in files:
part1 = os.path.dirname(f)
part2 = os.path.dirname(os.path.dirname(part1))
part3 = os.path.split(part1)[1]
path_miss1 = os.path.join(dst, "missing_txt")
path_miss = os.path.join(path_miss1, part3)
path_missing = os.path.join(dest, "missing_txt")
searchFileName = re.search(path_patter, part3)#### update
if searchFileName:#####update
try:
if not os.path.exists(path_miss):
os.makedirs(path_miss)
else:
pass
if os.path.exists(path_miss):
distutils.dir_util.copy_tree(part1, path_miss)
else:
debug_status += "missing_file\n"
pass
if (get_size(path_miss)) == 0:
os.rmdir(path_miss)
else:
pass
if not os.path.exists(path_missing):
os.makedirs(path_missing)
else:
pass
if os.path.exists(path_missing):
shutil.move(part1, path_missing)
else:
pass
if (get_size(path_missing)) == 0:
os.rmdir(path_missing)
else:
pass
except Exception:
pass
else:
continue
How to modify this code to compare directory name with regular expression in this case. (it has to ignore directories with .txt files)
import os
import re
def createEscapedPattern(path,pattern):
newPath = os.path.normpath(path)
newPath = newPath.replace("\\","\\\\\\\\")
return newPath + "\\\\\\\\" + pattern
def createEscapedPath(path):
newPath = os.path.normpath(path)
return newPath.replace("\\","\\\\")
src = 'C:\\Home\\test'
path_patter = '(\S+)_(\d+)_(\d+)_(\d+)__(\d+)_(\d+)_(\d+)$'
p = re.compile(createEscapedPattern(src,path_patter))
for dir, dirs, files in os.walk(src):
if any(f.endswith('.txt') for f in files):
dirs[:] = []
continue
if any(p.match(createEscapedPath(dir)) for f in files):
for f in files:
print createEscapedPath(dir + "/" + f)
p = re.compile(createEscapedPattern(dir,path_patter))
There are a couple of things i did here and hope this example helps
I wrote this for windows fs so used two path convert functions.
This script ignores dirs with .txt files like you implemented it
This script will start at the directory you start the script and will only print file names if the pattern matches. This is done for all subdirectory's that are not ignored by the previous rule.
Used regex in python and made it compile again for each directory so you get: 'directory/(\S+)(\d+)(\d+)_(\d+)__(\d+)(\d+)(\d+)$'

Organizing data by filetype

I am trying to sort a large number of files based off of their file extension. A lot of the files are .doc, .docx, .xls, etc.
This is what I was thinking in my head, but if there is a simpler way to do things, let me know! I do have multiple files with the same extension, so I don't want it to create a new folder for that extension every time and overwrite the previous file. I also have a much larger list, but for this example I don't believe all of them are needed. The OS is MacOS.
import os, shutil
extList = ['.doc', '.docx', '.xls']
for ext in extList:
os.mkdir(path + '/' + ext +'_folder')
for file in os.listdir(filepath):
if file.endswith(ext): #missing an indent
print(file)
shutil.copyfile(file + '/' + ext +'_folder' + file)
Also, if I run into a file that I do not have on my list, I would like it to go into a folder named 'noextlist'.
Here is what I was able to create quickly
import os, re, shutil
DocFolder = r'...'#Your doc folder path
DocxFolder = r'...'#Your docx folder path
XlsFolder = r'...'#Your xls folder path
MiscFolder = r'...'#Your misc folder path
for root, dirs, files in os.walk(r'...'): #Your folder path you want to sort
for file in files:
if file.endswith(".doc"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,DocFolder)
elif file.endswith(".docx"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,DocxFolder)
elif file.endswith(".xls"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,XlsFolder)
else:
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,MiscFolder)
Edit:The main function here is the for root,dirs,files in os.walk This allows the program to transverse through the provided path to search all files including the ones in the sub folder and sort it out accordingly.
import errno
import shutil
from os import listdir, mkdir
from os.path import splitext, join
# set for fast lookup
extList = set(['.doc', '.docx', '.xls'])
# source path
filepath = ...
# dest path
path = ...
for f in listdir(filepath):
# extract extension from file name
ext = splitext(f)[1]
if ext in extList:
dir_ = join(path, "{}_folder".format(ext))
try:
mkdir(dir_)
except OSError as e:
if ex.errno != errno.EEXIST:
raise # raise if any other error than "already exists"
dest = join(dir_, f)
else:
dest = join(path, "noextlist_folder", f)
shutil.copy2(join(filepath, f), dest)
If I understand correctly, you like your solution but you need a way to rename files with duplicate names so that the extras don't disappear. You can check if the destination file already exists and construct a variant name by adding _1, _2, etc. to the filename until you find something unused.
newpathname = path + '/' + ext +'_folder' + "/" + file
n = 0
while os.path.exists(newpathname):
n += 1
base, ext = os.path.splitext(newpathname)
newpathname = "%s_%d%s" % (base, n, ext)
shutil.copyfile(filepath+"/"+file, newpathname)
But your code has some other glitches, so here's a rewritten scanner. It uses os.walk() to descend into several levels of subdirectories (you don't say if that's needed or not), and it collects files of all extensions in one pass. And it constructs variant names as before.
import os, shutil
extList = ['.doc', '.docx', '.xls']
from os.path import join as joinpath
# Make sure the destination directories exist
for ext in extList:
extdir = joinpath(path, ext[1:]+"_folder")
if not os.path.exists(extdir):
os.mkdir(extdir)
for dirname, _dirs, files in os.walk(filepath):
for file in files:
base, ext = os.path.splitext(file)
if ext not in extList:
continue
destpath = joinpath(path, ext[1:]+"_folder")
n = 0
newpathname = joinpath(destpath, file)
# If the new name is in use, find an unused variant
while os.path.exists(newpathname):
n += 1
newfile = "%s_%d%s" % (base, n, ext)
newpathname = joinpath(path, newfile)
sh.copy(joinpath(dirname, file), newpathname) # or other copy method

return text file path

I want to return the path of a file, If it is found by the program, but I want it to continue to loop(or recursively repeat) the program until all files are checked.
def findAll(fname, path):
for item in os.listdir(path):
n = os.path.join(path, item)
try:
findAll(n, fname)
except:
if item == fname:
print(os.idontknow(item))
So I'm having trouble with calling the path, right now I have
os.idontknow(item)
as a place holder
Input is :
findAll('fileA.txt', 'testpath')
The output is:
['testpat\\fileA.txt', 'testpath\\folder1\\folder11\\fileA.txt','testpath\\folder2\\fileA.txt']
Per my comment above, here is an example that will start at the current directory and search through all sub-directories, looking for files matching fname:
import os
# path is your starting point - everything under it will be searched
path = os.getcwd()
fname = 'file1.txt'
my_files = []
# Start iterating, and anytime we see a file that matches fname,
# add to our list
for root, dirs, files in os.walk(path):
for name in files:
if name == fname:
# root here is the path to the file
my_files.append(os.path.join(root, name))
print my_files
Or as a function (more appropriate for your case :) ):
import os
def findAll(fname, start_dir=os.getcwd()):
my_files = []
for root, dirs, files in os.walk(start_dir):
for name in files:
if name == fname:
my_files.append(os.path.join(root, name))
return my_files
print findAll('file1.txt')
print findAll('file1.txt', '/some/other/starting/directory')
Something like this, maybe?
import os
path = "path/to/your/dir"
for (path, dirs, files) in os.walk(path):
print files

python ,return's position and filesize

Functions return the position of master well in python.
def myPath():
for root,dirs,files in os.walk(dir):
for fn in files:
path = os.path.join(root, fn)
return path
return path
return path
ls /home/bb/C/
a.out main.c simple_write t.c
Three positions "return" is not what I want.
I want to get result "All the files in the C"
def filesize(path):
for root, dirs, files in os.walk(PATH):
for fn in files:
path = os.path.join(root, fn)
size = os.stat(path).st_size
yield size,path
for size,path in filesize(PATH):
print size,path
but,How to achieve the above functions with the following code? How to modify it?
def find(path):
return [os.path.join(root,fn)
for root,dir,files in os.walk(dirs)
for fn in files]
Return a list of the paths, not only a single path:
def find(path):
return [os.path.join(root, fn)
for root, dirs, files in os.walk(path)
for fn in files]
You could also use yield inside the inner loop to get a generator function, see The Python yield keyword explained.
What you want is a generator:
def myPath(mydir):
for root,dirs,files in os.walk(mydir):
for fn in files:
path = os.path.join(root, fn)
yield path # <----- Instead of return
for path in myPath(some_dir):
print path
Here are 3 ways if you need just the /home/bb/C directory listed
import os, glob
def listWithoutDirectories(path):
return [os.path.join(root,fn)
for root,dirs,files in os.walk(mydir)
if(root==path)
for fn in files]
def listWithDirectories(path)
return [os.path.join(path,fn)
for fn in os.listdir(path)]
print listWithoutDirectories('/home/bb/C')
print listWithDirectories('/home/bb/C')
#Or with a glob
print glob.glob('/home/bb/C/*')

Categories

Resources