Comparing two directories with subdirectories to find any changes?

Comparing two directories with subdirectories to find any changes? - python

For starters I've only been playing with python for about a 2 weeks now and im relatively new to its proccessess, I'm trying to create a script that compares two directories with subdirectories and prints out ANY changes. I've read articles on hear about using os.walk to walk the directories and I've managed to write the script that prints all the files in a directory and its subdirectories in a understandable manner. I've also read on here and learned how to compare two directories but it only compares 1 file deep.
import os
x = 'D:\\xfiles'
y = 'D:\\yfiles'
q= [ filename for filename in x if filename not in y ]
print q
Obviously that does not do what I want it to. This however is listing all files and all directories.
import os
x = 'D:\\xfiles'
x1 = os.walk(x)
for dirName, subdirList, fileList in x1:
print ('Directory: %s' % dirName)
for fname in fileList:
print ('\%s' % fname)
How do I combine them and get it to work?

Write a function to aggregate your listing.
import os
def listfiles(path):
files = []
for dirName, subdirList, fileList in os.walk(path):
dir = dirName.replace(path, '')
for fname in fileList:
files.append(os.path.join(dir, fname))
return files
x = listfiles('D:\\xfiles')
y = listfiles('D:\\yfiles')
You could use a list comprehension to extract the files that are not in both directories.
q = [filename for filename in x if filename not in y]
But using sets is much more efficient and flexible.
files_only_in_x = set(x) - set(y)
files_only_in_y = set(y) - set(x)
files_only_in_either = set(x) ^ set(y)
files_in_both = set(x) & set(y)
all_files = set(x) | set(y)

I guess that best way to go will be external programs, as #Robᵩ suggests in the comment.
Using Python I would recommend doing following:
import os
def fileIsSame(right, left, path):
return os.path.exists (os.path.join(left, path.replace(right, '')));
def compare(right, left):
difference = list();
for root, dirs, files in os.walk(right):
for name in files:
path = os.path.join(root, name);
# check if file is same
if fileIsSame(right, left, path):
if os.path.isdir(path):
# recursively check subdirs
difference.extend(compare(path, left));
else:
# count file as difference
difference.append(path);
return difference;
This approach lacks normal fileIsSame function that would make sure files are same by content or by date modified and be sure to handle paths correctly (as I'm not sure this variant will). This algorithm requres you to specify full paths.
Usage example:
print (compare(r'c:\test', r'd:\copy_of_test'));
If second folder is copy of first, all the differences in paths (different disk letter and foldername) is ignored. Output will be [].

import os
def ls(path):
all = []
walked = os.walk(path)
for base, sub_f, files in walked:
for sub in sub_f:
entry = os.path.join(base,sub)
entry = entry[len(path):].strip("\\")
all.append(entry)
for file in files:
entry = os.path.join(base,file)
entry = entry[len(path):].strip("\\")
all.append(entry)
all.sort()
return all
def folder_diff(folder1_path, folder2_path):
folder1_list = ls(folder1_path);
folder2_list = ls(folder2_path);
diff = [item for item in folder1_list if item not in folder2_list]
diff.extend([item for item in folder2_list if item not in folder1_list])
return diff

I have done a code that check two directory recursively, and if there is different, it would point out the line that different.
import os
FOLDER_A = os.path.join(os.path.dirname(__file__), 'folder_a')
FOLDER_B = os.path.join(os.path.dirname(__file__), 'folder_b')
def load_directory(directory):
files = set()
directories = set()
for file_or_directory in os.listdir(directory):
file_or_directory_path = f'{directory}/{file_or_directory}'
if os.path.isfile(file_or_directory_path):
files.add(file_or_directory)
else:
directories.add(file_or_directory)
return files, directories
def compare_files(a, b):
assert os.path.isfile(a)
assert os.path.isfile(b)
with open(a, 'r') as file:
file_a = file.read()
with open(b, 'r') as file:
file_b = file.read()
if file_a != file_b:
file_a_lines = file_a.split('\n')
file_b_lines = file_b.split('\n')
if len(file_a_lines) != len(file_b_lines):
print(f'Two file {a} and {b} have different length, of {len(file_a_lines)} and {len(file_b_lines)}')
return False
compare_lines = zip(file_a_lines, file_b_lines)
index = 0
for i in compare_lines:
index += 1
if i[0] != i[1]:
print(f'Different found in file {a} and {b}, at line number {index}')
return False
print('Some thing wrong')
return False
return True
def compare_directories(a, b):
assert not os.path.isfile(a)
assert not os.path.isfile(b)
a_files, a_directories = load_directory(a)
b_files, b_directories = load_directory(b)
if (a_files != b_files):
print(f'Different Found In {a} and {b} directories files')
print(f'A: {a_files}\nB: {b_files}')
return False
if (a_directories != b_directories):
print(f'Different Found In {a} and {b} directories subdirectories')
print(f'A: {a_directories}\nB: {b_directories}')
return False
for files in a_files:
if not compare_files(f'{a}/{files}', f'{b}/{files}'):
return False
for directories in a_directories:
if not compare_directories(f'{a}/{directories}', f'{b}/{directories}'):
return False
return True
def main():
print(compare_directories(FOLDER_A, FOLDER_B))
if __name__ == '__main__':
main()

Related

I have two files in different locations, one file contain .jpg and another file with .xml. I want to compare their names and print difference

How can I compare the files in those two folders?
Image files:
1.jpg
2.jpg
5.jpg
XML files:
1.xml
2.xml
3.xml
output:
5.jpg, 3.xml

Compare without extension and find the different using set.
import os
f1 = os.listdir("foler1/")
f2 = os.listdir("folder2/")
f1 = set(map(lambda x:os.path.splitext(x)[0], f1))
f2 = set(map(lambda x:os.path.splitext(x)[0], f2))
r1 = list(map(lambda x:x+'.jpg', f1-f2))
r2 = list(map(lambda x:x+'.xml', f2-f1))
result = r1 + r2
print(result)

This solution is reasonably fast. If you had huge directories, a hash table solution would be better. Say you want to compare directory XML/ and images/.
import os
def dir_diff(dir1, dir2):
contents1 = sorted( os.listdir(dir1))
contents2 = sorted( os.listdir(dir2))
heads1=[]
heads2=[]
for fname in contents1:
heads1.append(fname[:fname.rfind('.')] )
for fname in contents2:
heads2.append(fname[:fname.rfind('.')] )
i=j=0
out=[]
print(heads2)
print(heads1)
while True:
fname1=heads1[i]
fname2=heads1[j]
if fname1 < fname2:
out.append(contents1[i])
i+=1
elif fname2 < fname1:
out.append(contents2[j])
j+=1
else:
i+=1
j+=1
if i ==len(heads1) and j ==len(heads2):
return out
elif i==len(heads1):
return out+contents2[j:]
elif j==len(heads2):
return out+contents1[i:]
if __name__=='__main__':
folder1 = 'XML/' # NOTE: change these to the folder names you want
folder2 = 'images/' # NOTE: change these to the folder names you want
for fname in dir_diff(folder1, folder2):
os.remove(fname)

traverse directory structure in python recursively without os.walk

I am trying to write a python2 function that will recursively traverse through the whole directory structure of a given directory, and print out the results.
All without using os.walk
This is what I have got so far:
test_path = "/home/user/Developer/test"
def scanning(sPath):
output = os.path.join(sPath, 'output')
if os.path.exists(output):
with open(output) as file1:
for line in file1:
if line.startswith('Final value:'):
print line
else:
for name in os.listdir(sPath):
path = os.path.join(sPath, name)
if os.path.isdir(path):
print "'", name, "'"
print_directory_contents(path)
scanning(test_path)
This is what I currently get, the script doesn't enter the new folder:
' test2'
'new_folder'
The issue is that it does not go further down than one directory. I would also like to able to indicate visually what is a directory, and what is a file

Try this:
import os
test_path = "YOUR_DIRECTORY"
def print_directory_contents(dir_path):
for child in os.listdir(dir_path):
path = os.path.join(dir_path, child)
if os.path.isdir(path):
print("FOLDER: " + "\t" + path)
print_directory_contents(path)
else:
print("FILE: " + "\t" + path)
print_directory_contents(test_path)
I worked on windows, verify if still working on unix.
Adapted from:
http://codegists.com/snippet/python/print_directory_contentspy_skobnikoff_python

Try this out with recursion
it is much simple and less code
import os
def getFiles(path="/var/log", files=[]):
if os.path.isfile(path):
return files.append(path)
for item in os.listdir(path):
item = os.path.join(path, item)
if os.path.isfile(item):
files.append(item)
else:
files = getFiles(item, files)
return files
for f in getFiles("/home/afouda/test", []):
print(f)

Try using a recursive function,
def lastline(fil):
with open(fil) as f:
for li in f.readlines():
if li.startswith("Final Value:"):
print(li)
## If it still doesnt work try putting 'dirs=[]' here
def lookforfiles(basepath):
contents = os.listdir(basepath)
dirs = []
i = 0
while i <= len(contents):
i += 1
for n in contents:
f = os.path.join(basepath, n)
if os.path.isfile(f):
lastline(f)
print("\n\nfile %s" % n)
elif os.path.isdir(f):
print("Adding dir")
if f in dirs:
pass
else:
dirs.append(f)
else:
for x in dirs:
print("dir %s" % x)
lookforfiles(x)
sorry if this doesn't fit your example precisely but I had a hard time understanding what you were trying to do.

This question is a duplicate of Print out the whole directory tree.
TL;TR: Use os.listdir.

Python - match directories with pattern (regular expression)

I wrote a loop which ignores all sub-directories which contain .txt files within them.
src = raw_input("Enter source disk location: ")
src = os.path.abspath(src)
dst = raw_input("Enter first destination to copy: ")
dst = os.path.abspath(dst)
dest = raw_input("Enter second destination to move : ")
dest = os.path.abspath(dest)
path_patter = '(\S+)_(\d+)_(\d+)_(\d+)__(\d+)_(\d+)_(\d+)'
for dir, dirs, files in os.walk(src):
if any(f.endswith('.txt') for f in files):
dirs[:] = [] # do not recurse into subdirectories
continue
files = [os.path.join(dir, f) for f in files ]
for f in files:
part1 = os.path.dirname(f)
part2 = os.path.dirname(os.path.dirname(part1))
part3 = os.path.split(part1)[1]
path_miss1 = os.path.join(dst, "missing_txt")
path_miss = os.path.join(path_miss1, part3)
path_missing = os.path.join(dest, "missing_txt")
searchFileName = re.search(path_patter, part3)#### update
if searchFileName:#####update
try:
if not os.path.exists(path_miss):
os.makedirs(path_miss)
else:
pass
if os.path.exists(path_miss):
distutils.dir_util.copy_tree(part1, path_miss)
else:
debug_status += "missing_file\n"
pass
if (get_size(path_miss)) == 0:
os.rmdir(path_miss)
else:
pass
if not os.path.exists(path_missing):
os.makedirs(path_missing)
else:
pass
if os.path.exists(path_missing):
shutil.move(part1, path_missing)
else:
pass
if (get_size(path_missing)) == 0:
os.rmdir(path_missing)
else:
pass
except Exception:
pass
else:
continue
How to modify this code to compare directory name with regular expression in this case. (it has to ignore directories with .txt files)

import os
import re
def createEscapedPattern(path,pattern):
newPath = os.path.normpath(path)
newPath = newPath.replace("\\","\\\\\\\\")
return newPath + "\\\\\\\\" + pattern
def createEscapedPath(path):
newPath = os.path.normpath(path)
return newPath.replace("\\","\\\\")
src = 'C:\\Home\\test'
path_patter = '(\S+)_(\d+)_(\d+)_(\d+)__(\d+)_(\d+)_(\d+)$'
p = re.compile(createEscapedPattern(src,path_patter))
for dir, dirs, files in os.walk(src):
if any(f.endswith('.txt') for f in files):
dirs[:] = []
continue
if any(p.match(createEscapedPath(dir)) for f in files):
for f in files:
print createEscapedPath(dir + "/" + f)
p = re.compile(createEscapedPattern(dir,path_patter))
There are a couple of things i did here and hope this example helps
I wrote this for windows fs so used two path convert functions.
This script ignores dirs with .txt files like you implemented it
This script will start at the directory you start the script and will only print file names if the pattern matches. This is done for all subdirectory's that are not ignored by the previous rule.
Used regex in python and made it compile again for each directory so you get: 'directory/(\S+)(\d+)(\d+)_(\d+)__(\d+)(\d+)(\d+)$'

Flatten complex directory structure in Python

I want to move files from a complex directory structure to just one place. For example i have this deep hierarchy:
foo/
foo2/
1.jpg
2.jpg
...
I want it to be:
1.jpg
2.jpg
...
My current solution:
def move(destination):
for_removal = os.path.join(destination, '\\')
is_in_parent = lambda x: x.find(for_removal) > -1
with directory(destination):
files_to_move = filter(is_in_parent,
glob_recursive(path='.'))
for file in files_to_move:
shutil.move(file, destination)
Definitions: directory and glob_recursive. Note, that my code only moves files to their common parent directory, not an arbitrary destination.
How can i move all files from a complex hierarchy to a single place succinctly and elegantly?

I don't like testing the name of the file about to be moved to see if we're already in the destination directory. Instead, this solution only scans the subdirectories of the destination
import os
import itertools
import shutil
def move(destination):
all_files = []
for root, _dirs, files in itertools.islice(os.walk(destination), 1, None):
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)
Explanation: os.walk walks recursively the destination in a "top down" manner. whole filenames are constructed with the os.path.join(root, filename) call. Now, to prevent scanning files at the top of the destination, we just need to ignore the first element of the iteration of os.walk. To do that I use islice(iterator, 1, None). One other more explicit way would be to do this:
def move(destination):
all_files = []
first_loop_pass = True
for root, _dirs, files in os.walk(destination):
if first_loop_pass:
first_loop_pass = False
continue
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)

this would do, it also renames files if they collide (I commented out the actual move and replaced with a copy):
import os
import sys
import string
import shutil
#Generate the file paths to traverse, or a single path if a file name was given
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
destination = "./newdir/"
fromdir = "./test/"
for f in getfiles(fromdir):
filename = string.split(f, '/')[-1]
if os.path.isfile(destination+filename):
filename = f.replace(fromdir,"",1).replace("/","_")
#os.rename(f, destination+filename)
shutil.copy(f, destination+filename)

Run recursively through directory, move the files and launch move for directories:
import shutil
import os
def move(destination, depth=None):
if not depth:
depth = []
for file_or_dir in os.listdir(os.path.join([destination] + depth, os.sep)):
if os.path.isfile(file_or_dir):
shutil.move(file_or_dir, destination)
else:
move(destination, os.path.join(depth + [file_or_dir], os.sep))

import os.path, shutil
def move(src, dest):
not_in_dest = lambda x: os.path.samefile(x, dest)
files_to_move = filter(not_in_dest,
glob_recursive(path=src))
for f in files_to_move:
shutil.move(f, dest)
Source for glob_recursive. Does not change name of file, if they collide.
samefile is a safe way to compare paths. But it doesn't work on Windows, so check How to emulate os.path.samefile behaviour on Windows and Python 2.7?.

def splitPath(p):
a,b = os.path.split(p)
return (splitPath(a) if len(a) and len(b) else []) + [b]
def safeprint(s):
try:
print(s)
except UnicodeEncodeError:
if sys.version_info >= (3,):
print(s.encode('utf8').decode(sys.stdout.encoding))
else:
print(s.encode('utf8'))
def flatten(root, doit):
SEP = "¦"
REPL = "?"
folderCount = 0
fileCount = 0
if not doit:
print("Simulating:")
for path, dirs, files in os.walk(root, topdown=False):
if path != root:
for f in files:
sp = splitPath(path)
np = ""
for element in sp[1:]:
e2 = element.replace(SEP, REPL)
np += e2 + SEP
f2 = f.replace(SEP, REPL)
newName = np + f2
safeprint("Moved: "+ newName )
if doit:
shutil.move(os.path.join(path, f), os.path.join(root, f))
# Uncomment, if you want filenames to be based on folder hierarchy.
#shutil.move(os.path.join(path, f), os.path.join(root, newName))
fileCount += 1
safeprint("Removed: "+ path)
if doit:
os.rmdir(path)
folderCount += 1
if doit:
print("Done.")
else:
print("Simulation complete.")
print("Moved files:", fileCount)
print("Removed folders:", folderCount)
directory_path = r"C:\Users\jd\Documents\myFtpData"
flatten(directory_path, True)

Adding on to the answers, I believe my answer will satisfy all your needs, the other answers fail when there is a subdirectory and file with the same filename as the upper directory.
This was SOLVED here, Also look at my Github Repo for Structured File Copy and Flattened File Copy:
import os, fnmatch, shutil
PATTERN = '*.txt' # Regex Pattern to Match files
INPUT_FOLDER = "A" # os.getcwd()
INPUT_FOLDER = os.path.abspath(INPUT_FOLDER)
include_input_foldername = False
prepend = "_included" if include_input_foldername else ""
OUTPUT_FOLDER = f"Structured_Copy_{os.path.basename(INPUT_FOLDER)}{prepend}"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def find(pattern, path):
"""Utility to find files wrt a regex search"""
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
all_files = find(PATTERN, INPUT_FOLDER)
for each_path in all_files:
relative_path = os.path.relpath(each_path, os.path.dirname(INPUT_FOLDER)) if include_input_foldername else os.path.relpath(each_path, INPUT_FOLDER)
flattened_relative_fullpath = os.path.join(OUTPUT_FOLDER, relative_path)
os.makedirs(os.path.dirname(flattened_relative_fullpath), exist_ok=True)
shutil.copy(each_path, flattened_relative_fullpath)
print(f"Copied {each_path} to {flattened_relative_fullpath}")
print(f"Finished Copying {len(all_files)} Files from : {INPUT_FOLDER} to : {OUTPUT_FOLDER}")

Python case insensitive file name?

I need to load a file given it's name, but the name I get is case insensitive. "A.txt" could actually be "a.txt". How to do this the fast way (not generate all possible names and try each)?

You could list the directory the file's in (os.listdir), and see if there are matches for your filename. The matching can be done by lower-casing both filenames and comparing.

You can't do it without taking a directory listing and taking both the item you're looking for and each in the directory to a common case for comparison. The filesystem is case sensitive and that's all there is to it.
Here is a function (well, two) that I wrote to do it completely, matching a filename in an insensitive manner, recursively: http://portableapps.hg.sourceforge.net/hgweb/portableapps/development-toolkit/file/775197d56e86/utils.py#l78.
def path_insensitive(path):
"""
Get a case-insensitive path for use on a case sensitive system.
>>> path_insensitive('/Home')
'/home'
>>> path_insensitive('/Home/chris')
'/home/chris'
>>> path_insensitive('/HoME/CHris/')
'/home/chris/'
>>> path_insensitive('/home/CHRIS')
'/home/chris'
>>> path_insensitive('/Home/CHRIS/.gtk-bookmarks')
'/home/chris/.gtk-bookmarks'
>>> path_insensitive('/home/chris/.GTK-bookmarks')
'/home/chris/.gtk-bookmarks'
>>> path_insensitive('/HOME/Chris/.GTK-bookmarks')
'/home/chris/.gtk-bookmarks'
>>> path_insensitive("/HOME/Chris/I HOPE this doesn't exist")
"/HOME/Chris/I HOPE this doesn't exist"
"""
return _path_insensitive(path) or path
def _path_insensitive(path):
"""
Recursive part of path_insensitive to do the work.
"""
if path == '' or os.path.exists(path):
return path
base = os.path.basename(path) # may be a directory or a file
dirname = os.path.dirname(path)
suffix = ''
if not base: # dir ends with a slash?
if len(dirname) < len(path):
suffix = path[:len(path) - len(dirname)]
base = os.path.basename(dirname)
dirname = os.path.dirname(dirname)
if not os.path.exists(dirname):
dirname = _path_insensitive(dirname)
if not dirname:
return
# at this point, the directory exists but not the file
try: # we are expecting dirname to be a directory, but it could be a file
files = os.listdir(dirname)
except OSError:
return
baselow = base.lower()
try:
basefinal = next(fl for fl in files if fl.lower() == baselow)
except StopIteration:
return
if basefinal:
return os.path.join(dirname, basefinal) + suffix
else:
return

This is a simple recursive function to to the search Eli suggests above:
def find_sensitive_path(dir, insensitive_path):
insensitive_path = insensitive_path.strip(os.path.sep)
parts = insensitive_path.split(os.path.sep)
next_name = parts[0]
for name in os.listdir(dir):
if next_name.lower() == name.lower():
improved_path = os.path.join(dir, name)
if len(parts) == 1:
return improved_path
else:
return find_sensitive_path(improved_path, os.path.sep.join(parts[1:]))
return None

Make a directory listing; and create a dictionary containing a mapping of upper-case filenames to their actual-case filenames. Then, make your input upper-case, and look for it in the dictionary.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Comparing two directories with subdirectories to find any changes? - python

Related

I have two files in different locations, one file contain .jpg and another file with .xml. I want to compare their names and print difference

traverse directory structure in python recursively without os.walk

Python - match directories with pattern (regular expression)

Flatten complex directory structure in Python

Python case insensitive file name?

Categories

Resources