Get folder structure along with folder/file sizes in python - python

After doing my research for this specific task I found at that most of the solution given for this kind of problem either return the list of all the files or the TOTAL size of the folder/file.
What I am trying to achieve is get an output in the CSV file stating the folder structure i.e. folders - sub folders - files (optional) along with the size information for EACH.
There is no specific format for the CSV. I just need to know the tree structure with the size of the folder/sub-folder.
The reason behind this is that we are moving from physical servers to the cloud. In order to verify whether all the data was retained correctly during conversion I need to make a similar list of all SHARED DRIVES which can later be validated.
Looking forward for meaningful insights. Thanks!

Edit:
Sooo, that should be what you are asking for:
import os
import csv
def sizeof_fmt(num, suffix='B'):
for unit in ['','K','M','G','T','P','E','Z']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def get_size(start_path = '.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return sizeof_fmt(total_size)
with open("yourfilename.csv", mode="w") as dir_file:
csv_writer = csv.writer(dir_file, delimiter=",")
def files_and_sizes(start_path):
dir_list = [file for file in os.listdir(start_path)]
for file in dir_list:
path = start_path + "\\" + file
if os.path.isdir(path) is True:
csv_writer.writerow([file, get_size(path)])
files_and_sizes(start_path + "\\" + file)
files_and_sizes(r"C:\your\path\here")
Updated to better fit the question.
You can get all files with sizes like this:
import os
all_files_with_size = []
def files_and_sizes(start_path):
dir_list = [file for file in os.listdir(start_path)]
current_dir = []
for file in dir_list:
path = start_path + "\\" + file
if os.path.isdir(path) is True:
current_dir.append(files_and_sizes(path))
else:
current_dir.append((file, os.lstat(path).st_size))
return current_dir
It will return a list containing all files like (file, size) and a sublist for each directory.
I recommend appending the entries to a file, but the formatting is up to you.
Also, if you want the directory sizes as well:
if os.path.isdir(path) is True:
current_dir.append(file, os.lstat(path).st_size)
current_dir.append(files_and_sizes(path))

I believe you will have to use a combination of the solutions that you have already found. Such as 'os.listdir(path)' to get the contents of a directory, 'os.lstat(path).st_size' to get file size, and 'os.path.isdir(path)' and 'os.path.isfile(path)' to determine the type.

Related

open files, keeping folder structure as nested list

I have a folder with the following structure:
data
|-folder1
|--subfolder1
|--file1
|--file2
|--subfolder2
|file1
|file2
|-folder2
|--subfolder1
|--file1
|--file2
|--subfolder2
|file1
|file2
with many folders, subfolder and files.
How can i create a list that is subdivided into smaller lists that contain my data?
For example, I'd end up with a list called data and I could retrieve file1 from folder1-subfolder1 by indexing data[0][0][0]?
As of now, I have created empty lists for each file but I'm not sure on how to append to a list of lists.
I have:
file1 = []
file2 = []
for folder in sorted(os.listdir(path)):
if folder != 'Documentation.txt':
for subfolder in sorted(os.listdir(path + '/' + folder)):
if subfolder != '.DS_Store':
for file in sorted(os.listdir(path+ '/' + folder + '/' + subfolder)):
if file.endswith(".x.dat"):
file1.append(pd.read_csv((path + '/' + folder + '/' + subfolder + '/' + file), header=None, sep=' '))
if file.endswith(".y.dat"):
file2.append(pd.read_csv((path + '/' + folder + '/' + subfolder + '/' + file), header=None, sep=' '))
data = [file1, file2]
This returns all the data files, but I'm struggling to figure out how to nest each file in a list of list according to the original folder structure... I feel like the solution will be pretty trivial, i'm just not great with python. Thanks
You could try the following with pathlib's Path.rglob() and groupby from itertools (all standard library):
from pathlib import Path
from itertools import groupby
from functools import partial
def key(i, file): return file.parent.parts[i]
base = Path("data")
data = []
for _, group1 in groupby(base.rglob("*.dat"), key=partial(key, 1)):
data.append([])
for _, group2 in groupby(group1, key=partial(key, 2)):
data[-1].append([file.name for file in group2])
With a test structure created by
base = Path("data")
for i in range(1, 4):
for j in range(1, 3):
path = (base / f"folder{i}") / f"subfolder{j}"
path.mkdir(parents=True, exist_ok=True)
for k in range(1, 3):
with open(path / f"file{i}-{j}-{k}.dat", "w") as file:
file.write("A,B,C\n1,2,3\n4,5,6")
this delivers the following data:
[[['file1-1-1.dat', 'file1-1-2.dat'], ['file1-2-1.dat', 'file1-2-2.dat']],
[['file2-1-1.dat', 'file2-1-2.dat'], ['file2-2-1.dat', 'file2-2-2.dat']],
[['file3-1-1.dat', 'file3-1-2.dat'], ['file3-2-1.dat', 'file3-2-2.dat']]]
Your code implies that you actually don't want to collect the filenames but pd.csv_read() them and store the dataframes in data. To do that you have to replace
data[-1].append([file.name for file in group2])
with
data[-1].append([pd.read_csv(file) for file in group2])
And it might well be that you have to add more logic to the file selection: I just went with the .dat suffix.
You could do something similar with os.walk instead, as suggested in the other answer:
from pathlib import Path
from os import walk
base = "data"
data = []
for root, _, files in walk(base):
if ".DS_Store" in root:
continue
num_parts = len(Path(root).parts)
if num_parts == 2:
data.append([])
elif num_parts == 3:
data[-1].append([file for file in files if file.endswith(".dat")])
resp.
data[-1].append([pd.read_csv(Path(root) / file) for file in files])
It's not clear to me what's the exact output you want, but I'm pretty sure os.walk is probably the best option for you to generate a tree of your files:
>>> import os
>>> import re
>>> data_path = '/Users/nilton/data'
>>> files_paths = []
>>> for dirpath, dirnames, filenames in os.walk(data_path):
... for filename in filenames:
... if re.match('\.dat', filename, re.I):
... files_paths.append(filename)
...
>>> files_paths
['/Users/nilton/data/folder2/subfolder2/file2.dat',
'/Users/nilton/data/folder2/subfolder2/file1.dat',
...]
Knowing this and reading the os.walk documentation, you can manage to get your desired output by playing with the 3-tuple (dirpath, dirnames, filenames) output from os.walk.

How can I confirm and remove the original files after sorting and copying them into several folders?

I'm a newbie and I'm trying to make office work a little less tedious. I currently have a little program that sorts and copies .pdf files from a folder into several folders, based on who these files need to be sent to later.
It works great. There's just the issue that I keep double-checking if it did its job. So then I added a bit where it counts the copied files to make checking easier.
Now I've been trying to figure out if I could make the program compare the list of files in the original folder with a list of files from all the other destination folders and then delete the originals if the files are indeed copied.
I've also resorted to having the program print the resulting file paths, but it's ugly and still requires me to manually compare.
Here's my code:
import os
import shutil
import pathlib
import pprint
dir = ('[path to original folder]')
files = os.listdir(dir)
user_data = [
('Karl H. Preusse', [Path to Karl]),
('Rom', [Path to Rom]),
('Hochschule', [Path to Hochschule]),
('Kiefer', [Path to Kiefer),
('Penny', [Path to Penny),
('Steigenberger', [Path to Steigenberger]),
('Penzkofer', [Path to Penzkofer]),
('Stoffel', [Path to Stoffel]),
('Cavertitzer', [Path to Cavertitzer])
]
for pattern, dest_dir in user_data:
matching_files = [f for f in files if pattern in f]
for filename in matching_files:
full_filename = os.path.join(dir, filename)
if os.path.isfile(full_filename):
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
shutil.copy(full_filename, dest_dir)
pprint.pprint(shutil.copy(full_filename, dest_dir))
stetje_datotek = sum(len(files) for _, _, files in os.walk([Path to directory that holds the copy folders])) #defines files to count
print('Stevilo datotek v mapi Posiljanje je: {}' .format(stetje_datotek)) #Prints out how many files are in the target folders.
Below are my attempts at getting things automated.
#I commented this function out as I couldn't figure out how to get the data out of it.
#def sub_files(folder):
# relpath = os.path.relpath
# join = os.path.join
# for path, _, files in os.walk([Path to directory that holds the copy folders]):
# relative = relpath(path, [Path to directory that holds the copy folders])
# for file in files:
# yield join(relative, file)
#print(sub_files)
Here I thought to use inputs to individually check each folder:
#print(os.listdir([Path to directory that holds the copy folders]))
#if input() == 'Penzkofer':
#pprint.pprint(os.listdir([Path to Penzkofer folder]))
And here I tried to compare lists, but I get a TypeError: unhashable type: 'list' error
prvotne_datoteke = set(os.listdir(dir))
kopirane_datoteke = set(os.walk([Path to directory that holds the copy folders])
set(prvotne_datoteke).intersection(kopirane_datoteke)
Any help is appreciated. Thank you.
One approach is to print the names of each copied file recipient and the number of recipients, then delete the original file if all intended recipients are included.
to_be_copied = set() # holds original paths of all files being copied
for pattern, dest_dir in user_data:
matching_files = [f for f in files if pattern in f]
for filename in matching_files:
full_filename = os.path.join(dir, filename)
to_be_copied.add(filename) # adds filepaths
if os.path.isfile(full_filename):
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
shutil.copy(full_filename, dest_dir)
pprint.pprint(shutil.copy(full_filename, dest_dir))
# Iterates through copied files
for original_file in to_be_copied:
count = 0
recipients = []
# Iterates through potential recipients
for pattern, dest_dir in user_data:
complete_name = os.path.join(dest_dir, original_file)
if os.path.isfile(complete_name):
count += 1
recipients.append(pattern)
print(original_file + ' sent to ' + str(count) + ' people:')
print(recipients)
# Quick manual check, could be changed to checking if count/recipients is correct
print('Delete original file? (Y or N): ')
delete = input()
if (delete == 'Y'):
os.remove(os.path.join(dir, original_file))

Python ValueError : too many values to unpack, solution?

Im getting this error and i have no idea what it means, i can get the program to print the files from there values but its just a long incoherent now im trying to get it to print it in an organized manor and thats where the issues arise.
import os
def listfiles (path):
files = []
for dirName, subdirList, fileList in os.walk(path):
dir = dirName.replace(path, '')
for fname in fileList:
files.append(os.path.join(dir, fname))
return files
a = input('Enter a primary file path: ')
b = input('Enter a secondary file path: ')
x = listfiles(a)
y = llistfiles(b)
files_only_x = set(x) - set (y)
files_only_y = set(y) - set (x)
this next line of code is where python is saying the error is
for dirName, subdirList, fileList in files_only_x:
print ('Directory: %s' % dirName)
for fname in fileList:
print ('\%s' % fname)
Your files_only_x is a set of single values; your listfiles() function returns a list of strings, not of tuples with 3 values:
for fname in files_only_x:
print ('\\%s' % fname)
You built files as a list of strings, therefore the loop in your 2nd code block is wrong as it suggests files is list of 3-value tuples.
Look at the data flow:
You call listfiles() with a path. It collects all files below that path in a list.
(BTW, IMHO dir = dirName.replace(path, '') is dangerous. What happens if path is lib/ and you encouter a sub path lib/misc/collected/lib/whatever? While this path males not much sense, it might have been created...)
You return this list from listfiles() and then convert them into sets.
If you try to iterate over these sets, you get one path per iteration step.

Iterate over 2 files in each folder and compare them

I compare two text files and print out the results to a 3rd file. I am trying to make it so the script i'm running would iterate over all of the folders that have two text files in them, in the CWD of the script.
What i have so far:
import os
import glob
path = './'
for infile in glob.glob( os.path.join(path, '*.*') ):
print('current file is: ' + infile)
with open (f1+'.txt', 'r') as fin1, open(f2+'.txt', 'r') as fin2:
Would this be a good way to start the iteration process?
It's not the most clear code but it gets the job done. However, i'm pretty sure i need to take the logic out of the read / write methods but i'm not sure where to start.
What i'm basically trying to do is have a script iterate over all of the folders in its CWD, open each folder, compare the two text files inside, write a 3rd text file to the same folder, then move on to the next.
Another method i have tried is as follows:
import os
rootDir = 'C:\\Python27\\test'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
And this outputs the following (to give you a better example of the file structure:
Found directory: C:\Python27\test
test.py
Found directory: C:\Python27\test\asdd
asd1.txt
asd2.txt
Found directory: C:\Python27\test\chro
ch1.txt
ch2.txt
Found directory: C:\Python27\test\hway
hw1.txt
hw2.txt
Would it be wise to put the compare logic under the for fname in fileList? How do i make sure it compares the two text files inside the specific folder and not with other fnames in the fileList?
This is the full code that i am trying to add this functionality into. I appologize for the Frankenstein nature of it but i am still working on a refined version but it does not work yet.
from collections import defaultdict
from operator import itemgetter
from itertools import groupby
from collections import deque
import os
class avs_auto:
def load_and_compare(self, input_file1, input_file2, output_file1, output_file2, result_file):
self.load(input_file1, input_file2, output_file1, output_file2)
self.compare(output_file1, output_file2)
self.final(result_file)
def load(self, fileIn1, fileIn2, fileOut1, fileOut2):
with open(fileIn1+'.txt') as fin1, open(fileIn2+'.txt') as fin2:
frame_rects = defaultdict(list)
for row in (map(str, line.split()) for line in fin1):
id, frame, rect = row[0], row[2], [row[3],row[4],row[5],row[6]]
frame_rects[frame].append(id)
frame_rects[frame].append(rect)
frame_rects2 = defaultdict(list)
for row in (map(str, line.split()) for line in fin2):
id, frame, rect = row[0], row[2], [row[3],row[4],row[5],row[6]]
frame_rects2[frame].append(id)
frame_rects2[frame].append(rect)
with open(fileOut1+'.txt', 'w') as fout1, open(fileOut2+'.txt', 'w') as fout2:
for frame, rects in sorted(frame_rects.iteritems()):
fout1.write('{{{}:{}}}\n'.format(frame, rects))
for frame, rects in sorted(frame_rects2.iteritems()):
fout2.write('{{{}:{}}}\n'.format(frame, rects))
def compare(self, fileOut1, fileOut2):
with open(fileOut1+'.txt', 'r') as fin1:
with open(fileOut2+'.txt', 'r') as fin2:
lines1 = fin1.readlines()
lines2 = fin2.readlines()
diff_lines = [l.strip() for l in lines1 if l not in lines2]
diffs = defaultdict(list)
with open(fileOut1+'x'+fileOut2+'.txt', 'w') as result_file:
for line in diff_lines:
d = eval(line)
for k in d:
list_ids = d[k]
for i in range(0, len(d[k]), 2):
diffs[d[k][i]].append(k)
for id_ in diffs:
diffs[id_].sort()
for k, g in groupby(enumerate(diffs[id_]), lambda (i, x): i - x):
group = map(itemgetter(1), g)
result_file.write('{0} {1} {2}\n'.format(id_, group[0], group[-1]))
def final(self, result_file):
with open(result_file+'.txt', 'r') as fin:
lines = (line.split() for line in fin)
for k, g in groupby(lines, itemgetter(0)):
fst = next(g)
lst = next(iter(deque(g, 1)), fst)
with open('final/{}.avs'.format(k), 'w') as fout:
fout.write('video0=ImageSource("old\%06d.jpeg", {}-3, {}+3, 15)\n'.format(fst[1], lst[2]))
fout.write('video1=ImageSource("new\%06d.jpeg", {}-3, {}+3, 15)\n'.format(fst[1], lst[2]))
fout.write('video0=BilinearResize(video0,640,480)\n')
fout.write('video1=BilinearResize(video1,640,480)\n')
fout.write('StackHorizontal(video0,video1)\n')
fout.write('Subtitle("ID: {}", font="arial", size=30, align=8)'.format(k))
using the load_and_compare() function, i define two input text files, two output text files, a file for the comparison results and a final phase that writes many files for all of the differences.
What i am trying to do is have this whole class run on the current working directory and go through every sub folder, compare the two text files, and write everything into the same folder, specifically the final() results.
You can indeed use os.walk(), since that already separates the directories from the files. You only need the directories it returns, because that's where you're looking for your 2 specific files.
You could also use os.listdir() but that returns directories as well files in the same list, so you would have to check for directories yourself.
Either way, once you have the directories, you iterate over them (for subdir in dirnames) and join the various path components you have: The dirpath, the subdir name that you got from iterating over the list and your filename.
Assuming there are also some directories that don't have the specific 2 files, it's a good idea to wrap the open() calls in a try..except block and thus ignore the directories where one of the files (or both of them) doesn't exist.
Finally, if you used os.walk(), you can easily choose if you only want to go into directories one level deep or walk the whole depth of the tree. In the former case, you just clear the dirnames list by dirnames[:] = []. Note that dirnames = [] wouldn't work, since that would just create a new empty list and put that reference into the variable instead of clearing the old list.
Replace the print("do something ...") with your program logic.
#!/usr/bin/env python
import errno
import os
f1 = "test1"
f2 = "test2"
path = "."
for dirpath, dirnames, _ in os.walk(path):
for subdir in dirnames:
filepath1, filepath2 = [os.path.join(dirpath, subdir, f + ".txt") for f in f1, f2]
try:
with open(filepath1, 'r') as fin1, open(filepath2, 'r') as fin2:
print("do something with " + str(fin1) + " and " + str(fin2))
except IOError as e:
# ignore directiories that don't contain the 2 files
if e.errno != errno.ENOENT:
# reraise exception if different from "file or directory doesn't exist"
raise
# comment the next line out if you want to traverse all subsubdirectories
dirnames[:] = []
Edit:
Based on your comments, I hope I understand your question better now.
Try the following code snippet instead. The overall structure stays the same, only now I'm using the returned filenames of os.walk(). Unfortunately, that would also make it harder to do something like "go only into the subdirectories 1 level deep", so I hope walking the tree recursively is fine with you. If not, I'll have to add a little code to later.
#!/usr/bin/env python
import fnmatch
import os
filter_pattern = "*.txt"
path = "."
for dirpath, dirnames, filenames in os.walk(path):
# comment this out if you don't want to filter
filenames = [fn for fn in filenames if fnmatch.fnmatch(fn, filter_pattern)]
if len(filenames) == 2:
# comment this out if you don't want the 2 filenames to be sorted
filenames.sort(key=str.lower)
filepath1, filepath2 = [os.path.join(dirpath, fn) for fn in filenames]
with open(filepath1, 'r') as fin1, open(filepath2, 'r') as fin2:
print("do something with " + str(fin1) + " and " + str(fin2))
I'm still not really sure what your program logic does, so you will have to interface the two yourself.
However, I noticed that you're adding the ".txt" extension to the file name explicitly all over your code, so depending on how you are going to use the snippet, you might or might not need to remove the ".txt" extension first before handing the filenames over. That would be achieved by inserting the following line after or before the sort:
filenames = [os.path.splitext(fn)[0] for fn in filenames]
Also, I still don't understand why you're using eval(). Do the text files contain python code? In any case, eval() should be avoided and be replaced by code that's more specific to the task at hand.
If it's a list of comma separated strings, use line.split(",") instead.
If there might be whitespace before or after the comma, use [word.strip() for word in line.split(",")] instead.
If it's a list of comma separated integers, use [int(num) for num in line.split(",")] instead - for floats it works analogously.
etc.

filter directory in python

I am trying to get filtered list of all Text and Python file, like below
from walkdir import filtered_walk, dir_paths, all_paths, file_paths
vdir=raw_input ("enter director :")
files = file_paths(filtered_walk(vdir, depth=0,included_files=['*.py', '*.txt']))
I want to:
know the total number of files found in given directory
I have tried options like : Number_of_files= len (files) or for n in files n=n+1 but all are failing as "files" is something called "generator" Object which I searched on python docs but couldn't make use of it
I also want to find a string e.g. "import sys" in the list of files found in above and store the file names having my search string in new file called "found.txt"
I believe this does what you want, if I misunderstood your specification, please let me know after you give this a test. I've hardcoded the directory searchdir, so you'll have to prompt for it.
import os
searchdir = r'C:\blabla'
searchstring = 'import sys'
def found_in_file(fname, searchstring):
with open(fname) as infp:
for line in infp:
if searchstring in line:
return True
return False
with open('found.txt', 'w') as outfp:
count = 0
search_count = 0
for root, dirs, files in os.walk(searchdir):
for name in files:
(base, ext) = os.path.splitext(name)
if ext in ('.txt', '.py'):
count += 1
full_name = os.path.join(root, name)
if found_in_file(full_name, searchstring):
outfp.write(full_name + '\n')
search_count += 1
print 'total number of files found %d' % count
print 'number of files with search string %d' % search_count
Using with to open the file will also close the file automatically for you later.
A python generator is a special kind of iterator. It yields one item after the other, without knowing in advance how much items there are. You only can know it at the end.
It should be ok, though, to do
n = 0
for item in files:
n += 1
do_something_with(items)
print "I had", n, "items."
You can think of a generator (or generally, an iterator) as a list that gives you one item at a time. (NO, it is not a list). So, you cannot count how much items it will give you unless you go through them all, because you have to take them one by one. (This is just a basic idea, now you should be able to understand the docs, and I'm sure there are lots of questions here about them too).
Now, for your case, you used a not-so-wrong approach:
count = 0
for filename in files:
count += 1
What you were doing wrong was taking f and incrementing, but f here is the filename! Incrementing makes no sense, and an Exception too.
Once you have these filenames, you have to open each individual file, read it, search for your string and return the filename.
def contains(filename, match):
with open(filename, 'r') as f:
for line in f:
if f.find(match) != -1:
return True
return False
match_files = []
for filename in files:
if contains(filename, "import sys"):
match_file.append(filename)
# or a one-liner:
match_files = [f for f in files if contains(f, "import sys")]
Now, as an example of a generator (don't read this before you read the docs):
def matching(filenames):
for filename in files:
if contains(filename, "import sys"):
# feed the names one by one, you are not storing them in a list
yield filename
# usage:
for f in matching(files):
do_something_with_the_files_that_match_without_storing_them_all_in_a_list()
You should try os.walk
import os
dir = raw_input("Enter Dir:")
files = [file for path, dirname, filenames in os.walk(dir) for file in filenames if file[-3:] in [".py", ".txt"]]
nfiles = len(files)
print nfiles
For searching for a string in a file look at Search for string in txt file Python
Combining both these your code would be something like
import os
import mmap
dir = raw_input("Enter Dir:")
print "Directory %s" %(dir)
search_str = "import sys"
count = 0
search_count = 0
write_file = open("found.txt", "w")
for dirpath, dirnames, filenames in os.walk(dir):
for file in filenames:
if file.split(".")[-1] in ["py", "txt"]:
count += 1
print dirpath, file
f = open(dirpath+"/"+file)
# print f.read()
if search_str in f.read():
search_count += 1
write_file.write(dirpath+"/"+file)
write_file.close()
print "Number of files: %s" %(count)
print "Number of files containing string: %s" %(search_count)

Categories

Resources