Related
I want to recursively walk through a directory, find the files that match any of the strings in a given list, and then copy these files to another folder. I thought the any() function would accomplish this, but I get a TypeError that it expected a string, not a list. Is there a more elegant way to do this?
string_to_match = ['apple.txt', 'pear.txt', 'banana.txt']
for root, subdirs, filename in os.walk(source_dir)
if any(s in filename for s in string_to_match):
shutil.copy(filename, destination_dir)
print(filename)
I know glob.glob can work well for finding files that match a specific string or pattern, but I haven't been able to find an answer that allows for multiple matches.
You can just use in
Example:
string_to_match = ['apple.txt', 'pear.txt', 'banana.txt']
for root, subdirs, filename in os.walk(source_dir)
if filename in string_to_match:
shutil.copy(filename, destination_dir)
print(filename)
Here also a glob version:
import glob
import itertools
root_dir = '/home/user'
files = ['apple.txt', 'pear.txt', 'banana.txt']
files_found = list(itertools.chain.from_iterable([glob.glob(f'{root_dir}/**/{f}', recursive=True) for f in files])
for f in files_found:
shutil.copy(f, destination_dir)
First, find an element in list takes O(n), so just convert it to a set which takes O(1).
Then you can do like this
string_to_match = {'apple.txt', 'pear.txt', 'banana.txt'}
for filename in os.listdir(source_dir):
if filename in string_to_match:
shutil.copy(filename, destination_dir)
print(filename)
I would use sets
def find_names(names,source_dir):
names = set(names)
# note os.walk will walk the subfolders too
# if you just want that source_dir use `strings_to_match.intersection(os.listdir(sourcedir))`
for root,subdirs,fnames in os.walk(sourcedir):
for matched_name in strings_to_match.intersection(fnames):
yield os.path.join(root,matched_name)
strings_to_match = ['apple.txt', 'pear.txt', 'banana.txt']
for match in find_names(strings_to_match,'/path/to/start'):
print("Match:", match)
[edited] typo intersection not intersect
(you could alternatively just pass in a set {'a','b','c'} instead of a list ['a','b','c'] and skip the conversion to a set)
here is an alternative that only looks in the source dir (not children)
def find_names_in_folder(names,source_dir):
return [os.path.join(source_dir,n) for n in set(names).intersection(os.listdir(source_dir))]
I compare two text files and print out the results to a 3rd file. I am trying to make it so the script i'm running would iterate over all of the folders that have two text files in them, in the CWD of the script.
What i have so far:
import os
import glob
path = './'
for infile in glob.glob( os.path.join(path, '*.*') ):
print('current file is: ' + infile)
with open (f1+'.txt', 'r') as fin1, open(f2+'.txt', 'r') as fin2:
Would this be a good way to start the iteration process?
It's not the most clear code but it gets the job done. However, i'm pretty sure i need to take the logic out of the read / write methods but i'm not sure where to start.
What i'm basically trying to do is have a script iterate over all of the folders in its CWD, open each folder, compare the two text files inside, write a 3rd text file to the same folder, then move on to the next.
Another method i have tried is as follows:
import os
rootDir = 'C:\\Python27\\test'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
And this outputs the following (to give you a better example of the file structure:
Found directory: C:\Python27\test
test.py
Found directory: C:\Python27\test\asdd
asd1.txt
asd2.txt
Found directory: C:\Python27\test\chro
ch1.txt
ch2.txt
Found directory: C:\Python27\test\hway
hw1.txt
hw2.txt
Would it be wise to put the compare logic under the for fname in fileList? How do i make sure it compares the two text files inside the specific folder and not with other fnames in the fileList?
This is the full code that i am trying to add this functionality into. I appologize for the Frankenstein nature of it but i am still working on a refined version but it does not work yet.
from collections import defaultdict
from operator import itemgetter
from itertools import groupby
from collections import deque
import os
class avs_auto:
def load_and_compare(self, input_file1, input_file2, output_file1, output_file2, result_file):
self.load(input_file1, input_file2, output_file1, output_file2)
self.compare(output_file1, output_file2)
self.final(result_file)
def load(self, fileIn1, fileIn2, fileOut1, fileOut2):
with open(fileIn1+'.txt') as fin1, open(fileIn2+'.txt') as fin2:
frame_rects = defaultdict(list)
for row in (map(str, line.split()) for line in fin1):
id, frame, rect = row[0], row[2], [row[3],row[4],row[5],row[6]]
frame_rects[frame].append(id)
frame_rects[frame].append(rect)
frame_rects2 = defaultdict(list)
for row in (map(str, line.split()) for line in fin2):
id, frame, rect = row[0], row[2], [row[3],row[4],row[5],row[6]]
frame_rects2[frame].append(id)
frame_rects2[frame].append(rect)
with open(fileOut1+'.txt', 'w') as fout1, open(fileOut2+'.txt', 'w') as fout2:
for frame, rects in sorted(frame_rects.iteritems()):
fout1.write('{{{}:{}}}\n'.format(frame, rects))
for frame, rects in sorted(frame_rects2.iteritems()):
fout2.write('{{{}:{}}}\n'.format(frame, rects))
def compare(self, fileOut1, fileOut2):
with open(fileOut1+'.txt', 'r') as fin1:
with open(fileOut2+'.txt', 'r') as fin2:
lines1 = fin1.readlines()
lines2 = fin2.readlines()
diff_lines = [l.strip() for l in lines1 if l not in lines2]
diffs = defaultdict(list)
with open(fileOut1+'x'+fileOut2+'.txt', 'w') as result_file:
for line in diff_lines:
d = eval(line)
for k in d:
list_ids = d[k]
for i in range(0, len(d[k]), 2):
diffs[d[k][i]].append(k)
for id_ in diffs:
diffs[id_].sort()
for k, g in groupby(enumerate(diffs[id_]), lambda (i, x): i - x):
group = map(itemgetter(1), g)
result_file.write('{0} {1} {2}\n'.format(id_, group[0], group[-1]))
def final(self, result_file):
with open(result_file+'.txt', 'r') as fin:
lines = (line.split() for line in fin)
for k, g in groupby(lines, itemgetter(0)):
fst = next(g)
lst = next(iter(deque(g, 1)), fst)
with open('final/{}.avs'.format(k), 'w') as fout:
fout.write('video0=ImageSource("old\%06d.jpeg", {}-3, {}+3, 15)\n'.format(fst[1], lst[2]))
fout.write('video1=ImageSource("new\%06d.jpeg", {}-3, {}+3, 15)\n'.format(fst[1], lst[2]))
fout.write('video0=BilinearResize(video0,640,480)\n')
fout.write('video1=BilinearResize(video1,640,480)\n')
fout.write('StackHorizontal(video0,video1)\n')
fout.write('Subtitle("ID: {}", font="arial", size=30, align=8)'.format(k))
using the load_and_compare() function, i define two input text files, two output text files, a file for the comparison results and a final phase that writes many files for all of the differences.
What i am trying to do is have this whole class run on the current working directory and go through every sub folder, compare the two text files, and write everything into the same folder, specifically the final() results.
You can indeed use os.walk(), since that already separates the directories from the files. You only need the directories it returns, because that's where you're looking for your 2 specific files.
You could also use os.listdir() but that returns directories as well files in the same list, so you would have to check for directories yourself.
Either way, once you have the directories, you iterate over them (for subdir in dirnames) and join the various path components you have: The dirpath, the subdir name that you got from iterating over the list and your filename.
Assuming there are also some directories that don't have the specific 2 files, it's a good idea to wrap the open() calls in a try..except block and thus ignore the directories where one of the files (or both of them) doesn't exist.
Finally, if you used os.walk(), you can easily choose if you only want to go into directories one level deep or walk the whole depth of the tree. In the former case, you just clear the dirnames list by dirnames[:] = []. Note that dirnames = [] wouldn't work, since that would just create a new empty list and put that reference into the variable instead of clearing the old list.
Replace the print("do something ...") with your program logic.
#!/usr/bin/env python
import errno
import os
f1 = "test1"
f2 = "test2"
path = "."
for dirpath, dirnames, _ in os.walk(path):
for subdir in dirnames:
filepath1, filepath2 = [os.path.join(dirpath, subdir, f + ".txt") for f in f1, f2]
try:
with open(filepath1, 'r') as fin1, open(filepath2, 'r') as fin2:
print("do something with " + str(fin1) + " and " + str(fin2))
except IOError as e:
# ignore directiories that don't contain the 2 files
if e.errno != errno.ENOENT:
# reraise exception if different from "file or directory doesn't exist"
raise
# comment the next line out if you want to traverse all subsubdirectories
dirnames[:] = []
Edit:
Based on your comments, I hope I understand your question better now.
Try the following code snippet instead. The overall structure stays the same, only now I'm using the returned filenames of os.walk(). Unfortunately, that would also make it harder to do something like "go only into the subdirectories 1 level deep", so I hope walking the tree recursively is fine with you. If not, I'll have to add a little code to later.
#!/usr/bin/env python
import fnmatch
import os
filter_pattern = "*.txt"
path = "."
for dirpath, dirnames, filenames in os.walk(path):
# comment this out if you don't want to filter
filenames = [fn for fn in filenames if fnmatch.fnmatch(fn, filter_pattern)]
if len(filenames) == 2:
# comment this out if you don't want the 2 filenames to be sorted
filenames.sort(key=str.lower)
filepath1, filepath2 = [os.path.join(dirpath, fn) for fn in filenames]
with open(filepath1, 'r') as fin1, open(filepath2, 'r') as fin2:
print("do something with " + str(fin1) + " and " + str(fin2))
I'm still not really sure what your program logic does, so you will have to interface the two yourself.
However, I noticed that you're adding the ".txt" extension to the file name explicitly all over your code, so depending on how you are going to use the snippet, you might or might not need to remove the ".txt" extension first before handing the filenames over. That would be achieved by inserting the following line after or before the sort:
filenames = [os.path.splitext(fn)[0] for fn in filenames]
Also, I still don't understand why you're using eval(). Do the text files contain python code? In any case, eval() should be avoided and be replaced by code that's more specific to the task at hand.
If it's a list of comma separated strings, use line.split(",") instead.
If there might be whitespace before or after the comma, use [word.strip() for word in line.split(",")] instead.
If it's a list of comma separated integers, use [int(num) for num in line.split(",")] instead - for floats it works analogously.
etc.
Hi. I was wondering if there is a way to add items into a list recursively. The function is supposed to print the path names of the file that matches with fname. So fname is the name of the file and the path is the folder in which the file is located. If there are folders inside the path folder it will go inside and look for the fname file. so far I am able to find all the files. But I am not able to append the list recursively.
def findAll(fname, path):
lst= []
for item in os.listdir(path):
n = os.path.join(path, item)
try:
if item == fname:
lst.append(n)
except:
findAll(fname,n)
return lst
Normally, I wouldn't give a full solution because this smells like homework (which is also why I'm avoiding os.walk), but since you have posted your attempt, here's an explanation and a solution:
For one thing, every time you call findAll, you initialize lst. Sure, you return it at the end, but you don't do anything with the return value, so the effect lst.append is contained within the recursion and is therefore not visible outside. Let me try to draw diagram to explain this (with one level of recursion):
+--------------------------------------------------+
|Outer Level: |
| |
|`lst = []` |
|found file f1 with name fname |
|`lst.append(f1)` |
|+------------------------------------------------+|
||Inner Level ||
|| ||
||`lst=[]` ||
||found file f2 with name fname ||
||`lst.append(f2)` ||
||`return lst` ||
|+------------------------------------------------+|
|a list is returned from recursive call, |
|but not assigned to a variable. |
|Therefore, `lst` remains unchanged |
+--------------------------------------------------+
There are a couple of ways by which you can fix this:
move lst to a scope outside findAll (personally, this is what I would do)
use the return value from the recursive call to modify lst
move lst to a scope outside findAll
lst= []
def findAll(fname, path):
global lst
for item in os.listdir(path):
n = os.path.join(path, item)
try: # really though, you don't need to use try/except here
if item == fname:
lst.append(n)
else:
findAll(fname,n)
except:
pass
After findAll has terminated, lst will contain the values you want
use the return value from the recursive call to modify lst
def findAll(fname, path, answer=None):
if answer == None:
answer = []
for item in os.listdir(path):
n = os.path.join(path, item)
try:
if item == fname:
answer += [n]
except:
findAll(fname,n, answer)
return answer
Hope this helps
PS: of course, the non-homework way to do this would be to use os.walk:
answer = []
def findAll(fname, dirpath):
dirpath, dirnames, filenames = os.walk(dirpath)
for filename in filenames:
if filename == fname:
answer.append(os.path.join(dirpath, filename))
for dirname in dirnames:
findAll(fname, os.path.join(dirpath, dirname))
# now, answer contains all the required filepaths
EDIT: OP asked for a version that doesn't use global variables:
def findAll(fname, root, answer=None):
if answer == None:
answer = []
for entry in os.listdir(root):
if os.path.isdir(os.path.join(root, entry)):
answer += findAll(fname, os.path.join(root, entry))
else:
if entry == fname:
answer.append(os.path.join(root, entry))
return answer
you need to extend your list with your recursive call
list.extend(findAll(fname,n))
also you can check if something is a directory with os.path.isdir(n)
but I think you have more problems than that with your script
afaik listdir just returns names , not the path of the directory ....
so you will need to call findAll(fname,os.path.join(path,n))
Not related to the question per se but I believe that os.walk would help you out:
allFiles = []
for root, dirs, files in os.walk(basedir):
[allFiles.append(file) for file in files]
Check out help(os.walk), it comes with a great example on how to use this function.
try/except is used incorrectly in your code. except clause is executed only if there is an error. Also you don't use the returned value from findAll(). You could skip creating a list inside the function and just yield found items lazily instead:
import os
def findAll(filename, rootdir):
for item in os.listdir(rootdir):
path = os.path.join(rootdir, item)
if not os.path.isdir(path):
if item == filename: # don't select dirs
yield path
else: # path is a dir
try:
for found_path in findAll(filename, path):
yield found_path
except EnvironmentError:
pass # ignore errors
print(list(findAll('python', '/usr')))
Output
['/usr/bin/python']
if it is not homework you could use os.walk() to find the files:
import os
def find_all(filename, rootdir):
for dirpath, dirs, files in os.walk(rootdir):
for file in files:
if file == filename:
yield os.path.join(dirpath, file)
print(list(find_all('python', '/usr')))
Output
['/usr/bin/python']
It is the same output as expected.
If you're on a Unix based system you could use find with the subprocess module .. I would reckon this would be the fastest way to retrieve all paths matching a filename. You can then do a split() on the output to make it a list:
>>> import subprocess
>>> lst = subprocess.check_output('find . -name "*rst"', shell=True)
>>> print lst
./SphinxWorkspace/doc/chapter1.rst
./SphinxWorkspace/doc/index.rst
./SphinxWorkspace/doc/tables.rst
You can always split the command and avoid the shell=True
Checkout: http://docs.python.org/2/library/subprocess.html#using-the-subprocess-module
.. Hope this helps!
I need to iterate through a folder and find every instance where the filenames are identical (except for extension) and then zip (preferably using tarfile) each of these into one file.
So I have 5 files named: "example1" each with different file extensions. I need to zip them up together and output them as "example1.tar" or something similar.
This would be easy enough with a simple for loop such as:
tar = tarfile.open('example1.tar',"w")
for output in glob ('example1*'):
tar.add(output)
tar.close()
however, there are 300 "example" files and I need to iterate through each one and their associated 5 files in order to make this work. This is way over my head. Any advice greatly appreciated.
The pattern you're describing generalizes to MapReduce. I found a simple implementation of MapReduce online, from which an even-simpler version is:
def map_reduce(data, mapper, reducer):
d = {}
for elem in data:
key, value = mapper(elem)
d.setdefault(key, []).append(value)
for key, grp in d.items():
d[key] = reducer(key, grp)
return d
You want to group all files by their name without the extension, which you can get from os.path.splitext(fname)[0]. Then, you want to make a tarball out of each group by using the tarfile module. In code, that is:
import os
import tarfile
def make_tar(basename, files):
tar = tarfile.open(basename + '.tar', 'w')
for f in files:
tar.add(f)
tar.close()
map_reduce(os.listdir('.'),
lambda x: (os.path.splitext(x)[0], x),
make_tar)
Edit: If you want to group files in different ways, you just need to modify the second argument to map_reduce. The code above groups files that have the same value for the expression os.path.splitext(x)[0]. So to group by the base file name with all the extensions stripped off, you could replace that expression with strip_all_ext(x) and add:
def strip_all_ext(path):
head, tail = os.path.split(path)
basename = tail.split(os.extsep)[0]
return os.path.join(head, basename)
You could do this:
list all files in the directory
create a dictionary where the basename is the key and all the extensions are values
then tar all the files by dictionary key
Something like this:
import os
import tarfile
from collections import defaultdict
myfiles = os.listdir(".") # List of all files
totar = defaultdict(list)
# now fill the defaultdict with entries; basename as keys, extensions as values
for name in myfiles:
base, ext = os.path.splitext(name)
totar[base].append(ext)
# iterate through all the basenames
for base in totar:
files = [base+ext for ext in totar[base]]
# now tar all the files in the list "files"
tar = tarfile.open(base+".tar", "w")
for item in files:
tar.add(item)
tar.close()
You have to problems. Solve the separately.
Finding matching names. Use a collections.defaultict
Creating tar files after you find the matching names. You've got that pretty well covered.
So. Solve problem 1 first.
Use glob to get all the names. Use os.path.basename to split the path and basename. Use os.path.splitext to split the name and extension.
A dictionary of lists can be used to save all files that have the same name.
Is that what you're doing in part 1?
Part 2 is putting the files into tar archives. For that, you've got most of the code you need.
Try using the glob module: http://docs.python.org/library/glob.html
#! /usr/bin/env python
import os
import tarfile
tarfiles = {}
for f in os.listdir ('files'):
prefix = f [:f.rfind ('.') ]
if prefix in tarfiles: tarfiles [prefix] += [f]
else: tarfiles [prefix] = [f]
for k, v in tarfiles.items ():
tf = tarfile.open ('%s.tar.gz' % k, 'w:gz')
for f in v: tf.addfile (tarfile.TarInfo (f), file ('files/%s' % f) )
tf.close ()
import os
import tarfile
allfiles = {}
for filename in os.listdir("."):
basename = '.'.join (filename.split(".")[:-1] )
if not basename in all_files:
allfiles[basename] = [filename]
else:
allfiles[basename].append(filename)
for basename, filenames in allfiles.items():
if len(filenames) < 2:
continue
tardata = tarfile.open(basename+".tar", "w")
for filename in filenames:
tardata.add(filename)
tardata.close()
I'm looking for a way to do a non-recursive os.walk() walk, just like os.listdir() works. But I need to return in the same way the os.walk() returns. Any idea?
Thank you in advance.
Add a break after the filenames for loop:
for root, dirs, filenames in os.walk(workdir):
for fileName in filenames:
print (fileName)
break #prevent descending into subfolders
This works because (by default) os.walk first lists the files in the requested folder and then goes into subfolders.
next(os.walk(...))
My a bit more parametrised solution would be this:
for root, dirs, files in os.walk(path):
if not recursive:
while len(dirs) > 0:
dirs.pop()
//some fancy code here using generated list
Edit: fixes, if/while issue. Thanks, #Dirk van Oosterbosch :}
Well what Kamiccolo meant was more in line with this:
for str_dirname, lst_subdirs, lst_files in os.walk(str_path):
if not bol_recursive:
while len(lst_subdirs) > 0:
lst_subdirs.pop()
Empty the directories list
for r, dirs, f in os.walk('/tmp/d'):
del dirs[:]
print(f)
Flexible Function for counting files:
You can set recursive searching and what types you want to look for. The default argument: file_types=("", ) looks for any file. The argument file_types=(".csv",".txt") would search for csv and txt files.
from os import walk as os_walk
def count_files(path, recurse=True, file_types = ("",)):
file_count = 0
iterator = os_walk(path) if recurse else ((next(os_walk(path))), )
for _, _, file_names in iterator:
for file_name in file_names:
file_count += 1 if file_name.endswith(file_types) else 0
return file_count