Find a file in python - python

I have a file that may be in a different place on each user's machine. Is there a way to implement a search for the file? A way that I can pass the file's name and the directory tree to search in?

os.walk is the answer, this will find the first match:
import os
def find(name, path):
for root, dirs, files in os.walk(path):
if name in files:
return os.path.join(root, name)
And this will find all matches:
def find_all(name, path):
result = []
for root, dirs, files in os.walk(path):
if name in files:
result.append(os.path.join(root, name))
return result
And this will match a pattern:
import os, fnmatch
def find(pattern, path):
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
find('*.txt', '/path/to/dir')

In Python 3.4 or newer you can use pathlib to do recursive globbing:
>>> import pathlib
>>> sorted(pathlib.Path('.').glob('**/*.py'))
[PosixPath('build/lib/pathlib.py'),
PosixPath('docs/conf.py'),
PosixPath('pathlib.py'),
PosixPath('setup.py'),
PosixPath('test_pathlib.py')]
Reference: https://docs.python.org/3/library/pathlib.html#pathlib.Path.glob
In Python 3.5 or newer you can also do recursive globbing like this:
>>> import glob
>>> glob.glob('**/*.txt', recursive=True)
['2.txt', 'sub/3.txt']
Reference: https://docs.python.org/3/library/glob.html#glob.glob

I used a version of os.walk and on a larger directory got times around 3.5 sec. I tried two random solutions with no great improvement, then just did:
paths = [line[2:] for line in subprocess.check_output("find . -iname '*.txt'", shell=True).splitlines()]
While it's POSIX-only, I got 0.25 sec.
From this, I believe it's entirely possible to optimise whole searching a lot in a platform-independent way, but this is where I stopped the research.

If you are using Python on Ubuntu and you only want it to work on Ubuntu a substantially faster way is the use the terminal's locate program like this.
import subprocess
def find_files(file_name):
command = ['locate', file_name]
output = subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0]
output = output.decode()
search_results = output.split('\n')
return search_results
search_results is a list of the absolute file paths. This is 10,000's of times faster than the methods above and for one search I've done it was ~72,000 times faster.

If you are working with Python 2 you have a problem with infinite recursion on windows caused by self-referring symlinks.
This script will avoid following those. Note that this is windows-specific!
import os
from scandir import scandir
import ctypes
def is_sym_link(path):
# http://stackoverflow.com/a/35915819
FILE_ATTRIBUTE_REPARSE_POINT = 0x0400
return os.path.isdir(path) and (ctypes.windll.kernel32.GetFileAttributesW(unicode(path)) & FILE_ATTRIBUTE_REPARSE_POINT)
def find(base, filenames):
hits = []
def find_in_dir_subdir(direc):
content = scandir(direc)
for entry in content:
if entry.name in filenames:
hits.append(os.path.join(direc, entry.name))
elif entry.is_dir() and not is_sym_link(os.path.join(direc, entry.name)):
try:
find_in_dir_subdir(os.path.join(direc, entry.name))
except UnicodeDecodeError:
print "Could not resolve " + os.path.join(direc, entry.name)
continue
if not os.path.exists(base):
return
else:
find_in_dir_subdir(base)
return hits
It returns a list with all paths that point to files in the filenames list.
Usage:
find("C:\\", ["file1.abc", "file2.abc", "file3.abc", "file4.abc", "file5.abc"])

Below we use a boolean "first" argument to switch between first match and all matches (a default which is equivalent to "find . -name file"):
import os
def find(root, file, first=False):
for d, subD, f in os.walk(root):
if file in f:
print("{0} : {1}".format(file, d))
if first == True:
break

The answer is very similar to existing ones, but slightly optimized.
So you can find any files or folders by pattern:
def iter_all(pattern, path):
return (
os.path.join(root, entry)
for root, dirs, files in os.walk(path)
for entry in dirs + files
if pattern.match(entry)
)
either by substring:
def iter_all(substring, path):
return (
os.path.join(root, entry)
for root, dirs, files in os.walk(path)
for entry in dirs + files
if substring in entry
)
or using a predicate:
def iter_all(predicate, path):
return (
os.path.join(root, entry)
for root, dirs, files in os.walk(path)
for entry in dirs + files
if predicate(entry)
)
to search only files or only folders - replace “dirs + files”, for example, with only “dirs” or only “files”, depending on what you need.
Regards.

SARose's answer worked for me until I updated from Ubuntu 20.04 LTS. The slight change I made to his code makes it work on the latest Ubuntu release.
import subprocess
def find_files(file_name):
command = ['locate'+ ' ' + file_name]
output = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()[0]
output = output.decode()
search_results = output.split('\n')
return search_results

#F.M.F's answers has a few problems in this version, so I made a few adjustments to make it work.
import os
from os import scandir
import ctypes
def is_sym_link(path):
# http://stackoverflow.com/a/35915819
FILE_ATTRIBUTE_REPARSE_POINT = 0x0400
return os.path.isdir(path) and (ctypes.windll.kernel32.GetFileAttributesW(str(path)) & FILE_ATTRIBUTE_REPARSE_POINT)
def find(base, filenames):
hits = []
def find_in_dir_subdir(direc):
content = scandir(direc)
for entry in content:
if entry.name in filenames:
hits.append(os.path.join(direc, entry.name))
elif entry.is_dir() and not is_sym_link(os.path.join(direc, entry.name)):
try:
find_in_dir_subdir(os.path.join(direc, entry.name))
except UnicodeDecodeError:
print("Could not resolve " + os.path.join(direc, entry.name))
continue
except PermissionError:
print("Skipped " + os.path.join(direc, entry.name) + ". I lacked permission to navigate")
continue
if not os.path.exists(base):
return
else:
find_in_dir_subdir(base)
return hits
unicode() was changed to str() in Python 3, so I made that adjustment (line 8)
I also added (in line 25) and exception to PermissionError. This way, the program won't stop if it finds a folder it can't access.
Finally, I would like to give a little warning. When running the program, even if you are looking for a single file/directory, make sure you pass it as a list. Otherwise, you will get a lot of answers that not necessarily match your search.
example of use:
find("C:\", ["Python", "Homework"])
or
find("C:\\", ["Homework"])
but, for example: find("C:\\", "Homework") will give un-wanted answers.
I would be lying if I said I know why this happens. Again, this is not my code and I just made the adjustments I needed to make it work. All credit should go to #F.M.F.

Related

Opening files with a "wildcard" in python [duplicate]

This is what I have:
glob(os.path.join('src','*.c'))
but I want to search the subfolders of src. Something like this would work:
glob(os.path.join('src','*.c'))
glob(os.path.join('src','*','*.c'))
glob(os.path.join('src','*','*','*.c'))
glob(os.path.join('src','*','*','*','*.c'))
But this is obviously limited and clunky.
pathlib.Path.rglob
Use pathlib.Path.rglob from the pathlib module, which was introduced in Python 3.5.
from pathlib import Path
for path in Path('src').rglob('*.c'):
print(path.name)
If you don't want to use pathlib, use can use glob.glob('**/*.c'), but don't forget to pass in the recursive keyword parameter and it will use inordinate amount of time on large directories.
For cases where matching files beginning with a dot (.); like files in the current directory or hidden files on Unix based system, use the os.walk solution below.
os.walk
For older Python versions, use os.walk to recursively walk a directory and fnmatch.filter to match against a simple expression:
import fnmatch
import os
matches = []
for root, dirnames, filenames in os.walk('src'):
for filename in fnmatch.filter(filenames, '*.c'):
matches.append(os.path.join(root, filename))
For python >= 3.5 you can use **, recursive=True :
import glob
for f in glob.glob('/path/**/*.c', recursive=True):
print(f)
If recursive is True (default is False), the pattern ** will match any files and zero
or more directories and subdirectories. If the pattern is followed by
an os.sep, only directories and subdirectories match.
Python 3 Demo
Similar to other solutions, but using fnmatch.fnmatch instead of glob, since os.walk already listed the filenames:
import os, fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
for filename in find_files('src', '*.c'):
print 'Found C source:', filename
Also, using a generator alows you to process each file as it is found, instead of finding all the files and then processing them.
I've modified the glob module to support ** for recursive globbing, e.g:
>>> import glob2
>>> all_header_files = glob2.glob('src/**/*.c')
https://github.com/miracle2k/python-glob2/
Useful when you want to provide your users with the ability to use the ** syntax, and thus os.walk() alone is not good enough.
Starting with Python 3.4, one can use the glob() method of one of the Path classes in the new pathlib module, which supports ** wildcards. For example:
from pathlib import Path
for file_path in Path('src').glob('**/*.c'):
print(file_path) # do whatever you need with these files
Update:
Starting with Python 3.5, the same syntax is also supported by glob.glob().
import os
import fnmatch
def recursive_glob(treeroot, pattern):
results = []
for base, dirs, files in os.walk(treeroot):
goodfiles = fnmatch.filter(files, pattern)
results.extend(os.path.join(base, f) for f in goodfiles)
return results
fnmatch gives you exactly the same patterns as glob, so this is really an excellent replacement for glob.glob with very close semantics. An iterative version (e.g. a generator), IOW a replacement for glob.iglob, is a trivial adaptation (just yield the intermediate results as you go, instead of extending a single results list to return at the end).
You'll want to use os.walk to collect filenames that match your criteria. For example:
import os
cfiles = []
for root, dirs, files in os.walk('src'):
for file in files:
if file.endswith('.c'):
cfiles.append(os.path.join(root, file))
Here's a solution with nested list comprehensions, os.walk and simple suffix matching instead of glob:
import os
cfiles = [os.path.join(root, filename)
for root, dirnames, filenames in os.walk('src')
for filename in filenames if filename.endswith('.c')]
It can be compressed to a one-liner:
import os;cfiles=[os.path.join(r,f) for r,d,fs in os.walk('src') for f in fs if f.endswith('.c')]
or generalized as a function:
import os
def recursive_glob(rootdir='.', suffix=''):
return [os.path.join(looproot, filename)
for looproot, _, filenames in os.walk(rootdir)
for filename in filenames if filename.endswith(suffix)]
cfiles = recursive_glob('src', '.c')
If you do need full glob style patterns, you can follow Alex's and
Bruno's example and use fnmatch:
import fnmatch
import os
def recursive_glob(rootdir='.', pattern='*'):
return [os.path.join(looproot, filename)
for looproot, _, filenames in os.walk(rootdir)
for filename in filenames
if fnmatch.fnmatch(filename, pattern)]
cfiles = recursive_glob('src', '*.c')
Consider pathlib.rglob().
This is like calling Path.glob() with "**/" added in front of the given relative pattern:
import pathlib
for p in pathlib.Path("src").rglob("*.c"):
print(p)
See also #taleinat's related post here and a similar post elsewhere.
import os, glob
for each in glob.glob('path/**/*.c', recursive=True):
print(f'Name with path: {each} \nName without path: {os.path.basename(each)}')
glob.glob('*.c') :matches all files ending in .c in current directory
glob.glob('*/*.c') :same as 1
glob.glob('**/*.c') :matches all files ending in .c in the immediate subdirectories only, but not in the current directory
glob.glob('*.c',recursive=True) :same as 1
glob.glob('*/*.c',recursive=True) :same as 3
glob.glob('**/*.c',recursive=True) :matches all files ending in .c in the current directory and in all subdirectories
In case this may interest anyone, I've profiled the top three proposed methods.
I have about ~500K files in the globbed folder (in total), and 2K files that match the desired pattern.
here's the (very basic) code
import glob
import json
import fnmatch
import os
from pathlib import Path
from time import time
def find_files_iglob():
return glob.iglob("./data/**/data.json", recursive=True)
def find_files_oswalk():
for root, dirnames, filenames in os.walk('data'):
for filename in fnmatch.filter(filenames, 'data.json'):
yield os.path.join(root, filename)
def find_files_rglob():
return Path('data').rglob('data.json')
t0 = time()
for f in find_files_oswalk(): pass
t1 = time()
for f in find_files_rglob(): pass
t2 = time()
for f in find_files_iglob(): pass
t3 = time()
print(t1-t0, t2-t1, t3-t2)
And the results I got were:
os_walk: ~3.6sec
rglob ~14.5sec
iglob: ~16.9sec
The platform: Ubuntu 16.04, x86_64 (core i7),
Recently I had to recover my pictures with the extension .jpg. I ran photorec and recovered 4579 directories 2.2 million files within, having tremendous variety of extensions.With the script below I was able to select 50133 files havin .jpg extension within minutes:
#!/usr/binenv python2.7
import glob
import shutil
import os
src_dir = "/home/mustafa/Masaüstü/yedek"
dst_dir = "/home/mustafa/Genel/media"
for mediafile in glob.iglob(os.path.join(src_dir, "*", "*.jpg")): #"*" is for subdirectory
shutil.copy(mediafile, dst_dir)
based on other answers this is my current working implementation, which retrieves nested xml files in a root directory:
files = []
for root, dirnames, filenames in os.walk(myDir):
files.extend(glob.glob(root + "/*.xml"))
I'm really having fun with python :)
For python 3.5 and later
import glob
#file_names_array = glob.glob('path/*.c', recursive=True)
#above works for files directly at path/ as guided by NeStack
#updated version
file_names_array = glob.glob('path/**/*.c', recursive=True)
further you might need
for full_path_in_src in file_names_array:
print (full_path_in_src ) # be like 'abc/xyz.c'
#Full system path of this would be like => 'path till src/abc/xyz.c'
Johan and Bruno provide excellent solutions on the minimal requirement as stated. I have just released Formic which implements Ant FileSet and Globs which can handle this and more complicated scenarios. An implementation of your requirement is:
import formic
fileset = formic.FileSet(include="/src/**/*.c")
for file_name in fileset.qualified_files():
print file_name
Another way to do it using just the glob module. Just seed the rglob method with a starting base directory and a pattern to match and it will return a list of matching file names.
import glob
import os
def _getDirs(base):
return [x for x in glob.iglob(os.path.join( base, '*')) if os.path.isdir(x) ]
def rglob(base, pattern):
list = []
list.extend(glob.glob(os.path.join(base,pattern)))
dirs = _getDirs(base)
if len(dirs):
for d in dirs:
list.extend(rglob(os.path.join(base,d), pattern))
return list
Or with a list comprehension:
>>> base = r"c:\User\xtofl"
>>> binfiles = [ os.path.join(base,f)
for base, _, files in os.walk(root)
for f in files if f.endswith(".jpg") ]
If the files are on a remote file system or inside an archive, you can use an implementation of the fsspec AbstractFileSystem class. For example, to list all the files in a zipfile:
from fsspec.implementations.zip import ZipFileSystem
fs = ZipFileSystem("/tmp/test.zip")
fs.glob("/**") # equivalent: fs.find("/")
or to list all the files in a publicly available S3 bucket:
from s3fs import S3FileSystem
fs_s3 = S3FileSystem(anon=True)
fs_s3.glob("noaa-goes16/ABI-L1b-RadF/2020/045/**") # or use fs_s3.find
you can also use it for a local filesystem, which may be interesting if your implementation should be filesystem-agnostic:
from fsspec.implementations.local import LocalFileSystem
fs = LocalFileSystem()
fs.glob("/tmp/test/**")
Other implementations include Google Cloud, Github, SFTP/SSH, Dropbox, and Azure. For details, see the fsspec API documentation.
Just made this.. it will print files and directory in hierarchical way
But I didn't used fnmatch or walk
#!/usr/bin/python
import os,glob,sys
def dirlist(path, c = 1):
for i in glob.glob(os.path.join(path, "*")):
if os.path.isfile(i):
filepath, filename = os.path.split(i)
print '----' *c + filename
elif os.path.isdir(i):
dirname = os.path.basename(i)
print '----' *c + dirname
c+=1
dirlist(i,c)
c-=1
path = os.path.normpath(sys.argv[1])
print(os.path.basename(path))
dirlist(path)
That one uses fnmatch or regular expression:
import fnmatch, os
def filepaths(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
try:
matched = pattern.match(basename)
except AttributeError:
matched = fnmatch.fnmatch(basename, pattern)
if matched:
yield os.path.join(root, basename)
# usage
if __name__ == '__main__':
from pprint import pprint as pp
import re
path = r'/Users/hipertracker/app/myapp'
pp([x for x in filepaths(path, re.compile(r'.*\.py$'))])
pp([x for x in filepaths(path, '*.py')])
In addition to the suggested answers, you can do this with some lazy generation and list comprehension magic:
import os, glob, itertools
results = itertools.chain.from_iterable(glob.iglob(os.path.join(root,'*.c'))
for root, dirs, files in os.walk('src'))
for f in results: print(f)
Besides fitting in one line and avoiding unnecessary lists in memory, this also has the nice side effect, that you can use it in a way similar to the ** operator, e.g., you could use os.path.join(root, 'some/path/*.c') in order to get all .c files in all sub directories of src that have this structure.
This is a working code on Python 2.7. As part of my devops work, I was required to write a script which would move the config files marked with live-appName.properties to appName.properties. There could be other extension files as well like live-appName.xml.
Below is a working code for this, which finds the files in the given directories (nested level) and then renames (moves) it to the required filename
def flipProperties(searchDir):
print "Flipping properties to point to live DB"
for root, dirnames, filenames in os.walk(searchDir):
for filename in fnmatch.filter(filenames, 'live-*.*'):
targetFileName = os.path.join(root, filename.split("live-")[1])
print "File "+ os.path.join(root, filename) + "will be moved to " + targetFileName
shutil.move(os.path.join(root, filename), targetFileName)
This function is called from a main script
flipProperties(searchDir)
Hope this helps someone struggling with similar issues.
Simplified version of Johan Dahlin's answer, without fnmatch.
import os
matches = []
for root, dirnames, filenames in os.walk('src'):
matches += [os.path.join(root, f) for f in filenames if f[-2:] == '.c']
Here is my solution using list comprehension to search for multiple file extensions recursively in a directory and all subdirectories:
import os, glob
def _globrec(path, *exts):
""" Glob recursively a directory and all subdirectories for multiple file extensions
Note: Glob is case-insensitive, i. e. for '\*.jpg' you will get files ending
with .jpg and .JPG
Parameters
----------
path : str
A directory name
exts : tuple
File extensions to glob for
Returns
-------
files : list
list of files matching extensions in exts in path and subfolders
"""
dirs = [a[0] for a in os.walk(path)]
f_filter = [d+e for d in dirs for e in exts]
return [f for files in [glob.iglob(files) for files in f_filter] for f in files]
my_pictures = _globrec(r'C:\Temp', '\*.jpg','\*.bmp','\*.png','\*.gif')
for f in my_pictures:
print f
import sys, os, glob
dir_list = ["c:\\books\\heap"]
while len(dir_list) > 0:
cur_dir = dir_list[0]
del dir_list[0]
list_of_files = glob.glob(cur_dir+'\\*')
for book in list_of_files:
if os.path.isfile(book):
print(book)
else:
dir_list.append(book)
I modified the top answer in this posting.. and recently created this script which will loop through all files in a given directory (searchdir) and the sub-directories under it... and prints filename, rootdir, modified/creation date, and size.
Hope this helps someone... and they can walk the directory and get fileinfo.
import time
import fnmatch
import os
def fileinfo(file):
filename = os.path.basename(file)
rootdir = os.path.dirname(file)
lastmod = time.ctime(os.path.getmtime(file))
creation = time.ctime(os.path.getctime(file))
filesize = os.path.getsize(file)
print "%s**\t%s\t%s\t%s\t%s" % (rootdir, filename, lastmod, creation, filesize)
searchdir = r'D:\Your\Directory\Root'
matches = []
for root, dirnames, filenames in os.walk(searchdir):
## for filename in fnmatch.filter(filenames, '*.c'):
for filename in filenames:
## matches.append(os.path.join(root, filename))
##print matches
fileinfo(os.path.join(root, filename))
Here is a solution that will match the pattern against the full path and not just the base filename.
It uses fnmatch.translate to convert a glob-style pattern into a regular expression, which is then matched against the full path of each file found while walking the directory.
re.IGNORECASE is optional, but desirable on Windows since the file system itself is not case-sensitive. (I didn't bother compiling the regex because docs indicate it should be cached internally.)
import fnmatch
import os
import re
def findfiles(dir, pattern):
patternregex = fnmatch.translate(pattern)
for root, dirs, files in os.walk(dir):
for basename in files:
filename = os.path.join(root, basename)
if re.search(patternregex, filename, re.IGNORECASE):
yield filename
I needed a solution for python 2.x that works fast on large directories.
I endet up with this:
import subprocess
foundfiles= subprocess.check_output("ls src/*.c src/**/*.c", shell=True)
for foundfile in foundfiles.splitlines():
print foundfile
Note that you might need some exception handling in case ls doesn't find any matching file.

Matching MD5 Hashes from another script

Ok so i'm trying to create a script that does the following: Searches a directory for known hashes. Here is my first script:
Hash.py
import hashlib
from functools import partial
#call another python script
execfile("knownHashes.py")
def md5sum(filename):
with open(filename, mode='rb') as f:
d = hashlib.md5()
for buf in iter(partial(f.read, 128), b''):
d.update(buf)
return d.hexdigest()
print "Hash of is: "
print(md5sum('photo.jpg'))
if md5List == md5sum:
print "Match"
knownHashes.py
print ("Call worked\n")
md5List = "01071709f67193b295beb7eab6e66646" + "5d41402abc4b2a76b9719d911017c592"
The problem at the moment is that I manually have to type in the file I want to find out the hash of where it says photo.jpg. Also, The I haven't got the md5List to work yet.
I want the script to eventually work like this:
python hash.py <directory>
1 match
cookies.jpg matches hash
So how can I get the script to search a directory rather than manually type in what file to hash? Also, how can I fix the md5List because that is wrong?
You can get a list of files in the current working directory using the following. This is the directory that you run the script from.
import os
#Get list of files in working directory
files_list = os.listdir(os.getcwd())
You can iterate through the list using a for loop:
for file in files_list:
#do something
As equinoxel also mentioned below, you can use os.walk() as well.
Simple little gist should solve most of your problems. Understandable if you don't like using OOP for this problem, but I believe all of the important conceptual pieces are here in a pretty clean, concise representation. Let me know if you have any questions.
class PyGrep:
def __init__(self, directory):
self.directory = directory
def grab_all_files_with_ending(self, file_ending):
"""Will return absolute paths to all files with given file ending in self.directory"""
walk_results = os.walk(self.directory)
file_check = lambda walk: len(walk[2]) > 0
ending_prelim = lambda walk: file_ending in " ".join(walk[2])
relevant_results = (entry for entry in walk_results if file_check(entry) and ending_prelim(entry))
return (self.grab_files_from_os_walk(result, file_ending) for result in relevant_results)
def grab_files_from_os_walk(self, os_walk_tuple, file_ending):
format_check = lambda file_name: file_ending in file_name
directory, subfolders, file_paths = os_walk_tuple
return [os.path.join(directory, file_path) for file_path in file_paths if format_check(file_path)]

recursive file copying into subdirectory

I need to copy all the files and folders to the current folder to a subdirectory. What would be the best way to do so? I tried the following snippet but it fails as it fails if the destination directory already exists.
def copy(d=os.path.curdir):
dest = "t"
for i in os.listdir(d):
if os.path.isdir(i):
shutil.copytree(i, dest)
else:
shutil.copy(i, dest)
I have the feeling that the same task can be done in a better and easier manner. How do i do it?
I would never do it on python, but the following solution came to mind. It doesn't look simple, but it should work and can be simplified (haven't checked, sorry, no access to the computer now):
def copyDirectoryTree(directory, destination, preserveSymlinks=True):
for entry in os.listdir(directory):
entryPath = os.path.join(directory, entry)
if os.path.isdir(entryPath):
entrydest = os.path.join(destination, entry)
if os.path.exists(entrydest):
if not os.path.isdir(entrydest):
raise IOError("Failed to copy thee, the destination for the `" + entryPath + "' directory exists and is not a directory")
copyDirectoryTree(entrypath, entrydest, preserveSymlinks)
else:
shutil.copytree(entrypath, entrydest, preserveSymlinks)
else: #symlinks and files
if preserveSymlinks:
shutil.copy(entryPath, directory)
else:
shutil.copy(os.path.realpath(entryPath), directory)
See the code in http://docs.python.org/library/shutil.html, then tweak it a little (e.g. try: around os.makedirs(dst)).
To extend mamnun's answer,
If you want to use the direct call to the os, I'd advise using cp -r since you seem to want a recursive copy for directories.
Do you really need to use python? Because shutil functions cannot copy all file metadata and group permissions. Why don't you try built-in OS commands like cp in linux and xcopy in windows?
You can even try to run these commands from python
import os
os.system("cp file1 file2")
Hope this helps.
Here is my version of a recursive copy method for python, seems to work :)
def copy_all(fr, to, overwrite=True):
fr = os.path.normpath(fr)
to = os.path.normpath(to)
if os.path.isdir(fr):
if (not os.path.exists(to + os.path.basename(fr)) and not
os.path.basename(fr) == os.path.basename(to)):
to += "/" + os.path.basename(fr)
mkdirs(to)
for file in os.listdir(fr):
copy_all(fr + "/" + file, to + "/")
else: #symlink or file
dest = to
if os.path.isdir(to):
dest += "/"
dest += os.path.basename(fr)
if overwrite and (os.path.exists(dest) or os.path.islink(dest)
rm(dest)
if os.path.isfile(fr):
shutil.copy2(fr, dest)
else: #has to be a symlink
os.symlink(os.readlink(fr), dest)
def mkdirs(path):
if not os.path.isdir(path):
os.makedirs(path)
def rm(path):
if os.path.isfile(path) or os.path.islink(path):
os.remove(path)
elif os.path.isdir(path):
for file in os.listdir(path):
fullpath = path+"/"+file
os.rmdir(fullpath)

How to find all files with a particular extension? [duplicate]

This question already has answers here:
Find all files in a directory with extension .txt in Python
(25 answers)
Closed 2 months ago.
I am trying to find all the .c files in a directory using Python.
I wrote this, but it is just returning me all files - not just .c files:
import os
import re
results = []
for folder in gamefolders:
for f in os.listdir(folder):
if re.search('.c', f):
results += [f]
print results
How can I just get the .c files?
try changing the inner loop to something like this
results += [each for each in os.listdir(folder) if each.endswith('.c')]
Try "glob":
>>> import glob
>>> glob.glob('./[0-9].*')
['./1.gif', './2.txt']
>>> glob.glob('*.gif')
['1.gif', 'card.gif']
>>> glob.glob('?.gif')
['1.gif']
KISS
# KISS
import os
results = []
for folder in gamefolders:
for f in os.listdir(folder):
if f.endswith('.c'):
results.append(f)
print results
There is a better solution that directly using regular expressions, it is the standard library's module fnmatch for dealing with file name patterns. (See also glob module.)
Write a helper function:
import fnmatch
import os
def listdir(dirname, pattern="*"):
return fnmatch.filter(os.listdir(dirname), pattern)
and use it as follows:
result = listdir("./sources", "*.c")
for _,_,filenames in os.walk(folder):
for file in filenames:
fileExt=os.path.splitext(file)[-1]
if fileExt == '.c':
results.append(file)
For another alternative you could use fnmatch
import fnmatch
import os
results = []
for root, dirs, files in os.walk(path)
for _file in files:
if fnmatch.fnmatch(_file, '*.c'):
results.append(os.path.join(root, _file))
print results
or with a list comprehension:
for root, dirs, files in os.walk(path)
[results.append(os.path.join(root, _file))\
for _file in files if \
fnmatch.fnmatch(_file, '*.c')]
or using filter:
for root, dirs, files in os.walk(path):
[results.append(os.path.join(root, _file))\
for _file in fnmatch.filter(files, '*.c')]
Change the directory to the given path, so that you can search files within directory. If you don't change the directory then this code will search files in your present directory location:
import os #importing os library
import glob #importing glob library
path=raw_input() #input from the user
os.chdir(path)
filedata=glob.glob('*.c') #all files with .c extenstions stores in filedata.
print filedata
import os, re
cfile = re.compile("^.*?\.c$")
results = []
for name in os.listdir(directory):
if cfile.match(name):
results.append(name)
The implementation of shutil.copytree is in the docs. I mofdified it to take a list of extentions to INCLUDE.
def my_copytree(src, dst, symlinks=False, *extentions):
""" I modified the 2.7 implementation of shutils.copytree
to take a list of extentions to INCLUDE, instead of an ignore list.
"""
names = os.listdir(src)
os.makedirs(dst)
errors = []
for name in names:
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if symlinks and os.path.islink(srcname):
linkto = os.readlink(srcname)
os.symlink(linkto, dstname)
elif os.path.isdir(srcname):
my_copytree(srcname, dstname, symlinks, *extentions)
else:
ext = os.path.splitext(srcname)[1]
if not ext in extentions:
# skip the file
continue
copy2(srcname, dstname)
# XXX What about devices, sockets etc.?
except (IOError, os.error), why:
errors.append((srcname, dstname, str(why)))
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error, err:
errors.extend(err.args[0])
try:
copystat(src, dst)
# except WindowsError: # cant copy file access times on Windows
# pass
except OSError, why:
errors.extend((src, dst, str(why)))
if errors:
raise Error(errors)
Usage: For example, to copy only .config and .bat files....
my_copytree(source, targ, '.config', '.bat')
this is pretty clean.
the commands come from the os library.
this code will search through the current working directory and list only the specified file type. You can change this by replacing 'os.getcwd()' with your target directory and choose the file type by replacing '(ext)'. os.fsdecode is so you don't get a bytewise error from .endswith(). this also sorts alphabetically, you can remove sorted() for the raw list.
import os
filenames = sorted([os.fsdecode(file) for file in os.listdir(os.getcwd()) if os.fsdecode(file).endswith(".(ext)")])
Here's yet another solution, using pathlib (and Python 3):
from pathlib import Path
gamefolder = "path/to/dir"
result = sorted(Path(gamefolder).glob("**.c"))
Notice the double asterisk (**) in the glob() argument. This will search the gamefolder as well as its subdirectories. If you only want to search the gamefolder, use a single * in the pattern: "*.c". For more details, see the documentation.
If you replace '.c' with '[.]c$', you're searching for files that contain .c as the last two characters of the name, rather than all files that contain a c, with at least one character before it.
Edit: Alternatively, match f[-2:] with '.c', this MAY be computationally cheaper than pulling out a regexp match.
Just to be clear, if you wanted the dot character in your search term, you could've escaped it too:
'.*[backslash].c' would give you what you needed, plus you would need to use something like:
results.append(f), instead of what you had listed as results += [f]
This function returns a list of all file names with the specified extension that live in the specified directory:
import os
def listFiles(path, extension):
return [f for f in os.listdir(path) if f.endswith(extension)]
print listFiles('/Path/to/directory/with/files', '.txt')
If you want to list all files with the specified extension in a certain directory and its subdirectories you could do:
import os
def filterFiles(path, extension):
return [file for root, dirs, files in os.walk(path) for file in files if file.endswith(extension)]
print filterFiles('/Path/to/directory/with/files', '.txt')
You can actually do this with just os.listdir
import os
results = [f for f in os.listdir(gamefolders/folder) if f.endswith('.c')]

Directory Walker for Python

I am currently using the directory walker from Here
import os
class DirectoryWalker:
# a forward iterator that traverses a directory tree
def __init__(self, directory):
self.stack = [directory]
self.files = []
self.index = 0
def __getitem__(self, index):
while 1:
try:
file = self.files[self.index]
self.index = self.index + 1
except IndexError:
# pop next directory from stack
self.directory = self.stack.pop()
self.files = os.listdir(self.directory)
self.index = 0
else:
# got a filename
fullname = os.path.join(self.directory, file)
if os.path.isdir(fullname) and not os.path.islink(fullname):
self.stack.append(fullname)
return fullname
for file in DirectoryWalker(os.path.abspath('.')):
print file
This minor change allows you to have the full path within the file.
Can anyone help me how to find just the filename as well using this? I need both the full path, and just the filename.
Why do you want to do such boring thing yourself?
for path, directories, files in os.walk('.'):
print 'ls %r' % path
for directory in directories:
print ' d%r' % directory
for filename in files:
print ' -%r' % filename
Output:
'.'
d'finction'
d'.hg'
-'setup.py'
-'.hgignore'
'./finction'
-'finction'
-'cdg.pyc'
-'util.pyc'
-'cdg.py'
-'util.py'
-'__init__.pyc'
-'__init__.py'
'./.hg'
d'store'
-'hgrc'
-'requires'
-'00changelog.i'
-'undo.branch'
-'dirstate'
-'undo.dirstate'
-'branch'
'./.hg/store'
d'data'
-'undo'
-'00changelog.i'
-'00manifest.i'
'./.hg/store/data'
d'finction'
-'.hgignore.i'
-'setup.py.i'
'./.hg/store/data/finction'
-'util.py.i'
-'cdg.py.i'
-'finction.i'
-'____init____.py.i'
But if you insist, there's path related tools in os.path, os.basename is what you are looking at.
>>> import os.path
>>> os.path.basename('/hello/world.h')
'world.h'
Rather than using '.' as your directory, refer to its absolute path:
for file in DirectoryWalker(os.path.abspath('.')):
print file
Also, I'd recommend using a word other than 'file', because it means something in the python language. Not a keyword, though so it still runs.
As an aside, when dealing with filenames, I find the os.path module to be incredibly useful - I'd recommend having a look through that, especially
os.path.normpath
Normalises paths (gets rid of redundant '.'s and 'theFolderYouWereJustIn/../'s)
os.path.join
Joins two paths
os.path.dirname()? os.path.normpath()? os.path.abspath()?
This would also be a lovely place to think recursion.
Just prepend the current directory path to the "./foo" path returned:
print os.path.join(os.getcwd(), file)

Categories

Resources