Scanning for file paths with glob

Scanning for file paths with glob - python

I am searching for all .csv's located in a subfolder with glob like so:
def scan_for_files(path):
file_list = []
for path, dirs, files in os.walk(path):
for d in dirs:
for f in glob.iglob(os.path.join(path, d, '*.csv')):
file_list.append(f)
return file_list
If I call:
path = r'/data/realtimedata/trades/bitfinex/'
scan_for_files(path)
I get the correct recursive list of files:
['/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_12.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_13.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_15.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_11.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_09.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_10.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_08.csv',
'/data/realtimedata/trades/bitfinex/btcusd/bitfinex_btcusd_trades_2018_05_14.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_14.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_12.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_10.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_08.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_09.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_15.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_11.csv',
'/data/realtimedata/trades/bitfinex/ethusd/bitfinex_ethusd_trades_2018_05_13.csv']
However when using the actual sub-directory containing the files I want - it returns an empty list. Any idea why this is happening? Thanks.
path = r'/data/realtimedata/trades/bitfinex/btcusd/'
scan_for_files(path)
returns: []

Looks like btcusd is a bottom-level directory. That means that when you call os.walk with the r'/data/realtimedata/trades/bitfinex/btcusd/' path, the dirs variable will be an empty list [], so the inner loop for d in dirs: does not execute at all.
My advice would be to re-write your function to iterate over the files directly, and not the directories... don't worry, you'll get there eventually, that's the nature of a directory tree.
def scan_for_files(path):
file_list = []
for path, _, files in os.walk(path):
for f in files:
file_list.extend(glob.iglob(os.path.join(path, f, '*.csv'))
return file_list
However, on more recent versions of python (3.5+), you can use recursive glob:
def scan_for_files(path):
return glob.glob(os.path.join(path, '**', '*.csv'), recursive=True)
Source.

Related

Search for files that match any strings from a list?

I want to recursively walk through a directory, find the files that match any of the strings in a given list, and then copy these files to another folder. I thought the any() function would accomplish this, but I get a TypeError that it expected a string, not a list. Is there a more elegant way to do this?
string_to_match = ['apple.txt', 'pear.txt', 'banana.txt']
for root, subdirs, filename in os.walk(source_dir)
if any(s in filename for s in string_to_match):
shutil.copy(filename, destination_dir)
print(filename)
I know glob.glob can work well for finding files that match a specific string or pattern, but I haven't been able to find an answer that allows for multiple matches.

You can just use in
Example:
string_to_match = ['apple.txt', 'pear.txt', 'banana.txt']
for root, subdirs, filename in os.walk(source_dir)
if filename in string_to_match:
shutil.copy(filename, destination_dir)
print(filename)
Here also a glob version:
import glob
import itertools
root_dir = '/home/user'
files = ['apple.txt', 'pear.txt', 'banana.txt']
files_found = list(itertools.chain.from_iterable([glob.glob(f'{root_dir}/**/{f}', recursive=True) for f in files])
for f in files_found:
shutil.copy(f, destination_dir)

First, find an element in list takes O(n), so just convert it to a set which takes O(1).
Then you can do like this
string_to_match = {'apple.txt', 'pear.txt', 'banana.txt'}
for filename in os.listdir(source_dir):
if filename in string_to_match:
shutil.copy(filename, destination_dir)
print(filename)

I would use sets
def find_names(names,source_dir):
names = set(names)
# note os.walk will walk the subfolders too
# if you just want that source_dir use `strings_to_match.intersection(os.listdir(sourcedir))`
for root,subdirs,fnames in os.walk(sourcedir):
for matched_name in strings_to_match.intersection(fnames):
yield os.path.join(root,matched_name)
strings_to_match = ['apple.txt', 'pear.txt', 'banana.txt']
for match in find_names(strings_to_match,'/path/to/start'):
print("Match:", match)
[edited] typo intersection not intersect
(you could alternatively just pass in a set {'a','b','c'} instead of a list ['a','b','c'] and skip the conversion to a set)
here is an alternative that only looks in the source dir (not children)
def find_names_in_folder(names,source_dir):
return [os.path.join(source_dir,n) for n in set(names).intersection(os.listdir(source_dir))]

How to get a list of list of subfolder files with full path?

I would like to get the same list structure that i am getting in this approach but i get a full list instead which i would have to break down manually and it kills the "automate the task".
For example, I have a folder called test with 4 subfolders called A,B,C,D and inside each folder we can find files file1, file2, file3.
import os
import openpyxl
#Create a regex that matches files with the american date format
path = r'C:\Users\Dell_G7_15\Desktop\test'
pathWalk = os.walk(path)
fileIndex = os.listdir(path)
wb = openpyxl.Workbook()
i=0
filenames = []
filesPathLink=[]
for foldernames in pathWalk:
filenames.append(foldernames[-1]) #creating list of filenames
i= i+1
filenames.pop(0) #delete first value of the list that causes error
print(filenames)
When i print filenames i get:
[['file1', 'file2', 'file3'],['file1', 'file2', 'file3'],['file1', 'file2', 'file3']]
I am looking for the same list structure but to get the full path of each one and it would look like this:
[['../A/file1', '../A/file2', '../A/file3'],[....],[....]]

Is this what you are looking for?
For the following folder and sub folders -
# root/
# -img0.jpg
# sub1/
# -img1.jpg
# -img1 copy.jpg
# sub2/
# -img2.jpg
# subsub1/
# -img3.jpg
path = '/Users/name/Desktop/root'
[[r+'/'+fname for fname in f] for r,d,f in os.walk(path)]
[['/Users/name/Desktop/root/img0.jpg'],
['/Users/name/Desktop/root/sub1/img1 copy.jpg',
'/Users/name/Desktop/root/sub1/img1.jpg'],
['/Users/name/Desktop/root/sub2/img2.jpg'],
['/Users/name/Desktop/root/sub2/subsub1/img3.jpg']]
For completion sake, if anyone is looking for a flat list of all files with paths inside a multi-level folder structure then try this -
[r+'/'+fname for r,d,f in os.walk(path) for fname in f]
['/Users/name/Desktop/root/img0.jpg',
'/Users/name/Desktop/root/sub1/img1 copy.jpg',
'/Users/name/Desktop/root/sub1/img1.jpg',
'/Users/name/Desktop/root/sub2/img2.jpg',
'/Users/name/Desktop/root/sub2/subsub1/img3.jpg']
EDIT: Simple loop without a list comprehension
filepaths = []
for r,d,f in os.walk(path):
l = []
for fname in f:
l.append(r+'/'+fname)
filepaths.append(l)
print(filepaths)
[['/Users/name/Desktop/root/img0.jpg'],
['/Users/name/Desktop/root/sub1/img1 copy.jpg',
'/Users/name/Desktop/root/sub1/img1.jpg'],
['/Users/name/Desktop/root/sub2/img2.jpg'],
['/Users/name/Desktop/root/sub2/subsub1/img3.jpg']]

How to get different lists of file paths depending on subdirs?

-Root
--A
---1,2
--B
---3
I am trying to get a list of lists of paths based on subdirs:
[['Root/A/1','Root/A/2'],['Root/B/3']]
I tried using os.walk but I couldn't get it to work. I can get a list of all files in one giant list but I can't split those based on subdirs
fullList = []
for root, dirs, files in os.walk(dir):
for name in files:
fullList.append(os.path.join(root, name))

You want to have a list of lists, but you create a list of strings. You'll need to create each of the interior lists and put them all together into one master list.
This program might do what you want:
import os
from pprint import pprint
def return_list_of_paths(dir='.'):
return [[os.path.join(root, file) for file in files]
for root, dirs, files in os.walk(dir)
if files]
pprint(return_list_of_paths("ROOT"))
Or, if you don't care for list comprehensions:
import os
from pprint import pprint
def return_list_of_paths(dir='.'):
fullList = []
for root, dirs, files in os.walk(dir):
if files:
oneList = []
for file in files:
oneList.append(os.path.join(root, file))
fullList.append(oneList)
return fullList
pprint(return_list_of_paths("ROOT"))

Make a list of root with os.walk

I am trying to setup a specific folder/file structure, which I will then copy into my test setup. I want a list of unique folders, that I can then create.
How to I get root into a list?
If I do the following:
for root, dirs, filenames in os.walk(path):
print root
I get:
/Users/Me/Folder
/Users/Me/Folder/SubFolder
But as I want use it in a for-loop it gets messed up.
for root, dirs, filenames in os.walk(path):
for x in root:
print x
and I get this result:
/
U
s
e
r
s
/
M
e
/
F
o
l
.
.
. and so on

To get an variable you are iterating over into a list simply append it to a list:
list = []
for root, dirs, filenames in os.walk(path):
list.append(root)
To create a list of folders you can simple use os.mkdir(path):
for path in list:
os.mkdir(path)
if you want an additional print statement to see which folders you created use:
for path in list:
os.mkdir(path)
print("created:{}".format(path))

Finally I found the answer:
for root, dirs, filenames in os.walk(src_path):
for x in root.splitlines():
print x

Extracting folder name from file through iteration - slow

I have a program where I need to loop throughout the files and sub-directories. I need to extract the subfolder name where the has been extracted.
I have a dictionary that contains all the subfolders names that I need to work with, d. Then by iterating through the files, I need to check if their director is in d or not.
Here is my code:
d = {'folder_1': 'a', 'folder_2': 'b', 'folder_3': 'c'}
dir_path = "/Users/user_1/Desktop/images_testing"
for root, directories, files in os.walk(dir_path):
for filename in files:
filepath = os.path.join(root, filename)
temp_path = os.path.dirname(filepath)
temp_sub_dir = temp_path.split("/")
if temp_sub_dir[-1] in d:
#do some work
This works fine but SUPER slow. Is there any way to make this process faster? It is super slow.
My main problem is on these lines:
temp_path = os.path.dirname(filepath)
temp_sub_dir = temp_path.split("/")
I do not need the full path, I just need the folder name where this file came from.

How about do that like this:
for root, directories, files in os.walk(dir_path):
temp_sub_dir = os.path.basename(root)
if temp_sub_dir in d:
for filename in files:
filepath = os.path.join(root, filename)
#do some work

As you 'walk' through check whether the current directory is one of those listed in d. If it is and if the file in this dictionary location is in the current directory then 'do something'. Seems simpler.
import os
d = {'folder_1': 'a', 'folder_2': 'b', 'folder_3': 'c'}
dir_path = "/Users/user_1/Desktop/images_testing"
for dirpath, dirnames, filenames in os.walk(dir_path):
if os.path.split(dirpath) in d and d[os.path.split(dirpath)] in files:
#do some work

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scanning for file paths with glob - python

Related

Search for files that match any strings from a list?

How to get a list of list of subfolder files with full path?

How to get different lists of file paths depending on subdirs?

Make a list of root with os.walk

Extracting folder name from file through iteration - slow

Categories

Resources