i need help to find a file inside a folder by name, i can do this with one file name, how could i do this with two file name?
This is the code used
path = r"Z:/Equities/ReferencePrice/"
files = []
for file in glob.glob(os.path.join(path ,"*OptionOnEquitiesReferencePriceFile*"+"*.txt*")):
df = pd.read_csv(file, delimiter = ';')
the first file contains the name
"OptionOnEquitiesReferencePriceFile"
the Second file contains the name
"BDRReferencePrice"
how to place the second file how to search between one or the other or both
I dont think you can do that in a straightforward way, so here's an alternative solution (with a function) that you can use :
import os
from fnmatch import fnmatch
# folder path :
# here in this path i have many files some start with 'other'
# some with 'test and some with random names.
# in the example im fetchinf only the 'test' and 'other' patterns
dir_path = './test_dir'
def find_by_patterns(patterns, path):
results = []
# check for any matches and save them in the results list
for root, dirs, files in os.walk(path):
for name in files:
if max([fnmatch(name, pattern) for pattern in patterns]):
results.append(os.path.join(root, name))
return results
# printing the results
print(find_by_patterns(['test*.txt', 'other*.txt'], dir_path))
output:
['./test_dir/other1.txt', './test_dir/other2.txt', './test_dir/test1.txt', './test_dir/test2.txt', './test_dir/test3.txt']
Related
I'm trying to read the folder name at the specific place from the file path. My current code:
import os
# search for and input multiple files
def get_files(source):
matches = []
for root, dirnames, filenames in os.walk(source):
for filename in filenames:
matches.append(os.path.join(root, filename))
return matches
def parse(files):
for file in files:
xml_information = {}
metadata = []
# Get the file path
filepath = os.path.dirname(file)
xml_information['file_path'] = '%s' % filepath
# Get customer name
customer = filepath.split("\\")[5]
xml_information['customer_name'] = '%s' % customer
metadata.append(xml_information)
print(metadata)
path = 'C:\\Users\\quan.nguyen\\SAGE\\Lania Thompson - Searching Project Files'
parse(get_files(path))
My program searches through folders and find the files and report back their folder path. However, I would like to read the folder path as well as the folder name at the sixth place which is customer name. When I run customer = filepath.split("\\")[5] it report an error:
Traceback (most recent call last):
File "*hidden*", line 33, in <module>
parse(get_files(path))
File "*hidden*", line 26, in parse
customer = filepath.split("\\")[5]
~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range
However, when I run with customer = filepath.split("\\")[4], the program works and reads the last folder specified in path which is Lania Thompson - Searching Project Files. The result is as follows:
[{'file_path': 'C:\\Users\\quan.nguyen\\SAGE\\Lania Thompson - Searching Project Files\\Hazor Ltd\\PCS Mah\\Machine', 'customer_name': 'Lania Thompson - Searching Project Files'}]
My expecting result is Hazor Ltd:
[{'file_path': 'C:\\Users\\quan.nguyen\\SAGE\\Lania Thompson - Searching Project Files\\Hazor Ltd\\PCS Mah\\Machine', 'customer_name': 'Hazor Ltd'}]
The names are made up except from my name
So I have worked out the code using the pathlib library. The code is:
import os
from pathlib import Path
# search for and input multiple files
def get_files(source):
matches = []
for root, dirnames, filenames in os.walk(source):
for filename in filenames:
matches.append(os.path.join(root, filename))
return matches
def parse(files):
for file in files:
xml_information = {}
metadata = []
# Get the file path
filepath = os.path.dirname(file)
# Get customer name
p = Path(filepath)
files = [f for f in p.rglob('*') if f.is_file()]
for f in files:
xml_information['Customer'] = f.parts[5]
metadata.append(xml_information)
print(metadata)
path = 'C:\\Users\\quan.nguyen\\SAGE\\Lania Thompson - Searching Project Files'
parse(get_files(path))
change the number [5] in xml_information['Customer'] = f.parts[5] for the place you want to get the folder's name from.
I am trying to create a dataset using pd.DataFrame to store file name and file extension of all the files in my directory. I eventually want to have two variables named Name and Extension. The name variable will have a list of file names and the extension variable should have a file type such as xlsx, and png.
I am new to python and was only able to get to this. This gives me a list of file names but I don't know how to incorporate the file extension part. Could anyone please help?
List = pd.DataFrame()
path = 'C:/Users/documnets/'
filelist = []
filepath = []
# r=root, d=directories, f = files
for subdir, dirs, files in os.walk(path):
for file in files:
filelist.append(file)
filename, file_extension = os.path.splitext('/path/to/somefile.xlsx')
filepath.append(file_extension)
List = pd.DataFrame(flielist, filepath)
Also, for this part: os.path.splitext('/path/to/somefile.xlsx'), can I leave what's in the parenthesis as it is or should I replace with my directory path?
Thank you
You can do this:
import os
import pandas as pd
path = 'C:/Users/documnets/'
filename = []
fileext = []
for file in os.listdir(path):
name, ext = file.split('.')
filename.append(name)
fileext.append(ext)
columns = ["Name", "Extension"]
data = [filename, fileext]
df = pd.DataFrame(data, columns).transpose()
I'm trying to use a For loop in the code below to go through a list of files and rename them with the file directory's name.
import re # add this to your other imports
import os
for files in os.walk("."):
for f_new in files:
folder = files.split(os.sep)[-2]
print(folder)
name_elements = re.findall(r'(Position)(\d+)', f_new)[0]
name = name_elements[0] + str(int(name_elements[1]))
print(name) # just for demonstration
dst = folder + '_' + name
print(dst)
os.rename('Position014 (RGB rendering) - 1024 x 1024 x 1 x 1 - 3 ch (8 bits).tif', dst)
Use pathlib
Path.rglob: This is like calling Path.glob() with '**/' added in front of the given relative pattern:
.parent or .parents[0]: An immutable sequence providing access to the logical ancestors of the path
If yo want different parts of the path, index parents[] differently
file.parents[0].stem returns 'test1' or 'test2' depending on the file
file.parents[1].stem returns 'photos'
file.parents[2].stem returns 'stack_overflow'
.stem: The final path component, without its suffix
.suffix: The file extension of the final component
.rename: Rename this file or directory to the given target
The following code, finds only .tiff files. Use *.* to get all files.
If you only want the first 10 characters of file_name:
file_name = file_name[:10]
form pathlib import Path
# set path to files
p = Path('e:/PythonProjects/stack_overflow/photos/')
# get all files in subdirectories with a tiff extension
files = list(p.rglob('*.tiff'))
# print files example
[WindowsPath('e:/PythonProjects/stack_overflow/photos/test1/test.tiff'), WindowsPath('e:/PythonProjects/stack_overflow/photos/test2/test.tiff')]
# iterate through files
for file in files:
file_path = file.parent # get only path
dir_name = file.parent.stem # get the directory name
file_name = file.stem # get the file name
suffix = file.suffix # get the file extension
file_name_new = f'{dir_name}_{file_name}{suffix}' # make the new file name
file.rename(file_path / file_name_new) # rename the file
# output files renamed
[WindowsPath('e:/PythonProjects/stack_overflow/photos/test1/test1_test.tiff'), WindowsPath('e:/PythonProjects/stack_overflow/photos/test2/test2_test.tiff')]
I have a text-file full of filenames. Like:
C:\Folder\Subfolder_01\file_1001.csv
C:\Folder\Subfolder_02\file_3030.xls
...
I want to check whether the files still exists (which is easy) or if the name of the subfolder has changed. The name of some subfolders changed by adding some string in front of it (starting with a 4 digit number e.g C:\Folder\Subfolder_02\file_3030.xls has changed to C:\Folder\2019 - Subfolder_02\file_3030.xls).
I tried to solve this with pathlib.glob(). It's possible to do this for one specific file 'by hand' like
list(file.parent.parent.glob('* - Subfolder_02\file_3030.xls'))
which returns a list with the new file-name. But i failed to do this in a loop surrounding the glob with parameters.
This is what I got so far, but my attempt to concatenate the glob with other variables (using +) failes for obvious reasons:
import pathlib
file = pathlib.Path(file_names.txt)
lines=[]
with open(file,'r') as f:
# reading the txt-file line by line
for line in f:
line = line.replace("\r", "").replace("\n", "")
lines.append(line)
for file in lines:
file = pathlib.Path(file)
# check if file exists ...
if file.exists():
print('OK - ' + file.name)
# ... if not, find new location
else:
new_files = list(file.parent.parent.glob('* - ') + file.name)
print(files_files)
I would set your top directory as a path and use that to glob the files under the directory if you can't find the file in its original location. Using ** in the glob will search all folders.
# Set top level directory as desired.
parent_dir = Path('.')
# you can use splitlines() to parse the file into a list
with Path('file_names.txt').open() as f:
files = f.read().splitlines()
for f in files:
orig = Path(f)
# Still in location, no need to look further
if orig.exists():
print(f"{orig.absolute()} is still in place.")
continue
# See if we can find it under parent_dir
matches = [*parent_dir.glob(f"**/{orig.name}")]
if len(matches) > 1:
print("Multiple Matches Found")
for match in matches:
print(f"{orig.absolute()} might be in {match.absolute()}")
Try watchdog
For example:
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
RESOURCES_PATH = "C:\Folder"
class dirs_watcher(FileSystemEventHandler):
def __init__(self):
self.observe()
self.cur_dirs = os.listdir(RESOURCES_PATH)
def observe(self):
self.observer = Observer()
self.my_watch = self.observer.schedule(self, path=RESOURCES_PATH, recursive=True)
self.observer.start()
def on_modified(self, event=None):
# A folder was modified:
self.new_dirs = os.listdir(RESOURCES_PATH)
old = set(self.cur_dirs) - set(self.new_dirs)
new = set(self.new_dirs) - set(self.cur_dirs)
print("{} changed to {}".format(old, new))
self.cur_dirs = self.new_dirs # update cur_dirs
on_modified will be triggered when a sub directory changes and you can extract the changed folders names by keeping a sub directories list
The code that I have determines which Operating System is being used. Then it has to search the entire system for my csv file. When it's found I need to be able to read in the csv file (so that its not just inside the function, but useable throughout my code).
So far I am able to locate my file, but I am having trouble to assign the filepath to a variable, so that I can read in that variabel with pd.read_csv()
the code that I have is at follows:
import pandas as pd
import os
import re
import win32api
# https://stackoverflow.com/questions/13067686/search-files-in-all-drives-using-python
def find_file(root_folder, rex):
for root,dirs,files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
print(os.path.join(root, f))
return result
break # if you want to find only one
def find_file_in_all_drives(file_name):
#create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
find_file( drive, rex )
return
#file_name = "AB_NYC_2019.csv"
#find_file_in_all_drives(file_name)
df_location = find_file_in_all_drives( "AB_NYC_2019.csv" )
df = pd.read_csv(df_location)
I think that something is not right with the return.
Thank you for your time.
Right now it returns "None"
You haven't returned anything from anywhere.
I'm considering your code to be working and I've placed the necessary return calls but haven't tested it:
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
first_file_df = pd.read_csv(df_location[0])