Python's re.findAll() function won't work as expected - python

I'm trying to create a python script that will find all the files from a working directory with a certain name pattern.
I have stored all files in a list and then I have tried applying the re.findall method on the list to obtain only a list of files with that name pattern.
I have written this code:
# Create the regex object that we will use to find our files
fileRegex = re.compile(r'A[0-9]*[a-z]*[0-9]*.*')
all_files = []
# Recursevly read the contents of the working_dir/Main folder #:
for folderName, subfolders, filenames in os.walk(working_directory + "/Main"):
for filename in filenames:
all_files.append(filename)
found_files = fileRegex.findall(all_files)
I get this error at the last line of the code:
TypeError: expected string or bytes-like object
I have also tried re.findall(all_files) instead of using the 'fileRegex' created prior to that line. Same error. Please tell me what am I doing wrong. Thank you so much for reading my post!
Edit(second question):
I have followed the answers and it's now working fine. I'm trying to create an archive with the files that match that pattern after I've found them. The archive was created however the way I wrote the code the whole path to the file gets included in the archive (all the folders from / up to the file). I just want the file to be included in the final .zip not the whole directories and subdirectories that make the path to it.
Here is the code. The generation of the .zipfile is at the bottom. Please give me a tip how could I solve this I've tried many things but none worked. Thanks:
# Project properties:
# Recursively read the contents of the 'Main' folder which contains files with different names.
# Select only the files whose name begin with letter A and contain digits in it. Use regexes for this.
# Archive these files in a folder named 'Created_Archive' in the project directory. Give the archive a name of your choosing.
# Files that you should find are:
# Aerials3.txt, Albert0512.txt, Alberto1341.txt
########################################################################################################################################
import os
import re
import zipfile
from pathlib import Path
# Get to the proper working directory
working_directory = os.getcwd()
if working_directory != "/home/paul/Desktop/Python_Tutorials/Projects/Files_And_Archive":
working_directory = "/home/paul/Desktop/Python_Tutorials/Projects/Files_And_Archive"
os.chdir(working_directory)
check_archive = Path(os.getcwd() + "/" + "files.zip")
if check_archive.is_file():
print("Yes. Deleting it and creating it.")
os.unlink(os.getcwd() + "/" + "files.zip")
else:
print("No. Creating it.")
# Create the regex object that we will use to find our files
fileRegex = re.compile(r'A[0-9]*[a-z]*[0-9]+.*')
found_files = []
# Create the zipfile object that we will use to create our archive
fileArchive = zipfile.ZipFile('files.zip', 'a')
# Recursevly read the contents of the working_dir/Main folder #:
for folderName, subfolders, filenames in os.walk(working_directory + "/Main"):
for filename in filenames:
if fileRegex.match(filename):
found_files.append(folderName + "/" + filename)
# Check all files have been found and create the archive. If the archive already exists
# delete it.
for file in found_files:
print(file)
fileArchive.write(file, compress_type=zipfile.ZIP_DEFLATED)
fileArchive.close()

re.findAll works on strings not on lists, so its better you use r.match over the list to filter the ones that actually matches:
found_files = [s for s in all_files if fileRegex.match(s)]

regex works on strings not lists. the following works
import re
import os
# Create the regex object that we will use to find our files
# fileRegex = re.compile(r'A[0-9]*[a-z]*[0-9]*.*')
fileRegex = re.compile(r'.*\.py')
all_files = []
found_files = []
working_directory = r"C:\Users\michael\PycharmProjects\work"
# Recursevly read the contents of the working_dir/Main folder #:
for folderName, subfolders, filenames in os.walk(working_directory):
for filename in filenames:
all_files.append(filename)
if fileRegex.search(filename):
found_files.append(filename)
print('all files\n', all_files)
print('\nfound files\n', found_files)

re.findall doesn't take a list of strings. You need re.match .
# Create the regex object that we will use to find our files
fileRegex = re.compile(r'A[0-9]*[a-z]*[0-9]*.*')
all_files = []
# Recursively read the contents of the working_dir/Main folder #:
for folderName, subfolders, filenames in os.walk(working_directory + "/Main"):
for filename in filenames:
all_files.append(filename)
found_files = [file_name for file_name in all_files if fileRegex.match(file_name)]

Related

Copy and rename pictures based on xml nodes

I'm trying to copy all pictures from one directory (also including subdirectories) to another target directory. Whenever the exact picture name is found in one of the xml files the tool should grap all information (attributes in the parent and child nodes) and create subdirectories based on those node informations, also it should rename the picture file.
The part when it extracts all the information from the nodes is already done.
from bs4 import BeautifulSoup as bs
path_xml = r"path\file.xml"
content = []
with open(res, "r") as file:
content = file.readlines()
content = "".join(content)
def get_filename(_content):
bs_content = bs(_content, "html.parser")
# some code
picture_path = f'{pm_1}{pm_2}\{pm_3}\{pm_4}\{pm_5}_{pm_6}_{pm_7}\{pm_8}\{pm_9}.jpg'
get_filename(content)
So in the end I get a string value with the directory path and the file name I want.
Now I struggle with opening all xml files in one directory instead of just opening one file. I tryed this:
import os
dir_xml = r"path"
res = []
for path in os.listdir(dir_xml):
if os.path.isfile(os.path.join(dir_xml, path)):
res.append(path)
with open(res, "r") as file:
content = file.readlines()
but it gives me this error: TypeError: expected str, bytes or os.PathLike object, not list
How can i read through all xml files instead of just one? I have hundreds of xml files so that will take a wile :D
And another question: How can i create directories base on string?
Lets say the value of picture_path is AB\C\D\E_F_G\H\I.jpg
I would need another directory path for the destination of the created folders and a function that somehow creates folders based on that string. How can I do that?
To read all XML files in a directory, you can modify your code as follows:
import os
dir_xml = r"path"
for path in os.listdir(dir_xml):
if path.endswith(".xml"):
with open(os.path.join(dir_xml, path), "r") as file:
content = file.readlines()
content = "".join(content)
get_filename(content)
This code uses the os.listdir() function to get a list of all files in the directory specified by dir_xml. It then uses a for loop to iterate over the list of files, checking if each file ends with the .xml extension. If it does, it opens the file, reads its content, and passes it to the get_filename function.
To create directories based on a string, you can use the os.makedirs function. For example:
import os
picture_path = r'AB\C\D\E_F_G\H\I.jpg'
dest_path = r'path_to_destination'
os.makedirs(os.path.join(dest_path, os.path.dirname(picture_path)), exist_ok=True)
In this code, os.path.join is used to combine the dest_path and the directory portion of picture_path into a full path. os.path.dirname is used to extract the directory portion of picture_path. The os.makedirs function is then used to create the directories specified by the path, and the exist_ok argument is set to True to allow the function to succeed even if the directories already exist.
Finally, you can use the shutil library to copy the picture file to the destination and rename it, like this:
import shutil
src_file = os.path.join(src_path, picture_path)
dst_file = os.path.join(dest_path, picture_path)
shutil.copy(src_file, dst_file)
Here, src_file is the full path to the source picture file and dst_file is the full path to the destination. The shutil.copy function is then used to copy the file from the source to the destination.
You can use os.walk() for recursive search of files:
import os
dir_xml = r"path"
for root, dirs, files in os.walk(dir_xml): #topdown=False
for names in files:
if ".xml" in names:
print(f"file path: {root}\n XML-Files: {names}")
with open(names, 'r') as file:
content = file.readlines()

python how to collect a specific file from a list of folders and save

I have many folders in a master folder as given below. Each folder contains a .JPG file. I would like to extract all these files and store them in this master folder.
Inside each folder
My present code:
import os
import glob
os.chdir('Master folder')
extension = 'JPG'
jpg_files= [i for i in glob.glob('*.{}'.format(extension))]
This did not work.
To find the images in your tree, I would use os.walk. Below you can find a complete example to 'find and move' function that move all the files to your given path, and create a new filename for duplicate filenames.
The simple 'find and replace' function will also check with function add_index_to_filepath whether or not the file already exists, add an index (n) to the path. For example: if image.jpg would exists, it turns the next one into turn into image (1).jpg and the following one into image (2).jpg and so on.
import os
import re
import shutil
def add_index_to_filepath(path):
'''
Check if a file exists, and append '(n)' if true.
'''
# If the past exists, go adjust it
if os.path.exists(path):
# pull apart your path and filenames
folder, file = os.path.split(path)
filename, extension = os.path.splitext(file)
# discover the current index, and correct filename
try:
regex = re.compile(r'\(([0-9]*)\)$')
findex = regex.findall(filename)[0]
filename = regex.sub('({})'.format(int(findex) + 1), filename)
except IndexError:
filename = filename + ' (1)'
# Glue your path back together.
new_path = os.path.join(folder, '{}{}'.format(filename, extension))
# Recursivly call your function, go keep verifying if it exists.
return add_index_to_filepath(new_path)
return path
def find_and_move_files(path, extension_list):
'''
Walk through a given path and move the files from the sub-dir to the path.
Upper-and lower-case are ignored. Duplicates get a new filename.
'''
files_moved = []
# First walk through the path, to list all files.
for root, dirs, files in os.walk(path, topdown=False):
for file in files:
# Is your extension wanted?
extension = os.path.splitext(file)[-1].lower()
if extension in extension_list:
# Perpare your old an new path, and move
old_path = os.path.join(root, file)
new_path = add_index_to_filepath(os.path.join(path, file))
if new_path in files_moved:
shutil.move(old_path, new_path)
# Lets keep track of what we moved to return it in the end
files_moved.append(new_path)
return files_moved
path = '.' # your filepath for the master-folder
extensions = ['.jpg', '.jpeg'] # There are some variations of a jpeg-file extension.
found_files = find_and_move_files(path, extensions)

How to get the files with the biggest size in the folders, change their name and save to a different folder

I need to get files with the biggest size in different folders, change their name to folder name that they belong to and save to a new folder. I have something like this and I got stuck:
import os
# Core settings
rootdir = 'C:\\Users\\X\\Desktop\\humps'
to_save = 'C:\\Users\\X\\Desktop\\new'
for root, dirs, files in os.walk(rootdir):
new_list = []
for file in files:
if file.endswith(".jpg"):
try:
print(file)
os.chdir(to_save)
add_id = root.split("humps\\")[1]
add_id = add_id.split("\\")[0]
file_name = os.path.join(root,file)
new_list.append(file_name)
bigfile = max(new_list, key=lambda x: x.stat().st_size)
except:
pass
To make it more clear: Let's say the name of the sub-folder is "elephant" and there are different elephant photos and subfolders in in this elephant folder. I want to go through those photos and subfolders and find the elephant foto with the biggest size, name it as elephant and save it to my target folder. Also repaet it for other sub folders such as lion, puma etc.
How I could achieve what I want ?
To find biggest file and save to another location
import os
import shutil
f_list = []
root = "path/to/directory"
root = os.path.abspath(root)
for folder, subfolders, files in os.walk(root):
for file in files:
filePath = os.path.join(folder, file)
f_list.append(filePath)
bigest_file = max(f_list,key=os.path.getsize)
new_path = "path/where/you/want/to/save"
shutil.copy(biggest_file,new_path)
if you want only images then add one more condition in loop
for folder, subfolders, files in os.walk(root):
for file in files:
if file.endswith(".jpg"):
filePath = os.path.join(folder, file)
f_list.append(filePath)
To get all folders biggest file
root = "demo"
root = os.path.abspath(root)
def test(path):
big_files = []
all_paths = [x[0] for x in os.walk(path)]
for paths in all_paths:
f_list = filter(os.path.isfile, os.listdir(paths))
if len(f_list) > 0:
big_files.append((paths,max(f_list,key=os.path.getsize)))
return big_files
print test(root)
How to get the files with the biggest size in the folders, change their name and save to a different folder
Basically you already have a good description of what you need to do. You just need to follow it step by step:
get all files in some search directory
filter for relevant files ("*.jpg")
get their sizes
find the maximum
copy to new directory with name of search directory
IMO it's an important skill to be able to break down a task into smaller tasks. Then, you just need to implement the smaller tasks and combine:
def iterate_files_recursively(directory="."):
for entry in os.scandir(directory):
if entry.is_dir():
for file in iterate_files_recursively(entry.path):
yield file
else:
yield entry
files = iterate_files_recursively(subfolder_name)
I'd use os.scandir because it avoids building up a (potentially) huge list of files in memory and instead allows me (via a generator) to work one file at a time. Note that starting with 3.6 you can use the result of os.scandir as a context manager (with syntax).
images = itertools.filterfalse(lambda f: not f.path.endswith('.jpg'), files)
Filtering is relatively straightforward except for the IMO strange choice of ìtertools.filterfalse to only keep elements for which its predicate returns False.
biggest = max(images, key=(lambda img: img.stat().st_size))
This is two steps in one: Get the maximum with the builtin max function, and use the file size as "key" to establish an order. Note that this raises a ValueError if you don't have any images ... so you might want to supply default=None or handle that exception.
shutil.copy(biggest.path, os.path.join(target_directory, subfolder_name + '.jpg')
shutil.copy copies the file and some metadata. Instead of hardcoding path separators, please use os.path.join!
Now all of this assumes that you know the subfolder_name. You can scan for those easily, too:
def iterate_directories(directory='.'):
for entry in os.scandir(directory):
if entry.is_dir():
yield entry
Here's some code that does what you want. Instead of using the old os.walk function, it uses modern pathlib functions.
The heart of this code is the recursive biggest function. It scans all the files and directories in folder, saving the matching file names to the files list, and recursively searching any directories it finds. It then returns the path of the largest file that it finds, or None if no matching files are found.
from pathlib import Path
import shutil
def filesize(path):
return path.stat().st_size
def biggest(folder, pattern):
''' Find the biggest file in folder that matches pattern
Search recursively in all subdirectories
'''
files = []
for f in folder.iterdir():
if f.is_file():
if f.match(pattern):
files.append(f)
elif f.is_dir():
found = biggest(f, pattern)
if found:
files.append(found)
if files:
return max(files, key=filesize)
def copy_biggest(src, dest, pattern):
''' Find the biggest file in each folder in src that matches pattern
and copy it to dest, using the folder's name as the new file name
'''
for path in src.iterdir():
if path.is_dir():
found = biggest(path, pattern)
if found:
newname = dest / path
print(path, ':', found, '->', newname)
shutil.copyfile(found, newname)
You can call it like this:
rootdir = r'C:\Users\X\Desktop\humps'
to_save = r'C:\Users\X\Desktop\new'
copy_biggest(Path(rootdir), Path(to_save), '*.jpg')
Note that the copied files will have the same name as the top-level folder in rootdir that they were found in, with no file extension. If you want to give them a .jpg extension, you can change
newname = dest / path
to
newname = (dest / path).with_suffix('.jpg')
The shutil module on older versions of Python 3 doesn't understand pathlib paths. But that's easy enough to remedy. In the copy_biggest function, replace
shutil.copyfile(found, newname)
with
shutil.copyfile(str(found), str(newname))

How can get a list of files in a specific directory ignoring the symbolic links using python?

I need to process the filenames from a directory by creating the list of the filenames.
But my resulting list contains entries for symbolic links too. How can I get pure filenames in a particular directory using python.
I have tried:os.walk,os.listdir,os.path.isfile
But all are including symbolic links of type 'filename~' to the list :(
The glob.glob adds the path to the list which I don't need.
I need to use it in a code like this:
files=os.listdir(folder)
for f in files:
dosomething(like find similar file f in other folder)
Any help? Or please redirect me to the right answer. Thanks
Edit: the tilde sign is at end
To get regular files in a directory:
import os
from stat import S_ISREG
for filename in os.listdir(folder):
path = os.path.join(folder, filename)
try:
st = os.lstat(path) # get info about the file (don't follow symlinks)
except EnvironmentError:
continue # file vanished or permission error
else:
if S_ISREG(st.st_mode): # is regular file?
do_something(filename)
If you still see 'filename~' filenames then it means that they are not actually symlinks. Just filter them using their names:
filenames = [f for f in os.listdir(folder) if not f.endswith('~')]
Or using fnmatch:
import fnmatch
filenames = fnmatch.filter(os.listdir(folder), '*[!~]')
You can use os.path.islink(yourfile) to check if yourfile is symlinked, and exclude it.
Something like this works for me:
folder = 'absolute_path_of_yourfolder' # without ending /
res = []
for f in os.listdir(folder):
absolute_f = os.path.join(folder, f)
if not os.path.islink(absolute_f) and not os.path.isdir(absolute_f):
res.append(f)
res # will get you the files not symlinked nor directory
...

Extract files from zip without keep the top-level folder with python zipfile

I'm using the current code to extract the files from a zip file while keeping the directory structure:
zip_file = zipfile.ZipFile('archive.zip', 'r')
zip_file.extractall('/dir/to/extract/files/')
zip_file.close()
Here is a structure for an example zip file:
/dir1/file.jpg
/dir1/file1.jpg
/dir1/file2.jpg
At the end I want this:
/dir/to/extract/file.jpg
/dir/to/extract/file1.jpg
/dir/to/extract/file2.jpg
But it should ignore only if the zip file has a top-level folder with all files inside it, so when I extract a zip with this structure:
/dir1/file.jpg
/dir1/file1.jpg
/dir1/file2.jpg
/dir2/file.txt
/file.mp3
It should stay like this:
/dir/to/extract/dir1/file.jpg
/dir/to/extract/dir1/file1.jpg
/dir/to/extract/dir1/file2.jpg
/dir/to/extract/dir2/file.txt
/dir/to/extract/file.mp3
Any ideas?
If I understand your question correctly, you want to strip any common prefix directories from the items in the zip before extracting them.
If so, then the following script should do what you want:
import sys, os
from zipfile import ZipFile
def get_members(zip):
parts = []
# get all the path prefixes
for name in zip.namelist():
# only check files (not directories)
if not name.endswith('/'):
# keep list of path elements (minus filename)
parts.append(name.split('/')[:-1])
# now find the common path prefix (if any)
prefix = os.path.commonprefix(parts)
if prefix:
# re-join the path elements
prefix = '/'.join(prefix) + '/'
# get the length of the common prefix
offset = len(prefix)
# now re-set the filenames
for zipinfo in zip.infolist():
name = zipinfo.filename
# only check files (not directories)
if len(name) > offset:
# remove the common prefix
zipinfo.filename = name[offset:]
yield zipinfo
args = sys.argv[1:]
if len(args):
zip = ZipFile(args[0])
path = args[1] if len(args) > 1 else '.'
zip.extractall(path, get_members(zip))
Read the entries returned by ZipFile.namelist() to see if they're in the same directory, and then open/read each entry and write it to a file opened with open().
This might be a problem with the zip archive itself. In a python prompt try this to see if the files are in the correct directories in the zip file itself.
import zipfile
zf = zipfile.ZipFile("my_file.zip",'r')
first_file = zf.filelist[0]
print file_list.filename
This should say something like "dir1"
repeat the steps above substituting and index of 1 into filelist like so first_file = zf.filelist[1] This time the output should look like 'dir1/file1.jpg' if this is not the case then the zip file does not contain directories and will be unzipped all to one single directory.
Based on the #ekhumoro's answer I come up with a simpler funciton to extract everything on the same level, it is not exactly what you are asking but I think can help someone.
def _basename_members(self, zip_file: ZipFile):
for zipinfo in zip_file.infolist():
zipinfo.filename = os.path.basename(zipinfo.filename)
yield zipinfo
from_zip="some.zip"
to_folder="some_destination/"
with ZipFile(file=from_zip, mode="r") as zip_file:
os.makedirs(to_folder, exist_ok=True)
zip_infos = self._basename_members(zip_file)
zip_file.extractall(path=to_folder, members=zip_infos)
Basically you need to do two things:
Identify the root directory in the zip.
Remove the root directory from the paths of other items in the zip.
The following should retain the overall structure of the zip while removing the root directory:
import typing, zipfile
def _is_root(info: zipfile.ZipInfo) -> bool:
if info.is_dir():
parts = info.filename.split("/")
# Handle directory names with and without trailing slashes.
if len(parts) == 1 or (len(parts) == 2 and parts[1] == ""):
return True
return False
def _members_without_root(archive: zipfile.ZipFile, root_filename: str) -> typing.Generator:
for info in archive.infolist():
parts = info.filename.split(root_filename)
if len(parts) > 1 and parts[1]:
# We join using the root filename, because there might be a subdirectory with the same name.
info.filename = root_filename.join(parts[1:])
yield info
with zipfile.ZipFile("archive.zip", mode="r") as archive:
# We will use the first directory with no more than one path segment as the root.
root = next(info for info in archive.infolist() if _is_root(info))
if root:
archive.extractall(path="/dir/to/extract/", members=_members_without_root(archive, root.filename))
else:
print("No root directory found in zip.")

Categories

Resources