Filename string comparison in list search fails [Python] - python

I am trying to associate some filepaths from 2 list elements in Python. These files have a part of their name identical, while the extension and some extra words are different.
This means the extension of the file, extra characters and their location can differ. The files are in different folders, hence their filepath name differs. What is exactly equal: their Numbering index: 0033, 0061 for example.
Example code:
original_files = ['C:/0001.jpg',
'C:/0033.jpg',
'C:/0061.jpg',
'C:/0080.jpg',
'C:/0204.jpg',
'C:/0241.jpg']
related_files = ['C:/0001_PM.png',
'C:/0033_PMA.png',
'C:/0033_NM.png',
'C:/0061_PMLTS.png',
'C:/0080_PM.png',
'C:/0080_RS.png',
'C:/0204_PM.png']
for idx, filename in enumerate(original_files):
related_filename = [s for s in (related_files) if filename.rsplit('/',1)[1][:-4] in s]
print(related_filename)
At filename = 'C:/0241.jpg' it should return [], but instead it returns all the filenames from related_files.
For privacy reasons I didn't post the entire filepath, just the names of the files. In this example, the comparison works, but for the entire filepath it fails.
I suppose my comparison condition is not correct but I don't know how to write it.
Note: I am looking for something with as few code lines as possible to do this.

I suggest something along the line of
from collections import defaultdict
original_files = ['C:/0001.jpg',
'C:/0033.jpg',
'C:/0061.jpg',
'C:/0080.jpg',
'C:/0204.jpg',
'C:/0241.jpg']
related_files = ['C:/0001_PM.png',
'C:/0033_PMA.png',
'C:/0033_NM.png',
'C:/0061_PMLTS.png',
'C:/0080_PM.png',
'C:/0080_RS.png',
'C:/0204_PM.png']
def key1(filename):
return filename.rsplit('/', 1)[-1].rsplit('.', 1)[0]
def key2(filename):
return key1(filename).split('_', 1)[0]
d = defaultdict(list)
for x in related_files:
d[key2(x)].append(x)
for x in original_files:
related = d.get(key1(x), [])
print(x, '->', related)
In key1() and key2() you could alternately use os.path functions or pathlib.Path methods.

Here's a solution that returns only the matched relative_files.
import os, re
def get_index(filename):
m = re.match('([0-9]+)', os.path.split(filename)[1])
return m.group(1) if m else False
indexes = filter(bool, map(get_index, original_files))
[f for f in related_files if get_index(f) in indexes]

Make use of defaultdict.
import os, re
from collections import defaultdict
stragglers = []
grouped_files = defaultdict(list)
file_index = re.compile('([0-9]+)')
for f in original_files + related_files:
m = file_index.match(os.path.split(f)[1])
if m:
grouped_files[m.group(1)].append(f)
else:
stragglers.append(f)
You now have grouped_files, a dict (or dictionary-like object) of key-value pairs where the key is the regex matched part of the filename and the value is a list of matching filenames.
for x in grouped_files.items():
print(x)
# ('0204', ['C:/0204.jpg', 'C:/0204_PM.png'])
# ('0001', ['C:/0001.jpg', 'C:/0001_PM.png'])
# ('0033', ['C:/0033.jpg', 'C:/0033_PM.png'])
# ('0061', ['C:/0061.jpg', 'C:/0061_PM.png'])
# ('0241', ['C:/0241.jpg'])
# ('0080', ['C:/0080.jpg', 'C:/0080_PM.png'])
In stragglers you have any filenames that didn't match your regex.
print(stragglers)
# []

For python 3.X you can try to use this:
for origfiles in original_files:
for relfiles in related_files:
if origfiles[3:6] == relfiles[3:6]:
print(origfiles)

Related

Find file in directory with the max number given a set of different file names

Problem Description
I have a list of files ["FA_1","FA_2","FB_1","FB_2","FC_1","FC_2"]. That list has 3 different file names FA, FB and FC. For each of FA, FB and FC, I am trying to retrieve the one with the max number. The following script that I coded does that. But it's so complicated and ugly.
Is there a way to make it simpler?
A similar question was asked in Find file in directory with the highest number in the filename. But, they are only using the same file name.
#!/usr/bin/env python
import sys
import os
from collections import defaultdict
def load_newest_files():
# Retrieve all files for the component in question
list_of_files = defaultdict(list)
new_list_of_files = []
files = ["FA_1","FA_2","FB_1","FB_2","FC_1","FC_2"]
# Split files and take the filename without the id.
# The files are not separated in bucket of FA, FB and FC
# I can now retrieve the file with the max number and put
# it in a list
for file in files:
list_of_files[file.split("_")[0]].append(file)
for key,value in list_of_files.items():
new_list_of_files.append(max(value))
print(new_list_of_files)
def main():
load_newest_files()
if __name__ == "__main__":
main()
You can use itertools.groupby and create custom grouping and maximum functions for the key arguments. Example is shown below.
from itertools import groupby
def custom_group(item):
x, _ = item.split("_")
return x
def custom_max(item):
_, y = item.split("_")
return int(y)
for _, v in groupby(files, key=custom_group):
val = max(v, key=custom_max)
new_list_of_files.append(val)
print(new_list_of_files)
> ['FA_2', 'FB_2', 'FC_2']
Please make sure to read the caveats surrounding itertools.groupby regarding the sort order of your input data.
You can use the regex library and sort(). An example is shown below.
import re
def load_newest_files():
files = ["FA_1", "FA_2", "FB_1", "FB_2", "FC_1", "FC_2"]
# Sort the list
files.sort()
concat_files = " ".join(files)
a = dict(re.findall('(.*?)_([0-9])[ ]?', concat_files))
new_list_of_files = ["%s_%s" % (i, j) for i, j in a.items()]
return new_list_of_files
def main():
newest_files = load_newest_files()
print(newest_files)
if __name__ == "__main__":
main()
Why do you think it is complicated and ugly?
You could use a list comprehension instead of these 3 lines:
new_list_of_files = []
# [...]
for key,value in list_of_files.items():
new_list_of_files.append(max(value))
Like so:
new_list_of_files = [max(value) for value in list_of_files.values()]
Alternatively you can sort the list of files in reverse, then iterate over the list, adding only the first instance (which will be the highest) of each filename prefix to a new list, using a set to keep track of what filename prefixes have already been added.
files = ["FA_1", "FA_2", "FB_1", "FB_2", "FC_1", "FC_2"]
files.sort(reverse=True)
already_seen = set()
new_filenames = []
for file in files:
prefix = file.split("_")[0]
if prefix not in already_seen:
already_seen.add(prefix)
new_filenames.append(file)
print(new_filenames)
Output: ['FC_2', 'FB_2', 'FA_2']
You can get it down to 2 lines with a complicated and ugly list comprehension:
files = ["FA_1", "FA_2", "FB_1", "FB_2", "FC_1", "FC_2"]
already_seen = set()
new_filenames = [(file, already_seen.add(prefix))[0] for file in files[::-1] if (prefix := file.split("_")[0]) not in already_seen]
print(new_filenames)

how to sort a list containing filenames?

I have a list of ~1000+ values in it. The values are the names of files in a folder which is given by os.listdir(folder_path)
code looks like this:
import os
folder_path = "some path here"
filelist = os.listdir(folder_path)
print(filelist)
Now when I look at the printed list, I see that the list isn't sorted by name. The filenames are something like ["text-1-1.txt","txt-1-23.txt","txt-1-32.txt","txt-1-10.txt","txt-2-1.txt","txt-2-32.txt"...]
Also, I know that there are filenames that increment by one, like: text-1-1.txt, text-1-2.txt, text-1-3.txt,.... text-2-1.txt, text-2-2.txt,...
I have tried these two methods to try and sort the list: new_list = sorted(filelist) & filelist.sort()
Both did not work and the list came out to be the same as the original, how can I sort this list? Do I have to manually write sorting algorithms(like Bubble, or Selection)?
You can run it this way:
import os
folder_path = "some path here"
filelist = os.listdir(folder_path)
filelist.sort() #Added this line
print(filelist)
By default, python already sorts strings in lexicographical order, but uppercase letters are all sorted before lowercase letters. If you want to sort strings and ignore case, then you can do
new_filelist = sorted(filelist, key=str.lower)
You can create a custom function for this, that creates a tuple of ints from the filenames:
>>> def sl_no(s):
return tuple(map(int,s.split('.')[0].rsplit('-', 2)[-2:]))
>>> sl_no("text-1-1.txt")
(1, 1)
>>> sorted(filelist, key=sl_no)
['text-1-1.txt',
'txt-1-10.txt',
'txt-1-23.txt',
'txt-1-32.txt',
'txt-2-1.txt',
'txt-2-32.txt']
Or, you can use re:
>>> import re
>>> sorted(filelist, lambda x: tuple(re.findall(r'\d+', x)))
['text-1-1.txt',
'txt-1-10.txt',
'txt-1-23.txt',
'txt-1-32.txt',
'txt-2-1.txt',
'txt-2-32.txt']
in order to support all kinds of file names that contain numbers, you can define a sortKey function that will isolate the numeric parts of the names and right justify them (with leading zeros) for the purpose of sorting:
import re
def sortKey(n):
return "".join([s,f"{s:>010}"][s.isdigit()] for s in re.split(r"(\d+)",n))
output:
names = ["text-1-1.txt","txt-1-23.txt","txt-1-32.txt","txt-1-10.txt",
"txt-2-1.txt","txt-2-32.txt"]
print(sorted(names,key=sortKey))
# ['text-1-1.txt', 'txt-1-10.txt', 'txt-1-23.txt', 'txt-1-32.txt',
# 'txt-2-1.txt', 'txt-2-32.txt']
names = ["log2020/12/23.txt","log2021/1/3.txt","log2021/02/1.txt",
"log2021/1/1.txt","log2021/1/13.txt"]
print(sorted(names,key=sortKey))
# ['log2020/12/23.txt', 'log2021/1/1.txt', 'log2021/1/3.txt',
# 'log2021/1/13.txt', 'log2021/02/1.txt']

Python: how to search for specific "string" in directory name (not individual file names)

I want to create a list of all the filepath names that match a specific string e.g. "04_DEM" so I can do further processing on the files inside those directories?
e.g.
INPUT
C:\directory\NewZealand\04DEM\DEM_CD23_1232.tif
C:\directory\Australia\04DEM\DEM_CD23_1233.tif
C:\directory\NewZealand\05DSM\DSM_CD23_1232.tif
C:\directory\Australia\05DSM\DSM_CD23_1232.tif
WANTED OUTPUT
C:\directory\NewZealand\04DEM\
C:\directory\Australia\04DEM\
This makes sure that only those files are processed, as some other files in the directories also have the same string "DEM" included in their filename, which I do not want to modify.
This is my bad attempt due to being a rookie with Py code
import os
for dirnames in os.walk('D:\Canterbury_2017Copy'):
print dirnames
if dirnames=='04_DEM' > listofdirectoriestoprocess.txt
print "DONE CHECK TEXT FILE"
You can use os.path for this:
import os
lst = [r'C:\directory\NewZealand\04DEM\DEM_CD23_1232.tif',
r'C:\directory\Australia\04DEM\DEM_CD23_1233.tif',
r'C:\directory\NewZealand\05DSM\DSM_CD23_1232.tif',
r'C:\directory\Australia\05DSM\DSM_CD23_1232.tif']
def filter_paths(lst, x):
return [os.path.split(i)[0] for i in lst if os.path.normpath(i).split(os.sep)[3] == x]
res = list(filter_paths(lst, '04DEM'))
# ['C:\\directory\\NewZealand\\04DEM',
# 'C:\\directory\\Australia\\04DEM']
Use in to check if a required string is in another string.
This is one quick way:
new_list = []
for path in path_list:
if '04DEM' in path:
new_list.append(path)
Demo:
s = 'C:/directory/NewZealand/04DEM/DEM_CD23_1232.tif'
if '04DEM' in s:
print(True)
# True
Make sure you use / or \\ as directory separator instead of \ because the latter escapes characters.
First, you select via regex using re, and then use pathlib:
import re
import pathlib
pattern = re.compile('04DEM')
# You use pattern.search() if s is IN the string
# You use pattern.match() if s COMPLETELY matches the string.
# Apply the correct function to your use case.
files = [s in list_of_files if pattern.search(s)]
all_pruned_paths = set()
for p in files:
total = ""
for d in pathlib.Path(p):
total = os.path.join(total, d)
if pattern.search(s):
break
all_pruned_paths.add(total)
result = list(all_pruned_paths)
This is more robust than using in because you might need to form more complicated queries in the future.

List all files in a directory in order of date as found inside the file names?

I have various tar files in a Desktop folder (Ubuntu).
The filename is like this:
esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-**05222017**-4.tar
The boldfaced part is the date. I want to sort the files in date order, most recent first.
Is there a simple python solution for this?
import glob
import datetime
import re
timeformat = "%m%d%Y"
regex = re.compile("^esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-(\d*\d*)")
def gettimestamp(thestring):
m = regex.search(thestring)
return datetime.datetime.strptime(m.groups()[0], timeformat)
list_of_filenames = ['esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05212017-4','esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05202017-4']
for fn in sorted(list_of_filenames, key=gettimestamp):
print fn
No, there is not a simple Python function for this. However, there are reasonably simple building blocks from which you can make a readable solution.
Write a function to extract the date and rearrange it to be useful as a sort key. Find the last two hyphens in the file name, grab the string between them, and then rearrange the digits in the format yyyymmdd (year-month-day). Return that string or integer (either will work) as the functional value.
For your main routine, collect all the file names in a list (or make a generator) and sort them, using the value of that function as the sort key.
See the sorting wiki for some implementation details.
As Adam Smith have pointed out, you require the list of files to work with.
import glob, os
import datetime
import re
timeformat = "%m%d%Y"
regex = re.compile("(\d*\d*)-\d*.tar")
def gettimestamp(thestring):
m = regex.search(thestring[-14:-1])
if m:
return datetime.datetime.strptime(m.groups()[0], timeformat)
else:
return None
list_of_filenames = os.listdir('/home/james/Desktop/tarfolder')
for fn in sorted(list_of_filenames, key=gettimestamp):
print fn
Edit As Martineu has noticed, the hash might be different than the one you indicated so it would be easier to discard beginning of the name part in advance.
You don't need to parse the date, or even use regex for that matter. If the file names are structured as you say, it's sufficient to do just:
filenames = ['esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05212017-4',
'esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05202017-4',
'esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-bad_date-4',]
def parse_date(name, offset=-10):
try:
date_str = name[offset:offset+8]
return int(date_str[-4:] + date_str[:2] + date_str[2:4])
except (IndexError, TypeError, ValueError): # invalid file name
return -1
sorted_list = [x[1] for x in sorted((parse_date(l), l) for l in filenames) if x[0] != -1]
# ['esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05202017-4',
# 'esarchive--James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05212017-4']
UPDATE - I've added the offset argument to specify where in the file name begins your date. In the list you've posted it begins 10 characters from the back (default), but if you've had a .tar extension after the name, as in your initial example, you'd account those 4 characters as well and use offset of -14:
names = ['James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05212017-4.tar',
'James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05202017-4.tar',
'James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-bad_date-4.tar']
sorted_list = [x[1] for x in sorted((parse_date(l, -14), l) for l in names) if x[0] != -1]
# ['James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05202017-4.tar',
# 'James-AB-Test226-8037affd-06d1-4c61-a91f-816ec9cb825f-05212017-4.tar']

Python: How to find most common elements of a list of files

first of all sorry for the easy question, but i cannot figure out the easiest way to code my problem.
I have a directory with several different file but with common elements (the values_25,_26,_28, etc.) as:
xxxxx_25.txt
xxxxx_26.txt
xxxxx_27.txt
xxxxx_28.txt
yyyyy_25.txt
yyyyy_26.txt
yyyyy_27.txt
yyyyy_29.txt
mmmmm_25.txt
mmmmm_26.txt
mmmmm_27.txt
mmmmm_30.txt
I wish to get lists as
xxxxx_25.txt
yyyyy_25.txt
mmmmm_25.txt
xxxxx_26.txt
yyyyy_26.txt
mmmmm_26.txt
xxxxx_27.txt
yyyyy_27.txt
mmmmm_27.txt
xxxxx_28.txt
yyyyy_29.txt
mmmmm_30.txt
import re
list_with_file_names = 'xxxx_25.txt xxxxx_26.txt xxxxx_27.txt xxxxx_28.txt yyyyy_25.txt yyyyy_26.txt yyyyy_27.txt yyyyy_29.txt mmmmm_25.txt mmmmm_26.txt mmmmm_27.txt mmmmm_30.txt'.split()
def get_number_and_prefix(text):
g = re.match('.*(\S+)(\d+)', text)
return tuple([
int(g.group(2)),
g.group(1)])
nice_list = sorted(list_with_file_names, key=get_number_and_prefix)
Tuples returned from get_number_and_prefix will be sorted first by the number, and later by the prefix
If, instead, you want to group based on the number in filename, you can use something like this:
def update_dict_with_file(dict_, filename):
g = re.match('.*(\d+)', filename)
key = g.group(1)
t = dict_.setdefault(key,[])
t.append(filename)
mydict = {}
[update_dict_with_file(mydict, filename)
for filename in list_with_file_names]
mydict now contains numbers from file names as keys, and lists with file names as values
Edit
To summarise all the answers until now, all you need is to build a sorted list out of your list, using a key getter function that extracts whatever you want from your filenames. You can do it by either fancy one-liner with itertools + list comprehension, or a lengthier for loop (no yieldanywhere?). But, basically, they are all the same. No rocket science.
This will do it:
list_of_files = [
'xxxxx_25.txt',
'xxxxx_26.txt',
'xxxxx_27.txt',
'xxxxx_28.txt',
'yyyyy_25.txt',
'yyyyy_26.txt',
'yyyyy_27.txt',
'yyyyy_29.txt',
'mmmmm_25.txt',
'mmmmm_26.txt',
'mmmmm_27.txt',
'mmmmm_30.txt',
]
import re
regex = re.compile('_([0-9]+)\.txt$')
def keyfn(name):
match = regex.search(name)
if match is None:
return None
else:
return match.group(1)
import itertools
for (key, group) in itertools.groupby(sorted(list_of_files,key=keyfn),keyfn):
print [x for x in group]
or if you want a list of lists as a result, replace the for loop with:
[x for g in itertools.groupby(sorted(list_of_files,key=keyfn),keyfn) for x in g[1]]
#Considering your list of files is as follows
ur_file_list = """xxxxx_25.txt
xxxxx_26.txt
xxxxx_27.txt
xxxxx_28.txt
yyyyy_25.txt
yyyyy_26.txt
yyyyy_27.txt
yyyyy_29.txt
mmmmm_25.txt
mmmmm_26.txt
mmmmm_27.txt
mmmmm_30.txt"""
#Based on the pattern, you can get the key assuming, you need the part in the
#filename (without ext) after underscore. So this will give you the part without regex
key = lambda e: os.path.splitext(e)[0].split("_")[-1]
from itertools import groupby
#On a sorted list, group on the above key function
#And generate a list of these groups
[list(group) for _, group in groupby(sorted(ur_file_list.splitlines(), key = key), key = key)]
[['xxxxx_25.txt', 'yyyyy_25.txt', 'mmmmm_25.txt'], ['xxxxx_26.txt', 'yyyyy_26.txt', 'mmmmm_26.txt'], ['xxxxx_27.txt', 'yyyyy_27.txt', 'mmmmm_27.txt'], ['xxxxx_28.txt'], ['yyyyy_29.txt'], ['mmmmm_30.txt']]
The use of collections.defaultdict comes in very handy for this task.
In [1]: import re; from collections import defaultdict
In [2]: filenames
Out[2]:
['xxxxx_25.txt',
'xxxxx_26.txt',
'xxxxx_27.txt',
'xxxxx_28.txt',
'yyyyy_25.txt',
'yyyyy_26.txt',
'yyyyy_27.txt',
'yyyyy_29.txt',
'mmmmm_25.txt',
'mmmmm_26.txt',
'mmmmm_27.txt',
'mmmmm_30.txt']
In [3]: d = defaultdict(list)
In [4]: for filename in filenames:
....: m = re.search(r'_(\d+)\.txt$', filename)
....: if m:
....: d[m.group(1)].append(filename)
In [5]: [sorted(filename_list) for filename_list in d.values()]
Out[5]:
[['xxxxx_25.txt', 'yyyyy_25.txt'],
['mmmmm_26.txt', 'xxxxx_26.txt', 'yyyyy_26.txt'],
['mmmmm_27.txt', 'yyyyy_27.txt'],
['xxxxx_28.txt'],
['yyyyy_29.txt'],
['mmmmm_30.txt']]

Categories

Resources