searching specific string in list - python

How to search for every string in a list that starts with a specific string like:
path = (r"C:\Users\Example\Desktop")
desktop = os.listdir(path)
print(desktop)
#['faf.docx', 'faf.txt', 'faad.txt', 'gas.docx']
So my question is: how do i filter from every file that starts with "fa"?

For this specific cases, involving filenames in one directory, you can use globbing:
import glob
import os
path = (r"C:\Users\Example\Desktop")
pattern = os.path.join(path, 'fa*')
files = glob.glob(pattern)

This code filters all items out that start with "fa" and stores them in a separate list
filtered = [item for item in path if item.startswith("fa")]

All strings have a .startswith() method!
results = []
for value in os.listdir(path):
if value.startswith("fa"):
results.append(value)

Related

Python grab substring between two specific characters

I have a folder with hundreds of files named like:
"2017_05_S2B_7VEG_20170528_0_L2A_B01.tif"
Convention:
year_month_ID_zone_date_0_L2A_B01.tif ("_0_L2A_B01.tif", and "zone" never change)
What I need is to iterate through every file and build a path based on their name in order to download them.
For example:
name = "2017_05_S2B_7VEG_20170528_0_L2A_B01.tif"
path = "2017/5/S2B_7VEG_20170528_0_L2A/B01.tif"
The path convention needs to be: path = year/month/ID_zone_date_0_L2A/B01.tif
I thought of making a loop which would "cut" my string into several parts every time it encounters a "_" character, then stitch the different parts in the right order to create my path name.
I tried this but it didn't work:
import re
filename =
"2017_05_S2B_7VEG_20170528_0_L2A_B01.tif"
try:
found = re.search('_(.+?)_', filename).group(1)
except AttributeError:
# _ not found in the original string
found = '' # apply your error handling
How could I achieve that on Python ?
Since you only have one separator character, you may as well simply use Python's built in split function:
import os
items = filename.split('_')
year, month = items[:2]
new_filename = '_'.join(items[2:])
path = os.path.join(year, month, new_filename)
Try the following code snippet
filename = "2017_05_S2B_7VEG_20170528_0_L2A_B01.tif"
found = re.sub('(\d+)_(\d+)_(.*)_(.*)\.tif', r'\1/\2/\3/\4.tif', filename)
print(found) # prints 2017/05/S2B_7VEG_20170528_0_L2A/B01.tif
No need for a regex -- you can just use split().
filename = "2017_05_S2B_7VEG_20170528_0_L2A_B01.tif"
parts = filename.split("_")
year = parts[0]
month = parts[1]
Maybe you can do like this:
from os import listdir, mkdir
from os.path import isfile, join, isdir
my_path = 'your_soure_dir'
files_name = [f for f in listdir(my_path) if isfile(join(my_path, f))]
def create_dir(files_name):
for file in files_name:
month = file.split('_', '1')[0]
week = file.split('_', '2')[1]
if not isdir(my_path):
mkdir(month)
mkdir(week)
### your download code
filename = "2017_05_S2B_7VEG_20170528_0_L2A_B01.tif"
temp = filename.split('_')
result = "/".join(temp)
print(result)
result is
2017/05/S2B/7VEG/20170528/0/L2A/B01.tif

How to create a list of images and their names in a folder

I am using the following code to create a list of image files with tif extension. The current result of the code is a list with the address of the .tif files.
raster_list=[]
def getFiles(path):
for file in os.listdir(path):
if file.endswith(".tif"):
raster_list.append(os.path.join(path, file))
getFiles(fpath)
print(raster_list)
len(raster_list)
result:
['C:/dataset/test\\ras1.tif', 'C:dataset/test\\ras2.tif', 'C:/dataset/test\\ras3.tif', 'C:/dataset/test\\ras4.tif', 'C:/dataset/test\\ras5.tif']
How can I revise the code to create two lists a) name of the file with .tif b) name of the file. here is the example:
raster_list = ['ras1.tif','ras2.tif','ras3.tif','ras4.tif', 'ras5.tif' ]
raster_name = ['ras1','ras2','ras3','ras4', 'ras5']
edit to the code from solutions:
from os import path
raster_list=[]
def getFiles(path):
for file in os.listdir(path):
if file.endswith(".tif"):
raster_list.append(file) #remove the os.join, just take the file name
getFiles(fpath)
raster_names = [path.splitext(x)[0] for x in raster_list]
print(raster_list)
print(raster_names)
result:
['ras1.tif', 'ras2.tif', 'ras3.tif', 'ras4.tif', 'ras5.tif']
['ras1', 'ras2', 'ras3', 'ras4', 'ras5']
Personally, I would use pathlib because it's OS independent and reliable:
>>> from pathlib import Path
>>> files = ['/tmp/ras1.tif', '/tmp/ras2.tif', '/tmp/ras3.tif']
>>> [Path(file).name for file in files]
['ras1.tif', 'ras2.tif', 'ras3.tif']
>>> [Path(file).stem for file in files]
['ras1', 'ras2', 'ras3']
For the first list you want you can just use the function split("") on each one of the term of your list and use the last element of the list created by split.
And for the second one you can just take the first list you've created and split on ".", you will just have to save the first element of the list.
L = ['C:/dataset/test\\ras1.tif', 'C:dataset/test\\ras2.tif', 'C:/dataset/test\\ras3.tif', 'C:/dataset/test\\ras4.tif', 'C:/dataset/test\\ras5.tif']
raster_list = []
raster_name = []
for i in L:
a = i.split("\\")[1]
raster_list.append(a)
raster_name.append(a.split(".")[0])
print(raster_list, raster_name)
But you can also try to not include the path in rather_list like that:
raster_list=[]
def getFiles(path):
for file in os.listdir(path):
if file.endswith(".tif"):
raster_list.append(file) #remove the os.join, just take the file name
getFiles(fpath)
print(raster_list)
len(raster_list)
You can merge the two to have this:
raster_list=[]
raster_names=[]
def getFiles(path):
for file in os.listdir(path):
if file.endswith(".tif"):
raster_list.append(file)
raster_names.append(file.split(".")[0])
getFiles(fpath)
print(raster_list, raster_names)
split by '\', take the last element and append it to first list,
then split that by '.', take first element and append it to second list?
Alternatively, you can use these in-built functions: https://docs.python.org/3/library/os.path.html
You can use the os.path() methods to achieve what you want.
import os
raster_list = []
raster_name = []
def getFiles(path):
for file in os.listdir(path):
if file.endswith(".tif"):
raster_list.append(os.path.basename(file))
raster_name.append(os.path.splitext(file)[0])
getFiles(fpath)
print(raster_list)
print(raster_name)
len(raster_list)

Verify the format of a filename in Python

Every week I get two files with following pattern.
EMEA_{sample}_Tracker_{year}_KW{week}
E.g.
EMEA_G_Tracker_2019_KW52.xlsx
EMEA_BC_Tracker_2019_KW52.xlsx
Next files would look like these
EMEA_G_Tracker_2020_KW1.xlsx
EMEA_BC_Tracker_2020_KW1.xlsx
Placeholders:
sample = G or BC
year = current year [YYYY]
week = calendar week [0 - ~52]
The only changes are made in the placeholders, everything else will stay the same.
How can I extract these values from the filename and check if the filename has this format?
Right now I only read all files using os.walk():
path_files = "Files/"
files = []
for (_, _, filenames) in walk(path_files):
files.extend(filenames)
break
If filename is the name of the file you've got:
import re
result = re.match(r'EMEA_(.*?)_Tracker_(\d+)_KW(\d+)', filename)
sample, year, week = result.groups()
Here is an example of how to collect all files matching your pattern into a list using regex and list comprehension. Then you can use the list as you wish in later code.
import os
import re
# Compile the regular expression pattern.
re_emea = re.compile('^EMEA_(G|BC)_Tracker_20\d{2}_KW\d{1,2}.xlsx$')
# Set path to be searched.
path = '/home/username/Desktop/so/emea_files'
# Collect all filenames matching the pattern into a list.
files = [f for f in os.listdir(path) if re_emea.match(f)]
# View the results.
print(files)
All files in the directory:
['EMEA_G_Tracker_2020_KW2.xlsx',
'other_file_3.txt',
'EMEA_G_Tracker_2020_KW1.xlsx',
'other_file_2.txt',
'other_file_5.txt',
'other_file_4.txt',
'EMEA_BC_Tracker_2019_KW52.xlsx',
'other_file_1.txt',
'EMEA_G_Tracker_2019_KW52.xlsx',
'EMEA_BC_Tracker_2020_KW2.xlsx',
'EMEA_BC_Tracker_2020_KW1.xlsx']
The results from pattern matching:
['EMEA_G_Tracker_2020_KW2.xlsx',
'EMEA_G_Tracker_2020_KW1.xlsx',
'EMEA_BC_Tracker_2019_KW52.xlsx',
'EMEA_G_Tracker_2019_KW52.xlsx',
'EMEA_BC_Tracker_2020_KW2.xlsx',
'EMEA_BC_Tracker_2020_KW1.xlsx']
Hope this helps! If not, just give me a shout.

Extracting numbers from a filename string in python

I have a number of html files in a directory. I am trying to store the filenames in a list so that I can use it later to compare with another list.
Eg: Prod224_0055_00007464_20170930.html is one of the filenames. From the filename, I want to extract '00007464' and store this value in a list and repeat the same for all the other files in the directory. How do I go about doing this? I am new to Python and any help would be greatly appreciated!
Please let me know if you need more information to answer the question.
Split the filename on underscores and select the third element (index 2).
>>> 'Prod224_0055_00007464_20170930.html'.split('_')[2]
'00007464'
In context that might look like this:
nums = [f.split('_')[2] for f in os.listdir(dir) if f.endswith('.html')]
you may try this (assuming you are in the folder with the files:
import os
num_list = []
r, d, files = os.walk( '.' ).next()
for f in files :
parts = f.split('_') # now `parts` contains ['Prod224', '0055', '00007464', '20170930.html']
print parts[2] # this outputs '00007464'
num_list.append( parts[2] )
Assuming you have a certain pattern for your files, you can use a regex:
>>> import re
>>> s = 'Prod224_0055_00007464_20170930.html'
>>> desired_number = re.findall("\d+", s)[2]
>>> desired_number
'00007464'
Using a regex will help you getting not only that specific number you want, but also other numbers in the file name.
This will work if the name of your files follow the pattern "[some text][number]_[number]_[desired_number]_[a date].html". After getting the number, I think it will be very simple to use the append method to add that number to any list you want.

Python: how to search for specific "string" in directory name (not individual file names)

I want to create a list of all the filepath names that match a specific string e.g. "04_DEM" so I can do further processing on the files inside those directories?
e.g.
INPUT
C:\directory\NewZealand\04DEM\DEM_CD23_1232.tif
C:\directory\Australia\04DEM\DEM_CD23_1233.tif
C:\directory\NewZealand\05DSM\DSM_CD23_1232.tif
C:\directory\Australia\05DSM\DSM_CD23_1232.tif
WANTED OUTPUT
C:\directory\NewZealand\04DEM\
C:\directory\Australia\04DEM\
This makes sure that only those files are processed, as some other files in the directories also have the same string "DEM" included in their filename, which I do not want to modify.
This is my bad attempt due to being a rookie with Py code
import os
for dirnames in os.walk('D:\Canterbury_2017Copy'):
print dirnames
if dirnames=='04_DEM' > listofdirectoriestoprocess.txt
print "DONE CHECK TEXT FILE"
You can use os.path for this:
import os
lst = [r'C:\directory\NewZealand\04DEM\DEM_CD23_1232.tif',
r'C:\directory\Australia\04DEM\DEM_CD23_1233.tif',
r'C:\directory\NewZealand\05DSM\DSM_CD23_1232.tif',
r'C:\directory\Australia\05DSM\DSM_CD23_1232.tif']
def filter_paths(lst, x):
return [os.path.split(i)[0] for i in lst if os.path.normpath(i).split(os.sep)[3] == x]
res = list(filter_paths(lst, '04DEM'))
# ['C:\\directory\\NewZealand\\04DEM',
# 'C:\\directory\\Australia\\04DEM']
Use in to check if a required string is in another string.
This is one quick way:
new_list = []
for path in path_list:
if '04DEM' in path:
new_list.append(path)
Demo:
s = 'C:/directory/NewZealand/04DEM/DEM_CD23_1232.tif'
if '04DEM' in s:
print(True)
# True
Make sure you use / or \\ as directory separator instead of \ because the latter escapes characters.
First, you select via regex using re, and then use pathlib:
import re
import pathlib
pattern = re.compile('04DEM')
# You use pattern.search() if s is IN the string
# You use pattern.match() if s COMPLETELY matches the string.
# Apply the correct function to your use case.
files = [s in list_of_files if pattern.search(s)]
all_pruned_paths = set()
for p in files:
total = ""
for d in pathlib.Path(p):
total = os.path.join(total, d)
if pattern.search(s):
break
all_pruned_paths.add(total)
result = list(all_pruned_paths)
This is more robust than using in because you might need to form more complicated queries in the future.

Categories

Resources