Write new file with digit before extension - python

I have a several files in a directory with the the following names
example1.txt
example2.txt
...
example10.txt
and a bunch of other files.
I'm trying to write a script that can get all the files with a file name like <name><digit>.txt and then get the one with higher digit (in this case example10.txt) and then write a new file where we add +1 to the digit, that is example11.txt
Right now I'm stuck at the part of selecting the files .txt and getting the last one.
Here is the code
import glob
from natsort import natsorted
files = natsorted(glob.glob('*[0-9].txt'))
last_file = files[-1]
print(files)
print(last_file)

You can use a regular expression to split the file name in the text and number part, increment the number and join everything else together to have your new file name:
import re
import glob
from natsort import natsorted
files = natsorted(glob.glob('*[0-9].txt'))
last_file = files[-1]
base_name, digits = re.match(r'([a-zA-Z]+)([0-9]+)\.txt', last_file).groups()
next_number = int(digits) + 1
next_file_name = f'{base_name}{next_number}.txt'
print(files)
print(last_file)
print(next_file_name)
Note that the regex assumes that the base name of the file has only alpha characters, with no spaces or _, etc. The regex can be extended if needed.

you can use this script, it will work well for your purpose i think.
import os
def get_last_file():
files = os.listdir('./files')
for index, file in enumerate(files):
filename = str(file)[0:str(file).find('.')]
digit = int(''.join([char for char in filename if char.isdigit()]))
files[index] = digit
files.sort()
return files[-1]
def add_file(file_name, extension):
last_digit = get_last_file() +1
with open('./files/' + file_name + str(last_digit) + '.' + extension, 'w') as f:
f.write('0')
# call this to create a new incremental file.
add_file('example', 'txt')

Here's a simple solution.
files = ["example1.txt", "example2.txt", "example3.txt", "example10.txt"]
highestFileNumber = max(int(file[7:-4]) for file in files)
fileToBeCreated = f"example{highestFileNumber+1}.txt"
print(fileToBeCreated)
output:
example11.txt
.txt and example are constants, so theres no sense in looking for patterns. just trim the example and .txt

Related

str.replace and os.rename doesn't work with file names containing "/"

I want to replace a single character on a bunch of file names with another character. However, when I want to replace files with / to another character, it doesn't work. It does work with other characters I want to replace, such as -.
The python file is on the directory path, and I 'cd' to the directory path to run the program.
I expect that every / on a file name would be replaced with a _. Like this: file/1.txt to file_1.txt.
However, the files stay the same. Like this: file/1.txt to file/1.txt.
I use this code:
# Replace a character of a name file with another charater
from importlib.metadata import files
import os
counter = 0
path = r"/Users/user/test"
char = input('Enter character or string you want to replace:')
repl = input('Enter character or string you want this to be replaced with:')
files = []
# Loop, doesn't work with "/" for some reason
for file_name in os.listdir(path):
if char in file_name :
old_name = file_name
new_name = file_name.replace(char, repl)
counter += 1
files.append(new_name)
os.rename(old_name, new_name)
print(counter)
print(files)
print("Done! Check your files")
As an alternative, I deleted the variables char and repl and instead used this in the for loop, but it still doesn't work:
for file_name in os.listdir(path):
if "/" in file_name :
old_name = file_name
new_name = file_name.replace("/", "_")
counter += 1
I'd use pathlib which is much more clean and convenient that using os module.
from pathlib import Path
base_folder = Path(r"/Users/user/test")
old_names = [f for f in base_folder.glob('*') if f.is_file()]
new_names = [f'{f.parts[-2]}_{f.name}' for f in old_names]
for o_name, n_name in zip(old_names, new_names):
o_name.with_name(n_name)
You first create a list of file paths for all the files in the folder.
Then the second list comprehension goes through a list of files and for each file grabs its parent folder name and the file name and combines them in an f-string with _ in between.
Now, finally you go through the old and new file names and assign a new name to each of the original files.
EDIT:
If you want to exclude hidden files you need to amend the first comprehension to exclude files that start with a .:
old_names = [f for f in base_folder.glob('*.txt')
if not any(part.startswith('.') for part in f.parts) and f.is_file()]
I managed to solve it with this.
# Replace a character of a file with another charater
from importlib.metadata import files
import os
counter = 0
path=input("Enter path: ")
char=input('Enter character or string you want to replace:')
repl=input('Enter character or string you want this to be replaced with:')
files=[]
# Ignore hidden files
def listdir_nohidden(path):
for f in os.listdir(path):
if not f.startswith('.'):
yield f
# Loop, instead of "/" use ":"
for file_name in listdir_nohidden(path):
if char in file_name :
old_name=file_name
new_name=file_name.replace(char,repl)
counter+=1
files.append(new_name)
os.rename(old_name,new_name)
print(counter)
print(files)
print("Done! Check your files")
Files change like this: test/1.txt --> test_1.txt
The problem was that for some reason, when I debug with print(file_name), files with "/" show ":" instead. The other problem, were the hidden files, which I solved above.

remove the unwanted columns in data

I have 500 txt files in a folder
data example:
1-1,0-7,10.1023,
1-2,7-8,/,
1-3,8-9,A,
1-4,9-10,:,
1-5,10-23,1020940830716,
I would like to delete the last "," in each line. to :
1-1,0-7,10.1023
1-2,7-8,/
1-3,8-9,A
1-4,9-10,:
1-5,10-23,1020940830716
How do I do that with a for loop to delete them from all of 500 files?
Try using this code:
for fname in filenames:
with open(fname, 'r') as f:
string = f.read().replace(',\n','\n')
with open(fname, 'w') as w:
w.write(string)
I usually do something like this.
Change the folder_path variable
Change the filename_pattern variable. This is just extra in case you have specific file patterns in your folder that you want to consider. You can simply set this variable to (blank) if irrelevant.
Also, the * takes anything that matches the pattern i.e. Book1, Book2, etc. Before running the code print(files) to make sure you have all of the correct files. I am not sure if :
import glob
import os
import pandas
#read files in
folder_path = 'Documents'
filename_pattern = 'Book'
files = glob.glob(f'{folder_path}//{filename_pattern}*.txt')
df = (pd.concat([pd.read_csv(f, header=None)
.assign(filename=os.path.basename(f))
for f in files]))
#read files out
for file, data in df.groupby('filename'):
data.iloc[:,:-2].to_csv(f'{folder_path}/{file}',
index=False,
header=False)

Concatenating files with matching string in middle of filename

My goal is to concatenate files in a folder based on a string in the middle of the filename, ideally using python or bash. To simplify the question, here is an example:
P16C-X128-22MB-LL_merged_trimmed.fastq
P16C-X128-27MB-LR_merged_trimmed.fastq
P16C-X1324-14DL-UL_merged_trimmed.fastq
P16C-X1324-21DL-LL_merged_trimmed.fastq
I would like to concatenate based on the value after the first dash but before the second (e.g. X128 or X1324), so that I am left with (in this example), two additional files that contain the concatenated contents of the individual files:
P16C-X128-Concat.fastq (concat of 2 files with X128)
P16C-X1324-Concat.fastq (concat of 2 files with X1324)
Any help would be appreciated.
For simple string manipulations, I prefer to avoid the use of regular expressions. I think that str.split() is enough in this case. Besides, for simple file name matching, the library fnmatch provides enough functionality.
import fnmatch
import os
from itertools import groupby
path = '/full/path/to/files/'
ext = ".fastq"
files = fnmatch.filter(os.listdir(path), '*' + ext)
def by(fname): return fname.split('-')[1] # Ej. X128
# You said:
# I would like to concatenate based on the value after the first dash
# but before the second (e.g. X128 or X1324)
# If you want to keep both parts together, uncomment the following:
# def by(fname): return '-'.join(fname.split('-')[:2]) # Ej. P16C-X128
for k, g in groupby(sorted(files, key=by), key=by):
dst = str(k) + '-Concat' + ext
with open(os.path.join(path, dst), 'w') as dstf:
for fname in g:
with open(os.path.join(path, fname), 'r') as srcf:
dstf.write(srcf.read())
Instead of the read, write in Python, you could also delegate the concatenation to the OS. You would normally use a bash command like this:
cat *-X128-*.fastq > X128.fastq
Using the subprocess library:
import subprocess
for k, g in groupby(sorted(files, key=by), key=by):
dst = str(k) + '-Concat' + ext
with open(os.path.join(path, dst), 'w') as dstf:
command = ['cat'] # +++
for fname in g:
command.append(os.path.join(path, fname)) # +++
subprocess.run(command, stdout=dstf) # +++
Also, for a batch job like this one, you should consider placing the concatenated files in a separate directory, but that is easily done by changing the dst filename.
You can use open to read and write (create) files, os.listdir to get all files (and directories) in a certain directory and re to match file name as needed.
Use a dictionary to store contents by filename prefix (the file's name up until 3rd hyphen -) and concatenate the contents together.
import os
import re
contents = {}
file_extension = "fastq"
# Get all files and directories that are in current working directory
for file_name in os.listdir('./'):
# Use '.' so it doesn't match directories
if file_name.endswith('.' + file_extension):
# Match the first 2 hyphen-separated values from file name
prefix_match = re.match("^([^-]+\-[^-]+)", file_name)
file_prefix = prefix_match.group(1)
# Read the file and concatenate contents with previous contents
contents[file_prefix] = contents.get(file_prefix, '')
with open(file_name, 'r') as the_file:
contents[file_prefix] += the_file.read() + '\n'
# Create new file for each file id and write contents to it
for file_prefix in contents:
file_contents = contents[file_prefix]
with open(file_prefix + '-Concat.' + file_extension, 'w') as the_file:
the_file.write(file_contents)

python, find and print specific cells in csv files that are in different directories

I have different csv files in different directories. so i want to find specific cells in different columns that correspond to a specific date in my input.txt file.
here is what i have until now:
import glob, os, csv, numpy
import re, csv
if __name__ == '__main__':
Input=open('Input.txt','r');
output = []
for i, line in enumerate(Input):
if i==0:
header_Input = Input.readline().replace('\n','').split(',');
else:
date_input = Input.readline().replace('\n','').split(',');
a=os.walk("path to the directory")
[x[0] for x in os.walk("path to the directory")]
print(a)
b=next(os.walk('.'))[1] # immediate child directories.
for dirname, dirnames, filenames in os.walk('.'):
# print path to all subdirectories first.
for subdirname in dirnames:
print(os.path.join(dirname, subdirname))
# print path to all filenames.
for filename in filenames:
#print(os.path.join(dirname, filename))
csvfile = 'csv_file'
if csvfile in filename:
print(os.path.join(dirname, filename))
Now I have the csv files, so i need to find the date_input in every file, and print the line that contains all the information. Or if possible, to get only the cells that are in the columns with header == header_input.
This is not intended to be a full answer to your question. But you may want to consider replacing
for i, line in enumerate(Input):
if i==0:
header_Input = Input.readline().replace('\n','').split(',');
else:
date_input = Input.readline().replace('\n','').split(',');
with
header_Input = Input.readline().strip().split(',')
date_input = Input.readline().strip().split(',')
The enumerate(Input) expression reads lines from the file, and so do calls to readline() in the loop body. This will most likely result in some unfortunate results like reading alternating lines from the file.
The strip() method removes whitespace from the start and end of the line. Alternatively you may want to know that s[:-1] strips off the last character of s.

Saving Filenames with Condition

I'm trying to save the names of files that fulfill a certain condition.
I think the easiest way to do this would make a short Python program that imports and reads the files, checks if the condition is met, and (assuming it is met) then saves the names of the files.
I have data files with just two columns and four rows, something like this:
a: 5
b: 5
c: 6
de: 7
I want to save the names of the files (or part of the name of the files, if that's a simple fix, otherwise I can just sed the file afterwards) of the data files that have the 4th number ([3:1]) greater than 8. I tried importing the files with numpy, but it said it couldn't import the letters in the first column.
Another way I was considering trying to do it was from the command line something along the lines of cat *.dat >> something.txtbut I couldn't figure out how to do that.
The code I've tried to write up to get this to work is:
import fileinput
import glob
import numpy as np
#Filter to find value > 8
#Globbing value datafiles
file_list = glob.glob("/path/to/*.dat")
#Creating output file containing
f = open('list.txt', 'w')
#Looping over files
for file in file_list:
#For each file in the directory, isolating the filename
filename = file.split('/')[-1]
#Opening the files, checking if value is greater than 8
a = np.loadtxt("file", delimiter=' ', usecols=1)
if a[3:0] > 8:
print >> f, filename
f.close()
When I do this, I get an error that says TypeError: 'int' object is not iterable, but I don't know what that's referring to.
I ended up using
import fileinput
import glob
import numpy as np
#Filter to find value > 8
#Globbing datafiles
file_list = glob.glob("/path/to/*.dat")
#Creating output file containing
f = open('list.txt', 'w')
#Looping over files
for file in file_list:
#For each file in the directory, isolating the filename
filename = file.split('/')[-1]
#Opening the files, checking if value is greater than 8
a = np.genfromtxt(file)
if a[3,1] > 8:
f.write(filename + "\n")
f.close()
it is hard to tell exactly what you want but maybe something like this
from glob import glob
from re import findall
fpattern = "/path/to/*.dat"
def test(fname):
with open(fname) as f:
try:
return int(findall("\d+",f.read())[3])>8
except IndexError:
pass
matches = [fname for fname in glob(fpattern) if test(fname)]
print matches

Categories

Resources