Concatenating files with matching string in middle of filename - python

My goal is to concatenate files in a folder based on a string in the middle of the filename, ideally using python or bash. To simplify the question, here is an example:
P16C-X128-22MB-LL_merged_trimmed.fastq
P16C-X128-27MB-LR_merged_trimmed.fastq
P16C-X1324-14DL-UL_merged_trimmed.fastq
P16C-X1324-21DL-LL_merged_trimmed.fastq
I would like to concatenate based on the value after the first dash but before the second (e.g. X128 or X1324), so that I am left with (in this example), two additional files that contain the concatenated contents of the individual files:
P16C-X128-Concat.fastq (concat of 2 files with X128)
P16C-X1324-Concat.fastq (concat of 2 files with X1324)
Any help would be appreciated.

For simple string manipulations, I prefer to avoid the use of regular expressions. I think that str.split() is enough in this case. Besides, for simple file name matching, the library fnmatch provides enough functionality.
import fnmatch
import os
from itertools import groupby
path = '/full/path/to/files/'
ext = ".fastq"
files = fnmatch.filter(os.listdir(path), '*' + ext)
def by(fname): return fname.split('-')[1] # Ej. X128
# You said:
# I would like to concatenate based on the value after the first dash
# but before the second (e.g. X128 or X1324)
# If you want to keep both parts together, uncomment the following:
# def by(fname): return '-'.join(fname.split('-')[:2]) # Ej. P16C-X128
for k, g in groupby(sorted(files, key=by), key=by):
dst = str(k) + '-Concat' + ext
with open(os.path.join(path, dst), 'w') as dstf:
for fname in g:
with open(os.path.join(path, fname), 'r') as srcf:
dstf.write(srcf.read())
Instead of the read, write in Python, you could also delegate the concatenation to the OS. You would normally use a bash command like this:
cat *-X128-*.fastq > X128.fastq
Using the subprocess library:
import subprocess
for k, g in groupby(sorted(files, key=by), key=by):
dst = str(k) + '-Concat' + ext
with open(os.path.join(path, dst), 'w') as dstf:
command = ['cat'] # +++
for fname in g:
command.append(os.path.join(path, fname)) # +++
subprocess.run(command, stdout=dstf) # +++
Also, for a batch job like this one, you should consider placing the concatenated files in a separate directory, but that is easily done by changing the dst filename.

You can use open to read and write (create) files, os.listdir to get all files (and directories) in a certain directory and re to match file name as needed.
Use a dictionary to store contents by filename prefix (the file's name up until 3rd hyphen -) and concatenate the contents together.
import os
import re
contents = {}
file_extension = "fastq"
# Get all files and directories that are in current working directory
for file_name in os.listdir('./'):
# Use '.' so it doesn't match directories
if file_name.endswith('.' + file_extension):
# Match the first 2 hyphen-separated values from file name
prefix_match = re.match("^([^-]+\-[^-]+)", file_name)
file_prefix = prefix_match.group(1)
# Read the file and concatenate contents with previous contents
contents[file_prefix] = contents.get(file_prefix, '')
with open(file_name, 'r') as the_file:
contents[file_prefix] += the_file.read() + '\n'
# Create new file for each file id and write contents to it
for file_prefix in contents:
file_contents = contents[file_prefix]
with open(file_prefix + '-Concat.' + file_extension, 'w') as the_file:
the_file.write(file_contents)

Related

str.replace and os.rename doesn't work with file names containing "/"

I want to replace a single character on a bunch of file names with another character. However, when I want to replace files with / to another character, it doesn't work. It does work with other characters I want to replace, such as -.
The python file is on the directory path, and I 'cd' to the directory path to run the program.
I expect that every / on a file name would be replaced with a _. Like this: file/1.txt to file_1.txt.
However, the files stay the same. Like this: file/1.txt to file/1.txt.
I use this code:
# Replace a character of a name file with another charater
from importlib.metadata import files
import os
counter = 0
path = r"/Users/user/test"
char = input('Enter character or string you want to replace:')
repl = input('Enter character or string you want this to be replaced with:')
files = []
# Loop, doesn't work with "/" for some reason
for file_name in os.listdir(path):
if char in file_name :
old_name = file_name
new_name = file_name.replace(char, repl)
counter += 1
files.append(new_name)
os.rename(old_name, new_name)
print(counter)
print(files)
print("Done! Check your files")
As an alternative, I deleted the variables char and repl and instead used this in the for loop, but it still doesn't work:
for file_name in os.listdir(path):
if "/" in file_name :
old_name = file_name
new_name = file_name.replace("/", "_")
counter += 1
I'd use pathlib which is much more clean and convenient that using os module.
from pathlib import Path
base_folder = Path(r"/Users/user/test")
old_names = [f for f in base_folder.glob('*') if f.is_file()]
new_names = [f'{f.parts[-2]}_{f.name}' for f in old_names]
for o_name, n_name in zip(old_names, new_names):
o_name.with_name(n_name)
You first create a list of file paths for all the files in the folder.
Then the second list comprehension goes through a list of files and for each file grabs its parent folder name and the file name and combines them in an f-string with _ in between.
Now, finally you go through the old and new file names and assign a new name to each of the original files.
EDIT:
If you want to exclude hidden files you need to amend the first comprehension to exclude files that start with a .:
old_names = [f for f in base_folder.glob('*.txt')
if not any(part.startswith('.') for part in f.parts) and f.is_file()]
I managed to solve it with this.
# Replace a character of a file with another charater
from importlib.metadata import files
import os
counter = 0
path=input("Enter path: ")
char=input('Enter character or string you want to replace:')
repl=input('Enter character or string you want this to be replaced with:')
files=[]
# Ignore hidden files
def listdir_nohidden(path):
for f in os.listdir(path):
if not f.startswith('.'):
yield f
# Loop, instead of "/" use ":"
for file_name in listdir_nohidden(path):
if char in file_name :
old_name=file_name
new_name=file_name.replace(char,repl)
counter+=1
files.append(new_name)
os.rename(old_name,new_name)
print(counter)
print(files)
print("Done! Check your files")
Files change like this: test/1.txt --> test_1.txt
The problem was that for some reason, when I debug with print(file_name), files with "/" show ":" instead. The other problem, were the hidden files, which I solved above.

Write new file with digit before extension

I have a several files in a directory with the the following names
example1.txt
example2.txt
...
example10.txt
and a bunch of other files.
I'm trying to write a script that can get all the files with a file name like <name><digit>.txt and then get the one with higher digit (in this case example10.txt) and then write a new file where we add +1 to the digit, that is example11.txt
Right now I'm stuck at the part of selecting the files .txt and getting the last one.
Here is the code
import glob
from natsort import natsorted
files = natsorted(glob.glob('*[0-9].txt'))
last_file = files[-1]
print(files)
print(last_file)
You can use a regular expression to split the file name in the text and number part, increment the number and join everything else together to have your new file name:
import re
import glob
from natsort import natsorted
files = natsorted(glob.glob('*[0-9].txt'))
last_file = files[-1]
base_name, digits = re.match(r'([a-zA-Z]+)([0-9]+)\.txt', last_file).groups()
next_number = int(digits) + 1
next_file_name = f'{base_name}{next_number}.txt'
print(files)
print(last_file)
print(next_file_name)
Note that the regex assumes that the base name of the file has only alpha characters, with no spaces or _, etc. The regex can be extended if needed.
you can use this script, it will work well for your purpose i think.
import os
def get_last_file():
files = os.listdir('./files')
for index, file in enumerate(files):
filename = str(file)[0:str(file).find('.')]
digit = int(''.join([char for char in filename if char.isdigit()]))
files[index] = digit
files.sort()
return files[-1]
def add_file(file_name, extension):
last_digit = get_last_file() +1
with open('./files/' + file_name + str(last_digit) + '.' + extension, 'w') as f:
f.write('0')
# call this to create a new incremental file.
add_file('example', 'txt')
Here's a simple solution.
files = ["example1.txt", "example2.txt", "example3.txt", "example10.txt"]
highestFileNumber = max(int(file[7:-4]) for file in files)
fileToBeCreated = f"example{highestFileNumber+1}.txt"
print(fileToBeCreated)
output:
example11.txt
.txt and example are constants, so theres no sense in looking for patterns. just trim the example and .txt

import filenames iteratively from a different file

I have a large number of entries in a file. Let me call it file A.
File A:
('aaa.dat', 'aaa.dat', 'aaa.dat')
('aaa.dat', 'aaa.dat', 'bbb.dat')
('aaa.dat', 'aaa.dat', 'ccc.dat')
I want to use these entries, line by line, in a program that would iteratively pick an entry from file A, concatenate the files in this way:
filenames = ['aaa.dat', 'aaa.dat', 'ccc.dat'] ###entry number 3
with open('out.dat', 'w') as outfile: ###the name has to be aaa-aaa-ccc.dat
for fname in filenames:
with open(fname) as infile:
outfile.write(infile.read().strip())
All I need to do is to substitute the filenames iteratively and create an output in a "aaa-aaa-aaa.dat" format. I would appreciate any help-- feeling a bit lost!
Many thanks!!!
You can retrieve and modify the file names in the following way:
import re
pattern = re.compile('\W')
with open('fnames.txt', 'r') as infile:
for line in infile:
line = (re.sub(pattern, ' ', line)).split()
# Old filenames - to concatenate contents
content = [x + '.dat' for x in line[::2]];
# New filename
new_name = ('-').join(line[::2]) + '.dat'
# Write the concatenated content to the new
# file (first read the content all at once)
with open(new_name, 'w') as outfile:
for con in content:
with open(con, 'r') as old:
new_content = old.read()
outfile.write(new_content)
This program reads your input file, here named fnames.txt with the exact structure from your post, line by line. For each line it splits the entries using a precompiled regex (precompiling regex is suitable here and should make things faster). This assumes that your filenames are only alphanumeric characters, since the regex substitutes all non-alphanumeric characters with a space.
It retrieves only 'aaa' and dat entries as a list of strings for each line and forms a new name by joining every second entry starting from 0 and adding a .dat extension to it. It joins using a - as in the post.
It then retrieves the individual file names from which it will extract the content into a list content by selecting every second entry from line.
Finally, it reads each of the files in content and writes them to the common file new_name. It reads each of them all at ones which may be a problem if these files are big and in general there may be more efficient ways of doing all this. Also, if you are planning to do more things with the content from old files before writing, consider moving the old file-specific operations to a separate function for readability and any potential debugging.
Something like this:
with open(fname) as infile, open('out.dat', 'w') as outfile:
for line in infile:
line = line.strip()
if line: # not empty
filenames = eval(line.strip()) # read tuple
filenames = [f[:-4] for f in filenames] # remove extension
filename = '-'.join(filenames) + '.dat' # make filename
outfile.write(filename + '\n') # write
If your problem is just calculating the new filenames, how about using os.path.splitext?
'-'.join([
f[0] for f in [os.path.splitext(path) for path in filenames]
]) + '.dat'
Which can be probably better understood if you see it like this:
import os
clean_fnames = []
filenames = ['aaa.dat', 'aaa.dat', 'ccc.dat']
for fname in filenames:
name, extension = os.path.splitext(fname)
clean_fnames.append(name)
name_without_ext = '-'.join(clean_fnames)
name_with_ext = name_without_ext + '.dat'
print(name_with_ext)
HOWEVER: If your issue is that you can not get the filenames in a list by reading the file line by line, you must keep in mind that when you read files, you get text (strings) NOT Python structures. You need to rebuild a list from a text like: "('aaa.dat', 'aaa.dat', 'aaa.dat')\n".
You could take a look to ast.literal_eval or try to rebuild it yourself. The code below outputs a lot of messages to show what's happening:
import pprint
collected_fnames = []
with open('./fileA.txt') as f:
for line in f:
print("Read this (literal) line: %s" % repr(line))
line_without_whitespaces_on_the_sides = line.strip()
if not line_without_whitespaces_on_the_sides:
print("line is empty... skipping")
continue
else:
line_without_parenthesis = (
line_without_whitespaces_on_the_sides
.lstrip('(')
.rstrip(')')
)
print("Cleaned parenthesis: %s" % line_without_parenthesis)
chunks = line_without_parenthesis.split(', ')
print("Collected %s chunks in a %s: %s" % (len(chunks), type(chunks), chunks))
chunks_without_quotations = [chunk.replace("'", "") for chunk in chunks]
print("Now we don't have quotations: %s" % chunks_without_quotations)
collected_fnames.append(chunks_without_quotations)
print("collected %s lines with filenames:\n%s" %
(len(collected_fnames), pprint.pformat(collected_fnames)))

Iterate over 2 files in each folder and compare them

I compare two text files and print out the results to a 3rd file. I am trying to make it so the script i'm running would iterate over all of the folders that have two text files in them, in the CWD of the script.
What i have so far:
import os
import glob
path = './'
for infile in glob.glob( os.path.join(path, '*.*') ):
print('current file is: ' + infile)
with open (f1+'.txt', 'r') as fin1, open(f2+'.txt', 'r') as fin2:
Would this be a good way to start the iteration process?
It's not the most clear code but it gets the job done. However, i'm pretty sure i need to take the logic out of the read / write methods but i'm not sure where to start.
What i'm basically trying to do is have a script iterate over all of the folders in its CWD, open each folder, compare the two text files inside, write a 3rd text file to the same folder, then move on to the next.
Another method i have tried is as follows:
import os
rootDir = 'C:\\Python27\\test'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
And this outputs the following (to give you a better example of the file structure:
Found directory: C:\Python27\test
test.py
Found directory: C:\Python27\test\asdd
asd1.txt
asd2.txt
Found directory: C:\Python27\test\chro
ch1.txt
ch2.txt
Found directory: C:\Python27\test\hway
hw1.txt
hw2.txt
Would it be wise to put the compare logic under the for fname in fileList? How do i make sure it compares the two text files inside the specific folder and not with other fnames in the fileList?
This is the full code that i am trying to add this functionality into. I appologize for the Frankenstein nature of it but i am still working on a refined version but it does not work yet.
from collections import defaultdict
from operator import itemgetter
from itertools import groupby
from collections import deque
import os
class avs_auto:
def load_and_compare(self, input_file1, input_file2, output_file1, output_file2, result_file):
self.load(input_file1, input_file2, output_file1, output_file2)
self.compare(output_file1, output_file2)
self.final(result_file)
def load(self, fileIn1, fileIn2, fileOut1, fileOut2):
with open(fileIn1+'.txt') as fin1, open(fileIn2+'.txt') as fin2:
frame_rects = defaultdict(list)
for row in (map(str, line.split()) for line in fin1):
id, frame, rect = row[0], row[2], [row[3],row[4],row[5],row[6]]
frame_rects[frame].append(id)
frame_rects[frame].append(rect)
frame_rects2 = defaultdict(list)
for row in (map(str, line.split()) for line in fin2):
id, frame, rect = row[0], row[2], [row[3],row[4],row[5],row[6]]
frame_rects2[frame].append(id)
frame_rects2[frame].append(rect)
with open(fileOut1+'.txt', 'w') as fout1, open(fileOut2+'.txt', 'w') as fout2:
for frame, rects in sorted(frame_rects.iteritems()):
fout1.write('{{{}:{}}}\n'.format(frame, rects))
for frame, rects in sorted(frame_rects2.iteritems()):
fout2.write('{{{}:{}}}\n'.format(frame, rects))
def compare(self, fileOut1, fileOut2):
with open(fileOut1+'.txt', 'r') as fin1:
with open(fileOut2+'.txt', 'r') as fin2:
lines1 = fin1.readlines()
lines2 = fin2.readlines()
diff_lines = [l.strip() for l in lines1 if l not in lines2]
diffs = defaultdict(list)
with open(fileOut1+'x'+fileOut2+'.txt', 'w') as result_file:
for line in diff_lines:
d = eval(line)
for k in d:
list_ids = d[k]
for i in range(0, len(d[k]), 2):
diffs[d[k][i]].append(k)
for id_ in diffs:
diffs[id_].sort()
for k, g in groupby(enumerate(diffs[id_]), lambda (i, x): i - x):
group = map(itemgetter(1), g)
result_file.write('{0} {1} {2}\n'.format(id_, group[0], group[-1]))
def final(self, result_file):
with open(result_file+'.txt', 'r') as fin:
lines = (line.split() for line in fin)
for k, g in groupby(lines, itemgetter(0)):
fst = next(g)
lst = next(iter(deque(g, 1)), fst)
with open('final/{}.avs'.format(k), 'w') as fout:
fout.write('video0=ImageSource("old\%06d.jpeg", {}-3, {}+3, 15)\n'.format(fst[1], lst[2]))
fout.write('video1=ImageSource("new\%06d.jpeg", {}-3, {}+3, 15)\n'.format(fst[1], lst[2]))
fout.write('video0=BilinearResize(video0,640,480)\n')
fout.write('video1=BilinearResize(video1,640,480)\n')
fout.write('StackHorizontal(video0,video1)\n')
fout.write('Subtitle("ID: {}", font="arial", size=30, align=8)'.format(k))
using the load_and_compare() function, i define two input text files, two output text files, a file for the comparison results and a final phase that writes many files for all of the differences.
What i am trying to do is have this whole class run on the current working directory and go through every sub folder, compare the two text files, and write everything into the same folder, specifically the final() results.
You can indeed use os.walk(), since that already separates the directories from the files. You only need the directories it returns, because that's where you're looking for your 2 specific files.
You could also use os.listdir() but that returns directories as well files in the same list, so you would have to check for directories yourself.
Either way, once you have the directories, you iterate over them (for subdir in dirnames) and join the various path components you have: The dirpath, the subdir name that you got from iterating over the list and your filename.
Assuming there are also some directories that don't have the specific 2 files, it's a good idea to wrap the open() calls in a try..except block and thus ignore the directories where one of the files (or both of them) doesn't exist.
Finally, if you used os.walk(), you can easily choose if you only want to go into directories one level deep or walk the whole depth of the tree. In the former case, you just clear the dirnames list by dirnames[:] = []. Note that dirnames = [] wouldn't work, since that would just create a new empty list and put that reference into the variable instead of clearing the old list.
Replace the print("do something ...") with your program logic.
#!/usr/bin/env python
import errno
import os
f1 = "test1"
f2 = "test2"
path = "."
for dirpath, dirnames, _ in os.walk(path):
for subdir in dirnames:
filepath1, filepath2 = [os.path.join(dirpath, subdir, f + ".txt") for f in f1, f2]
try:
with open(filepath1, 'r') as fin1, open(filepath2, 'r') as fin2:
print("do something with " + str(fin1) + " and " + str(fin2))
except IOError as e:
# ignore directiories that don't contain the 2 files
if e.errno != errno.ENOENT:
# reraise exception if different from "file or directory doesn't exist"
raise
# comment the next line out if you want to traverse all subsubdirectories
dirnames[:] = []
Edit:
Based on your comments, I hope I understand your question better now.
Try the following code snippet instead. The overall structure stays the same, only now I'm using the returned filenames of os.walk(). Unfortunately, that would also make it harder to do something like "go only into the subdirectories 1 level deep", so I hope walking the tree recursively is fine with you. If not, I'll have to add a little code to later.
#!/usr/bin/env python
import fnmatch
import os
filter_pattern = "*.txt"
path = "."
for dirpath, dirnames, filenames in os.walk(path):
# comment this out if you don't want to filter
filenames = [fn for fn in filenames if fnmatch.fnmatch(fn, filter_pattern)]
if len(filenames) == 2:
# comment this out if you don't want the 2 filenames to be sorted
filenames.sort(key=str.lower)
filepath1, filepath2 = [os.path.join(dirpath, fn) for fn in filenames]
with open(filepath1, 'r') as fin1, open(filepath2, 'r') as fin2:
print("do something with " + str(fin1) + " and " + str(fin2))
I'm still not really sure what your program logic does, so you will have to interface the two yourself.
However, I noticed that you're adding the ".txt" extension to the file name explicitly all over your code, so depending on how you are going to use the snippet, you might or might not need to remove the ".txt" extension first before handing the filenames over. That would be achieved by inserting the following line after or before the sort:
filenames = [os.path.splitext(fn)[0] for fn in filenames]
Also, I still don't understand why you're using eval(). Do the text files contain python code? In any case, eval() should be avoided and be replaced by code that's more specific to the task at hand.
If it's a list of comma separated strings, use line.split(",") instead.
If there might be whitespace before or after the comma, use [word.strip() for word in line.split(",")] instead.
If it's a list of comma separated integers, use [int(num) for num in line.split(",")] instead - for floats it works analogously.
etc.

Iterate within directory to zip files with python

I need to iterate through a folder and find every instance where the filenames are identical (except for extension) and then zip (preferably using tarfile) each of these into one file.
So I have 5 files named: "example1" each with different file extensions. I need to zip them up together and output them as "example1.tar" or something similar.
This would be easy enough with a simple for loop such as:
tar = tarfile.open('example1.tar',"w")
for output in glob ('example1*'):
tar.add(output)
tar.close()
however, there are 300 "example" files and I need to iterate through each one and their associated 5 files in order to make this work. This is way over my head. Any advice greatly appreciated.
The pattern you're describing generalizes to MapReduce. I found a simple implementation of MapReduce online, from which an even-simpler version is:
def map_reduce(data, mapper, reducer):
d = {}
for elem in data:
key, value = mapper(elem)
d.setdefault(key, []).append(value)
for key, grp in d.items():
d[key] = reducer(key, grp)
return d
You want to group all files by their name without the extension, which you can get from os.path.splitext(fname)[0]. Then, you want to make a tarball out of each group by using the tarfile module. In code, that is:
import os
import tarfile
def make_tar(basename, files):
tar = tarfile.open(basename + '.tar', 'w')
for f in files:
tar.add(f)
tar.close()
map_reduce(os.listdir('.'),
lambda x: (os.path.splitext(x)[0], x),
make_tar)
Edit: If you want to group files in different ways, you just need to modify the second argument to map_reduce. The code above groups files that have the same value for the expression os.path.splitext(x)[0]. So to group by the base file name with all the extensions stripped off, you could replace that expression with strip_all_ext(x) and add:
def strip_all_ext(path):
head, tail = os.path.split(path)
basename = tail.split(os.extsep)[0]
return os.path.join(head, basename)
You could do this:
list all files in the directory
create a dictionary where the basename is the key and all the extensions are values
then tar all the files by dictionary key
Something like this:
import os
import tarfile
from collections import defaultdict
myfiles = os.listdir(".") # List of all files
totar = defaultdict(list)
# now fill the defaultdict with entries; basename as keys, extensions as values
for name in myfiles:
base, ext = os.path.splitext(name)
totar[base].append(ext)
# iterate through all the basenames
for base in totar:
files = [base+ext for ext in totar[base]]
# now tar all the files in the list "files"
tar = tarfile.open(base+".tar", "w")
for item in files:
tar.add(item)
tar.close()
You have to problems. Solve the separately.
Finding matching names. Use a collections.defaultict
Creating tar files after you find the matching names. You've got that pretty well covered.
So. Solve problem 1 first.
Use glob to get all the names. Use os.path.basename to split the path and basename. Use os.path.splitext to split the name and extension.
A dictionary of lists can be used to save all files that have the same name.
Is that what you're doing in part 1?
Part 2 is putting the files into tar archives. For that, you've got most of the code you need.
Try using the glob module: http://docs.python.org/library/glob.html
#! /usr/bin/env python
import os
import tarfile
tarfiles = {}
for f in os.listdir ('files'):
prefix = f [:f.rfind ('.') ]
if prefix in tarfiles: tarfiles [prefix] += [f]
else: tarfiles [prefix] = [f]
for k, v in tarfiles.items ():
tf = tarfile.open ('%s.tar.gz' % k, 'w:gz')
for f in v: tf.addfile (tarfile.TarInfo (f), file ('files/%s' % f) )
tf.close ()
import os
import tarfile
allfiles = {}
for filename in os.listdir("."):
basename = '.'.join (filename.split(".")[:-1] )
if not basename in all_files:
allfiles[basename] = [filename]
else:
allfiles[basename].append(filename)
for basename, filenames in allfiles.items():
if len(filenames) < 2:
continue
tardata = tarfile.open(basename+".tar", "w")
for filename in filenames:
tardata.add(filename)
tardata.close()

Categories

Resources