combine multiple text files into one text file? - python

I'm trying to combine multiple files into one, where each of them contains one column and I need to get one file with two columns and plot the resulted file like (x,y) as follows:
x y result
1 4 1 4
2 5 2 5
3 6 3 6
and run the code for n text files.
How can I do this?

A simple solution assuming that all files have the same number of elements in float format.
import numpy as np
filename_list=['file0.txt', 'file1.txt']#and so on
columns = []
for filename in filename_list:
f=open(filename)
x = np.array([float(raw) for raw in f.readlines()])
columns.append(x)
columns = np.vstack(columns).T
np.savetxt('filename_out.txt', columns)
see also the method savetxt to customize the output
EDIT:
if you have 100 files in a certain directory (let's call it files_dir) you can use the method listdir in the os library, be be careful, since listdir returns directories and files:
import os
filename_list = [f for f in os.listdir(files_dir) if os.path.isfile(f)]

here's a quick-and-dirty solution. I assumed that all files have the exact number of rows. The function write_files gets an input files which is a list of file-paths (strings).
def write_files(files):
opened_files = []
for f in files:
opened_files.append(open(f, "r"))
output_file = open("output.txt", "w")
num_lines = sum(1 for line in opened_files[0])
opened_files[0].seek(0,0)
for i in range(num_lines):
line = [of.readline().rstrip() for of in opened_files]
line = " ".join(line)
line += "\n"
output_file.write(line)
for of in opened_files:
of.close()
output_file.close()
write_files(["1.txt", "2.txt"])

Related

concatenating files in python

I have files in a directory and i want to concatenate these files vertically to make a single file.
input
file1.txt file2.txt
1 8
2 8
3 9
i need output
1
2
3
8
8
9
My script is
import glob
import numpy as np
for files in glob.glob(*.txt):
print(files)
np.concatenate([files])
but it doesnot concatenate vertically instead it produces last file of for loop.Can anybody help.Thanks.
There's a few things wrong with your code,
Numpy appears a bit overkill for such a mundane task in my opinion. You can use a much simpler approach, like for instance:
import glob
result = ""
for file_name in glob.glob("*.txt"):
with open(file_name, "r") as f:
for line in f.readlines():
result += line
print(result)
In order to save the result in a .txt-file, you could do something like:
with open("result.txt", "w") as f:
f.write(result)
This should work.
import glob
for files in glob.glob('*.txt'):
fileopen = open(r"" + files, "r+")
file_contents = fileopen.read()
output = open("output.txt", "a")
output.write(file_contents)
output.close()

Wrong output when trying to create a manifest-generating script (python) for Rail-RNA. Files do not match

What I want: To create a "manifest" for running rail-RNA (http://rail.bio/) on a number of fastq files, that I have in a directory, in the following form:
FASTQ URL 1 (tab) optional MD5 1 (tab) FASTQ URL 2 (tab) optional MD5 2 (tab) sample label
like:
/home/data/10080-17_r1.fastq 0 /home/data/10080-17_r2.fastq 0 10080-17_r1
/home/data/10300-25_r1.fastq 0 /home/data/10300-25_r2.fastq 0 10300-25_r1
/home/data/40500-72_r1.fastq 0 /home/data/40500-72_r2.fastq 0 10300-25_r2
.. and so on
What I have done: created a python script to generate a manifest from fastq files in a specific directory:
#!/usr/bin/python
import os
import csv
thisdir = os.getcwd()
# Create empty lists
forward = []
reverse = []
zero = []
names = []
# Extract file-paths for files ending with .fastq and append them to "forward" and "reverse"
for r, d, f in os.walk(thisdir): # r=root, d=directories, f = files
for file in f:
if "_1.fastq" in file:
forward.append(os.path.join(r, file))
if "_2.fastq" in file:
reverse.append(os.path.join(r, file))
# make a list containing 0 with the length of the forward list
for i in range(len(forward)):
zero.append('0')
# extract filenames without extensions:
l = os.listdir(thisdir)
li = [x.split('.')[0] for x in l]
for name in li:
if "_1" in name:
names.append(name)
names = [s.strip('_1') for s in names]
# write the output to a file
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for path in zip(forward, zero, reverse, zero, names):
writer.writerow(list(path))
What is wrong?: I get a manifest.txt in the right format, BUT, it does not match the right *_r1.fastq and *_r2.fastq files. It does something like this (the r1's in the first column does not match the r2's of the third column)
/home/data/10080-17_r1.fastq 0 /home/data/40500-72_r2.fastq 0 10080-17_r1
/home/data/10300-25_r1.fastq 0 /home/data/10080-17_r2.fastq 0 10300-25_r1
/home/data/40500-72_r1.fastq 0 /home/data/10300-25_r2.fastq 0 10300-25_r2
Do some of you, more experienced Python'ers have a solution to fix this?
That would be greatly appreciated!
Best wishes, Birgitte
In provided solution this error occurs if amount of *_r1.fastq files doesn't correspond to amount of *_r2.fastq files as that code makes new csv rows only by arrays indexes and doesn't compare file names.
I updated that solution. Check the files names, they are should be like:
/home/data/10080-17_r1.fastq
/home/data/10080-17_r2.fastq
At the moment we get all forward files ( *_r1.fastq ) and we are trying to find an appropriate reverse file ( *_r2.fastq) in the same directory. If we doesn't find it then put '-' instead of the reverse file's name.
Please check the code and read comments:
#!/usr/bin/python
import os
import csv
this_dir = os.getcwd()
forward_arr = []
reverse_arr = []
for r, d, f in os.walk(this_dir): # r=root, d=directories, f = files
for file in f:
if "_r1.fastq" in file:
forward_arr.append(os.path.join(r, file))
if "_r2.fastq" in file:
reverse_arr.append(os.path.join(r, file))
# collect result rows in this array
csv_arr = []
# foreach file in forward_arr
for forward_file in forward_arr:
# get sample label from full file path
# 1. split by '/' and select last element:
# /home/data/10080-17_r1.fastq -> 10080-17_r1.fastq
# 2. split by '_r' and select first element: 10080-17_r1.fastq -> 10080-17
sample_label = forward_file.split('/')[-1].split('_r')[0]
# we will search the reverse file for the same forward file in the reverse_arr
# but if we don't find it, in that case we'll put '-'
# instead of the path to the reverse file
reverse_file_result = "-"
# we are looking for a file with the same name and in the same location
# but it should be a reverse file with '_r2' instead of '_r1' in its name
reverse_file_for_search = forward_file.replace("_r1", "_r2")
# search that reverse_file in the reverse_arr
for reverse_file in reverse_arr:
# if we found that file
if reverse_file_for_search == reverse_file:
# assign the reverse file name
# to reverse_file_result variable insted of '-'
reverse_file_result = reverse_file
# go to the next forward_file
break
# in that place we can count md5 for the FORWARD file
md5_url_1 = 0
# in that place we can count md5 for the REVERSE file
md5_url_2 = 0
# append the result row in the csv_arr
csv_arr.append((forward_file, md5_url_1, reverse_file_result,
md5_url_2, sample_label))
# re-write all data to csv file per one iteration
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(csv_arr)
I think this should work for what you need. It's difficult to make out, because this:
names = [s.strip('_1') for s in names]
doesn't look like it should be doing anything (I suspect it's supposed to be "_r1" as in the first loop where I've modified there)
import os
import csv
thisdir = os.getcwd()
# Create empty lists
forward = []
reverse = []
names = []
for r, d, f in os.walk(thisdir): # r=root, d=directories, f = files
if f.endswith("_r1.fastq"):
forward.append(os.path.join(r, file))
names.append(f.strip("_r1.fastq"))
elif f.endswith("_r2.fastq"):
reverse.append(os.path.join(r, file))
# write the output to a file
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for for, rev, nam in zip(forward, reverse, names):
path = [for, 0, rev, o, nam]
writer.writerow(path)

Multiple editing of CSV files

I have a small delay with operating CSV files in python (3.5). Previously I was working with single files and there was no problem, but right now I have >100 files in one folder.
So, my goal is:
To parse all *.csv files in the directory
From each file delete first 6 rows , the files consists of the following data:
"nu(Ep), 2.6.8"
"Date: 2/10/16, 11:18:21 AM"
19
Ep,nu
0.0952645,0.123776,
0.119036,0.157720,
...
0.992060,0.374300,
Save each file separately (for example adding "_edited"), so there should be only numbers saved.
As an option - I have data subdivided on two parts for one material. For example: Ag(0-1_s).csv and Ag(1-4)_s.csv (after steps 1-3 the should be like Ag(*)_edited.csv). How can I merge this two files in a way of adding data from (1-4) to the end of (0-1) saving it in a third file?
My code so far is the following:
import os, sys
import csv
import re
import glob
import fileinput
def get_all_files(directory, extension='.csv'):
dir_list = os.listdir(directory)
csv_files = []
for i in dir_list:
if i.endswith(extension):
csv_files.append(os.path.realpath(i))
return csv_files
csv_files = get_all_files('/Directory/Path/Here')
#Here is the problem with csv's, I dont know how to scan files
#which are in the list "csv_files".
for n in csv_files:
#print(n)
lines = [] #empty, because I dont know how to write it properly per
#each file
input = open(n, 'r')
reader = csv.reader(n)
temp = []
for i in range(5):
next(reader)
#a for loop for here regarding rows?
#for row in n: ???
# ???
input.close()
#newfilename = "".join(n.split(".csv")) + "edited.csv"
#newfilename can be used within open() below:
with open(n + '_edited.csv', 'w') as nf:
writer = csv.writer(nf)
writer.writerows(lines)
This is the fastest way I can think of. If you have a solid-state drive, you could throw multiprocessing at this for more of a performance boost
import glob
import os
for fpath in glob.glob('path/to/directory/*.csv'):
fname = os.basename(fpath).rsplit(os.path.extsep, 1)[0]
with open(fpath) as infile, open(os.path.join('path/to/dir', fname+"_edited"+os.path.extsep+'csv'), 'w') as outfile:
for _ in range(6): infile.readline()
for line in infile: outfile.write(line)

Saving Filenames with Condition

I'm trying to save the names of files that fulfill a certain condition.
I think the easiest way to do this would make a short Python program that imports and reads the files, checks if the condition is met, and (assuming it is met) then saves the names of the files.
I have data files with just two columns and four rows, something like this:
a: 5
b: 5
c: 6
de: 7
I want to save the names of the files (or part of the name of the files, if that's a simple fix, otherwise I can just sed the file afterwards) of the data files that have the 4th number ([3:1]) greater than 8. I tried importing the files with numpy, but it said it couldn't import the letters in the first column.
Another way I was considering trying to do it was from the command line something along the lines of cat *.dat >> something.txtbut I couldn't figure out how to do that.
The code I've tried to write up to get this to work is:
import fileinput
import glob
import numpy as np
#Filter to find value > 8
#Globbing value datafiles
file_list = glob.glob("/path/to/*.dat")
#Creating output file containing
f = open('list.txt', 'w')
#Looping over files
for file in file_list:
#For each file in the directory, isolating the filename
filename = file.split('/')[-1]
#Opening the files, checking if value is greater than 8
a = np.loadtxt("file", delimiter=' ', usecols=1)
if a[3:0] > 8:
print >> f, filename
f.close()
When I do this, I get an error that says TypeError: 'int' object is not iterable, but I don't know what that's referring to.
I ended up using
import fileinput
import glob
import numpy as np
#Filter to find value > 8
#Globbing datafiles
file_list = glob.glob("/path/to/*.dat")
#Creating output file containing
f = open('list.txt', 'w')
#Looping over files
for file in file_list:
#For each file in the directory, isolating the filename
filename = file.split('/')[-1]
#Opening the files, checking if value is greater than 8
a = np.genfromtxt(file)
if a[3,1] > 8:
f.write(filename + "\n")
f.close()
it is hard to tell exactly what you want but maybe something like this
from glob import glob
from re import findall
fpattern = "/path/to/*.dat"
def test(fname):
with open(fname) as f:
try:
return int(findall("\d+",f.read())[3])>8
except IndexError:
pass
matches = [fname for fname in glob(fpattern) if test(fname)]
print matches

Copy columns from multiple text files in Python

I have a large number of text files containg data arranged into a fixed number of rows and columns, the columns being separated by spaces. (like a .csv but using spaces as the delimiter). I want to extract a given column from each of these files, and write it into a new text file.
So far I have tried:
results_combined = open('ResultsCombined.txt', 'wb')
def combine_results():
for num in range(2,10):
f = open("result_0."+str(num)+"_.txt", 'rb') # all the text files have similar filename styles
lines = f.readlines() # read in the data
no_lines = len(lines) # get the number of lines
for i in range (0,no_lines):
column = lines[i].strip().split(" ")
results_combined.write(column[5] + " " + '\r\n')
f.close()
if __name__ == "__main__":
combine_results()
This produces a text file containing the data I want from the separate files, but as a single column. (i.e. I've managed to 'stack' the columns on top of each other, rather than have them all side by side as separate columns). I feel I've missed something obvious.
In another attempt, I manage to write all the separate files to a single file, but without picking out the columns that I want.
import glob
files = [open(f) for f in glob.glob("result_*.txt")]
fout = open ("ResultsCombined.txt", 'wb')
for row in range(0,488):
for f in files:
fout.write( f.readline().strip() )
fout.write(' ')
fout.write('\n')
fout.close()
What I basically want is to copy column 5 from each file (it is always the same column) and write them all to a single file.
If you don't know the maximum number of rows in the files and if the files can fit into memory, then the following solution would work:
import glob
files = [open(f) for f in glob.glob("*.txt")]
# Given file, Read the 6th column in each line
def readcol5(f):
return [line.split(' ')[5] for line in f]
filecols = [ readcol5(f) for f in files ]
maxrows = len(max(filecols, key=len))
# Given array, make sure it has maxrows number of elements.
def extendmin(arr):
diff = maxrows - len(arr)
arr.extend([''] * diff)
return arr
filecols = map(extendmin, filecols)
lines = zip(*filecols)
lines = map(lambda x: ','.join(x), lines)
lines = '\n'.join(lines)
fout = open('output.csv', 'wb')
fout.write(lines)
fout.close()
Or this option (following your second approach):
import glob
files = [open(f) for f in glob.glob("result_*.txt")]
fout = open ("ResultsCombined.txt", 'w')
for row in range(0,488):
for f in files:
fout.write(f.readline().strip().split(' ')[5])
fout.write(' ')
fout.write('\n')
fout.close()
... which uses a fixed number of rows per file but will work for very large numbers of rows because it is not storing the intermediate values in memory. For moderate numbers of rows, I'd expect the first answer's solution to run more quickly.
Why not read all the entries from each 5th column into a list and after reading in all the files, write them all to the output file?
data = [
[], # entries from first file
[], # entries from second file
...
]
for i in range(number_of_rows):
outputline = []
for vals in data:
outputline.append(vals[i])
outfile.write(" ".join(outputline))

Categories

Resources