Loop through pair of files python - python

I have a script that receives two files as input and creates a dictionary based on lines. Finally, it overwrites the first file.
I am looking for a way to run this script on all file pairs of a folder, choosing as sys.argv[1] and sys.argv[2] based on a pattern in the name.
import re
import sys
datafile = sys.argv[1]
schemaseqs = sys.argv[2]
datafile_lines = []
d = {}
prev = None
with open(datafile, 'r') as f:
i = 0
for line in f:
if i % 2 == 0:
d[line.strip()]=0
prev = line.strip()
else:
d[prev] = line.strip()
i+=1
new_d = {}
with open(schemaseqs, 'r') as f:
i=0
prev = None
for line in f:
if i % 2 == 0:
new_d[line.strip()]=0
prev = line.strip()
else:
new_d[prev] = line.strip()
i+=1
for key, value in d.items():
if value in new_d:
d[key] = new_d[value]
print(d)
with open(datafile,'w') as filee:
for k,v in d.items():
filee.writelines(k)
filee.writelines('\n')
filee.writelines(v)
filee.writelines('\n')
I have hundreds of file pairs all sharing the same pattern proteinXXXX (where XXXX is a number) This number can have up to four digits (e.g. 9,99,999 or 9999). So I have protein 555.txt and protein 555.fasta
I've seen I can use glob or os.listdir to read files from a directory. However, I cannot assign them to a variable and extract the lines one pair at a time in every pair of the directory.
Any help is appreciated.

Just the concept.
Import required libraries.
import glob
import os.path
Define function that extracts only the basename (the part without extension) from filename.
def basename(fn):
return os.path.splitext(os.path.basename(fn))[0]
Create two sets, one with .txt files, another with .fasta files.
t = {basename(fn) for fn in glob.glob("protein*.txt")}
f = {basename(fn) for fn in glob.glob("protein*.fasta")}
Calculate intersection of these two sets to be sure that both .txt and .fasta files exist with the same basename. Then add the missing suffixes and let them process with the existing code.
for bn in t.intersection(f):
process(bn + ".txt", bn + ".fasta")

Related

Python: Reading multiple files and storing the output for a particular file

I have a thousand .xvg files in a directory which I need to read and store an output for each of them.
Currently I have a python code which works for only one file. Could you please suggest how do I read all files at once and get an output and store it for every file separately?
f = open('/home/abc/xyz/coord/coord_1.xvg')
dat = f.readlines()
dat1 = dat[22:len(dat)]
dat2=[]
for k in dat1:
dat2.append(k.split())
for k in dat2:
if float(k[1])>=9.5:
print('P')
break
elif float(k[1])<=5.9:
print('R')
break
else:
print('N')
Here's a version but used as much as code as possible to make it easier to follow.
import os
def process_files():
" Will process all files in folder using your code "
for file in os.listdir("."): # '.' correspond to the current directory
# You can specify whatever directory,
#such as /usr/accidental_coder/working
if file.endswith(".xvg"):
# Find found
# Output will be with same name but with .txt suffix
with open(os.path.join(".", file), 'r') as infile, \
open(os.path.join(".", file.replace('.xvg', '.txt')), 'w') as ofile:
# Using your original code
# left alone so you could know how to change if desired
# (note: your can be shortened)
dat = infile.readlines()
dat1 = dat[22:len(dat)]
dat2=[]
for k in dat1:
dat2.append(k.split())
for k in dat2:
if float(k[1])>=9.5:
ofile.write('P\n')
break
elif float(k[1])<=5.9:
ofile.write('R\n')
break
else:
ofile.write('N\n')
process_files()
Refactoring Your Code for Better Performance
Seems you just process the 23'rd line in each file
import os
def process_files():
for file in os.listdir("."):
# Examples of getting files from directories
# https://stackoverflow.com/questions/3964681/find-all-files-in-a-directory-with-extension-txt-in-python
if file.endswith(".xvg"):
with open(os.path.join(".", file), 'r') as infile, \
open(os.path.join(".", file.replace('.xvg', '.txt')), 'w') as ofile:
# Skip first 22 lines
for _ in range(22):
next(infile)
# use data on 23rd line
data = next(infile)
k = data.split()
if float(k[1])>=9.5:
ofile.write('P\n')
elif float(k[1])<=5.9:
ofile.write('R\n')
else:
ofile.write('N\n')
process_files()

Wrong output when trying to create a manifest-generating script (python) for Rail-RNA. Files do not match

What I want: To create a "manifest" for running rail-RNA (http://rail.bio/) on a number of fastq files, that I have in a directory, in the following form:
FASTQ URL 1 (tab) optional MD5 1 (tab) FASTQ URL 2 (tab) optional MD5 2 (tab) sample label
like:
/home/data/10080-17_r1.fastq 0 /home/data/10080-17_r2.fastq 0 10080-17_r1
/home/data/10300-25_r1.fastq 0 /home/data/10300-25_r2.fastq 0 10300-25_r1
/home/data/40500-72_r1.fastq 0 /home/data/40500-72_r2.fastq 0 10300-25_r2
.. and so on
What I have done: created a python script to generate a manifest from fastq files in a specific directory:
#!/usr/bin/python
import os
import csv
thisdir = os.getcwd()
# Create empty lists
forward = []
reverse = []
zero = []
names = []
# Extract file-paths for files ending with .fastq and append them to "forward" and "reverse"
for r, d, f in os.walk(thisdir): # r=root, d=directories, f = files
for file in f:
if "_1.fastq" in file:
forward.append(os.path.join(r, file))
if "_2.fastq" in file:
reverse.append(os.path.join(r, file))
# make a list containing 0 with the length of the forward list
for i in range(len(forward)):
zero.append('0')
# extract filenames without extensions:
l = os.listdir(thisdir)
li = [x.split('.')[0] for x in l]
for name in li:
if "_1" in name:
names.append(name)
names = [s.strip('_1') for s in names]
# write the output to a file
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for path in zip(forward, zero, reverse, zero, names):
writer.writerow(list(path))
What is wrong?: I get a manifest.txt in the right format, BUT, it does not match the right *_r1.fastq and *_r2.fastq files. It does something like this (the r1's in the first column does not match the r2's of the third column)
/home/data/10080-17_r1.fastq 0 /home/data/40500-72_r2.fastq 0 10080-17_r1
/home/data/10300-25_r1.fastq 0 /home/data/10080-17_r2.fastq 0 10300-25_r1
/home/data/40500-72_r1.fastq 0 /home/data/10300-25_r2.fastq 0 10300-25_r2
Do some of you, more experienced Python'ers have a solution to fix this?
That would be greatly appreciated!
Best wishes, Birgitte
In provided solution this error occurs if amount of *_r1.fastq files doesn't correspond to amount of *_r2.fastq files as that code makes new csv rows only by arrays indexes and doesn't compare file names.
I updated that solution. Check the files names, they are should be like:
/home/data/10080-17_r1.fastq
/home/data/10080-17_r2.fastq
At the moment we get all forward files ( *_r1.fastq ) and we are trying to find an appropriate reverse file ( *_r2.fastq) in the same directory. If we doesn't find it then put '-' instead of the reverse file's name.
Please check the code and read comments:
#!/usr/bin/python
import os
import csv
this_dir = os.getcwd()
forward_arr = []
reverse_arr = []
for r, d, f in os.walk(this_dir): # r=root, d=directories, f = files
for file in f:
if "_r1.fastq" in file:
forward_arr.append(os.path.join(r, file))
if "_r2.fastq" in file:
reverse_arr.append(os.path.join(r, file))
# collect result rows in this array
csv_arr = []
# foreach file in forward_arr
for forward_file in forward_arr:
# get sample label from full file path
# 1. split by '/' and select last element:
# /home/data/10080-17_r1.fastq -> 10080-17_r1.fastq
# 2. split by '_r' and select first element: 10080-17_r1.fastq -> 10080-17
sample_label = forward_file.split('/')[-1].split('_r')[0]
# we will search the reverse file for the same forward file in the reverse_arr
# but if we don't find it, in that case we'll put '-'
# instead of the path to the reverse file
reverse_file_result = "-"
# we are looking for a file with the same name and in the same location
# but it should be a reverse file with '_r2' instead of '_r1' in its name
reverse_file_for_search = forward_file.replace("_r1", "_r2")
# search that reverse_file in the reverse_arr
for reverse_file in reverse_arr:
# if we found that file
if reverse_file_for_search == reverse_file:
# assign the reverse file name
# to reverse_file_result variable insted of '-'
reverse_file_result = reverse_file
# go to the next forward_file
break
# in that place we can count md5 for the FORWARD file
md5_url_1 = 0
# in that place we can count md5 for the REVERSE file
md5_url_2 = 0
# append the result row in the csv_arr
csv_arr.append((forward_file, md5_url_1, reverse_file_result,
md5_url_2, sample_label))
# re-write all data to csv file per one iteration
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(csv_arr)
I think this should work for what you need. It's difficult to make out, because this:
names = [s.strip('_1') for s in names]
doesn't look like it should be doing anything (I suspect it's supposed to be "_r1" as in the first loop where I've modified there)
import os
import csv
thisdir = os.getcwd()
# Create empty lists
forward = []
reverse = []
names = []
for r, d, f in os.walk(thisdir): # r=root, d=directories, f = files
if f.endswith("_r1.fastq"):
forward.append(os.path.join(r, file))
names.append(f.strip("_r1.fastq"))
elif f.endswith("_r2.fastq"):
reverse.append(os.path.join(r, file))
# write the output to a file
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for for, rev, nam in zip(forward, reverse, names):
path = [for, 0, rev, o, nam]
writer.writerow(path)

Unable to strip and store content of some files in CSV format

I have a files which have look like:
They are placed in
~/ansible-environments/aws/random_name_1/inventory/group_vars/all
~/ansible-environments/aws/random_name_2/inventory/group_vars/all
~/ansible-environments/aws/random_name_3/inventory/group_vars/all
I wrote:
import os
import sys
rootdir='/home/USER/ansible-environments/aws'
#print "aa"
for root, subdirs, files in os.walk(rootdir):
for subdir in subdirs:
all_path = os.path.join(rootdir, subdir, "inventory", "group_vars", "all")
if not os.path.isfile(all_path):
continue
try:
with open(all_path, "r") as f:
all_content = f.readlines()
except (OSError, IOError):
continue # ignore errors
csv_line = [""] * 3
for line in all_content:
if line[:9] == "isv_alias:":
csv_line[0] = line[7:].strip()
elif line[:21] == "LMID:":
csv_line[1] = line[6:].strip()
elif line[:17] == "products:":
csv_line[2] = line[10:].strip()
if all(value != "" for value in csv_line):
with open(os.path.join("/home/nsingh/nishlist.csv"), "a") as csv:
csv.write(",".join(csv_line))
csv.write("\n")
I just need the LMIT, isv_alias, products in the following format :
alias,LMIT,product
bloodyhell,80,rms_scl
something_else,434,some_other_prod
There are three problems here:
Finding all key-value files
Extracting keys and values from each file
Turning the keys and values from each file into rows in a CSV
First use os.listdir() to find the contents of
~/ansible-environments/aws, then build the expected path of the
inventory/group_vars directory inside each using
os.path.join(), and see which ones actually exist. Then list
the contents of those directories that do exist, and assume all
files inside (such as all) are key-value files. The example
code at the end of this answer assumes that all files can be
found this way; if they cannot, you may have to adapt the example
code to find the files using os.walk() or another method.
Each key-value file is a sequence of lines, where each line is a key
and value separated by a colon (":"). Your approach using search
for a substring (operator in) will fail if, say, the secret key
contains the string "LMIT". Instead, split the line at the colon.
The expression line.split(":", 1) splits the line at the first
colon, but not subsequent colons in case the value itself has a
colon. Then strip off excess whitespace from the key and value,
and build a dictionary of keys and values.
Now choose which keys you want to keep. Once you've parsed each
file, look up the associated values in the dictionary from that
file, and build a list out of them. Then add the list of values
from this file to a list of lists of values from all files, and
use csv.writer to write out the list of lists as a CSV file.
It might look something like this:
#!/usr/bin/env python2
from __future__ import with_statement, print_function, division
import os
import csv
def read_kv_file(filename):
items = {}
with open(filename, "rU") as infp:
for line in infp:
# Split at a colon and strip leading and trailing space
line = [x.strip() for x in line.split(":", 1)]
# Add the key and value to the dictionary
if len(line) > 1:
items[line[0]] = line[1]
return items
# First find all random names
outer_dir = os.path.expanduser("~/ansible-environments/aws")
random_names = os.listdir(outer_dir)
inner_dirs = [
os.path.join(outer_dir, name, "inventory/group_vars")
for name in random_names
]
# Now filter it to those directories that actually exist
inner_dirs = [name for name in inner_dirs if os.path.isdir(name)]
wanted_keys = ["alias", "LMIT", "products"]
out_columns = ["alias", "LMIT", "product"]
# Collect key-value pairs from all files in these folders
rows = []
for dirname in inner_dirs:
for filename in os.listdir(dirname):
path = os.path.join(dirname, filename)
# Skip non-files in this directory
if not os.path.isfile(path):
continue
# If the file has a non-blank value for any of the keys of
# interest, add a row
items = read_kv_file(path)
this_file_values = [items.get(key) for key in wanted_keys]
if any(this_file_values):
rows.append(this_file_values)
# And write them out
with open("out.csv", "wb") as outfp:
writer = csv.writer(outfp, "excel")
writer.writerow(out_columns)
writer.writerows(rows)
You didn't specify how are you obtaining the files (the f in the first line) but under the assumption that you've sorted out the files traversal and that the files are exactly as you present them (so no extra spaces or something like that), you can modify your code to:
csv_line = [""] * 3
for line in f:
if line[:6] == "alias:":
csv_line[0] = line[7:].strip()
elif line[:5] == "LMIT:":
csv_line[1] = line[6:].strip()
elif line[:9] == "products:":
csv_line[2] = line[10:].strip()
with open(rootdir + '/' + 'list.csv', "a") as csv:
csv.write(",".join(csv_line))
csv.write("\n")
This will add a new line with the proper vars in your CSV for each file that was loaded as f, however keep in mind that it doesn't check for the data validity so it will be happy to write empty new lines if the opened file didn't contain the proper data.
You can prevent that by checking for all(value != "" for value in csv_line) before opening the csv file for writing. You can use any instead of all if you want to write entries that have at least one variable populated.
UPDATE: The code you just pasted has serious indentation and structural issues. It at least make more sense on what you want to do - assuming everything else is ok, this should do it:
for root, subdirs, files in os.walk(rootdir):
for subdir in subdirs:
all_path = os.path.join(rootdir, subdir, "inventory", "group_vars", "all")
if not os.path.isfile(all_path):
continue
try:
with open(all_path, "r") as f:
all_content = f.readlines()
except (OSError, IOError):
continue # ignore errors
csv_line = [""] * 3
for line in all_content:
if line[:6] == "alias:":
csv_line[0] = line[7:].strip()
elif line[:5] == "LMIT:":
csv_line[1] = line[6:].strip()
elif line[:9] == "products:":
csv_line[2] = line[10:].strip()
if all(value != "" for value in csv_line):
with open(os.path.join(rootdir, "list.csv"), "a") as csv:
csv.write(",".join(csv_line))
csv.write("\n")

Python script to count num lines in all files in directory

So I'm new to python and I'm trying to write a script that iterates through all .txt files in a directory, counts the number of lines in each one (with exception to lines that are blank or commented out), and writes the final output to a csv. The final output should look something like this:
agprices, avi, adp
132, 5, 8
I'm having trouble with the syntax to save each count as the value of the dictionary. Here is my code below:
#!/usr/bin/env python
import csv
import copy
import os
import sys
#get current working dir, set count, and select file delimiter
d = os.getcwd()
count = 0
ext = '.txt'
#parses through files and saves to a dict
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.strip(): #Exclude blank lines
continue
else if line.startswith("#"): #Exclude commented lines
continue
else
count +=1
#Need to save count as val in dict here
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series_dict.keys())
w.writeheader()
w.writerow(series_dict)
So here's the edit:
#!/usr/bin/env python
import csv
import copy
import os
import sys
import glob
#get current working dir, set count, and select file delimiter
os.chdir('/Users/Briana/Documents/Misc./PythonTest')
#parses through files and saves to a dict
series = {}
for fn in glob.glob('*.txt'):
with open(fn) as f:
series[fn] = (1 for line in f if line.strip() and not line.startswith('#'))
print series
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series.keys())
sum(names.values())
I'm getting an indentation error on the 2nd to last line and am not quite sure why? Also, I'm not positive that I'm writing the syntax correctly on the last part. Again, I'm simply trying to return a dictionary with names of files and number of lines in files like {a: 132, b:245, c:13}
You can try something along these lines:
os.chdir(ur_directory)
names={}
for fn in glob.glob('*.txt'):
with open(fn) as f:
names[fn]=sum(1 for line in f if line.strip() and not line.startswith('#'))
print names
That will print a dictionary similar to:
{'test_text.txt': 20, 'f1.txt': 3, 'lines.txt': 101, 'foo.txt': 6, 'dat.txt': 6, 'hello.txt': 1, 'f2.txt': 4, 'neglob.txt': 8, 'bar.txt': 6, 'test_reg.txt': 6, 'mission_sp.txt': 71, 'test_nums.txt': 8, 'test.txt': 7, '2591.txt': 8303}
And you can use that Python dict in csv.DictWriter.
If you want the sum of those, just do:
sum(names.values())
I think you should make two changes to your script:
Use glob.glob() to get the list of files matching your desired suffix
Use for line in file_obj to iterate through the lines
Other problem:
The indentation is wrong on your last few lines
You could count your lines in your files with this 1-liner:
line_nums = sum(1 for line in open(f) if line.strip() and line[0] != '#')
that would shorten your code segment to
for f in txt_files:
count += sum(1 for line in open(os.path.join(d,f))
if line[0] != '#' and line.strip())
I looks like you want to use a dictionary to keep track of the counts. You could create one a the top like this counts = {}
Then (once you fix your tests) you can update it for each non-comment line:
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
counts[f] = 0 # create an entry in the dictionary to keep track of one file's lines
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.startswith("#"): #Exclude commented lines
continue
elif line.strip(): #Exclude blank lines
counts(f) += 1

how to match file name in the file using python

How to find out if two file exists with same pattern inside a file.If all filenames have two-set of filenames ( csv.new and csv) then go ahead to next step otherwise exit with error message.
The prefix "abc_package" will have two files one with extension "csv.new" and second file with extension "csv". There could be many filenames inside the "list_of_files.txt".
Ex: List_of_files.txt
abc_package.1406728501.csv.new
abc_package.1406728501.csv
abc_package.1406724901.csv.new
abc_package.1406724901.csv
For matching the file name name in python you can use fnmatch module..I will provide you a sample code from the documentation.
import fnmatch
import os
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*.txt'):
print file
The syntax would be fnmatch.fnmatchcase(filename, pattern)
Please have a look here for more examples
with open("in.txt","r") as fo:
f = fo.readlines()
cs_new = set()
cs = set()
for ele in f:
ele = ele.rstrip()
if not ele.endswith(".new"):
cs.add(ele)
else:
cs_new.add(ele.split(".new")[0])
diff = cs ^ cs_new
for fi in diff:
print fi
As you need either filename you will need to check for the existence against both lists:
with open("in.txt","r") as f:
f = [x.rstrip() for x in f]
cs, cs_new, diff = [],[],[]
for ind, ele in enumerate(f):
if ele.endswith(".csv"):
cs.append(ele)
else:
cs_new.append([ele.split(".new")[0],ind]) # keep track of original element in with the ind/index
for ele in cs:
if not any(ele in x for x in cs_new):
diff.append(ele)
for ele in cs_new:
if not any(ele[0] in x for x in cs):
diff.append(f[ele[1]]) # append original element with full extension
Assuming the file isn't so ridiculously huge that you can't fit it into memory, just create a set of all .csv.new files and a set of all .csv files and verify that they're identical. For example:
csvfiles = set()
newfiles = set()
with open('List_of_files.txt') as f:
for line in f:
line = line.rstrip()
if line.endswith('.csv.new'):
newfiles.add(line[:-4])
elif line.endswith('.csv'):
csvfiles.add(line)
if csvfiles != newfiles:
raise ValueError('Mismatched files!')
If you want to know which files were mismatched, csvfiles - newfiles gives you the .csv files without corresponding .csv.new, and newfiles - csvfiles gives you the opposite.
(There are ways to make this cleaner and more readable, from using os.path.splitext to using a general partition-an-iterable-by-filter function, but I think this should be the easiest for a novice to immediately understand.)

Categories

Resources