Python: Separating txt file to multiple files using a reoccuring symbol - python

I have a .txt file of amino acids separated by ">node" like this:
Filename.txt :
>NODE_1
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
>NODE_2
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
I want to separate this file into two (or as many as there are nodes) files;
Filename1.txt :
>NODE
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
Filename2.txt :
>NODE
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
With a number after the filename
This code works, however it deletes the ">NODE" line and does not create a file for the last node (the one without a '>' afterwards).
with open('FilePathway') as fo:
op = ''
start = 0
cntr = 1
for x in fo.read().split("\n"):
if x.startswith('>'):
if start == 1:
with open (str(cntr) + '.fasta','w') as opf:
opf.write(op)
opf.close()
op = ''
cntr += 1
else:
start = 1
else:
if op == '':
op = x
else:
op = op + '\n' + x
fo.close()
I canĀ“t seem to find the mistake. Would be thankful if you could point it out to me.
Thank you for your help!
Hi again! Thank you for all the comments. With your help, I managed to get it to work perfectly. For anyone with similar problems, this is my final code:
import os
import glob
folder_path = 'FilePathway'
for filename in glob.glob(os.path.join(folder_path, '*.fasta')):
with open(filename) as fo:
for line in fo.readlines():
if line.startswith('>'):
original = line
content = [original]
fileno = 1
filename = filename
y = filename.replace(".fasta","_")
def writefasta():
global content, fileno
if len(content) > 1:
with open(f'{y}{fileno}.fasta', 'w') as fout:
fout.write(''.join(content))
content = [line]
fileno += 1
with open('FilePathway') as fin:
for line in fin:
if line.startswith('>NODE'):
writefasta()
else:
content.append(line)
writefasta()

You could do it like this:
def writefasta(d):
if len(d['content']) > 1:
with open(f'Filename{d["fileno"]}.fasta', 'w') as fout:
fout.write(''.join(d['content']))
d['content'] = ['>NODE\n']
d['fileno'] += 1
with open('test.fasta') as fin:
D = {'content': ['>NODE\n'], 'fileno': 1}
for line in fin:
if line.startswith('>NODE'):
writefasta(D)
else:
D['content'].append(line)
writefasta(D)

This would be better way. It is going to write only on odd iterations. So that, ">NODE" will be skipped and files will be created only for the real content.
with open('filename.txt') as fo:
cntr=1
for i,content in enumerate(fo.read().split("\n")):
if i%2 == 1:
with open (str(cntr) + '.txt','w') as opf:
opf.write(content)
cntr += 1
By the way, since you are using context manager, you dont need to close the file.
Context managers allow you to allocate and release resources precisely
when you want to. It opens the file, writes some data to it and then
closes it.
Please check: https://book.pythontips.com/en/latest/context_managers.html

with open('FileName') as fo:
cntr = 1
for line in fo.readlines():
with open (f'{str(cntr)}.fasta','w') as opf:
opf.write(line)
opf.close()
op = ''
cntr += 1
fo.close()

Related

Open two files. Read from the first file and do lookup on second before writing a line

I have two text files. I can open both with Python successfully.
I open the first file and read a data element into a variable using the for l in file construct.
I open the second file and read a data element into a variable using the for l in file construct.
If both variables match I write data to a text file. For the first line read it works perfectly but subsequent lines do not. The FIN variable never changes even though it finds a new line that starts with D further along. Is there a way to loop through two files like this? Am I missing something obvious?
File2Split = 'c:\\temp\\datafile\\comparionIP.txt'
GetResident = 'c:\\temp\\datafile\\NPINumbers.txt'
writefile = open('c:\\temp\\datafile\\comparionIPmod.txt','w')
openfile = open(File2Split,'r')
openfileNPI = open(GetResident,'r')
FIN = ''
FirstChar = ''
FIN2 = ''
for l in openfile:
FirstChar = (l[0:1])
if FirstChar =='D':
FIN = (l[21:31])
#print (FIN)
if FIN.startswith('1'):
writefile.write(l)
elif FirstChar in ['F','G','C','R']:
writefile.write(l)
elif FirstChar =='N':
for l2 in openfileNPI:
FIN2 = (l2[0:10])
NPI = ('N' + (l2[11:21]))
if FIN2 == FIN:
writefile.write(NPI + '\n')
openfileNPI.close()
openfile.close()
writefile.close()

Python write blank

I'm trying to make a program which would replace tags in a markdown file (.md) as follow :
If it's an opening $ tag, replace it by \(, if it's a closing $ tag, replace it by \), copy every other characters.
Unfortunately, when I try it, the file written is really strange. Some lines are copied but others aren't. First, the first and last line of every of my test files weren't copied. Other lines in the middle weren't as well. Same text on different line are not both copied.
Here is my program :
import os
def conv1(path):
"""convert $$ tags to \( \)"""
file = open(path, mode ='r') # open lesson with $ (.md)
new = open(path + '.tmp', mode = 'w') # open blank file
test = 0
for lines in file:
line = file.readline()
i = 0
length = len(line)
while i < length:
if line[i] == '$':
if test % 2 == 0: # replace opening tag
line = line[:i] + '\(' + line [i + 1:]
elif test % 2 == 1: # replace closing tag
line = line[:i] + '\)' + line [i + 1:]
test +=1
i += 2
length += 1
else :
i += 1
new.write(line + '\n')
file.close()
new.close()
os.rename(str(path) + '.tmp', str(path))
print('Done!')
Do you have any idea how to fix my issue?
Thanks in advance
EloiLmr
These line are causing every other line to be skipped:
for lines in file:
line = file.readline()
Calling file.readline() unnecessarily advances the file pointer by one line. It's enough to iterate over the file:
for line in file:
...

How to search for a string in part of text?

I am trying to search multiple text files for the text "1-2","2-3","3-H" which occur in the last field of the lines of text that start with "play".
An example of the text file is show below
id,ARI201803290
version,2
info,visteam,COL
info,hometeam,ARI
info,site,PHO01
play,1,0,lemad001,22,CFBBX,HR/78/F
play,1,0,arenn001,20,BBX,S7/L+
play,1,0,stort001,12,SBCFC,K
play,1,0,gonzc001,02,SS>S,K
play,1,1,perad001,32,BTBBCX,S9/G
play,1,1,polla001,02,CSX,S7/L+.1-2
play,1,1,goldp001,32,SBFBBB,W.2-3;1-2
play,1,1,lambj001,00,X,D9/F+.3-H;2-H;1-3
play,1,1,avila001,31,BC*BBX,31/G.3-H;2-3
play,2,0,grayj003,12,CC*BS,K
play,2,1,dysoj001,31,BBCBX,43/G
play,2,1,corbp001,31,CBBBX,43/G
play,4,1,avila001,02,SC1>X,S8/L.1-2
For the text file above, I would like the output to be '4' since there are 4 occurrences of "1-2","2-3" and "3-H" in total.
The code I have got so far is below, however I'm not sure where to start with writing a line of code to do this function.
import os
input_folder = 'files' # path of folder containing the multiple text files
# create a list with file names
data_files = [os.path.join(input_folder, file) for file in
os.listdir(input_folder)]
# open csv file for writing
csv = open('myoutput.csv', 'w')
def write_to_csv(line):
print(line)
csv.write(line)
j=0 # initialise as 0
count_of_plate_appearances=0 # initialise as 0
for file in data_files:
with open(file, 'r') as f: # use context manager to open files
for line in f:
lines = f.readlines()
i=0
while i < len(lines):
temp_array = lines[i].rstrip().split(",")
if temp_array[0] == "id":
j=0
count_of_plate_appearances=0
game_id = temp_array[1]
awayteam = lines[i+2].rstrip().split(",")[2]
hometeam = lines[i+3].rstrip().split(",")[2]
date = lines[i+5].rstrip().split(",")[2]
for j in range(i+46,i+120,1): #only check for plate appearances this when temp_array[0] == "id"
temp_array2 = lines[j].rstrip().split(",") #create new array to check for plate apperances
if temp_array2[0] == "play" and temp_array2[2] == "1": # plate apperance occurs when these are true
count_of_plate_appearances=count_of_plate_appearances+1
#print(count_of_plate_appearances)
output_for_csv2=(game_id,date,hometeam, awayteam,str(count_of_plate_appearances))
print(output_for_csv2)
csv.write(','.join(output_for_csv2) + '\n')
i=i+1
else:
i=i+1
j=0
count_of_plate_appearances=0
#quit()
csv.close()
Any suggestions on how I can do this? Thanks in advance!
You can use regex, I put your text in a file called file.txt.
import re
a = ['1-2', '2-3', '3-H'] # What you want to count
find_this = re.compile('|'.join(a)) # Make search string
count = 0
with open('file.txt', 'r') as f:
for line in f.readlines():
count += len(find_this.findall(line)) # Each findall returns the list of things found
print(count) # 7
or a shorter solution: (Credit to wjandrea for hinting the use of a generator)
import re
a = ['1-2', '2-3', '3-H'] # What you want to count
find_this = re.compile('|'.join(a)) # Make search string
with open('file.txt', 'r') as f:
count = sum(len(find_this.findall(line)) for line in f)
print(count) # 7

How can I change for loop while in it?

for x in file.readlines():
something()
I think this code caching all the lines when loop is started. I deleting some of the lines from the file but it still repeating deleted lines. How can I change loop while in it?
def wanted(s,d):
print("deneme = " + str(s))
count = 0
total = 0
TG_count = TC_count = TA_count = GC_count = CC_count = CG_count = GG_count = AA_count = AT_count = TT_count = CT_count = AG_count = AC_count = GT_count = 0
for x in range(d,fileCount):
print(str(x+1) + 'st file processing...')
searchFile = open(str(x) + '.txt',encoding = 'utf-8',mode = "r+")
l = searchFile.readlines()
searchFile.seek(0)
for line in l:
if s in line[:12]:
blabla()
else:
searchFile.write(line)
searchFile.truncate()
searchFile.close()
for p in range(fileCount):
searchFile = open(str(p) + '.txt',encoding = 'utf-8',mode = "r+")
for z in searchFile.readlines():
wanted(z[:12],p)
print("Progressing file " + str(p) + " complete")
I guess it's Python. Yes, readlines() reads the whole file at once. In order to avoid this you can use:
for x in file:
something()
Maybe you can find the appropriate information in the Python tutorial. It says
If you want to read all the lines of a file in a list you can also use list(f) or f.readlines().
So yes, all lines are read and stored in memory.
Also the manual says:
f.readline() reads a single line from the file;
More details can be found in the manual.

Segmentation Fault

I am using python 2.4.4 (old machine, can't do anything about it) on a UNIX machine. I am extremely new to python/programming and have never used a UNIX machine before. This is what I am trying to do:
extract a single sequence from a FASTA file (proteins + nucleotides) to a temporary text file.
Give this temporary file to a program called 'threader'
Append the output from threader (called tempresult.out) to a file called results.out
Remove the temporary file.
Remove the tempresult.out file.
Repeat using the next FASTA sequence.
Here is my code so far:
import os
from itertools import groupby
input_file = open('controls.txt', 'r')
output_file = open('results.out', 'a')
def fasta_parser(fasta_name):
input = fasta_name
parse = (x[1] for x in groupby(input, lambda line: line[0] == ">"))
for header in parse:
header = header.next()[0:].strip()
seq = "\n".join(s.strip() for s in parse.next())
yield (header, '\n', seq)
parsedfile = fasta_parser(input_file)
mylist = list(parsedfile)
index = 0
while index < len(mylist):
temp_file = open('temp.txt', 'a+')
temp_file.write(' '.join(mylist[index]))
os.system('threader' + ' temp.txt' + ' tempresult.out' + ' structures.txt')
os.remove('temp.txt')
f = open('tempresult.out', 'r')
data = str(f.read())
output_file.write(data)
os.remove('tempresult.out')
index +=1
output_file.close()
temp_file.close()
input_file.close()
When I run this script I get the error 'Segmentation Fault'. From what I gather this is to do with me messing with memory I shouldn't be messing with (???). I assume it is something to do with the temporary files but I have no idea how I would get around this.
Any help would be much appreciated!
Thanks!
Update 1:
Threader works fine when I give it the same sequence multiple times like this:
import os
input_file = open('control.txt', 'r')
output_file = open('results.out', 'a')
x=0
while x<3:
os.system('threader' + ' control.txt' + ' tempresult.out' + ' structures.txt')
f = open('tempresult.out', 'r')
data = str(f.read())
output_file.write(data)
os.remove('result.out')
x += 1
output_file.close()
input_file.close()
Update 2: In the event that someone else gets this error. I forgot to close temp.txt before invoking the threader program.

Categories

Resources