I have been trying to debug my code for searching strings in two files, but I can't understand why the strings are not found all the time. I have been stuck here for half day, and probably you could help me to understand the error, please?
The logic is: (after filtering out line in "try_ID.txt" by this piece len(re.findall("Ca", row)) == 0 or len(re.findall("Co", row)) == 0), if ca and co in "try_ID.txt" do not appear in both "try.txt" and "try_C.txt", then we go into the first if condition in my code; if we only find either ca or co in "try.txt" or "try_C.txt", then it goes into the elif conditions in my code; if we find both ca and co in both files "try_C.txt" and "try.txt", then we go into else condition in my code.
The problem is that, with my code, all the items go into the first if conditions (both not found). I don't know why.
my code
import re
with open("try_ID.txt", 'r') as fin, \
open("try_C.txt", 'r') as co_splice, \
open("try.txt", 'r') as ca_splice:
for row in fin:
if len(re.findall("Ca", row)) == 0 or len(re.findall("Co", row)) == 0:
pass
else: # problem starts from here
name = str(row.split()[1]) + "_blast"
if not row.split()[1] in ca_splice.read() and not row.split()[2] in co_splice.read():
print(row.split()[0:2])
elif row.split()[1] in ca_splice.read() and not row.split()[2] in col_splice.read():
print(row.split()[1] + "Ca")
elif not row.split()[1] in can_splice.read() and row.split()[2] in col_splice.read():
print(row.split()[2] + "Co")
else:
ne_name = name + "recip"
print(ne_name)
"try_ID.txt"
H21911 Ca29092.1t A05340.1
H21912 Ca19588.1t Co27353.1t A05270.1
H21913 Ca19590.1t Co14899.1t A05260.1
H21914 Ca19592.1t Co14897.1t A05240.1
H21915 Co14877.1t A05091.1
S25338 Ca12595.1t Co27352.1t A53970.1
S20778 Ca29091.1t Co24326.1t A61120.1
S26552 Ca20916.1t Co14730.1t A16155.1
"try_C.txt"
Co14730.1t;Co14730.2t
Co27352.1t;Co27352.2t;Co27352.3t;Co27352.4t;Co27352.5t
Co14732.1t;Co14732.2t
Co4217.1t;Co4217.2t
Co27353.1t;Co27353.2t
Co14733.1t;Co14733.2t
"try.txt"
Ca12595.1t;Ca12595.2t
Ca29091.1t;Ca29091.2t
Ca1440.1t;Ca1440.2t
Ca29092.1t;Ca29092.2t
Ca20916.1t;Ca20916.2t
Though weird thing is when I try a small piece of code like below, it can find the strings.
row = "H20118 Ca12595.1t Co18779.1t A01010.1"
text_file = "try.txt"
with open(text_file, 'r') as fin:
if row.split()[1] in fin.read():
print(True)
else:
print(False)
I really don't understand.
Try to read and split and search only once wherever possible. Try to keep it simple.
with open("try_ID.txt", 'r') as fin, \
open("try_C.txt", 'r') as co_splice, \
open("try.txt", 'r') as ca_splice:
co_splice = co_splice.read()
ca_splice = ca_splice.read()
for row in fin:
if 'Ca' in row or 'Co' in row:
zero,one,two,*_ = row.split()
name = one + "_blast"
one_in_ca = one in ca_splice
two_in_co = two in co_splice
if not one_in_ca and not two_in_co:
print(zero,one,two)
elif one_in_ca and not two_in_co:
print(one + "Ca")
elif not one_in_ca and two_in_co:
print(two + "Co")
else:
ne_name = name + "recip"
print(ne_name)
Related
I have a .txt file of amino acids separated by ">node" like this:
Filename.txt :
>NODE_1
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
>NODE_2
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
I want to separate this file into two (or as many as there are nodes) files;
Filename1.txt :
>NODE
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
Filename2.txt :
>NODE
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
With a number after the filename
This code works, however it deletes the ">NODE" line and does not create a file for the last node (the one without a '>' afterwards).
with open('FilePathway') as fo:
op = ''
start = 0
cntr = 1
for x in fo.read().split("\n"):
if x.startswith('>'):
if start == 1:
with open (str(cntr) + '.fasta','w') as opf:
opf.write(op)
opf.close()
op = ''
cntr += 1
else:
start = 1
else:
if op == '':
op = x
else:
op = op + '\n' + x
fo.close()
I canĀ“t seem to find the mistake. Would be thankful if you could point it out to me.
Thank you for your help!
Hi again! Thank you for all the comments. With your help, I managed to get it to work perfectly. For anyone with similar problems, this is my final code:
import os
import glob
folder_path = 'FilePathway'
for filename in glob.glob(os.path.join(folder_path, '*.fasta')):
with open(filename) as fo:
for line in fo.readlines():
if line.startswith('>'):
original = line
content = [original]
fileno = 1
filename = filename
y = filename.replace(".fasta","_")
def writefasta():
global content, fileno
if len(content) > 1:
with open(f'{y}{fileno}.fasta', 'w') as fout:
fout.write(''.join(content))
content = [line]
fileno += 1
with open('FilePathway') as fin:
for line in fin:
if line.startswith('>NODE'):
writefasta()
else:
content.append(line)
writefasta()
You could do it like this:
def writefasta(d):
if len(d['content']) > 1:
with open(f'Filename{d["fileno"]}.fasta', 'w') as fout:
fout.write(''.join(d['content']))
d['content'] = ['>NODE\n']
d['fileno'] += 1
with open('test.fasta') as fin:
D = {'content': ['>NODE\n'], 'fileno': 1}
for line in fin:
if line.startswith('>NODE'):
writefasta(D)
else:
D['content'].append(line)
writefasta(D)
This would be better way. It is going to write only on odd iterations. So that, ">NODE" will be skipped and files will be created only for the real content.
with open('filename.txt') as fo:
cntr=1
for i,content in enumerate(fo.read().split("\n")):
if i%2 == 1:
with open (str(cntr) + '.txt','w') as opf:
opf.write(content)
cntr += 1
By the way, since you are using context manager, you dont need to close the file.
Context managers allow you to allocate and release resources precisely
when you want to. It opens the file, writes some data to it and then
closes it.
Please check: https://book.pythontips.com/en/latest/context_managers.html
with open('FileName') as fo:
cntr = 1
for line in fo.readlines():
with open (f'{str(cntr)}.fasta','w') as opf:
opf.write(line)
opf.close()
op = ''
cntr += 1
fo.close()
I am trying to extract IPv4 addresses from a text file and save them as a list to a new file, however, I can not use regex to parse the file, Instead, I have check the characters individually. Not really sure where to start with that, everything I find seems to have import re as the first line.
So far this is what I have,
#Opens and prints wireShark txt file
fileObject = open("wireShark.txt", "r")
data = fileObject.read()
print(data)
#Save IP adresses to new file
with open('wireShark.txt') as fin, open('IPAdressess.txt', 'wt') as fout:
list(fout.write(line) for line in fin if line.rstrip())
#Opens and prints IPAdressess txt file
fileObject = open("IPAdressess.txt", "r")
data = fileObject.read()
print(data)
#Close Files
fin.close()
fout.close()
So I open the file, and I have created the file that I will put the extracted IP's in, I just don't know ow to pull them without using REGEX.
Thanks for the help.
Here is a possible solution.
The function find_first_digit, position the index at the next digit in the text if any and return True. Else return False
The functions get_dot and get_num read a number/dot and, lets the index at the position just after the number/dot and return the number/dot as str. If one of those functions fails to get the number/dot raise an MissMatch exception.
In the main loop, find a digit, save the index and then try to get an ip.
If sucess, write it to output file.
If any of the called functions raises a MissMatch exception, set the current index to the saved index plus one and start over.
class MissMatch(Exception):pass
INPUT_FILE_NAME = 'text'
OUTPUT_FILE_NAME = 'ip_list'
def find_first_digit():
while True:
c = input_file.read(1)
if not c: # EOF found!
return False
elif c.isdigit():
input_file.seek(input_file.tell() - 1)
return True
def get_num():
num = input_file.read(1) # 1st digit
if not num.isdigit():
raise MissMatch
if num != '0':
for i in range(2): # 2nd 3th digits
c = input_file.read(1)
if c.isdigit():
num += c
else:
input_file.seek(input_file.tell() - 1)
break
return num
def get_dot():
if input_file.read(1) == '.':
return '.'
else:
raise MissMatch
with open(INPUT_FILE_NAME) as input_file, open(OUTPUT_FILE_NAME, 'w') as output_file:
while True:
ip = ''
if not find_first_digit():
break
saved_position = input_file.tell()
try:
ip = get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num()
except MissMatch:
input_file.seek(saved_position + 1)
else:
output_file.write(ip + '\n')
Truoble with a really annoying homework. I have a csv-file with lots of comma-delimitered fields per row. I need to take the last two fields from every row and write them into a new txt-file. The problem is that some of the latter fields have sentences, those with commas are in double quotes, those without them aren't. For example:
180,easy
240min,"Quite easy, but number 3, wtf?"
300,much easier than the last assignment
I did this and it worked just fine, but the double quotes disappear. The assignment is to copy the fields to the txt-file, use semicolon as delimiter and remove possible line breaks. The text must remain exactly the same. We have an automatic check system, so it's no use arguing if this makes any sense.
import csv
file = open('myfile.csv', 'r')
output= open('mytxt.txt', 'w')
csvr = csv.reader(file)
headline = next(csvr)
for line in csvr:
lgt = len(line)
time = line[lgt - 2].replace('\n', '')
feedb = line[lgt - 1].replace('\n', '')
if time != '' and feedb != '':
output.write(time + ';' + feedb + '\n')
output.close()
file.close()
Is there some easy solution for this? Can I use csv module at all? No one seems to have exactly the same problem.
Thank you all beforehand.
Try this,
import csv
file = open('myfile.csv', 'r')
output= open('mytxt.txt', 'w')
csvr = csv.reader(file)
headline = next(csvr)
for line in csvr:
lgt = len(line)
time = line[lgt - 2].replace('\n', '')
feedb = line[lgt - 1].replace('\n', '')
if time != '' and feedb != '':
if ',' in feedb:
output.write(time + ';"' + feedb + '"\n')
else:
output.write(time + ';' + feedb + '\n')
output.close()
file.close()
Had to do it the ugly way, the file was too irrational. Talked with some collaegues on the same course and apparently the idea was NOT to use csv module here, but to rehearse basic file handling in Python.
file = open('myfile.csv','r')
output = open('mytxt.txt', 'w')
headline = file.readline()
feedb_lst = []
count = 0
for line in file:
if line.startswith('1'): #found out all lines should start with an ID number,
data_lst = line.split(',', 16) #that always starts with '1'
lgt = len(data_lst)
time = data_lst[lgt - 2]
feedb = data_lst[lgt - 1].rstrip()
feedback = [time, feedb]
feedb_lst.append(feedback)
count += 1
else:
feedb_lst[count - 1][1] = feedb_lst[count - 1][1] + line.rstrip()
i = 1
for item in feedb_lst:
if item[0] != '' and item[1] != '':
if i == len(feedb_lst):
output.write(item[0] + ';' + item[1])
else:
output.write(item[0] + ';' + item[1] + '\n')
i += 1
output.close()
file.close()
Thank you for your help!
You may think of this one as another redundant question asked, but I tried to go through all similar questions asked, no luck so far. In my specific use-case, I can't use pandas or any other similar library for this operation.
This is what my input looks like
AttributeName,Value
Name,John
Gender,M
PlaceofBirth,Texas
Name,Alexa
Gender,F
SurName,Garden
This is my expected output
Name,Gender,Surname,PlaceofBirth
John,M,,Texas
Alexa,F,Garden,
So far, I have tried to store my input into a dictionary and then tried writing it to a csv string. But, it is failing as I am not sure how to incorporate missing column values conditions. Here is my code so far
reader = csv.reader(csvstring.split('\n'), delimiter=',')
csvdata = {}
csvfile = ''
for row in reader:
if row[0] != '' and row[0] in csvdata and row[1] != '':
csvdata[row[0]].append(row[1])
elif row[0] != '' and row[0] in csvdata and row[1] == '':
csvdata[row[0]].append(' ')
elif row[0] != '' and row[1] != '':
csvdata[row[0]] = [row[1]]
elif row[0] != '' and row[1] == '':
csvdata[row[0]] = [' ']
for key, value in csvdata.items():
if value == ' ':
csvdata[key] = []
csvfile += ','.join(csvdata.keys()) + '\n'
for row in zip(*csvdata.values()):
csvfile += ','.join(row) + '\n'
For the above code as well, I took some help here. Thanks in advance for any suggestions/advice.
Edit #1 : Update code to imply that I am doing processing on a csv string instead of a csv file.
What you need is something like that:
import csv
with open("in.csv") as infile:
buffer = []
item = {}
lines = csv.reader(infile)
for line in lines:
if line[0] == 'Name':
buffer.append(item.copy())
item = {'Name':line[1]}
else:
item[line[0]] = line[1]
buffer.append(item.copy())
for item in buffer[1:]:
print item
If none of the attributes is mandatory, I think #framontb solution needs to be rearranged in order to work also when Name field is not given.
This is an import-free solution, and it's not super elegant.
I assume you have lines already in this form, with this columns:
lines = [
"Name,John",
"Gender,M",
"PlaceofBirth,Texas",
"Gender,F",
"Name,Alexa",
"Surname,Garden" # modified typo here: SurName -> Surname
]
cols = ["Name", "Gender", "Surname", "PlaceofBirth"]
We need to distinguish one record from another, and without mandatory fields the best I can do is start considering a new record when an attribute has already been seen.
To do this, I use a temporary list of attributes tempcols from which I remove elements until an error is raised, i.e. new record.
Code:
csvdata = {k:[] for k in cols}
tempcols = list(cols)
for line in lines:
attr, value = line.split(",")
try:
csvdata[attr].append(value)
tempcols.remove(attr)
except ValueError:
for c in tempcols: # now tempcols has only "missing" attributes
csvdata[c].append("")
tempcols = [c for c in cols if c != attr]
for c in tempcols:
csvdata[c].append("")
# write csv string with the code you provided
csvfile = ""
csvfile += ",".join(csvdata.keys()) + "\n"
for row in zip(*csvdata.values()):
csvfile += ",".join(row) + "\n"
>>> print(csvfile)
Name,PlaceofBirth,Surname,Gender
John,Texas,,M
Alexa,,Garden,F
While, if you want to sort columns according to your desired output:
csvfile = ""
csvfile += ",".join(cols) + "\n"
for row in zip(*[csvdata[k] for k in cols]):
csvfile += ",".join(row) + "\n"
>>> print(csvfile)
Name,Gender,Surname,PlaceofBirth
John,M,,Texas
Alexa,F,Garden,
This works for me:
with open("in.csv") as infile, open("out.csv", "w") as outfile:
incsv, outcsv = csv.reader(infile), csv.writer(outfile)
incsv.__next__() # Skip 1st row
outcsv.writerows(zip(*incsv))
Update: For input and output as strings:
import csv, io
with io.StringIO(indata) as infile, io.StringIO() as outfile:
incsv, outcsv = csv.reader(infile), csv.writer(outfile)
incsv.__next__() # Skip 1st row
outcsv.writerows(zip(*incsv))
print(outfile.getvalue())
I'm using Python 2.7.6 and learning to use the csv module. My script somehow added unexpected double quotes and several spaces after my last element, when I parsed input file to the CSV output file on my last element in each line. I could not remove those double quotes using regular substitution.
The only way that I could remove the extra double quote is to use:
tmp[1] = tmp[1][:-3]
I don't understand how the extra double quote is added when I parsed my input. Please let me know why or how those double quote were added to the partno when the input file did not have them.
My code:
import re
import csv
fname = "inputfile"
try:
fhand = open(fname)
except:
print 'File cannot be opened:', fname
exit()
domain = []
server = []
model = []
serial = []
dn = []
memsize = []
vid = []
partno = []
csv_out = open('/tmp/out.csv','wb')
writer = csv.writer(csv_out)
for line in fhand:
words = line.split("; ")
tmp_row_list = []
for number in [0,1,2,3,4,5,6,7]:
tmp=words[number].split("=")
if "}" in tmp[1]:
tmp[1] = tmp[1][:-3]
#tmp[1] = re.sub('}','', tmp[1])
if number == 0: domain.append(tmp[1])
if number == 1: server.append(tmp[1])
if number == 2: model.append(tmp[1])
if number == 3: serial.append(tmp[1])
if number == 4: dn.append(tmp[1])
if number == 5: memsize.append(tmp[1])
if number == 6: vid.append(tmp[1])
if number == 7: partno.append(tmp[1])
rows = zip(domain,server,model,serial,dn,memsize,vid,partno)
writer.writerows(rows)
csv_out.close()
Input file:
ffile:#{Ucs=uname; ServerId=4/6; Model=UCSB-B200-M3; Serial=FCH; AssignedToDn=; TotalMemory=98304; Vid=V06; PartNumber=73-14689-04}
ffile:#{Ucs=uname; ServerId=4/7; Model=UCSB-B200-M3; Serial=FCH; AssignedToDn=; TotalMemory=98304; Vid=V06; PartNumber=73-14689-04}
My bad output that has the strange double quotes before I had to remove them (if unrem the line with the re.sub, bad output with double quotes and extra spaces will show up in the last field/element):
uname,4/6,UCSB-B200-M3,FCH,,98304,V06,"73-14689-04
"
uname,4/7,UCSB-B200-M3,FCH,,98304,V06,"73-14689-04
"
Looks like you can simplify that lot.
Given input.txt as:
ffile:#{Ucs=uname; ServerId=4/6; Model=UCSB-B200-M3; Serial=FCH; AssignedToDn=; TotalMemory=98304; Vid=V06; PartNumber=73-14689-04}
ffile:#{Ucs=uname; ServerId=4/7; Model=UCSB-B200-M3; Serial=FCH; AssignedToDn=; TotalMemory=98304; Vid=V06; PartNumber=73-14689-04}
Then using the following:
import re, csv
get_col_vals = re.compile('(?:\w+)=(.*?)[;}]').findall
with open('input.txt') as fin, open('output.csv', 'wb') as fout:
csvout = csv.writer(fout, quoting=csv.QUOTE_NONE)
csvout.writerows(get_col_vals(row) for row in fin)
The resulting output.csv is:
uname,4/6,UCSB-B200-M3,FCH,,98304,V06,73-14689-04
uname,4/7,UCSB-B200-M3,FCH,,98304,V06,73-14689-04