Remove specified elements from a list in a txt file

Remove specified elements from a list in a txt file - python

I have a text file with a 1825 x 51 table. I am attempting to read into the text file and write this table to a new text file while removing certain columns from the table. I couldn't figure out how to delete a whole column because the list is a string so I am attempting to go into each row and use an if/else statement to determine if the element is in the desired range. If it is, write it to output, otherwise delete it.
if i in range(1,3)+[14]+range(20,27)+range(33,38)+[43]+[45]:
newNum=data[i]
data[i]=newNum
else:
delete.data
This is my first time using python so any help would be greatly appreciated!
code from comments
with open(inputfilepath,'r') as f:
outputfile=open(outputfilepath,'w+')
inSection=False
for line in f:
if begin in line:inSection=True
if inSection:
keep=range(1,3)+[14]+range(20,27)+range(33,38)+[43]+[45]
tmp=[]
spl=line.split(' ')
for idx in keep:
tmp.extend(spl[idx])
outputfile.write('%s\n' % ' '.join(tmp))
if end in line:inSection=False

I would probably go with something like this:
from __future__ import print_function
def process(infile, keep):
for line in infile:
fields = line.split()
yield ' '.join([_ for i, _ in enumerate(fields) if i in keep])
def main(infile, outfile):
# The following line (taken from your example) will not work in Python 3 as
# you cannot "add" ranges to lists. In Python 3 you would need to write:
# >>> [14] + list(range(20, 27)
keep = range(1, 3) + [14] + range(20, 27) + range(33, 38) + [43] + [45]
for newline in process(infile, keep):
print(newline, file=outfile)
if __name__ == '__main__':
with open('so.txt') as infile, open('output.txt', 'w') as outfile:
main(infile, outfile)

keep = [1,2,3,14,20,21,22,23,24,25,26,27,33,34,35,36,37,38,43,45]
with open('textfile') as fin, open('textout') as fout:
for ln in fin:
tmp = []
spl = ln.split(' ')
for idx in keep:
tmp.append(spl[idx])
fout.write('%s\n' % ' '.join(tmp))

Related

Python constantly deleting lines after saving one

My code looks like
from __future__ import print_function
import linecache
from random import choice
from string import digits, ascii_lowercase
import random, string
import fileinput
L = ''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(10)])
print(L)
with open("input.txt") as f:
for line in f:
ok = (line)
ok = line.strip()
if line > 6:
f = open('output.txt', 'a' )
f.write( str(ok) +" " + str(L) + '\n')
f.close()
total_count = 0
for line in fileinput.input('output.txt', inplace=True):
count = line.count(ok)
if count > 0:
print(line, end='')
total_count += count
print (total_count)
My input file is:
55
99
42
65
49
49
My problem here is that it's supposed to be saving all of the numbers but instead it's only saving the last 2, any help would be appreciated

Seems the fileinput module may be causing issues with correctly writing the file output, but I am not sure how that module works so I cannot suggest how to use it.
The main correction may be to use different file handler pointers for reading the input file and writing to the output file (don't reuse f as mentioned in the comments).
Here's some refactoring of your original code:
from __future__ import print_function
import linecache
from random import choice
from string import digits, ascii_lowercase
import random, string
from collections import defaultdict
L = ''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(10)])
print(L)
total_count = defaultdict(lambda: 0)
with open("input.txt", 'r') as f:
for line in f:
ok = line.strip()
if int(line) > 6:
total_count[int(line)] += 1
with open('output.txt', 'a' ) as ff:
ff.write( str(ok) + " " + str(L) + '\n')
infile_contents = []
outfile_contents = []
# get list of input.txt contents
with open('input.txt', 'r') as infile:
for line in infile.readlines():
infile_contents.append(line.strip())
# get list of output.txt first column contents
with open('output.txt', 'r') as outfile:
for line in outfile.readlines():
outfile_contents.append(line.split()[0])
# dictionary to hold count of occurrences
totals = {}
# count occurrences
for num in infile_contents:
totals[num] = outfile_contents.count(num)
# print counts
for item in totals.items():
print('{num} appears {count} times.'.format(num=item[0], count=item[1]))
Example output after running script twice:
vlVHVi3t9L
55 appears 2 times.
99 appears 2 times.
42 appears 2 times.
65 appears 2 times.
49 appears 4 times.

How to automatically call for a next iterator inside a loop

I recently started scripting and am having difficulties with nested loops. I'm not getting the first iterator object from the first loop as an input to the second to run properly.
The problem itself is quite simple. I would like to change the second item (‘20’) on row 1 in my data to a number from the range and create a file.
So if the first number from the range is 14 then the first line of the file is (L,14,0,0,0,0) and gets a name data1.txt.
Data:
L,1,5.827,20,-4.705,0
L,20,0,0,0,0
L,12,15,0,-6,0
Original Script:
import re
from itertools import islice
import numpy as np
x = np.arange(14,30.5,0.5)
size = x.size
with open('data.txt', 'r') as line:
for line in islice(line, 1, 2):
re.sub(r'\s', '', line).split(',')
nline = line[:2] + line[3:]
x = iter(x)
y = next(x)
for i in x:
nline = nline[:2] + str(y)+ nline[3:]
with open('data.txt', 'r') as file:
data = file.readlines()
data[1] = nline
for i in range(1,size):
with open('data%i.txt' %i, 'w') as file:
file.writelines(data)
EDITED:
Ive made some progress with my script and Im almost there.
After the first loop I have the output that I need (33 cases). All I would like to do now is to write them to a 33 unique files, named data1 to data33. What seems to happen however is that the second loop iterates through the first loop another 33 times and creates 1089 cases. So what ends up in the files is only the last line of the first loop.
Any suggestions how to allow the second loop for file creation but disable it for data?
Updated Script:
import re
from itertools import islice
import numpy as np
x = np.arange(14,30.5,0.5)
size = x.size
with open('data.txt', 'r') as line:
for line in islice(line, 1, 2):
re.sub(r'\s', '', line).split(',')
for i in x:
y=str(i)
nline = line[:2] + y + line[4:]
with open('data.txt', 'r') as file:
data = file.readlines()
data[1] = nline
for i in range(1,size+1):
with open('data%i.txt' %i, 'w') as file:
file.writelines(data)
print data

It seems you are trying to concatenate a string to a list. nline = nline[:2] + str(y)+ nline[3:]. This will yield in a type error.
Also nline[:2] gets the first 2 parts of the list, so you want to split nline[:1] to nline[2:]
Try something along the lines of:
import re
from itertools import islice
import numpy as np
x = np.arange(14,30.5,0.5)
size = x.size
with open('data.txt', 'r') as line:
for line in islice(line, 1, 2):
re.sub(r'\s', '', line).split(',')
nline = line[:1] + line[2:] #not sure what this does, but this might be wrong, change accordingly
x = iter(x)
y = next(x)
temp = []
temp.append(y)
for i in x:
nline = nline[:1] + temp + nline[2:]
with open('data.txt', 'r') as file:
data = file.readlines()
data[1] = nline
for i in range(1,size):
with open('data%i.txt' %i, 'w') as file:
file.writelines(data)

Reorganizing data

I have to input a text file that contains comma seperated and line seperated data in the following format:
A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585,05-21-11,04:00:00,REGULAR,003169415,001097588,05-21-11,08:00:00,REGULAR,003169431,001097607
Multiple sets of such data is present in the text file
I need to print all this in new lines with the condition:
1st 3 elements of every set followed by 5 parameters in a new line. So solution of the above set would be:
A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585
A002,R051,02-00-00,05-21-11,04:00:00,REGULAR,003169415,001097588
A002,R051,02-00-00,05-21-11,08:00:00,REGULAR,003169431,001097607
My function to achieve it is given below:
def fix_turnstile_data(filenames):
for name in filenames:
f_in = open(name, 'r')
reader_in = csv.reader(f_in, delimiter = ',')
f_out = open('updated_' + name, 'w')
writer_out = csv.writer(f_out, delimiter = ',')
array=[]
for line in reader_in:
i = 0
j = -1
while i < len(line):
if i % 8 == 0:
i+=2
j+=1
del array[:]
array.append(line[0])
array.append(line[1])
array.append(line[2])
elif (i+1) % 8 == 0:
array.append(line[i-3*j])
writer_out.writerow(array)
else:
array.append(line[i-3*j])
i+=1
f_in.close()
f_out.close()
The output is wrong and there is a space of 3 lines at the end of those lines whose length is 8. I suspect it might be the writer_out.writerow(array) which is to blame.
Can anyone please help me out?

Hmm, the logic you use ends up being fairly confusing. I'd do it more along these lines (this replaces your for loop), and this is more Pythonic:
for line in reader_in:
header = line[:3]
for i in xrange(3, len(line), 5):
writer_out.writerow(header + line[i:i+5])

Do something to line and next lines until a symbol is hit

I have data, that is set up as the following:
//Name_1 * *
>a xyzxyzyxyzyxzzxy
>b xyxyxyzxyyxzyxyz
>c xyzyxzyxyzyxyzxy
//Name_2
>a xyzxyzyxyzxzyxyx
>b zxyzxyzxyyzxyxzx
>c zxyzxyzxyxyzyzxy
//Name_3 * *
>a xyzxyzyxyzxzyxyz
>b zxyzxyzxzyyzyxyx
>c zxyzxyzxyxyzyzxy
...
The //-line refers to an ID for the following group of sequences until the next //-line is reached.
I have been working on writing a program, that reads the position of the asterix, and print the characters on the given position for the sequences.
To simplifiy things for myself, I have been working on a subset of my data, containing only one group of sequences, so e.g.:
//Name_1 * *
>a xyzxyzyxyzyxzzxy
>b xyxyxyzxyyxzyxyz
>c xyzyxzyxyzyxyzxy
My program does what I want on this subset.
import sys
import csv
datafile = open(sys.argv[1], 'r')
outfile = open(sys.argv[1]+"_FGT_Data", 'w')
csv_out = csv.writer(outfile, delimiter=',')
csv_out.writerow(['Locus', 'Individual', 'Nucleotide', 'Position'])
with (datafile) as searchfile:
var_line = [line for line in searchfile if '*' in line]
LocusID = [line[2:13].strip() for line in var_line]
poslist = [i for line in var_line for i, x in enumerate(line) if x =='*']
datafile = open(sys.argv[1], 'r')
with (datafile) as getsnps:
lines = [line for line in getsnps.readlines() if line.startswith('>')]
for pos in poslist:
for line in lines:
snp = line[pos]
individual = line[0:7]
indistr = individual.strip()
csv_out.writerow((LocusID[0], indistr, line[pos], str(pos)))
datafile.close()
outfile.close()
However, now I am trying to modify it to work on the full dataset. I am having trouble finding a way to iterate over the data in the correct way.
I need to search through the file, and when a line containing '' is reached, I need to do as in the above code for the sequences corresponding to the given line, and then continue to the next line containing an ''. Do I need to split up my data with regards to the //-lines or what is the best approach?
I have uploaded a sample of my data to dropbox:
Data_Sample.txt contains several groups, and is the kind of data, I am trying to get the program to work on.
Data_One_Group.txt contains only one group, and is the data I have gotten the program to work on so far.
https://www.dropbox.com/sh/3j4i04s2rg6b63h/AADkWG3OcsutTiSsyTl8L2Vda?dl=0
--------EDIT---------
I am trying to implement the suggestion by #Julien Spronck below.
However, I am having trouble processing the produced block. How would I be able to search through the block line for line. E.g., why does the below not work as intended? It just prints the asterix' and not the line itself.
block =''
with open('onelocus.txt', 'r') as searchfile:
for line in searchfile:
if line.startswith('//'):
#print line
if block:
for line in block:
if '*' in line:
print line
block = line
else:
block += line
---------EDIT 2----------
I am getting closer. I understand that fact, that I need to split the string into line, to be able to search through them. The below works on one group, but when I try to itereate over several, it prints the information for the first group only. But does it for as many groups, as there are. I have tried clearing LocusID and poslist before next iteration, but this does not seem to be the solution.
block =''
with (datafile) as searchfile:
for line in searchfile:
if line.startswith('//'):
if block:
var_line = [line for line in block.splitlines() if '*' in line]
LocusID = [line[2:13].strip() for line in var_line]
print LocusID
poslist = [i for line in var_line for i, x in enumerate(line) if x == '*']
print poslist
block = line
else:
block += line

Can't you do something like:
block =''
with open(filename, 'r') as fil:
for line in fil:
if line.startswith('//'):
if block:
do_something_with(block)
block = line
else:
block += line
if block:
do_something_with(block)
In this code, I just append the lines of the file to a variable block. Once I find a line that starts with //, I process the previous block and reinitialize the block for the next iteration.
The last two lines will take care of processing the last block, which would not be processed otherwise.
do_something_with(block) could be something like this:
def do_something_with(block):
lines = block.splitlines()
j = 0
first_line = lines[j]
while first_line.strip() == '':
j += 1
first_line = lines[j]
pos = []
position = first_line.find('*')
while position != -1:
pos.append(position)
position = first_line.find('*', position+1)
for k, line in enumerate(lines):
if k > j:
for p in pos:
print line[p],
print
## prints
## z y
## x z
## z y

I have created a way to make this work with the data you provided.
You should run it with 2 file locations, 1 should be your input.txt and 2 should be your output.csv
explanation
first we create a dictionary with the locus as key and the sequences as values.
We iterate over this dictionary and get the * locations in the locus and append these to a list indexes.
We iterate over the values belonging to this key and extract the sequence
per iteration we iterate over indexes so that we gather the snps.
per iteration we append to our csv file.
We empty the indexes list so we can go to the next key.
Keep in mind
This method is highly dependant on the amount of spaces you have inside your input.txt.
You should know that this will not be the fastest way to get it done. but it does get it done.
I hope this helped, if you have any questions, feel free to ask them, and if I have time, I will happily try to answer them.
script
import sys
import csv
sequences = []
dic = {}
indexes = []
datafile = sys.argv[1]
outfile = sys.argv[2]
with open(datafile,'r') as snp_file:
lines = snp_file.readlines()
for i in range(0,len(lines)):
if lines[i].startswith("//"):
dic[lines[i].rstrip()] = sequences
del sequences[:]
if lines[i].startswith(">"):
sequences.append(lines[i].rstrip())
for key in dic:
locus = key.split(" ")[0].replace("//","")
for i, x in enumerate(key):
if x == '*':
indexes.append(i-11)
for sequence in dic[key]:
seq = sequence.split(" ")[1]
seq_id = sequence.split(" ")[0].replace(">","")
for z in indexes:
position = z+1
nucleotide = seq[z]
with open(outfile,'a')as handle:
csv_out = csv.writer(handle, delimiter=',')
csv_out.writerow([locus,seq_id,position,nucleotide])
del indexes[:]
input.txt
//Locus_1 * *
>Safr01 AATCCGTTTTAAACCAGNTCYAT
>Safr02 TTAATCCGTTTTAAACCAGNTCY
//Locus_2 * *
>Safr01 AATCCGTTTTAAACCAGNTCYAT
>Safr02 TTAATCCGTTTTAAACCAGNTCY
output.csv
Locus_1,Safr01,1,A
Locus_1,Safr01,22,A
Locus_1,Safr02,1,T
Locus_1,Safr02,22,C
Locus_2,Safr01,5,C
Locus_2,Safr01,19,T
Locus_2,Safr02,5,T
Locus_2,Safr02,19,G

This is how I ended up solving the problem:
def do_something_with(block):
lines = block.splitlines()
for line in lines:
if '*' in line:
hit = line
LocusID = hit[2:13].strip()
for i, x in enumerate(hit):
if x=='*':
poslist.append(i)
for pos in poslist:
for line in lines:
if line.startswith('>'):
individual = line[0:7].strip()
snp = line[pos]
print LocusID, individual, snp, pos,
csv_out.writerow((LocusID, individual, snp, pos))
with (datafile) as searchfile:
for line in searchfile:
if line.startswith('//'):
if block:
do_something_with(block)
poslist = list()
block = line
else:
block += line
if block:
do_something_with(block)

Access each index result from enumerate() - Python

I have data, that looks like this:
Name Nm1 * *
Ind1 AACTCAGCTCACG
Ind2 GTCATCGCTACGA
Ind3 CTTCAAACTGACT
I need to grab the letter from each position marked by an asterix in the "Name"-line and print this, along with the index of the asterix
So the result would be
Ind1, 12, T
Ind2, 12, A
Ind3, 12, C
Ind1, 17, T
Ind2, 17, T
Ind3, 17, T
I'm trying to use enumerate() to retrieve the positions of the asterix's, and then my thought was, that I could use these indexes to grab the letters.
import sys
import csv
input = open(sys.argv[1], 'r')
Output = open(sys.argv[1]+"_processed", 'w')
indlist = (["Individual_1,", "Individual_2,", "Individual_3,"])
with (input) as searchfile:
for line in searchfile:
if '*' in line:
LocusID = line[2:13]
LocusIDstr = LocusID.strip()
hit = line
for i, x in enumerate(hit):
if x=='*':
position = i
print position
for item in indlist:
Output.write("%s%s%s\n" % (item, LocusIDstr, position))
Output.close()
If the enumerate()outputs e.g.
12
17
How do I access each index seperately?
Also, when I print the position, I get the numbers I want. When I write to the file, however, only the last position is written. Why is this?
----------------EDIT-----------------
After advice below, I have edited split up my code to make it a bit more simple (for me) to understand.
import sys
import csv
input = open(sys.argv[1], 'r')
Output = open(sys.argv[1]+"_FGT_Data", 'w')
indlist = (["Individual_1,", "Individual_2,", "Individual_3,"])
with (input) as searchfile:
for line in searchfile:
if '*' in line:
LocusID = line[2:13]
LocusIDstr = LocusID.strip()
print LocusIDstr
hit = line
for i, x in enumerate(hit):
if x=='*':
position = i
#print position
input = open(sys.argv[1], 'r')
with (input) as searchfile:
for line in searchfile:
if line [0] == ">":
print line[position], position
with (Output) as writefile:
for item in indlist:
writefile.write("%s%s%s\n" % (item, LocusIDstr, position))
Output.close()
I still do not have a solution for how to acces each of the indexes, though.

Edit
changed to work with the file you gave me in your comment. if you have made this file yourself, consider working with columns next time.
import sys
read_file = sys.argv[1]
write_file = "%s_processed.%s"%(sys.argv[1].split('.')[0],sys.argv[1].split('.')[1])
indexes = []
lines_to_write = []
with open(read_file,'r') as getindex:
first_line = getindex.readline()
for i, x in enumerate(first_line):
if x == '*':
indexes.append(i-11)
with open(read_file,'r') as getsnps:
for line in getsnps:
if line.startswith(">"):
sequence = line.split(" ")[1]
for z in indexes:
string_to_append = "%s\t%s\t%s"%(line.split(" ")[0],z+1,sequence[z])
lines_to_write.append(string_to_append)
with open(write_file,"w") as write_file:
write_file.write("\n".join(lines_to_write))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Remove specified elements from a list in a txt file - python

keep = [1,2,3,14,20,21,22,23,24,25,26,27,33,34,35,36,37,38,43,45] with open('textfile') as fin, open('textout') as fout: for ln in fin: tmp = [] spl = ln.split(' ') for idx in keep: tmp.append(spl[idx]) fout.write('%s\n' % ' '.join(tmp))

Related

Python constantly deleting lines after saving one

How to automatically call for a next iterator inside a loop

Reorganizing data

Do something to line and next lines until a symbol is hit

Access each index result from enumerate() - Python

Categories

Resources