Reconciling an array slicer - python

I've built a function to cut the extraneous garbage out of text entries. It uses an array slicer. I now need to reconcile the lines that've been removed by my cleanup function so all the lines_lost + lines_kept = total lines. Source code below:
def header_cleanup(entry_chunk):
# Removes duplicate headers due to page-continuations
entry_chunk = entry_chunk.replace("\r\n\r\n","\r\n")
header = lines[1:5]
lines[:] = [x for x in lines if not any(header == x for header in headers)]
lines = headers + lines
return("\n".join(lines))
How could I count the lines that do not show up in lines after the slice/mutation, i.e:
original_length = len(lines)
lines = lines.remove_garbage
garbage = lines.garbage_only_plz
if len(lines) + len(garbage) == original_length:
print("Good!")
else:
print("Bad! ;(")
Final answer ended up looking like this:
def header_cleanup(entry_chunk):
lines = entry_chunk.replace("\r\n\r\n","\r\n")
line_length = len(lines)
headers = lines[1:5]
saved_lines = []
bad_lines = []
saved_lines[:] = [x for x in lines if not any(header == x for header in headers)]
bad_lines[:] = [x for x in lines if any(header == x for header in headers)]
total_lines = len(saved_lines) + len(bad_lines)
if total_lines == line_length:
print("Yay!")
else:
print("Boo.")
print(f"{rando_trace_info}")
sys.exit()
final_lines = headers + saved_lines
return("\n".join(final_lines))
Okokokokok - I know you're thinking: that's redundant, but it's required. Open to edits after solution for anything more pythonic. Thanks for consideration.

Don't reuse the lines variable, use a different variable, so you can get the garbage out of the original lines.
clean_lines = remove_garbage(lines)
garbage = garbage_only(lines)
if len(clean_lines) + len(garbage) == len(lines):
print("Good!")
else:
print("Bad!")
You might want to have a single function that returns both:
clean_lines, garbage = filter_garbage(lines)

Related

Python: Rosalind Consensus and Profile

I am trying to solve the "Consensus and Profile" challenge on Rosalind.
The challenge instructions are as follows:
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
My code is as follows (I got most of it from another user on this website). My only issue is that some of the DNA strands are broken down into multiple separate lines, so they are being appended to the "allstrings" list as separate strings. I am trying to figure out how to write each consecutive line that does not contain ">" as a single string.
import numpy as np
seq = []
allstrings = []
temp_seq = []
matrix = []
C = []
G = []
T = []
A = []
P = []
consensus = []
position = 1
file = open("C:/Users/knigh/Documents/rosalind_cons (3).txt", "r")
conout = open("C:/Users/knigh/Documents/consensus.txt", "w")
# Right now, this is reading and writing each as an individual line. Thus, it
# is splitting each sequence into multiple small sequences. You need to figure
# out how to read this in FASTA format to prevent this from occurring
desc = file.readlines()
for line in desc:
allstrings.append(line)
for string in range(1, len(allstrings)):
if ">" not in allstrings[string]:
temp_seq.append(allstrings[string])
else:
seq.insert(position, temp_seq[0])
temp_seq = []
position += 1
# This last insertion into the sequence must be performed after the loop to empty
# out the last remaining string from temp_seq
seq.insert(position, temp_seq[0])
for base in seq:
matrix.append([pos for pos in base])
M = np.array(matrix).reshape(len(seq), len(seq[0]))
for base in range(len(seq[0])):
A_count = 0
C_count = 0
G_count = 0
T_count = 0
for pos in M[:, base]:
if pos == "A":
A_count += 1
elif pos == "C":
C_count += 1
elif pos == "G":
G_count += 1
elif pos == "T":
T_count += 1
A.append(A_count)
C.append(C_count)
G.append(G_count)
T.append(T_count)
profile_matrix = {"A": A, "C": C, "G": G, "T": T}
P.append(A)
P.append(C)
P.append(G)
P.append(T)
profile = np.array(P).reshape(4, len(A))
for pos in range(len(A)):
if max(profile[:, pos]) == profile[0, pos]:
consensus.append("A")
elif max(profile[:, pos]) == profile[1, pos]:
consensus.append("C")
elif max(profile[:, pos]) == profile[2, pos]:
consensus.append("G")
elif max(profile[:, pos]) == profile[3, pos]:
consensus.append("T")
conout.write("".join(consensus) + "\n")
for k, v in profile_matrix.items():
conout.write(k + ": " + " ".join(str(x) for x in v) + "\n")
conout.close()
There are a couple of ways that you can iterate a FASTA file as records. You can use a prebuilt library or write your own.
A widely used library for working with sequence data is biopython. This code snippet will create a list of strings.
from Bio import SeqIO
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for record in SeqIO.parse(file_handle, "fasta"):
sequences.append(record.seq)
Alternatively, you can write your own FASTA parser. Something like this should work:
def read_fasta(fh):
# Iterate to get first FASTA header
for line in fh:
if line.startswith(">"):
name = line[1:].strip()
break
# This list will hold the sequence lines
fa_lines = []
# Now iterate to find the get multiline fasta
for line in fh:
if line.startswith(">"):
# When in this block we have reached
#  the next FASTA record
# yield the previous record's name and
# sequence as tuple that we can unpack
yield name, "".join(fa_lines)
# Reset the sequence lines and save the
#  name of the next record
fa_lines = []
name = line[1:].strip()
# skip to next line
continue
fa_lines.append(line.strip())
yield name, "".join(fa_lines)
You can use this function like so:
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for name, seq in read_fasta(file_handle):
sequences.append(seq)

Hot skip special character in txt file python

I have some issues while reading txt files. What i have to do is read files ( about 360 ) and make a plot. Everything works except when there is a special character in my file such us: "". When my reading function finds that character it crashes. Is there any way to skip it? My code:
import os
import matplotlib.pyplot as plt
import numpy as np
i = 10
j = 0
X = []
Y = []
Z = []
k = 0
A = np.zeros([360,719])
for i in range(10,360,10):
X = []
Y = []
if len(str(i)) == 2:
data = open(dir + '\\150317_ScPONd_0%s_radio.txt'%i, 'r')
else:
data = open(dir + '\\150317_ScPONd_%s_radio.txt'%i, 'r')
z = data.readlines()
data.close()
for line in z:
if not line.startswith('$'):
data_2 = line.split('\t')
X.append(data_2[0])
Y.append(data_2[1])
A[j,:] = X
A[(j+1),:] = Y
And here is how my file looks like:
Is there any way to skip those "$" lines? Sorry for that picture, I have no idea how to attache It better.
Thaks to #user1753919 I have found an answer. If someone would be still interested in this, here is working code:
for i in range(10,360,10):
X = []
Y = []
if len(str(i)) == 2:
data = np.genfromtxt(dir + '\\150317_ScPONd_0%s_radio.txt'%i,skip_header = 12)
else:
data = np.genfromtxt(dir + '\\150317_ScPONd_%s_radio.txt'%i,skip_header = 12)
for line in data:
X.append(line[0])
Y.append(line[1])
A[j,:] = X
A[(j+1),:] = Y
plt.plot(A[j,:],A[(j+1),:],label = '{} K'.format(i))
plt.hold
j = j+2
genfromtxt is overkill.
np.loadtxt(file, comments='$')

Python: Search in file, replace preceding entry

I am trying to alter an existing ASCII data file in a specific way.
The way I would like to go is to find find either one string from an array, which I define beforehand.
If this string is found in the file I would like to change the preceding entry; the string to put in here depends on which of the strings is found in the first place.
I have a file, where the entrys are separated by spaces and I have trailing spaces at the end to fill up 30 columns. The respective strings would not be in the first line and there would never be more than one per line. An example could look like this:
test01out.txt:
a0997 b0998 c0999
a1000 b1001 c1002
a1003 b1004 c1005
a1006 a1000 c1007
a1008 b1009 c1010
b1001 b1011 c1012
a1013 b1014 b1001
a1015 b1016 c1017
The file does not necessarily have to have three columns in a row. It is possible, that a row has only two but can also have four or five columns.
My current attempt was the following:
from numpy import *
findlines = open("test01.txt").read().split("\n")
searcharray = array(["a1000","b1001"])
alterarray = array(["this1","this2"])
tempstring_current = ""
fileout = open("test01out.txt", "w")
for i, line in enumerate(findlines):
tempstring_last = tempstring_current
tempstring_current = line.rstrip().split(" "))
if any(x in tempstring_current for x in searcharray): # check if one of the elements is in the current line -> unfortunately this seems to be true for any line checked...
print(i)
print(tempstring_current)
for j, element in enumerate(tempstring_current):
if any(searcharray == tempstring_current):
currentsearchindex = argmax(searcharray == tempstring_current)
currentalterstring = alterarray[currentsearchindex]
if currentsearchindex == 0:
tempstring_last.split(" ")[-1] = currentalterstring
else:
tempstring_current.split(" ")[currentsearchindex - 1] = currentalterstring
tempstring_current.split(" ")[currentsearchindex-1] = "XPRZeugs_towrite" + repr(currentdesignatedspeed)
tempstring_last = tempstring_last.ljust(30)
try:
fileout.write(str(tempstring_last))
fileout.write("\r")
try:
fileout.close()
searcharray and alterarray would have some more elements, than two.
I have tested the script up to the any condition; unfortunately the any conditions seems to be met always for some reason I do not quite understand:
from numpy import *
findlines = open("test01.txt").read().split("\n")
searcharray = array(["a1000","b1001"])
alterarray = array(["this1","this2"])
tempstring_current = ""
fileout = open("test01out.txt", "w")
for i, line in enumerate(findlines):
tempstring_last = tempstring_current
tempstring_current = line.rstrip().split(" ")
if any(x in tempstring_current for x in searcharray): # check if one of the elements is in the current line -> unfortunately this seems to be true for any line checked...
print(i)
print(tempstring_current)
I get the lines printed for every line in the file, which I did not expect.
Edit/Solution:
I realized I made a mistake in the input testfile:
It should look like this:
a0997 b0998 c0999
a1000 b1001 c1001
a1003 b1004 c1005
a1006 a1000 c1007
a1008 b1009 c1010
c1002 b1011 c1012
a1013 b1014 c1002
a1015 b1016 c1017
The full code doing the job is the following:
from numpy import *
findlines = open("test01.txt").read().split("\n")
searcharray = array(["a1000","c1002"])
alterarray = array(["this1","this2"])
tempstring_current = ""
fileout = open("test01out.txt", "w")
for i, line in enumerate(findlines):
tempstring_last = tempstring_current
tempstring_current = line.rstrip().split(" ")
if any([x in tempstring_current for x in searcharray]): # check if one of the elements is in the current line -> unfortunately this seems to be true for any line checked...
# print(i)
# print(tempstring_current)
# print(searcharray)
# print([x in tempstring_current for x in searcharray])
# print(argmax([x in tempstring_current for x in searcharray]))
currentsearchposindex = argmax([x in tempstring_current for x in searcharray]) # welchen Index hat das entsprechende Element im Searcharray?
currentalterstring = alterarray[currentsearchposindex] # was ist der entsprechende Eintrag im Alterarray
for j, currentXPRelement in enumerate(tempstring_current):
if currentXPRelement == searcharray[currentsearchposindex]:
currentsearchindex_intemparray = j
# print(len(tempstring_current))
# print(searcharray[currentsearchposindex])
# print(tempstring_current == searcharray[currentsearchposindex])
# print(searcharray[currentsearchposindex] == tempstring_current)
# print(argmax(tempstring_current == searcharray[currentsearchposindex]))
# currentsearchindex_intemparray = argmax(tempstring_current == searcharray[currentsearchposindex])
if currentsearchindex_intemparray == 0:
tempstring_last[-1] = currentalterstring
else:
tempstring_current[currentsearchindex_intemparray - 1] = currentalterstring
# tempstring_current[currentsearchindex_intemparray-1] = "XPRZeugs_towrite" + repr(currentalterstring)
tempstring_last = str(" ".join(tempstring_last)).ljust(30)
if not i == 0:
try:
fileout.write(str(tempstring_last))
fileout.write("\r")
finally:
None
try:
fileout.write(" ".join(tempstring_current))
fileout.write("\r")
fileout.close()
finally:
None
To fix your code so at least it can fail to always match, change
if any(x in tempstring_current for x in searcharray):
to
if any([x in tempstring_current for x in searcharray]):
I think the reason is that the 'x in tempstring_current for x in searcharray' expression returns an interator function - any() says 'this value (i.e. the iterator function reference) is not None so it is True', so the result is always True. The changed syntax creates a list from the iterator and then any works as you probably wanted, i.e. it returns true if any element in the list is true.

Can I use bisect to print the content of a line?

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

Extract values from string

I want to extract certain values from a string in python.
snp_1_881627 AA=G;ALLELE=A;DAF_GLOBAL=0.473901;GENE_TRCOUNT_AFFECTED=1;GENE_TRCOUNT_TOTAL=1;SEVERE_GENE=ENSG00000188976;SEVERE_IMPACT=SYNONYMOUS_CODON;TR_AFFECTED=FULL;ANNOTATION_CLASS=REG_FEATURE,SYNONYMOUS_CODON,ACTIVE_CHROM,NC_TRANSCRIPT_VARIANT,NC_TRANSCRIPT_VARIANT;A_A_CHANGE=.,L,.,.,.;A_A_LENGTH=.,750,.,.,.;A_A_POS=.,615,.,.,.;CELL=GM12878,.,GM12878,.,.;CHROM_STATE=.,.,11,.,.;EXON_NUMBER=.,16/19,.,.,.;GENE_ID=.,ENSG00000188976,.,ENSG00000188976,ENSG00000188976;GENE_NAME=.,NOC2L,.,NOC2L,NOC2L;HGVS=.,c.1843N>T,.,n.3290N>T,n.699N>T;REG_ANNOTATION=H3K36me3,.,.,.,.;TR_BIOTYPE=.,PROTEIN_CODING,.,PROCESSED_TRANSCRIPT,PROCESSED_TRANSCRIPT;TR_ID=.,ENST00000327044,.,ENST00000477976,ENST00000483767;TR_LENGTH=.,2790,.,4201,1611;TR_POS=.,1893,.,3290,699;TR_STRAND=.,-1,.,-1,-1
Output:
GENE_ID GENE_NAME EXON_NUMBER SEVERE_IMPACT
snp_1_881627 ENSG00000188976 NOC2L 16/19 SYNONYMOUS_CODON
If the string has values for each of those variables(GENE_ID,GENE_NAME,EXON_NUMBER) existing then output, else "NA"(variables don't exist or their values don't exist).In some cases,these variables don't exist in the string.
Which string method should I use to accomplish this?Should I split my string before extracting any values?I have 10k rows to extract values for each snp_*
string=string.split(';')
P.S. I am a newbie in python
There are two general strategies for this - split and regex.
To use split, first split off the row label (snp_1_881627):
rowname, data = row.split()
Then, you can split data into the individual entries using the ; separator:
data = data.split(';')
Since you need to get the value of certain keys, we can turn it into a dictionary:
dataDictionary = {}
for entry in data:
entry = entry.split('=')
dataDictionary[entry[0]] = entry[1] if len(entry) > 1 else None
Then you can simply check if the keys are in dataDictionary, and if so grab their values.
Using split is nice in that it will index everything in the data string, making it easy to grab whichever ones you need.
If the ones you need will not change, then regex might be a better option:
>>> import re
>>> re.search('(?<=GENE_ID=)[^;]*', 'onevalue;GENE_ID=SOMETHING;othervalue').group()
'SOMETHING'
Here I'm using a "lookbehind" to match one of the keywords, then grabbing the value from the match using group(). Putting your keywords into a list, you could find all the values like this:
import re
...
keywords = ['GENE_ID', 'GENE_NAME', 'EXON_NUMBER', 'SEVERE_IMPACT']
desiredValues = {}
for keyword in keywords:
match = re.search('(?<={}=)[^;]*'.format(keyword), string_to_search)
desiredValues[keyword] = match.group() if match else DEFAULT_VALUE
I think this is going to be the solution you are looking for.
#input
user_in = 'snp_1_881627 AA=G;ALLELE=A;DAF_GLOBAL=0.473901;GENE_TRCOUNT_AFFECTED=1;GENE_TRCOUNT_TOTAL=1;SEVERE_GENE=ENSG00000188976;SEVERE_IMPACT=SYNONYMOUS_CODON;TR_AFFECTED=FULL;ANNOTATION_CLASS=REG_FEATURE,SYNONYMOUS_CODON,ACTIVE_CHROM,NC_TRANSCRIPT_VARIANT,NC_TRANSCRIPT_VARIANT;A_A_CHANGE=.,L,.,.,.;A_A_LENGTH=.,750,.,.,.;A_A_POS=.,615,.,.,.;CELL=GM12878,.,GM12878,.,.;CHROM_STATE=.,.,11,.,.;EXON_NUMBER=.,16/19,.,.,.;GENE_ID=.,ENSG00000188976,.,ENSG00000188976,ENSG00000188976;GENE_NAME=.,NOC2L,.,NOC2L,NOC2L;HGVS=.,c.1843N>T,.,n.3290N>T,n.699N>T;REG_ANNOTATION=H3K36me3,.,.,.,.;TR_BIOTYPE=.,PROTEIN_CODING,.,PROCESSED_TRANSCRIPT,PROCESSED_TRANSCRIPT;TR_ID=.,ENST00000327044,.,ENST00000477976,ENST00000483767;TR_LENGTH=.,2790,.,4201,1611;TR_POS=.,1893,.,3290,699;TR_STRAND=.,-1,.,-1,-1'
#set some empty vars
user_in = user_in.split(';')
final_output = ""
GENE_ID_FOUND = False
GENE_NAME_FOUND = False
EXON_NUMBER_FOUND = False
GENE_ID_OUTPUT = ''
GENE_NAME_OUTPUT = ''
EXON_NUMBER_OUTPUT = ''
SEVERE_IMPACT_OUTPUT = ''
for x in range(0, len(user_in)):
if x == 0:
first_line_count = 0
first_line_print = ''
while(user_in[0][first_line_count] != " "):
first_line_print += user_in[0][first_line_count]
first_line_count += 1
final_output += first_line_print + "\t"
else:
if user_in[x][0:11] == "SEVERE_GENE":
GENE_ID_OUTPUT += user_in[x][12:] + "\t"
GENE_ID_FOUND = True
if user_in[x][0:9] == "GENE_NAME":
GENE_NAME_OUTPUT += user_in[x][10:] + "\t"
GENE_NAME_FOUND = True
if user_in[x][0:11] == "EXON_NUMBER":
EXON_NUMBER_OUTPUT += user_in[x][12:] + "\t"
EXON_NUMBER_FOUND = True
if user_in[x][0:13] == "SEVERE_IMPACT":
SEVERE_IMPACT_OUTPUT += user_in[x][14:] + "\t"
if GENE_ID_FOUND == True:
final_output += GENE_ID_OUTPUT
else:
final_output += "NA"
if GENE_NAME_FOUND == True:
final_output += GENE_NAME_OUTPUT
else:
final_output += "NA"
if EXON_NUMBER_FOUND == True:
final_output += EXON_NUMBER_OUTPUT
else:
final_output += "NA"
final_output += SEVERE_IMPACT_OUTPUT
print(final_output)

Categories

Resources