Python loop if statement exit - python

I want to find 2 elements in the first page of a pdf file.
The elements that start with P and the elements that start with N.
Even if 1 element has been found, the search for the other element still needs to be continued.
If P is found, continue searching for N
If N is found, continue searching for P
When P and N are found, stop searching
I am having troubles with my loops, N is only found if there is no P.
I get that there is a issue with the if-statements, but I can't seem to correct it.
if text is not None:
p_new = "NoPFound"
n_new = "NoNFound"
for line in text.split('\n'):
#get p
if line.startswith('P '):
pieces = line.split()
p_old = pieces[1]
p_new = (p_old[7:11]) + (p_old[0:6])
break
#get n
if line.startswith('N '):
pieces = line.split()
n_old = pieces[1]
n_new = (n_old[0:15]) + "/" + (n_old[18:20])
break
list_parsed_items.append([p_new, n_new])

Use flags. You have a small state machine.
if text is not None:
p_new = None
n_new = None
for line in text.splitlines():
#get p
if not p_new and line.startswith('P '):
pieces = line.split()
p_old = pieces[1]
p_new = (p_old[7:11]) + (p_old[0:6])
#get n
if not n_new and line.startswith('N '):
pieces = line.split()
n_old = pieces[1]
n_new = (n_old[0:15]) + "/" + (n_old[18:20])
if p_new and n_new:
break
list_parsed_items.append([p_new, n_new])

Related

Python: Rosalind Consensus and Profile

I am trying to solve the "Consensus and Profile" challenge on Rosalind.
The challenge instructions are as follows:
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
My code is as follows (I got most of it from another user on this website). My only issue is that some of the DNA strands are broken down into multiple separate lines, so they are being appended to the "allstrings" list as separate strings. I am trying to figure out how to write each consecutive line that does not contain ">" as a single string.
import numpy as np
seq = []
allstrings = []
temp_seq = []
matrix = []
C = []
G = []
T = []
A = []
P = []
consensus = []
position = 1
file = open("C:/Users/knigh/Documents/rosalind_cons (3).txt", "r")
conout = open("C:/Users/knigh/Documents/consensus.txt", "w")
# Right now, this is reading and writing each as an individual line. Thus, it
# is splitting each sequence into multiple small sequences. You need to figure
# out how to read this in FASTA format to prevent this from occurring
desc = file.readlines()
for line in desc:
allstrings.append(line)
for string in range(1, len(allstrings)):
if ">" not in allstrings[string]:
temp_seq.append(allstrings[string])
else:
seq.insert(position, temp_seq[0])
temp_seq = []
position += 1
# This last insertion into the sequence must be performed after the loop to empty
# out the last remaining string from temp_seq
seq.insert(position, temp_seq[0])
for base in seq:
matrix.append([pos for pos in base])
M = np.array(matrix).reshape(len(seq), len(seq[0]))
for base in range(len(seq[0])):
A_count = 0
C_count = 0
G_count = 0
T_count = 0
for pos in M[:, base]:
if pos == "A":
A_count += 1
elif pos == "C":
C_count += 1
elif pos == "G":
G_count += 1
elif pos == "T":
T_count += 1
A.append(A_count)
C.append(C_count)
G.append(G_count)
T.append(T_count)
profile_matrix = {"A": A, "C": C, "G": G, "T": T}
P.append(A)
P.append(C)
P.append(G)
P.append(T)
profile = np.array(P).reshape(4, len(A))
for pos in range(len(A)):
if max(profile[:, pos]) == profile[0, pos]:
consensus.append("A")
elif max(profile[:, pos]) == profile[1, pos]:
consensus.append("C")
elif max(profile[:, pos]) == profile[2, pos]:
consensus.append("G")
elif max(profile[:, pos]) == profile[3, pos]:
consensus.append("T")
conout.write("".join(consensus) + "\n")
for k, v in profile_matrix.items():
conout.write(k + ": " + " ".join(str(x) for x in v) + "\n")
conout.close()
There are a couple of ways that you can iterate a FASTA file as records. You can use a prebuilt library or write your own.
A widely used library for working with sequence data is biopython. This code snippet will create a list of strings.
from Bio import SeqIO
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for record in SeqIO.parse(file_handle, "fasta"):
sequences.append(record.seq)
Alternatively, you can write your own FASTA parser. Something like this should work:
def read_fasta(fh):
# Iterate to get first FASTA header
for line in fh:
if line.startswith(">"):
name = line[1:].strip()
break
# This list will hold the sequence lines
fa_lines = []
# Now iterate to find the get multiline fasta
for line in fh:
if line.startswith(">"):
# When in this block we have reached
#  the next FASTA record
# yield the previous record's name and
# sequence as tuple that we can unpack
yield name, "".join(fa_lines)
# Reset the sequence lines and save the
#  name of the next record
fa_lines = []
name = line[1:].strip()
# skip to next line
continue
fa_lines.append(line.strip())
yield name, "".join(fa_lines)
You can use this function like so:
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for name, seq in read_fasta(file_handle):
sequences.append(seq)

Python to Quickly Change 300K lines in a 8KK lines file

I am a chip test engineer, and I have one big text file about 8KK lines. For this file, most lines include '='. Meanwhile I have a log file, which is about 300K lines, each line is show a test failure. I need to change the 300K lines of the original file.
Currently it takes about 15 hours to finish the job.
I have existing solution, but it is too slow.
For the code, the parse_log is used to process the log file and get to know each modification to be made, and the stil_parse include below function:
read file as list in memory;
iterate the file, and modify each line in list if included in log file;
write back to disk;
class MaskStil:
def __init__(self):
self.log_signal_file = ''
self.pattern = r"^([^:]+)(:)(\d+)(\s+)(\d+)(\s+)(\d+)(\s+)(\d+)(\s)([.LH]+)$"
self.log_signal = {}
self.log_lines = []
self.mask_dict = {}
self.stil_name_new = ''
self.stil_name = ''
self.signal_all = {}
self.signal_group = []
self.offset = 0
self.mask_mode = -1 # mask_mode 0: revert between L/H; mask_mode 1: mask L/H to Z
self.convert_value=[{"L":"H", "H":"L"}, {"L":"Z", "H":"Z"}]
for i in range(100):
self.log_signal[i] = ''
def digest(self, log_signal, stil_file, signal_group, offset, mask_mode = 1):
self.log_signal_file = log_signal
self.stil_name = stil_file
self.stil_name_new = stil_file[:-5] + '_mask.stil'
self.signal_group = signal_group.replace('=', '+').strip().split('+')
self.offset = offset
self.mask_mode = mask_mode
for i in range(1, len(self.signal_group)):
self.signal_all[self.signal_group[i]] = (i - 1) / 10 + i
print(self.signal_all)
self.parse_log()
self.stil_parse()
def parse_log(self):
with open(self.log_signal_file) as infile:
line_num = 0
blank_line = 0
for line in infile:
line_num += 1
if line_num == 1:
blank_line = line.count(' ')
if "------------------" in line:
break
for i in range(blank_line, len(line)):
self.log_signal[i - blank_line] += line[i]
for (key, value) in self.log_signal.items():
self.log_signal[key] = value.rstrip()
print(self.log_signal)
with open(self.log_signal_file) as log_in:
self.log_lines = log_in.read().splitlines()
for line in self.log_lines:
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
def stil_parse(self):
cycle_keys = []
vector_num = 0
for i in self.mask_dict.keys():
cycle_keys.append(i)
with open(self.stil_name, 'r') as stil_in:
stil_in_list = stil_in.read().splitlines()
total_len = len(stil_in_list)
vector_cycle_dict = {}
with tqdm(total=total_len, ncols=100, desc= " Stil Scanning in RAM Progress") as pbar:
for i_iter in range(total_len):
line = stil_in_list[i_iter]
pbar.update(1)
if "=" in line:
vector_num +=1
if (vector_num in cycle_keys):
vector_cycle_dict[vector_num] = i_iter
status = line[line.find("=") + 1:line.find(";")]
# if cycle + self.offset in cycle_keys:
if vector_num in cycle_keys:
match = 1
for (i, j) in self.mask_dict[vector_num].iteritems():
mask_point = i
mask_signal = self.log_signal[i]
mask_value = j
test_point = self.signal_all[mask_signal]
test_value = status[test_point]
if test_value != mask_value:
print("data did not match for cycle: ", test_value, " VS ", line, j, vector_num, mask_point, mask_signal, test_point, test_value)
match = 0
raise NameError
else:
status = status[:test_point] + self.convert_value[self.mask_mode][test_value] + status[test_point + 1:]
if match == 1:
replace_line = line[:line.find("=") + 1] + status + line[line.find(";"):]
print("data change from :", line)
print(" to:", replace_line)
stil_in_list[i_iter] = replace_line
else:
print("No matching for %d with %s" %(vector_num, line))
raise NameError
with tqdm(total=len(stil_in_list), ncols=100, desc= " Masked-stil to in RAM Progress") as pbar:
with open(self.stil_name_new, 'w') as stil_out:
for new_line in range(len(stil_in_list)):
pbar.update(1)
stil_out.write(new_line)
I was expecting a solution that could finish in about 1 or 2 hours.
As I mentioned in the comments, you can get some speedup by refactoring your code to be multithreaded or multiprocess.
I imagine you're also running into memory swapping issues here. If that's the case, this should help:
with open(self.log_signal_file) as log_in:
line = log_in.readline() # First line. Need logic to handle empty logs
while line: #Will return false at EOF
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
line = log_in.readline()
Here we only read in one line at a time, so you don't have to try to hold 8KK lines in memory
*In case anyone else didn't know, KK means million apparently.
I managed to optimized the solution, and the timing consumed tremendously reduced to about 1 minute.
Mainly the optimization is in below fields:
instead of keeping checking if (vector_num in cycle_keys):, I use
ordered list and always check whether equal to index_to_mask;
use variable line_find_equal and line_find_coma for further usage
class MaskStil:
def __init__(self):
self.log_signal_file = ''
self.pattern = r"^([^:]+)(:)(\d+)(\s+)(\d+)(\s+)(\d+)(\s+)(\d+)(\s)([.LH]+)$"
self.log_signal = {}
self.log_lines = []
self.mask_dict = {}
self.stil_name_new = ''
self.stil_name = ''
self.signal_all = {}
self.signal_group = []
self.offset = 0
self.mask_mode = -1 # mask_mode 0: revert between L/H; mask_mode 1: mask L/H to Z
self.convert_value=[{"L":"H", "H":"L"}, {"L":"Z", "H":"Z"}]
for i in range(100):
self.log_signal[i] = ''
def digest(self, log_signal, stil_file, signal_group, offset, mask_mode = 1):
self.log_signal_file = log_signal
self.stil_name = stil_file
self.stil_name_new = stil_file[:-5] + '_mask.stil'
self.signal_group = signal_group.replace('=', '+').strip().split('+')
self.offset = offset
self.mask_mode = mask_mode
for i in range(1, len(self.signal_group)):
self.signal_all[self.signal_group[i]] = int(math.floor((i - 1) / 10) + i)
print(self.signal_all)
self.parse_log()
self.stil_parse()
def parse_log(self):
with open(self.log_signal_file) as infile:
line_num = 0
blank_line = 0
for line in infile:
line_num += 1
if line_num == 1:
blank_line = line.count(' ')
if "------------------" in line:
break
for i in range(blank_line, len(line)):
self.log_signal[i - blank_line] += line[i]
for (key, value) in self.log_signal.items():
self.log_signal[key] = value.rstrip()
print(self.log_signal)
with open(self.log_signal_file) as log_in:
self.log_lines = log_in.read().splitlines()
for line in self.log_lines:
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
def stil_parse(self):
cycle_keys = []
vector_num = 0
for i in self.mask_dict.keys():
cycle_keys.append(i)
with open(self.stil_name, 'r') as stil_in:
stil_in_list = stil_in.read().splitlines()
total_len = len(stil_in_list)
index_to_mask = 0
with tqdm(total=total_len, ncols=100, desc= " Stil Scanning in RAM Progress") as pbar:
for i_iter in range(total_len):
line = stil_in_list[i_iter]
pbar.update(1)
if "=" in line:
vector_num +=1
if (vector_num<=cycle_keys[-1]):
if (vector_num == cycle_keys[index_to_mask]):
line_find_equal = line.find("=")
line_find_coma = line.find(";")
status = line[line_find_equal + 1:line_find_coma]
# if cycle + self.offset in cycle_keys:
try:
match = 1
for (i, j) in self.mask_dict[vector_num].items():
mask_point = i
mask_signal = self.log_signal[i]
mask_value = j
test_point = self.signal_all[mask_signal]
test_value = status[test_point]
if test_value != mask_value:
print("data did not match for cycle: ", test_value, " VS ", line, j, vector_num, mask_point, mask_signal, test_point, test_value)
match = 0
raise NameError
else:
status = status[:test_point] + self.convert_value[self.mask_mode][test_value] + status[test_point + 1:]
stil_in_list[i_iter] = line[:line_find_equal + 1] + status + line[line_find_coma:]
# print("data change from :", line)
# print(" to:", stil_in_list[i_iter])
index_to_mask = index_to_mask+1
except (Exception) as e:
print("No matching for %d with %s" %(vector_num, line))
raise NameError
with tqdm(total=len(stil_in_list), ncols=100, desc= " Masked-stil to disk Progress") as pbar:
with open(self.stil_name_new, 'w') as stil_out:
for i_iter in range(len(stil_in_list)):
pbar.update(1)
stil_out.write(stil_in_list[i_iter]+ "\n")

While reading a text file sequentially how to go back to a particular line and start again from there

Newbie alert!
Path = "C:/Users/Kailash/Downloads/Results_For_Stride-Table.csv"
counter_start = 0
counter_end = 0
num_lines = len(open(Path).read().splitlines())
print("num_lines = ", num_lines)
with open(Path, "r") as f:
for lines in f:
print("lines = ", lines)
counter_end += 1
Stride_Length = lines.split(", ")
Previous_Sum = distances
Previous_Sum_GCT = Total_GCT
Previous_Heading = Sum_Heading
GCT = float(Stride_Length[2])
Total_GCT += GCT
print("Total_GCT = ", Total_GCT)
distances += float(Stride_Length[3])
print("distances = ", distances)
Sum_Heading += float(Stride_Length[7])
print("Sum_Heading = ", Sum_Heading)
print("counter_end = ", counter_end)
if(GCT == 0.00):
distances = 0
counter_end = 0
if distances > 762:
print("counter_end = ", counter_end)
counter_start = counter_end
lines_test = f.readlines()
print("counter start = ", counter_start)
print("move = ", lines_test[counter_start-counter_end-1])
print("Distance is above 762")
distances = 0
I want to know how to go back to a particular line in a file and start reading from there again in python. when I try to use f.readlines() in the last but 5th line in my code, the processing stops right there.
You can build a list of line start positions (file offsets), and then use file.seek(line_offsets[n]) to go back to the nth line (counting from zero). After that you can read the line (and those following it sequentially) once again.
Here's example code showing building such a list incrementally:
filepath = "somefile"
line_offsets = []
with open(filepath, "r") as file:
while True:
posn = file.tell()
line = file.readline()
if not line: # end-of-file?
break
line_offsets.append(posn) # Remember where line started.
""" Process line """
... code above ...
for line in data:
if counter_end<= <line to stop at>:
continue
counter_end += 1
...
a = linecache.getlines('D:/aaa.txt')[5]
import linecache
a = linecache.getlines('D:/aaa.txt')[5]

compare an exact word with the txt file

i am trying to get the exact word match from my file along with their line no.
like when i search for abc10 it gives me all the possible answers e.g abc102 abc103 etc
how can i limitize my code to only print what i commanded..
here is my code!
lineNo = 0
linesFound = []
inFile= open('rxmop.txt', 'r')
sKeyword = input("enter word ")
done = False
while not done :
pos = inFile.tell()
sLine = inFile.readline()
if sLine == "" :
done = True
break
if (sLine.find( sKeyword ) != -1):
print ("Found at line: "+str(lineNo))
tTuple = lineNo, pos
linesFound.append( tTuple )
lineNo = lineNo + 1
done = False
while not done :
command = int( input("Enter the line you want to view: ") )
if command == -1 :
done = True
break
for tT in linesFound :
if command == tT[0] :
inFile.seek( tT[1] )
lLine = inFile.readline()
print ("The line at position " + str(tT[1]) + "is: " + lLine)
"like when i search for abc10 it gives me all the possible answers e.g abc102 abc103 etc"
You split each record and compare whole "words" only.
to_find = "RXOTG-10"
list_of_possibles = ["RXOTG-10 QTA5777 HYB SY G12",
"RXOTG-100 QTA9278 HYB SY G12"]
for rec in list_of_possibles:
words_list=rec.strip().split()
if to_find in words_list:
print "found", rec
else:
print " NOT found", rec

python is inexplicably shortening the step size with each iteration of a sliding window analysis

I am working on a program that estimates the statistic Tajima's D in a series of sliding windows across a chromosome. The chromosome itself is also divided into a number of different regions with (hopefully) functional significance. The sliding window analysis is performed by my script on each region.
At the start of the program, I define the size of the sliding windows and the size of the steps that move from one window to the next. I import a file which contains the coordinates for each different chromosomal region, and import another file which contains all the SNP data I am working with (this is read line-by-line, as it is a large file). The program loops through the list of chromosomal locations. For each location, it generates an index of steps and windows for the analysis, partitions the SNP data into output files (corresponding with the steps), calculates key statistics for each step file, and combines these statistics to estimate Tajima's D for each window.
The program works well for small files of SNP data. It also works well for the first iteration over the first chromosomal break point. However, for large files of SNP data, the step size in the analysis is inexplicably decreased as the program iterates over each chromosomal regions. For the first chromosomal regions, the step size is 2500 nucleotides (this is what it is suppose to be). For the second chromosome segment, however, the step size is 1966, and for the third it is 732.
If anyone has any suggestions at to why this might be the case, please let me know. I am especially stumped as this program seems to work size for small files but not for larger ones.
My code is below:
import sys
import math
import fileinput
import shlex
import string
windowSize = int(500)
stepSize = int(250)
n = int(50) #number of individuals in the anaysis
SNP_file = open("SNPs-1.txt",'r')
SNP_file.readline()
breakpoints = open("C:/Users/gwilymh/Desktop/Python/Breakpoint coordinates.txt", 'r')
breakpoints = list(breakpoints)
numSegments = len(breakpoints)
# Open a file to store the Tajima's D results:
outputFile = open("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/Tajima's D estimates.txt", 'a')
outputFile.write(str("segmentNumber\tchrSegmentName\tsegmentStart\tsegmentStop\twindowNumber\twindowStart\twindowStop\tWindowSize\tnSNPs\tS\tD\n"))
#Calculating parameters a1, a2, b1, b2, c1 and c2
numPairwiseComparisons=n*((n-1)/2)
b1=(n+1)/(3*(n-1))
b2=(2*(n**2+n+3))/(9*n*(n-1))
num=list(range(1,n)) # n-1 values as a list
i=0
a1=0
for i in num:
a1=a1+(1/i)
i=i+1
j=0
a2=0
for j in num:
a2=a2+(1/j**2)
j=j+1
c1=(b1/a1)-(1/a1**2)
c2=(1/(a1**2+a2))*(b2 - ((n+2)/(a1*n))+ (a2/a1**2) )
counter6=0
#For each segment, assign a number and identify the start and stop coodrinates and the segment name
for counter6 in range(counter6,numSegments):
segment = shlex.shlex(breakpoints[counter6],posix = True)
segment.whitespace += '\t'
segment.whitespace_split = True
segment = list(segment)
segmentName = segment[0]
segmentNumber = int(counter6+1)
segmentStartPos = int(segment[1])
segmentStopPos = int(segment[2])
outputFile1 = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'a')
#Make output files to index the lcoations of each window within each segment
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
k = segmentStartPos - 1
windowNumber = 0
while (k+1) <=segmentStopPos:
windowStart = k+1
windowNumber = windowNumber+1
windowStop = k + windowSize
if windowStop > segmentStopPos:
windowStop = segmentStopPos
windowFileIndex.write(("%s\t%s\t%s\n")%(str(windowNumber),str(windowStart),str(windowStop)))
k=k+stepSize
windowFileIndex.close()
# Make output files for each step to export the corresponding SNP data into + an index of these output files
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
i = segmentStartPos-1
stepNumber = 0
while (i+1) <= segmentStopPos:
stepStart = i+1
stepNumber = stepNumber+1
stepStop = i+stepSize
if stepStop > segmentStopPos:
stepStop = segmentStopPos
stepFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepNumber))), 'a')
stepFileIndex.write(("%s\t%s\t%s\n")%(str(stepNumber),str(stepStart),str(stepStop)))
i=i+stepSize
stepFile.close()
stepFileIndex.close()
# Open the index file for each step in current chromosomal segment
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepFileIndex = list(stepFileIndex)
numSteps = len(stepFileIndex)
while 1:
currentSNP = SNP_file.readline()
if not currentSNP: break
currentSNP = shlex.shlex(currentSNP,posix=True)
currentSNP.whitespace += '\t'
currentSNP.whitespace_split = True
currentSNP = list(currentSNP)
SNPlocation = int(currentSNP[0])
if SNPlocation > segmentStopPos:break
stepIndexBin = int(((SNPlocation-segmentStartPos-1)/stepSize)+1)
#print(SNPlocation, stepIndexBin)
writeFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepIndexBin))), 'a')
writeFile.write((("%s\n")%(str(currentSNP[:]))))
writeFile.close()
counter3=0
for counter3 in range(counter3,numSteps):
# open up each step in the list of steps across the chromosomal segment:
L=shlex.shlex(stepFileIndex[counter3],posix=True)
L.whitespace += '\t'
L.whitespace_split = True
L=list(L)
#print(L)
stepNumber = int(L[0])
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
#Now open the file of SNPs corresponding with the window in question and convert it into a list:
currentStepFile = open(("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(counter3+1)),'r')
currentStepFile = list(currentStepFile)
nSNPsInCurrentStepFile = len(currentStepFile)
print("number of SNPs in this step is:", nSNPsInCurrentStepFile)
#print(currentStepFile)
if nSNPsInCurrentStepFile == 0:
mismatchesPerSiteList = [0]
else:
# For each line of the file, estimate the per site parameters relevent to Tajima's D
mismatchesPerSiteList = list()
counter4=0
for counter4 in range(counter4,nSNPsInCurrentStepFile):
CountA=0
CountG=0
CountC=0
CountT=0
x = counter4
lineOfData = currentStepFile[x]
counter5=0
for counter5 in range(0,len(lineOfData)):
if lineOfData[counter5]==("A" or "a"): CountA=CountA+1
elif lineOfData[counter5]==("G" or "g"): CountG=CountG+1
elif lineOfData[counter5]==("C" or "c"): CountC=CountC+1
elif lineOfData[counter5]==("T" or "t"): CountT=CountT+1
else: continue
AxG=CountA*CountG
AxC=CountA*CountC
AxT=CountA*CountT
GxC=CountG*CountC
GxT=CountG*CountT
CxT=CountC*CountT
NumberMismatches = AxG+AxC+AxT+GxC+GxT+CxT
mismatchesPerSiteList=mismatchesPerSiteList+[NumberMismatches]
outputFile1.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber, segmentName,stepNumber,stepStart,stepStop,stepSize,nSNPsInCurrentStepFile,sum(mismatchesPerSiteList))))
outputFile1.close()
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
windowFileIndex = list(windowFileIndex)
numberOfWindows = len(windowFileIndex)
stepData = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepData = list(stepData)
numberOfSteps = len(stepData)
counter = 0
for counter in range(counter, numberOfWindows):
window = shlex.shlex(windowFileIndex[counter], posix = True)
window.whitespace += "\t"
window.whitespace_split = True
window = list(window)
windowNumber = int(window[0])
firstCoordinateInCurrentWindow = int(window[1])
lastCoordinateInCurrentWindow = int(window[2])
currentWindowSize = lastCoordinateInCurrentWindow - firstCoordinateInCurrentWindow +1
nSNPsInThisWindow = 0
nMismatchesInThisWindow = 0
counter2 = 0
for counter2 in range(counter2,numberOfSteps):
step = shlex.shlex(stepData[counter2], posix=True)
step.whitespace += "\t"
step.whitespace_split = True
step = list(step)
lastCoordinateInCurrentStep = int(step[4])
if lastCoordinateInCurrentStep < firstCoordinateInCurrentWindow: continue
elif lastCoordinateInCurrentStep <= lastCoordinateInCurrentWindow:
nSNPsInThisStep = int(step[6])
nMismatchesInThisStep = int(step[7])
nSNPsInThisWindow = nSNPsInThisWindow + nSNPsInThisStep
nMismatchesInThisWindow = nMismatchesInThisWindow + nMismatchesInThisStep
elif lastCoordinateInCurrentStep > lastCoordinateInCurrentWindow: break
if nSNPsInThisWindow ==0 :
S = 0
D = 0
else:
S = nSNPsInThisWindow/currentWindowSize
pi = nMismatchesInThisWindow/(currentWindowSize*numPairwiseComparisons)
print(nSNPsInThisWindow,nMismatchesInThisWindow,currentWindowSize,S,pi)
D = (pi-(S/a1))/math.sqrt(c1*S + c2*S*(S-1/currentWindowSize))
outputFile.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber,segmentName,segmentStartPos,segmentStopPos,windowNumber,firstCoordinateInCurrentWindow,lastCoordinateInCurrentWindow,currentWindowSize,nSNPsInThisWindow,S,D)))
A quick search shows that you do change your stepSize on line 110:
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
stepStop and stepStart appear to depend on your files' contents, so we can't debug it further.

Categories

Resources