Python Pandas How to save output to csv - python

Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one

Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.

Related

PDF splitting with Bookmarks in python through PyPDF4 - bookmarks are losing in the output

I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct
book mark is losing in the separated pdf, if original pdf contains bookmark
if pdf contains the page labels with Roman and arabic page numbers,
like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)
how to correct the issue in the below script
MWE
import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *
#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme
pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()
print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)
print(pgcnt)
pno = []
def value(rno):
r = rno.upper()
if (r == 'I'):
return 1
if (r == 'V'):
return 5
if (r == 'X'):
return 10
if (r == 'L'):
return 50
if (r == 'C'):
return 100
if (r == 'D'):
return 500
if (r == 'M'):
return 1000
return -1
def romanToDecimal(str):
res = 0
i = 0
while (i < len(str)):
# Getting value of symbol s[i]
s1 = value(str[i])
if (i + 1 < len(str)):
# Getting value of symbol s[i + 1]
s2 = value(str[i + 1])
# Comparing both values
if (s1 >= s2):
# Value of current symbol is greater
# or equal to the next symbol
res = res + s1
i = i + 1
else:
# Value of current symbol is greater
# or equal to the next symbol
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
def get_pageInfo(pginfo):
global pno
for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
Start_page = m.group(2)
End_page = m.group(3)
x = Start_page
y = End_page
numeric_test = x.isnumeric()
if not numeric_test:
Start_page = romanToDecimal(Start_page)
else:
Start_page = int(Start_page)
numeric_test = y.isnumeric()
if not numeric_test:
End_page = romanToDecimal(End_page)
else:
End_page = int(End_page)
print(x, Start_page, y, End_page)
pno.append((Start_page,End_page))
return pno
pgdetails = get_pageInfo(pgcnt)
print(pgdetails)
def pdf_splitter(file,start,end,fcount):
fix_start = start
#we will save new splited pdf as "nameofpdf splitted.pdf"
#example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
new_file_name = str(fcount)+".pdf"
read_file = PdfFileReader(open(file,"rb")) #read pdf
new_pdf = PdfFileWriter() #create write object
start-=1
try:
with open(new_file_name,"wb") as f:
for i in range(start, end):
new_pdf.addPage(read_file.getPage(i))
new_pdf.write(f)
i+=1
f.close()
print("PDF splitted Successfully")
reader = PdfReader(new_file_name)
labels = PageLabels.from_pdf(reader)
newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
style="roman lowercase", # See options in PageLabelScheme.styles()
prefix="",
firstpagenum=fix_start) # number to attribute to the first page of this index
labels.append(newlabel) # Adding our page labels to the existing ones
labels.write(reader)
writer = PdfWriter()
writer.trailer = reader
writer.write(new_file_name)
except Exception as e:
print(e)
x = 0
for i in pgdetails:
x += 1
#pvalaue = i
Start,End = i
pdf_splitter('input.pdf',Start,End,x)
sys.exit()
and the page information file (txt) will contain the below information
<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>
Thanks in Advance

Load a apreadsheet and copy a row and pasted it in a different location

How can I copy a row for example from D51 to F51 and paste these values in the row T20 to AF20.
I know how to load a spreadsheet
workbook = load_workbook(output)
sheet = workbook.active
But I dont know how to itenarate in a loop to get this
sheet["T2"] = "=D6"
sheet["U2"] = "=E6"
sheet["V2"] = "=F6"
sheet["W2"] = "=G6"
sheet["X2"] = "=H6"
sheet["Y2"] = "=I6"
sheet["Z2"] = "=J6"
sheet["AA2"] = "=K6"
sheet["AB2"] = "=L6"
sheet["AC2"] = "=M6"
sheet["AD2"] = "=N6"
sheet["AE2"] = "=O6"
sheet["AF2"] = "=P6"
You can achieve this by using code below...
Note that the file output.xlsx is opened, updated and saved. The function num_to_excel_col is borrowed from here.
This will update columns 20 (T) onwards for the next 15 columns (all row 2) with the text as "=D6", "=E6", etc. The num_to_col function will convert the col number to equivalent excel string (for eg. 27 will be converted to AA, etc.)
import pandas as pd
import numpy as np
import openpyxl
workbook = openpyxl.load_workbook('output.xlsx')
ws = workbook.active
def num_to_excel_col(n):
if n < 1:
raise ValueError("Number must be positive")
result = ""
while True:
if n > 26:
n, r = divmod(n - 1, 26)
result = chr(r + ord('A')) + result
else:
return chr(n + ord('A') - 1) + result
outcol = 4 #Paste in col 'D'
for col in range(20,35): #col 20 is T and doing this for next 15 columns
txt = "="+num_to_excel_col(outcol)+"6"
print(txt)
ws.cell(row=2, column=col).value = txt
outcol += 1
workbook.save("output.xlsx")

Problems with calculating gcskews using python

everyone.
I have some problems with calculating gcskews in python.
My 2 major inputs are fasta file and bed file.
Bed file has columns of gn(0), gene_type(1), gene name(2), chromosome(3), strand(4), num(5), start(6).(These numbers are index numbers in python.) Then I am trying to use some functions which can calculate gcskews of sense and antisense strand from the start site of each gene. The window is 100bp and these are the functions.
import re
import sys
import os
# opening bed file
content= []
with open("gene_info.full.tsv") as new :
for line in new :
content.append(line.strip().split())
content = content[1:]
def fasta2dict(fil):
dic = {}
scaf = ''
seq = []
for line in open(fil):
if line.startswith(">") and scaf == '':
scaf = line.split(' ')[0].lstrip(">").replace("\n", "")
elif line.startswith(">") and scaf != '':
dic[scaf] = ''.join(seq)
scaf = line.split(' ')[0].lstrip(">").replace("\n", "")
seq = []
else:
seq.append(line.rstrip())
dic[scaf] = ''.join(seq)
return dic
dic_file = fasta2dict("full.fa")
# functions for gc skew
def GC_skew_up(strand, loc, seq, window = 100) : # need -1 for index
values_up = []
loc = loc - 1
if strand == "+" :
sp_up = seq[loc - window : loc]
g_up = sp_up.count('G') + sp_up.count('g')
c_up = sp_up.count('C') + sp_up.count('c')
try :
skew_up = (g_up - c_up) / float(g_up + c_up)
except ZeroDivisionError:
skew_up = 0.0
values_up.append(skew_up)
elif strand == "-" :
sp_up = seq[loc : loc + window]
g_up = sp_up.count('G') + sp_up.count('g')
c_up = sp_up.count('C') + sp_up.count('c')
try :
skew_up = (c_up - g_up) / float(g_up + c_up)
except ZeroDivisionError:
skew_up = 0.0
values_up.append(skew_up)
return values_up
def GC_skew_dw(strand, loc, seq, window = 100) :
values_dw = []
loc = loc - 1
if strand == "+" :
sp_dw = seq[loc : loc + window]
g_dw = sp_dw.count('G') + sp_dw.count('g')
c_dw = sp_dw.count('C') + sp_dw.count('c')
try :
skew_dw = (g_dw - c_dw) / float(g_dw + c_dw)
except ZeroDivisionError:
skew_dw = 0.0
values_dw.append(skew_dw)
elif strand == "-" :
sp_dw = seq[loc - window : loc]
g_dw = sp_dw.count('G') + sp_dw.count('g')
c_dw = sp_dw.count('C') + sp_dw.count('c')
try :
skew_dw = (c_dw - g_dw) / float(g_dw + c_dw)
except ZeroDivisionError:
skew_dw = 0.0
values_dw.append(skew_dw)
return values_dw
As I said, I want to calculate the gcskews for 100bp of strands from the start site of genes.
Therefore, I made codes that get the chromosome name from the bed file and get the sequence data from the Fasta file.
Then according to gene name and strand information, I expected that codes will find the correct start site and gcskew for 100bp window will be calculated.
However, when I run this code, gcskew of - strand is wrong but + strand is correct. (I got correct gcskew data and I used it.)
Gcskews are different from the correct data, but I don't know what is the problem.
Could anyone tell me what is the problem of this code?
Thanks in advance!
window = 100
gname = []
up = []
dw = []
for match in content :
seq_chr = dic_file[str(match[3])]
if match[4] == "+" :
strand = match[4]
new = int(match[6])
sen_up = GC_skew_up(strand, new, seq_chr, window = 100)
sen_dw = GC_skew_dw(strand, new, seq_chr, window = 100)
gname.append(match[2])
up.append(str(sen_up[0]))
dw.append(str(sen_dw[0]))
if match[4] == "-" :
strand = match[4]
new = int(match[6])
an_up = GC_skew_up(strand, new, seq_chr, window = 100)
an_dw = GC_skew_dw(strand, new, seq_chr, window = 100)
gname.append(match[2])
up.append(str(an_up[0]))
dw.append(str(an_dw[0]))
tot = zip(gname, up, dw)

Cant make a txt output at proper format

This is my code. The problem is that the output looks like this
2015-06-03 19:32:11.225085
{'2015-01-21-20:56:45.mp3': 1}{'negative': -2}{'2015-01-15-21:28:23.mp3': 1}
i want to be like a dictionary.... like this one below so i can read it back as a dictionary and remove the keys from the first subset sum and go on output a second on and so on until no other subset sum exists...
2015-06-03 19:32:11.225085
{'2015-01-21-20:56:45.mp3': 1, 'negative': -2, '2015-01-15-21:28:23.mp3': 1}
Any ideas?
thanx in advanced
import os, sys,re,gzip, pickle
from itertools import combinations
import json
from datetime import datetime
mp3folder = raw_input('Please copy paste the mp3s path:')
lowerin = input('Please enter your total playlist time in NEGATIVE seconds and hit ENTER:')
r = {}
drk = os.listdir(mp3folder)
drifiles = list(drk)
r = dict.fromkeys(drifiles, 0)
for key in r.keys():
print ('Please enter the duration of...')
print(key)
r[key] = input('in seconds and hit ENTER:')
r['negative'] = lowerin
d = {}
neg = 0
pos = 0
dates = datetime.now()
dates = str(dates)
f = open("dict.txt",'ab')
f.write('\n'+dates+'\n')
f.close()
for (w,v) in r.iteritems():
if v > 0: pos += v
else: neg += v
sums = [0] * (pos - neg + 1)
for (w,v) in r.iteritems():
s = sums[:]
if not s[v - neg]: s[v - neg] = (w,)
for (i, w2) in enumerate(sums):
if w2 and not s[i + v]:
s[i + v] = w2 + (w,)
sums = s
if s[-neg]:
for x in s[-neg]:
d = dict([(x, r[x])])
file('dict.txt','a'+'\n').write(repr(d))
break
f = open('dict.txt','r')
filedata = f.read()
f.close()
newdata = filedata.replace("}{",", ")
f = open('lexiko.txt','w')
f.write(newdata)
f.close()
di = eval(open("lexiko.txt").read())
print di
this will do it

python is inexplicably shortening the step size with each iteration of a sliding window analysis

I am working on a program that estimates the statistic Tajima's D in a series of sliding windows across a chromosome. The chromosome itself is also divided into a number of different regions with (hopefully) functional significance. The sliding window analysis is performed by my script on each region.
At the start of the program, I define the size of the sliding windows and the size of the steps that move from one window to the next. I import a file which contains the coordinates for each different chromosomal region, and import another file which contains all the SNP data I am working with (this is read line-by-line, as it is a large file). The program loops through the list of chromosomal locations. For each location, it generates an index of steps and windows for the analysis, partitions the SNP data into output files (corresponding with the steps), calculates key statistics for each step file, and combines these statistics to estimate Tajima's D for each window.
The program works well for small files of SNP data. It also works well for the first iteration over the first chromosomal break point. However, for large files of SNP data, the step size in the analysis is inexplicably decreased as the program iterates over each chromosomal regions. For the first chromosomal regions, the step size is 2500 nucleotides (this is what it is suppose to be). For the second chromosome segment, however, the step size is 1966, and for the third it is 732.
If anyone has any suggestions at to why this might be the case, please let me know. I am especially stumped as this program seems to work size for small files but not for larger ones.
My code is below:
import sys
import math
import fileinput
import shlex
import string
windowSize = int(500)
stepSize = int(250)
n = int(50) #number of individuals in the anaysis
SNP_file = open("SNPs-1.txt",'r')
SNP_file.readline()
breakpoints = open("C:/Users/gwilymh/Desktop/Python/Breakpoint coordinates.txt", 'r')
breakpoints = list(breakpoints)
numSegments = len(breakpoints)
# Open a file to store the Tajima's D results:
outputFile = open("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/Tajima's D estimates.txt", 'a')
outputFile.write(str("segmentNumber\tchrSegmentName\tsegmentStart\tsegmentStop\twindowNumber\twindowStart\twindowStop\tWindowSize\tnSNPs\tS\tD\n"))
#Calculating parameters a1, a2, b1, b2, c1 and c2
numPairwiseComparisons=n*((n-1)/2)
b1=(n+1)/(3*(n-1))
b2=(2*(n**2+n+3))/(9*n*(n-1))
num=list(range(1,n)) # n-1 values as a list
i=0
a1=0
for i in num:
a1=a1+(1/i)
i=i+1
j=0
a2=0
for j in num:
a2=a2+(1/j**2)
j=j+1
c1=(b1/a1)-(1/a1**2)
c2=(1/(a1**2+a2))*(b2 - ((n+2)/(a1*n))+ (a2/a1**2) )
counter6=0
#For each segment, assign a number and identify the start and stop coodrinates and the segment name
for counter6 in range(counter6,numSegments):
segment = shlex.shlex(breakpoints[counter6],posix = True)
segment.whitespace += '\t'
segment.whitespace_split = True
segment = list(segment)
segmentName = segment[0]
segmentNumber = int(counter6+1)
segmentStartPos = int(segment[1])
segmentStopPos = int(segment[2])
outputFile1 = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'a')
#Make output files to index the lcoations of each window within each segment
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
k = segmentStartPos - 1
windowNumber = 0
while (k+1) <=segmentStopPos:
windowStart = k+1
windowNumber = windowNumber+1
windowStop = k + windowSize
if windowStop > segmentStopPos:
windowStop = segmentStopPos
windowFileIndex.write(("%s\t%s\t%s\n")%(str(windowNumber),str(windowStart),str(windowStop)))
k=k+stepSize
windowFileIndex.close()
# Make output files for each step to export the corresponding SNP data into + an index of these output files
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
i = segmentStartPos-1
stepNumber = 0
while (i+1) <= segmentStopPos:
stepStart = i+1
stepNumber = stepNumber+1
stepStop = i+stepSize
if stepStop > segmentStopPos:
stepStop = segmentStopPos
stepFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepNumber))), 'a')
stepFileIndex.write(("%s\t%s\t%s\n")%(str(stepNumber),str(stepStart),str(stepStop)))
i=i+stepSize
stepFile.close()
stepFileIndex.close()
# Open the index file for each step in current chromosomal segment
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepFileIndex = list(stepFileIndex)
numSteps = len(stepFileIndex)
while 1:
currentSNP = SNP_file.readline()
if not currentSNP: break
currentSNP = shlex.shlex(currentSNP,posix=True)
currentSNP.whitespace += '\t'
currentSNP.whitespace_split = True
currentSNP = list(currentSNP)
SNPlocation = int(currentSNP[0])
if SNPlocation > segmentStopPos:break
stepIndexBin = int(((SNPlocation-segmentStartPos-1)/stepSize)+1)
#print(SNPlocation, stepIndexBin)
writeFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepIndexBin))), 'a')
writeFile.write((("%s\n")%(str(currentSNP[:]))))
writeFile.close()
counter3=0
for counter3 in range(counter3,numSteps):
# open up each step in the list of steps across the chromosomal segment:
L=shlex.shlex(stepFileIndex[counter3],posix=True)
L.whitespace += '\t'
L.whitespace_split = True
L=list(L)
#print(L)
stepNumber = int(L[0])
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
#Now open the file of SNPs corresponding with the window in question and convert it into a list:
currentStepFile = open(("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(counter3+1)),'r')
currentStepFile = list(currentStepFile)
nSNPsInCurrentStepFile = len(currentStepFile)
print("number of SNPs in this step is:", nSNPsInCurrentStepFile)
#print(currentStepFile)
if nSNPsInCurrentStepFile == 0:
mismatchesPerSiteList = [0]
else:
# For each line of the file, estimate the per site parameters relevent to Tajima's D
mismatchesPerSiteList = list()
counter4=0
for counter4 in range(counter4,nSNPsInCurrentStepFile):
CountA=0
CountG=0
CountC=0
CountT=0
x = counter4
lineOfData = currentStepFile[x]
counter5=0
for counter5 in range(0,len(lineOfData)):
if lineOfData[counter5]==("A" or "a"): CountA=CountA+1
elif lineOfData[counter5]==("G" or "g"): CountG=CountG+1
elif lineOfData[counter5]==("C" or "c"): CountC=CountC+1
elif lineOfData[counter5]==("T" or "t"): CountT=CountT+1
else: continue
AxG=CountA*CountG
AxC=CountA*CountC
AxT=CountA*CountT
GxC=CountG*CountC
GxT=CountG*CountT
CxT=CountC*CountT
NumberMismatches = AxG+AxC+AxT+GxC+GxT+CxT
mismatchesPerSiteList=mismatchesPerSiteList+[NumberMismatches]
outputFile1.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber, segmentName,stepNumber,stepStart,stepStop,stepSize,nSNPsInCurrentStepFile,sum(mismatchesPerSiteList))))
outputFile1.close()
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
windowFileIndex = list(windowFileIndex)
numberOfWindows = len(windowFileIndex)
stepData = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepData = list(stepData)
numberOfSteps = len(stepData)
counter = 0
for counter in range(counter, numberOfWindows):
window = shlex.shlex(windowFileIndex[counter], posix = True)
window.whitespace += "\t"
window.whitespace_split = True
window = list(window)
windowNumber = int(window[0])
firstCoordinateInCurrentWindow = int(window[1])
lastCoordinateInCurrentWindow = int(window[2])
currentWindowSize = lastCoordinateInCurrentWindow - firstCoordinateInCurrentWindow +1
nSNPsInThisWindow = 0
nMismatchesInThisWindow = 0
counter2 = 0
for counter2 in range(counter2,numberOfSteps):
step = shlex.shlex(stepData[counter2], posix=True)
step.whitespace += "\t"
step.whitespace_split = True
step = list(step)
lastCoordinateInCurrentStep = int(step[4])
if lastCoordinateInCurrentStep < firstCoordinateInCurrentWindow: continue
elif lastCoordinateInCurrentStep <= lastCoordinateInCurrentWindow:
nSNPsInThisStep = int(step[6])
nMismatchesInThisStep = int(step[7])
nSNPsInThisWindow = nSNPsInThisWindow + nSNPsInThisStep
nMismatchesInThisWindow = nMismatchesInThisWindow + nMismatchesInThisStep
elif lastCoordinateInCurrentStep > lastCoordinateInCurrentWindow: break
if nSNPsInThisWindow ==0 :
S = 0
D = 0
else:
S = nSNPsInThisWindow/currentWindowSize
pi = nMismatchesInThisWindow/(currentWindowSize*numPairwiseComparisons)
print(nSNPsInThisWindow,nMismatchesInThisWindow,currentWindowSize,S,pi)
D = (pi-(S/a1))/math.sqrt(c1*S + c2*S*(S-1/currentWindowSize))
outputFile.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber,segmentName,segmentStartPos,segmentStopPos,windowNumber,firstCoordinateInCurrentWindow,lastCoordinateInCurrentWindow,currentWindowSize,nSNPsInThisWindow,S,D)))
A quick search shows that you do change your stepSize on line 110:
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
stepStop and stepStart appear to depend on your files' contents, so we can't debug it further.

Categories

Resources