Related
I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct
book mark is losing in the separated pdf, if original pdf contains bookmark
if pdf contains the page labels with Roman and arabic page numbers,
like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)
how to correct the issue in the below script
MWE
import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *
#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme
pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()
print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)
print(pgcnt)
pno = []
def value(rno):
r = rno.upper()
if (r == 'I'):
return 1
if (r == 'V'):
return 5
if (r == 'X'):
return 10
if (r == 'L'):
return 50
if (r == 'C'):
return 100
if (r == 'D'):
return 500
if (r == 'M'):
return 1000
return -1
def romanToDecimal(str):
res = 0
i = 0
while (i < len(str)):
# Getting value of symbol s[i]
s1 = value(str[i])
if (i + 1 < len(str)):
# Getting value of symbol s[i + 1]
s2 = value(str[i + 1])
# Comparing both values
if (s1 >= s2):
# Value of current symbol is greater
# or equal to the next symbol
res = res + s1
i = i + 1
else:
# Value of current symbol is greater
# or equal to the next symbol
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
def get_pageInfo(pginfo):
global pno
for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
Start_page = m.group(2)
End_page = m.group(3)
x = Start_page
y = End_page
numeric_test = x.isnumeric()
if not numeric_test:
Start_page = romanToDecimal(Start_page)
else:
Start_page = int(Start_page)
numeric_test = y.isnumeric()
if not numeric_test:
End_page = romanToDecimal(End_page)
else:
End_page = int(End_page)
print(x, Start_page, y, End_page)
pno.append((Start_page,End_page))
return pno
pgdetails = get_pageInfo(pgcnt)
print(pgdetails)
def pdf_splitter(file,start,end,fcount):
fix_start = start
#we will save new splited pdf as "nameofpdf splitted.pdf"
#example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
new_file_name = str(fcount)+".pdf"
read_file = PdfFileReader(open(file,"rb")) #read pdf
new_pdf = PdfFileWriter() #create write object
start-=1
try:
with open(new_file_name,"wb") as f:
for i in range(start, end):
new_pdf.addPage(read_file.getPage(i))
new_pdf.write(f)
i+=1
f.close()
print("PDF splitted Successfully")
reader = PdfReader(new_file_name)
labels = PageLabels.from_pdf(reader)
newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
style="roman lowercase", # See options in PageLabelScheme.styles()
prefix="",
firstpagenum=fix_start) # number to attribute to the first page of this index
labels.append(newlabel) # Adding our page labels to the existing ones
labels.write(reader)
writer = PdfWriter()
writer.trailer = reader
writer.write(new_file_name)
except Exception as e:
print(e)
x = 0
for i in pgdetails:
x += 1
#pvalaue = i
Start,End = i
pdf_splitter('input.pdf',Start,End,x)
sys.exit()
and the page information file (txt) will contain the below information
<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>
Thanks in Advance
This is what I have gotten while trying to run step 3 of this source code:
https://github.com/carykh/lazykh
Error:
Traceback (most recent call last):
File "C:\Users\User\Desktop\lazykh-main\code\scheduler.py", line 93, in
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
ValueError: substring not found
Code:
import argparse
import os.path
import json
import numpy as np
import random
def addPhoneme(p, t):
global prevPhoneme
global f
if p != prevPhoneme:
strings[4] += (str.format('{0:.3f}', t)+",phoneme,"+p+"\n")
prevPhoneme = p
def pickNewPose(t):
global pose
global prevPose
global POSE_COUNT
global prevPhoneme
global f
newPose = -1
while newPose == -1 or newPose == pose or newPose == prevPose:
newPose = int(random.random()*POSE_COUNT)
prevPose = pose
pose = newPose
strings[3] += (str.format('{0:.3f}', t)+",pose,"+str(pose)+"\n")
prevPhoneme = "na"
strings = [""]*5
POSE_COUNT = 5
emotions = {}
emotions["explain"] = 0
emotions["happy"] = 1
emotions["sad"] = 2
emotions["angry"] = 3
emotions["confused"] = 4
emotions["rq"] = 5
mouthList = [["aa","a"],["ae","a"],["ah","a"],["ao","a"],["aw","au"],
["ay","ay"],["b","m"],["ch","t"],["d","t"],["dh","t"],
["eh","a"],["er","u"],["ey","ay"],["f","f"],["g","t"],
["hh","y"],["ih","a"],["iy","ay"],["jh","t"],["k","t"],
["l","y"],["m","m"],["n","t"],["ng","t"],["ow","au"],
["oy","ua"],["p","m"],["r","u"],["s","t"],["sh","t"],
["t","t"],["th","t"],["uh","u"],["uw","u"],["v","f"],
["w","u"],["y","y"],["z","t"],["zh","t"],
["oov","m"]] # For unknown phonemes, the stick figure will just have a closed mouth ("mmm")
mouths = {}
for x in mouthList:
mouths[x[0]] = x[1]
ENDING_PHONEME = "m"
STOPPERS = [",",";",".",":","!","?"]
parser = argparse.ArgumentParser(description='blah')
parser.add_argument('--input_file', type=str, help='the script')
args = parser.parse_args()
INPUT_FILE = args.input_file
f = open(INPUT_FILE+".txt","r+")
originalScript = f.read()
f.close()
f = open(INPUT_FILE+".json","r+")
fileData = f.read()
f.close()
data = json.loads(fileData)
WORD_COUNT = len(data['words'])
pose = -1
prevPose = -1
prevPhoneme = "na"
emotion = "0"
pararaph = 0
image = 0
OS_IndexAt = 0
pickNewPose(0)
strings[1] += "0,emotion,0\n"
strings[0] += "0,paragraph,0\n"
strings[2] += "0,image,0\n"
strings[4] += "0,phoneme,m\n"
for i in range(WORD_COUNT):
word = data['words'][i]
if "start" not in word:
continue
wordString = word["word"]
timeStart = word["start"]
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
if "<" in originalScript[OS_IndexAt:]:
tagStart = originalScript.index("<",OS_IndexAt)
tagEnd = originalScript.index(">",OS_IndexAt)
if OS_nextIndex > tagStart and tagEnd >= OS_nextIndex:
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
nextDigest = originalScript[OS_IndexAt:OS_nextIndex]
if "\n" in nextDigest and data['words'][i-1]['case'] != 'not-found-in-audio' and (prevPhoneme == "a" or prevPhoneme == "f" or prevPhoneme == "u" or prevPhoneme == "y"):
addPhoneme("m", data['words'][i-1]["end"])
"""print(wordString)
print(str(OS_IndexAt)+", "+str(OS_nextIndex))
print(nextDigest)
print("")"""
pickedPose = False
for stopper in STOPPERS:
if stopper in nextDigest:
pickNewPose(timeStart)
pickedPose = True
if "<" in nextDigest:
leftIndex = nextDigest.index("<")+1
rightIndex = nextDigest.index(">")
emotion = emotions[nextDigest[leftIndex:rightIndex]]
strings[1] += (str.format('{0:.3f}', timeStart)+",emotion,"+str(emotion)+"\n")
prevPhoneme = "na"
if "\n\n" in nextDigest:
pararaph += 1
image += 1 # The line of the script advances 2 lines whenever we hit a /n/n.
strings[0] += (str.format('{0:.3f}', timeStart)+",paragraph,"+str(pararaph)+"\n")
prevPhoneme = "na"
if "\n" in nextDigest:
image += 1
strings[2] += (str.format('{0:.3f}', timeStart)+",image,"+str(image)+"\n")
prevPhoneme = "na"
if not pickedPose:
pickNewPose(timeStart) # A new image means we also need to have a new pose
phones = word["phones"]
timeAt = timeStart
for phone in phones:
timeAt += phone["duration"]
phoneString = phone["phone"]
if phoneString == "sil":
truePhone = "m"
else:
truePhone = mouths[phoneString[:phoneString.index("_")]]
if len(truePhone) == 2:
addPhoneme(truePhone[0], timeAt-phone["duration"])
addPhoneme(truePhone[1], timeAt-phone["duration"]*0.5)
else:
addPhoneme(truePhone, timeAt-phone["duration"])
OS_IndexAt = OS_nextIndex
f = open(INPUT_FILE+"_schedule.csv","w+")
for i in range(len(strings)):
f.write(strings[i])
if i < len(strings)-1:
f.write("SECTION\n")
f.flush()
f.close()
print(f"Done creating schedule for {INPUT_FILE}.")
The
ValueError: substring not found
occurs when you try to find the index of a substring in a string which does not contain it in the specified (or default) section, using the index function.
The index method takes 3 parameters:
value
start
end
and it searches for the value between start and end.
So, the error occurred because the substring was not found in the section where it was searched for. The line of
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
searches for wordString, starting from tagEnd and searches for the likes of
<span>yourwordstring</span>
, but in your case it was not found. You can do one of the following to solve the issue:
you can fix your input if it should always have a match for the search
you can handle the error when the index throws the error
you can use find instead, see https://bobbyhadz.com/blog/python-valueerror-substring-not-found
Note that find also has three parameters, as you can read from https://www.w3schools.com/python/ref_string_find.asp
I'm trying to learn search algorithms in order to prepare my master thesis, so I have a TSP problem in which I want to find the best and minimal route to visite all the states , I'm using a .txt file named cities__coordinates.txt that contains the coordinates for every state, so to read the data I've found this source code that has a class to read the data you'll find it here:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import copy
import re
import math
class Data():
'''
the format of solomon dataset
'''
def __init__(self):
self.customerNum = 0 # the number of customers
self.nodeNum = 0 # the sum of customers and depots
self.vehicleNum = 0
self.capacity = 0
self.cor_X = []
self.cor_Y = []
self.demand = []
self.readyTime = []
self.dueTime = []
self.serviceTime = []
self.disMatrix = {}
def read_data(self, path, customerNum, depotNum):
'''
function to read solomom data from .txt files, notice that it must be solomon dataset
INPUT
# data : class Data
# path : Data path
# customerNum : the number of customer
OutPut : none
'''
self.customerNum = customerNum
self.nodeNum = customerNum + depotNum
f = open('cities__coordinates.txt', 'r')
lines = f.readlines()
count = 0
for line in lines:
count = count + 1
if(count == 5):
line = line[:-1].strip()
str = re.split(r" +", line)
self.vehicleNum = float(str[0])
self.capacity = float(str[1])
elif(count >= 10 and count <= 10 + customerNum):
line = line[:-1]
str = re.split(r" +", line)
self.cor_X.append(float(str[2]))
self.cor_Y.append(float(str[3]))
self.demand.append(float(str[4]))
self.readyTime.append(float(str[5]))
self.dueTime.append(float(str[6]))
self.serviceTime.append(float(str[7]))
# compute the distance matrix
self.disMatrix = {}
for i in range(0, self.nodeNum):
dis_temp={}
for j in range(0, self.nodeNum):
dis_temp[j] = int(math.hypot(self.cor_X[i] - self.cor_X[j],self.cor_Y[i] - self.cor_Y[j]))
self.disMatrix[i] = dis_temp
def plot_nodes(self):
'''
Description: function to plot
'''
Graph = nx.DiGraph()
nodes_name = [str(x) for x in list(range(self.nodeNum))]
Graph.add_nodes_from(nodes_name)
cor_xy = np.array([self.cor_X,self.cor_Y]).T.astype(int)
pos_location = {nodes_name[i]:x for i,x in enumerate(cor_xy)}
nodes_color_dict = ['r'] + ['gray'] * (self.nodeNum-1)
nx.draw_networkx(Graph,pos_location,node_size=200,node_color=nodes_color_dict,labels=None)
plt.show(Graph)
def plot_route(self,route,color='k'):
Graph = nx.DiGraph()
nodes_name = [0]
cor_xy=[[self.cor_X[0] , self.cor_Y[0]]]
edge = []
edges = [[0,route[0]]]
for i in route :
nodes_name.append(i)
cor_xy.append([self.cor_X[i] , self.cor_Y[i]])
edge.append(i)
if len(edge) == 2 :
edges.append(copy.deepcopy(edge))
edge.pop(0)
edges.append([route[-1],0])
Graph.add_nodes_from(nodes_name)
Graph.add_edges_from(edges)
pos_location = {nodes_name[i]:x for i,x in enumerate(cor_xy)}
nodes_color_dict = ['r'] + ['gray'] * (len(route))
nx.draw_networkx(Graph,pos_location,node_size=200,node_color=nodes_color_dict,edge_color=color, labels=None)
plt.show(Graph)
so in read_data function I've changed the path to my .txt file , and for the code which will calculate all the distance and took the tabu search and all the staffs, here it is the code:
from itertools import combinations
import os,sys,copy
import numpy as np
import time
from Datareader import Data
import matplotlib.pyplot as plt
class Tabu():
def __init__(self,disMatrix,max_iters=200,maxTabuSize=20):
"""parameters definition"""
self.disMatrix = disMatrix
self.maxTabuSize = maxTabuSize
self.max_iters = max_iters
self.tabu_list=[]
def get_route_distance(self,route):
'''
Description: function to calculate total distance of a route. evaluate function.
parameters: route : list
return : total distance : folat
'''
routes = [0] + route + [0] # add the start and end point
total_distance = 0
for i,n in enumerate(routes):
if i != 0 :
total_distance = total_distance + self.disMatrix[last_pos][n]
last_pos = n
return total_distance
def exchange(self,s1,s2,arr):
"""
function to Swap positions of two elements in an arr
Args: int,int,list
s1 : target 1
s2 : target 2
arr : target array
Ouput: list
current_list : target array
"""
current_list = copy.deepcopy(arr)
index1 , index2 = current_list.index(s1) , current_list.index(s2) # get index
current_list[index1], current_list[index2]= arr[index2] , arr[index1]
return current_list
def generate_initial_solution(self,num=10,mode='greedy'):
"""
function to get the initial solution,there two different way to generate route_init.
Args:
num : int
the number of points
mode : string
"greedy" : advance step by choosing optimal one
"random" : randomly generate a series number
Ouput: list
s_init : initial solution route_init
"""
if mode == 'greedy':
route_init=[0]
for i in range(num):
best_distance = 10000000
for j in range(num+1):
if self.disMatrix[i][j] < best_distance and j not in route_init:
best_distance = self.disMatrix[i][j]
best_candidate = j
route_init.append(best_candidate)
route_init.remove(0)
if mode == 'random':
route_init = np.arange(1,num+1) #init solution from 1 to num
np.random.shuffle(route_init) #shuffle the list randomly
return list(route_init)
def tabu_search(self,s_init):
"""tabu search"""
s_best = s_init
bestCandidate = copy.deepcopy(s_best)
routes , temp_tabu = [] , [] # init
routes.append(s_best)
while(self.max_iters):
self.max_iters -= 1 # Number of iterations
neighbors = copy.deepcopy(s_best)
for s in combinations(neighbors, 2):
sCandidate = self.exchange(s[0],s[1],neighbors) # exchange number to generate candidates
if s not in self.tabu_list and self.get_route_distance(sCandidate) < self.get_route_distance(bestCandidate):
bestCandidate = sCandidate
temp_tabu = s
if self.get_route_distance(bestCandidate) < self.get_route_distance(s_best): # record the best solution
s_best = bestCandidate
if temp_tabu not in self.tabu_list:
self.tabu_list.append(temp_tabu)
if len(self.tabu_list) > self.maxTabuSize :
self.tabu_list.pop(0)
routes.append(bestCandidate)
return s_best, routes
if __name__ == "__main__":
data = Data()
data.read_data(path='cities__coordinates.txt',customerNum=100,depotNum=1) # change the path
""" Tabu :
disMatrix : the distance matrix from 0 to X , 0 represernt starting and stopping point。
for example: disMatrix = [[0,3,4,...
1,0,5,...
3,5,0,...]]
that means the distance from 0 to 0 is 0, from 0 to 1 is 3,... from 1 to 3 is 5....
max_iters : maximum iterations
maxTabuSize : maximum iterations
"""
tsp = Tabu(disMatrix=data.disMatrix,max_iters=10,maxTabuSize=10)
# two different way to generate initial solution
# num : the number of points
s_init = tsp.generate_initial_solution(num=10,mode='greedy') # mode = "greedy" or "random"
print('init route : ' , s_init)
print('init distance : ' , tsp.get_route_distance(s_init))
start = time.time()
best_route , routes = tsp.tabu_search(s_init) # tabu search
end = time.time()
print('best route : ' , best_route)
print('best best_distance : ' , tsp.get_route_distance(best_route))
print('the time cost : ',end - start )
# plot the result changes with iterations
results=[]
for i in routes:
results.append(tsp.get_route_distance(i))
plt.plot(np.arange(len(results)) , results)
plt.show()
# plot the route
data.plot_route(best_route)
when I execute it, it takes a little time and then it shows me this error :
Traceback (most recent call last):
File "C:/Users/malle/OneDrive/Desktop/TS.py", line 100, in <module>
data.read_data(path='cities__coordinates.txt',customerNum=100,depotNum=1) # change the path
File "C:/Users/malle/OneDrive/Desktop\Datareader.py", line 49, in read_data
self.cor_X.append(float(str[2]))
IndexError: list index out of range
anyone can help to resolve this problem please ?
i'm junho who is student at unis.
i tried to study this code for image preprocessing.
and i tried to change face recognization to label recongization to crop properly.
but Error occure like below code.
it seems like list problem. but i couldn't figure it out even though i tried to.
Is there anyone can know this problem??
this is code:
global_label_index = 0
global_label_number = [0 for x in range(1000)]
global_image_hash = []
def skew_angle(self):
return None
# read files in src_dir and crop label only and write it into des_dir
def crop_labels_dir(self,src_dir,des_dir,maxnum):
# training data will be written in $des_dir/training
# validation data will be written in $des_dir/validate
des_dir_training = os.path.join(des_dir,'training')
des_dir_validate = os.path.join(des_dir,'validate')
if not os.path.exists(des_dir):
os.makedirs(des_dir)
if not os.path.exists(des_dir_training):
os.makedirs(des_dir_training)
if not os.path.exists(des_dir_validate):
os.makedirs(des_dir_validate)
path,folder_name = os.path.split(src_dir)
label = folder_name
# create label file. it will contains file location
# and label for each file
training_file = open(des_dir+'/training_file.txt','a')
validate_file = open(des_dir+'/validate_file.txt','a')
files = self.getfiles(src_dir)
global global_label_index
cnt = 0
num = 0 # number of training data
for f in files:
rect = self.detect_label(f)
# replace ',' in file name to '.'
# because ',' is used for deliminator of image file name and its label
des_file_name = os.path.basename(f)
des_file_name = des_file_name.replace(',','_')
if rect != None:
# 70% of file will be stored in training data directory
if(cnt < 8):
des_file = os.path.join(des_dir_training,des_file_name)
# if we already have duplicated image, crop_face will return None
if self.crop_face(f, rect, des_file ) != None:
training_file.write("%s,%s,%d\n"%(des_file,label,global_label_index) )
num = num + 1
global_label_number[global_label_index] = num
cnt = cnt+1
if (num>=maxnum):
break
# 30% of files will be stored in validation data directory
else: # for validation data
des_file = os.path.join(des_dir_validate,des_file_name)
if self.crop_face(f, rect, des_file) != None:
validate_file.write("%s,%s,%d\n"%(des_file,label,global_label_index) )
cnt = cnt+1
if(cnt>9):
cnt = 0
#increase index for image label
global_label_index = global_label_index + 1
print('## label %s has %s of training data' %(global_label_index,num))
training_file.close()
validate_file.close()
def getdirs(self,dir):
dirs = []
for f in os.listdir(dir):
f=os.path.join(dir,f)
if os.path.isdir(f):
if not f.startswith('.'):
dirs.append(f)
return dirs
def crop_labels_rootdir(self,src_dir,des_dir,maxnum):
# crop file from sub-directoris in src_dir
dirs = self.getdirs(src_dir)
#list sub directory
for d in dirs:
print('[INFO] : ### Starting cropping in directory %s ###'%d)
self.crop_labels_dir(d, des_dir,maxnum)
#loop and run face crop
global global_label_number
print("number of datas per label ", global_label_number)
def main(argv):
srcdir= argv[1]
desdir = argv[2]
maxnum = int(argv[3])
detector = LabelDetector()
detector.crop_labels_rootdir(srcdir, desdir,maxnum)
if __name__ == "__main__":
main(sys.argv)
error code is this :
IndexError Traceback (most recent call last)
in ()
17
18 if name == "main":
---> 19 main(sys.argv)
<ipython-input-32-ab2ad6c296c9> in main(argv)
6 srcdir= argv[1]
7 desdir = argv[2]
----> 8 maxnum = int(argv[3])
9
10 detector = LabelDetector()
IndexError: list index out of range
I am working on a program that estimates the statistic Tajima's D in a series of sliding windows across a chromosome. The chromosome itself is also divided into a number of different regions with (hopefully) functional significance. The sliding window analysis is performed by my script on each region.
At the start of the program, I define the size of the sliding windows and the size of the steps that move from one window to the next. I import a file which contains the coordinates for each different chromosomal region, and import another file which contains all the SNP data I am working with (this is read line-by-line, as it is a large file). The program loops through the list of chromosomal locations. For each location, it generates an index of steps and windows for the analysis, partitions the SNP data into output files (corresponding with the steps), calculates key statistics for each step file, and combines these statistics to estimate Tajima's D for each window.
The program works well for small files of SNP data. It also works well for the first iteration over the first chromosomal break point. However, for large files of SNP data, the step size in the analysis is inexplicably decreased as the program iterates over each chromosomal regions. For the first chromosomal regions, the step size is 2500 nucleotides (this is what it is suppose to be). For the second chromosome segment, however, the step size is 1966, and for the third it is 732.
If anyone has any suggestions at to why this might be the case, please let me know. I am especially stumped as this program seems to work size for small files but not for larger ones.
My code is below:
import sys
import math
import fileinput
import shlex
import string
windowSize = int(500)
stepSize = int(250)
n = int(50) #number of individuals in the anaysis
SNP_file = open("SNPs-1.txt",'r')
SNP_file.readline()
breakpoints = open("C:/Users/gwilymh/Desktop/Python/Breakpoint coordinates.txt", 'r')
breakpoints = list(breakpoints)
numSegments = len(breakpoints)
# Open a file to store the Tajima's D results:
outputFile = open("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/Tajima's D estimates.txt", 'a')
outputFile.write(str("segmentNumber\tchrSegmentName\tsegmentStart\tsegmentStop\twindowNumber\twindowStart\twindowStop\tWindowSize\tnSNPs\tS\tD\n"))
#Calculating parameters a1, a2, b1, b2, c1 and c2
numPairwiseComparisons=n*((n-1)/2)
b1=(n+1)/(3*(n-1))
b2=(2*(n**2+n+3))/(9*n*(n-1))
num=list(range(1,n)) # n-1 values as a list
i=0
a1=0
for i in num:
a1=a1+(1/i)
i=i+1
j=0
a2=0
for j in num:
a2=a2+(1/j**2)
j=j+1
c1=(b1/a1)-(1/a1**2)
c2=(1/(a1**2+a2))*(b2 - ((n+2)/(a1*n))+ (a2/a1**2) )
counter6=0
#For each segment, assign a number and identify the start and stop coodrinates and the segment name
for counter6 in range(counter6,numSegments):
segment = shlex.shlex(breakpoints[counter6],posix = True)
segment.whitespace += '\t'
segment.whitespace_split = True
segment = list(segment)
segmentName = segment[0]
segmentNumber = int(counter6+1)
segmentStartPos = int(segment[1])
segmentStopPos = int(segment[2])
outputFile1 = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'a')
#Make output files to index the lcoations of each window within each segment
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
k = segmentStartPos - 1
windowNumber = 0
while (k+1) <=segmentStopPos:
windowStart = k+1
windowNumber = windowNumber+1
windowStop = k + windowSize
if windowStop > segmentStopPos:
windowStop = segmentStopPos
windowFileIndex.write(("%s\t%s\t%s\n")%(str(windowNumber),str(windowStart),str(windowStop)))
k=k+stepSize
windowFileIndex.close()
# Make output files for each step to export the corresponding SNP data into + an index of these output files
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
i = segmentStartPos-1
stepNumber = 0
while (i+1) <= segmentStopPos:
stepStart = i+1
stepNumber = stepNumber+1
stepStop = i+stepSize
if stepStop > segmentStopPos:
stepStop = segmentStopPos
stepFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepNumber))), 'a')
stepFileIndex.write(("%s\t%s\t%s\n")%(str(stepNumber),str(stepStart),str(stepStop)))
i=i+stepSize
stepFile.close()
stepFileIndex.close()
# Open the index file for each step in current chromosomal segment
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepFileIndex = list(stepFileIndex)
numSteps = len(stepFileIndex)
while 1:
currentSNP = SNP_file.readline()
if not currentSNP: break
currentSNP = shlex.shlex(currentSNP,posix=True)
currentSNP.whitespace += '\t'
currentSNP.whitespace_split = True
currentSNP = list(currentSNP)
SNPlocation = int(currentSNP[0])
if SNPlocation > segmentStopPos:break
stepIndexBin = int(((SNPlocation-segmentStartPos-1)/stepSize)+1)
#print(SNPlocation, stepIndexBin)
writeFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepIndexBin))), 'a')
writeFile.write((("%s\n")%(str(currentSNP[:]))))
writeFile.close()
counter3=0
for counter3 in range(counter3,numSteps):
# open up each step in the list of steps across the chromosomal segment:
L=shlex.shlex(stepFileIndex[counter3],posix=True)
L.whitespace += '\t'
L.whitespace_split = True
L=list(L)
#print(L)
stepNumber = int(L[0])
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
#Now open the file of SNPs corresponding with the window in question and convert it into a list:
currentStepFile = open(("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(counter3+1)),'r')
currentStepFile = list(currentStepFile)
nSNPsInCurrentStepFile = len(currentStepFile)
print("number of SNPs in this step is:", nSNPsInCurrentStepFile)
#print(currentStepFile)
if nSNPsInCurrentStepFile == 0:
mismatchesPerSiteList = [0]
else:
# For each line of the file, estimate the per site parameters relevent to Tajima's D
mismatchesPerSiteList = list()
counter4=0
for counter4 in range(counter4,nSNPsInCurrentStepFile):
CountA=0
CountG=0
CountC=0
CountT=0
x = counter4
lineOfData = currentStepFile[x]
counter5=0
for counter5 in range(0,len(lineOfData)):
if lineOfData[counter5]==("A" or "a"): CountA=CountA+1
elif lineOfData[counter5]==("G" or "g"): CountG=CountG+1
elif lineOfData[counter5]==("C" or "c"): CountC=CountC+1
elif lineOfData[counter5]==("T" or "t"): CountT=CountT+1
else: continue
AxG=CountA*CountG
AxC=CountA*CountC
AxT=CountA*CountT
GxC=CountG*CountC
GxT=CountG*CountT
CxT=CountC*CountT
NumberMismatches = AxG+AxC+AxT+GxC+GxT+CxT
mismatchesPerSiteList=mismatchesPerSiteList+[NumberMismatches]
outputFile1.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber, segmentName,stepNumber,stepStart,stepStop,stepSize,nSNPsInCurrentStepFile,sum(mismatchesPerSiteList))))
outputFile1.close()
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
windowFileIndex = list(windowFileIndex)
numberOfWindows = len(windowFileIndex)
stepData = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepData = list(stepData)
numberOfSteps = len(stepData)
counter = 0
for counter in range(counter, numberOfWindows):
window = shlex.shlex(windowFileIndex[counter], posix = True)
window.whitespace += "\t"
window.whitespace_split = True
window = list(window)
windowNumber = int(window[0])
firstCoordinateInCurrentWindow = int(window[1])
lastCoordinateInCurrentWindow = int(window[2])
currentWindowSize = lastCoordinateInCurrentWindow - firstCoordinateInCurrentWindow +1
nSNPsInThisWindow = 0
nMismatchesInThisWindow = 0
counter2 = 0
for counter2 in range(counter2,numberOfSteps):
step = shlex.shlex(stepData[counter2], posix=True)
step.whitespace += "\t"
step.whitespace_split = True
step = list(step)
lastCoordinateInCurrentStep = int(step[4])
if lastCoordinateInCurrentStep < firstCoordinateInCurrentWindow: continue
elif lastCoordinateInCurrentStep <= lastCoordinateInCurrentWindow:
nSNPsInThisStep = int(step[6])
nMismatchesInThisStep = int(step[7])
nSNPsInThisWindow = nSNPsInThisWindow + nSNPsInThisStep
nMismatchesInThisWindow = nMismatchesInThisWindow + nMismatchesInThisStep
elif lastCoordinateInCurrentStep > lastCoordinateInCurrentWindow: break
if nSNPsInThisWindow ==0 :
S = 0
D = 0
else:
S = nSNPsInThisWindow/currentWindowSize
pi = nMismatchesInThisWindow/(currentWindowSize*numPairwiseComparisons)
print(nSNPsInThisWindow,nMismatchesInThisWindow,currentWindowSize,S,pi)
D = (pi-(S/a1))/math.sqrt(c1*S + c2*S*(S-1/currentWindowSize))
outputFile.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber,segmentName,segmentStartPos,segmentStopPos,windowNumber,firstCoordinateInCurrentWindow,lastCoordinateInCurrentWindow,currentWindowSize,nSNPsInThisWindow,S,D)))
A quick search shows that you do change your stepSize on line 110:
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
stepStop and stepStart appear to depend on your files' contents, so we can't debug it further.