Combining wav files programmatically - Python - python

I am looking to combine 10 audio samples in various manners (format - wav probably, but this can be changed to any format as they will be pre-recorded).
from pydub import AudioSegment
sounds = []
sound1 = AudioSegment.from_wav("Dropbox/PIREAD/1.wav")
sound2 = AudioSegment.from_wav("Dropbox/PIREAD/2.wav")
sound3 = AudioSegment.from_wav("Dropbox/PIREAD/3.wav")
sound4 = AudioSegment.from_wav("Dropbox/PIREAD/4.wav")
sound5 = AudioSegment.from_wav("Dropbox/PIREAD/5.wav")
sound6 = AudioSegment.from_wav("Dropbox/PIREAD/6.wav")
sound7 = AudioSegment.from_wav("Dropbox/PIREAD/7.wav")
sound8 = AudioSegment.from_wav("Dropbox/PIREAD/8.wav")
sound9 = AudioSegment.from_wav("Dropbox/PIREAD/9.wav")
sound0 = AudioSegment.from_wav("Dropbox/PIREAD/0.wav")
sounds=[sound1,sound2,sound3,sound4,sound5,sound6,sound7,sound8,sound9,sound0]
combined_sounds = AudioSegment.empty()
for x in range(10):
for y in range(10):
combined_sounds += sounds[y]
combined_sounds.export("Dropbox/PIREAD/joinedFile.wav", format="wav")
This is literally me reading the numbers 0-9 and assembling them into one overall wav file.
It works - but it is slow once the loop is extended x=100, x=1000.
Q: How can I speed things up?
The actual order of the numbers will be read from a text$ - for example "354224848179261915075" which happens to be the 100th Fibonacci number.
Cheers
Glen

I believe it's slow because when you loop over x, you repeat operations (the loop over y) which could be computed before the loop over x, then assembled.

I looked into AudioSegment and found potentially useful method for you namely from_mono_audiosegments but it is limited to mono sounds and you will need to test if it is faster than += please compare time-wise these options, i.e.
import time
from pydub import AudioSegment
sounds = []
sound1 = AudioSegment.from_wav("Dropbox/PIREAD/1.wav")
sound2 = AudioSegment.from_wav("Dropbox/PIREAD/2.wav")
sound3 = AudioSegment.from_wav("Dropbox/PIREAD/3.wav")
sound4 = AudioSegment.from_wav("Dropbox/PIREAD/4.wav")
sound5 = AudioSegment.from_wav("Dropbox/PIREAD/5.wav")
sound6 = AudioSegment.from_wav("Dropbox/PIREAD/6.wav")
sound7 = AudioSegment.from_wav("Dropbox/PIREAD/7.wav")
sound8 = AudioSegment.from_wav("Dropbox/PIREAD/8.wav")
sound9 = AudioSegment.from_wav("Dropbox/PIREAD/9.wav")
sound0 = AudioSegment.from_wav("Dropbox/PIREAD/0.wav")
sounds=[sound1,sound2,sound3,sound4,sound5,sound6,sound7,sound8,sound9,sound0]
# option1 using +=
t1 = time.time()
combined_sounds1 = AudioSegment.empty()
for s in sounds
combined_sounds1 += s
t2 = time.time()
# end of option1
# option2 using from_mono_audiosegments
t3 = time.time()
combined_sounds2 = AudioSegment.from_mono_audiosegments(*sounds)
t4 = time.time()
# end of option2
print('option1 (seconds):',t2-t1)
print('option2 (seconds):',t4-t3)

Thanks for the suggestions and advice above. This is the final code I used and link to the resultant video (with ffmpeg visualisation):
# Program to display the Fibonacci sequence up to n-th term
from pydub import AudioSegment
combined_sounds = ""
sound1 = AudioSegment.from_wav("1_2.wav")
sound2 = AudioSegment.from_wav("2_2.wav")
sound3 = AudioSegment.from_wav("3_2.wav")
sound4 = AudioSegment.from_wav("4_2.wav")
sound5 = AudioSegment.from_wav("5_2.wav")
sound6 = AudioSegment.from_wav("6_2.wav")
sound7 = AudioSegment.from_wav("7_2.wav")
sound8 = AudioSegment.from_wav("8_2.wav")
sound9 = AudioSegment.from_wav("9_2.wav")
sound0 = AudioSegment.from_wav("0_2.wav")
nterms=1000
# first two terms
n1, n2 = 0, 1
count = 0
fib = ""
# check if the number of terms is valid
if nterms <= 0:
print("Please enter a positive integer")
# if there is only one term, return n1
elif nterms == 1:
print("Fibonacci sequence upto",nterms,":")
print(n1)
# generate fibonacci sequence
else:
print("Fibonacci sequence:")
while count < nterms:
#print(n1)
fib += str(n1)
nth = n1 + n2
# update values
n1 = n2
n2 = nth
count += 1
i=-36
j=0
fibs = [fib[i:i+1000] for i in range(0, len(fib), 1000)]
seg = 0
for a in fibs:
if seg == 2:
break
combined_sounds = AudioSegment.empty()
seg +=1
for x in a:
i,j = -36,0
s = eval("sound"+str(x))
s = s.apply_gain_stereo(i,j)
combined_sounds += s
i,j = j,i
combined_sounds.export("joinedFile"+str(seg)+".wav", format="wav")
This splits the output into 1000 digit wav files. The first 1000 Fibonacci terms produces nearly 15Gb of wavs!
Uploaded to YouTube: https://www.youtube.com/watch?v=U7Z_HOGqjlE
Thanks all.

Related

Python code for adding 3 corresponding lists elements and getting averages from user input

For my code I need help with "mean_times[]"
I want to take the corresponding list items from all first elements from "op_time", "ltime", and "ptime" add them up, then divide by three and then all second elements to add them up and divide by three, and so on as many times as the user inputs tasks.
The way I wrote my code is static. I allow for three instances of this. but the code allows the user to input as many tasks as they want. I think I need the code to append to "mean_times[]" as many times as there are user inputs.
Meaning to take "(float_optime[0] + float_ltime[0] + float_ptime[0]) / 3" and append that into mean_times[] then do the same for the second element, and so on as many times as there are tasks.
I'm not sure of the logic I need or what sort of loop I need to do to add then append the product into mean_times[]
import numpy as np
from tabulate import *
tasks = []
t = input("Label the TASKS (a, b, c,..., entering STOP would end the process): ")
tasks.append(t)
while t.lower() != 'stop':
t = input("Label the TASKS (a, b, c, ..., entering STOP would end the process): ")
tasks.append(t)
del tasks[-1]
op_time = []
o = input("Enter OPTIMISTIC time for each task: ")
op_time.append(o)
while o.lower() != 'stop':
o = input("Enter OPTIMISTIC time for each task: :")
op_time.append(o)
del op_time[-1]
ltime = []
lt = input("Enter MOST LIKELY time for each task: ")
ltime.append(lt)
while lt.lower() != 'stop':
lt = input("Enter MOST LIKELY time for each task: :")
ltime.append(lt)
del ltime[-1]
ptime = []
p = input("Enter PESSIMISTIC time for each task: ")
ptime.append(p)
while p.lower() != 'stop':
p = input("Enter PESSIMISTIC time for each task: :")
ptime.append(p)
del ptime[-1]
array_op_time = np.array(op_time)
float_op_time = array_op_time.astype(float)
array_ltime = np.array(ltime)
float_ltime = array_ltime.astype(float)
array_ptime = np.array(ptime)
float_ptime = array_ptime.astype(float)
mean_times = [(float_optime[0] + float_ltime[0] + float_ptime[0]) / 3, (float_optime[1] + float_ltime[1] + float_ptime[1]) / 3, (float_optime[2] + float_ltime[2] + float_ptime[2]) / 3]
array_mean_times = np.array(mean_times)
float_mean_times = array_mean_times.astype(float)
#some logic to append to mean_times
table = {"Task": tasks, "Optimistic": op_time, "Most Likely": ltime, "Pessimistic": ptime, "Mean Duration": mean_times}
print(tabulate(table, headers='keys', tablefmt='fancy_grid', stralign='center', numalign='center'))
Fixed the program
import numpy as np
from tabulate import tabulate
tasks = []
counter = 1
t = input("Label the TASKS (a, b, c,..., entering STOP would end the process): ")
tasks.append(t)
while t.lower() != 'stop':
t = input("Label the TASKS (a, b, c, ..., entering STOP would end the process): ")
tasks.append(t)
counter = counter + 1
del tasks[-1]
op_time = []
for x in range(counter-1):
o = input("Enter OPTIMISTIC time for each task: ")
op_time.append(o)
ltime = []
for x in range(counter-1):
lt = input("Enter MOST LIKELY time for each task: ")
ltime.append(lt)
ptime = []
for x in range(counter-1):
p = input("Enter PESSIMISTIC time for each task: ")
ptime.append(p)
array_op_time = np.array(op_time)
float_op_time = array_op_time.astype(float)
array_ltime = np.array(ltime)
float_ltime = array_ltime.astype(float)
array_ptime = np.array(ptime)
float_ptime = array_ptime.astype(float)
mean_times = []
for x in range(0, counter-1):
m_time = ((float(ltime[x]) + float(ptime[x])+float(op_time[x]))/3)
mean_times.append(m_time)
array_mean_times = np.array(mean_times)
float_mean_times = array_mean_times.astype(float)
table = {"Task": tasks, "Optimistic": op_time, "Most Likely": ltime, "Pessimistic": ptime, "Mean Duration": mean_times}
print(tabulate(table, headers='keys', tablefmt='fancy_grid', stralign='center', numalign='center'))
You keep data in numpy.array so you can do it in one line - without index.
float_mean_times = (float_op_time + float_ltime + float_ptime) / 3
And this gives directly new numpy.array so it doesn't need np.array(...).astype(float)
And if you have it as normal lists then you would need for-loop with zip()
mean_times = []
for o, l, t in zip(float_op_time, float_ltime, float_ptime):
average = (o + l + t) / 3
mean_times.append(average)
mean_times = np.array(mean_times).astype(float)
Frankly, I see one problem - user may give different number of values for different arrays and then first version will not work, and second version will calculate it only for the shortest number of data.
I would rather use one while-loop to make sure user put the same number of data for all lists.
import numpy as np
from tabulate import *
tasks = []
op_time = []
ltime = []
ptime = []
while True:
t = input("Label for single TASK: ")
if t == '!stop':
break
o = input("Enter OPTIMISTIC time for this single task: ")
if o == '!stop':
break
lt = input("Enter MOST LIKELY time for this single task: ")
if lt == '!stop':
break
p = input("Enter PESSIMISTIC time for this single task: ")
if pt == '!stop':
break
# it will add data only if user put all answers
# (and it will skip task when user skip at some question)
tasks.append(t)
op_time.append(o)
ltime.append(lt)
ptime.append(p)
# ---
float_op_time = np.array(op_time).astype(float)
float_ltime = np.array(ltime).astype(float)
float_ptime = np.array(ptime).astype(float)
float_mean_times = (float_optime + float_ltime + float_ptime) / 3
table = {
"Task": tasks,
"Optimistic": op_time,
"Most Likely": ltime,
"Pessimistic": ptime,
"Mean Duration": mean_times
}
print(tabulate(table, headers='keys', tablefmt='fancy_grid', stralign='center', numalign='center'))

how to insert the data into pixels faster?

I'm currently working on a steagnographic application,
and i'm taking each pixel value and embedding data into it one by one
this sequencial processing is taking a long time to process,
the code:
import config_loader
import numpy as np
from PIL import Image
import encryption
import time
def byte2bin(bytestring):
# print("\n from byte 2 bin\n")
# print(bytestring)
bitstring = bin(int.from_bytes(bytestring, byteorder="big"))
return bitstring[2:]
def insert_data_in_pixel(raw_data, string, ptr, bits=1): # this function takes a pixel's data and then converts it to
# binary and then change the last bit to the secret
color = bin(int(raw_data))[2:]
# old = color # troubleshooting lines
color = color[:len(color) - bits]
color = color + string[ptr: ptr + bits]
# print("original-> ", old,"| |added bits ",string[ptr: ptr+bits],"| |Modified-> ", color) # troubleshooting lines
return np.uint8(int(color, 2))
def insert_length(length, new_img): # inserts length of our secret and the length itself is obfuscated
secret_string_len = '<l>' + str(int(length / 4) + 16) + '<l>' # Added ambiguity
secret_string_len = ''.join(format(_, '08b') for _ in bytearray(str(secret_string_len), encoding='utf-8'))
length = len(secret_string_len)
str_len_ptr = 0
for y in range(length):
x = 0
if str_len_ptr < length:
new_img[x][y][0] = insert_data_in_pixel(new_img[x][y][0], secret_string_len, str_len_ptr, bits=3)
str_len_ptr += 3
if str_len_ptr == length:
break
new_img[x][y][1] = insert_data_in_pixel(new_img[x][y][1], secret_string_len, str_len_ptr, bits=3)
str_len_ptr += 3
if str_len_ptr == length:
break
new_img[x][y][2] = insert_data_in_pixel(new_img[x][y][2], secret_string_len, str_len_ptr, bits=2)
str_len_ptr += 2
if str_len_ptr == length:
break
def secret_Loader(): # loads secret from a file
with open('Message.txt', 'r', encoding='utf-8', errors='ignore') as file:
lines = file.readlines()
message = ''.join(lines)
key = config_loader.read('''data['key']''')
# print(key)
enc_message = encryption.encrypt(message, key)
return enc_message
def insert():
start = time.time()
image_path = config_loader.read('''data['environment']['cover_image']''')
photo = Image.open(image_path).convert('RGB') # just insert the image name here
data = np.asarray(photo).copy()
width, height = photo.size
secret = byte2bin(secret_Loader())
secret_pointer = 0
lensecret = len(secret)
insert_length(lensecret, data)
insertion = time.time()
for x in range(1, height):
for y in range(width):
if lensecret > secret_pointer:
# RED
data[x][y][0] = insert_data_in_pixel(data[x][y][0], secret, secret_pointer, bits=2)
secret_pointer += 2
if lensecret == secret_pointer:
break
# Green
data[x][y][1] = insert_data_in_pixel(data[x][y][1], secret, secret_pointer, bits=2)
secret_pointer += 2
if lensecret == secret_pointer:
break
# Blue
data[x][y][2] = insert_data_in_pixel(data[x][y][2], secret, secret_pointer, bits=1)
secret_pointer += 1
if lensecret == secret_pointer:
break
print("data insertion",time.time()-insertion)
generation = time.time()
# print(data)
data = Image.fromarray(data)
print("image generation in ", time.time()-generation)
# data.show()
_ = time.time()
data = data.save(r'stg.PNG')
print("saving time ", time.time()-_)
print('Exectuted in->', time.time() - start)
if __name__ == '__main__':
insert()
the timings
encryption in 1.0841524600982666
data insertion 9.439783811569214
image generation in 0.039893388748168945
saving time 6.283206939697266
Exectuted in-> 17.11327576637268
I thought about multithreading but that is unreliable as every bit in the data is important and it's position in the sequence is also important.
P.S the data insertion time is for 10000
lines of this
this is a message to test the limit of the program let's check when it breaks and how, also i'm running out of words0
so this isn't bad but if it can be improved how can i achieve it?

How to parallelize or use multi-cores to speed up a while loop?

I have a instance with 16-core processor and I have a while loop like below,
count = 200000
num = 0
pbar = tqdm(total=count)
lst = []
while num <= count:
random_folder = os.path.join(path, np.random.choice(os.listdir(path)))
file_path = os.path.join(path, np.random.choice(os.listdir(random_folder)))
if not os.path.isdir(file_path):
lst.append(file_path)
pbar.update(1)
num += 1
When I tried to run this code on a server, the estimated time is really long
0%| | 138/200000 [02:14<51:25:11, 1.08it/s]
I have tried to use numpy to get random choice but it's still slow. Is there any way I can take advantage of my multi-core cpu and speed up this while loop? It's just collecting random files from sub folders. Really appreciate any help. Thanks
Update:
path = "/home/user12/pdf_files"
def get_random_file(num_of_files):
count = 0
random_files = []
while count < num_of_files:
random_folder = os.path.join(path, random.choice(os.listdir(path)))
file_path = os.path.join(path, random.choice(os.listdir(random_folder)))
if not os.path.isdir(file_path):
resumes_list.append(file_path)
count += 1
return random_files
with Pool(16) as p:
random_files = p.map(get_random_file, (1000/16,))
You can use multi processing and use all cores at the same time.
See https://docs.python.org/3.8/library/multiprocessing.html
Something like this:
from multiprocessing import Pool
def get_random_file(num_of_files):
# your logic goes here
count = 0
random_files = []
while count < num_of_files:
count += 1
pass
#get random file and append to 'random_files'
return random_files
if __name__ == '__main__':
with Pool(16) as p:
num_of_files = [200000/16 for i in range(1,16)]
random_files = p.map(get_random_file,num_of_files)
# random_files is a list of lists - you need to merge them into one list

When i use a for loop with an array it doesn't work and uses the number of items instead of going item by item

Basically in the last for loop the k variable uses the number of items in the list and then I have a false and unique answer rather than multiple answers I want to do some sort of n roots of a complex number (if my question isn't clear sorry i'm not a native english speaker I'll do my best to make it clearer)
from math import *
deg = int(input("entrez le degré:"))
re = int(input("le réel:"))
im = int(input("l'imaginaire:"))
counter = 0
while counter < deg:
counter = counter + 1
kkk = []
kkk.append(counter)
r = sqrt(pow(re,2)+pow(im,2))
if im != 0:
teton = round(pi/radians(degrees(acos(re/r))),1)
else:
teton = round(pi/radians(degrees(acos(im/r))),1)
if round(r) != r:
r = "sqrt(",(pow(re,2)+pow(im,2)),")"
else:
r = r
teta = "pi/%s" %teton
print("z = ",r,"e^i",teta,)
for k in kkk:
if re != 0 or im != 0:
print(r,"e^i*2*",teta,"*",k,"pi")
else:
print(r,"^1/",deg,"e^i(",teta,"/",deg," +(2",k,"pi)/",deg)
print(k)
If I understood the problem correctly, you are saying that for loop is not iterating over all the items in the list kkk.
if you check your code the list kkk always have only one item as you are initializing and appending item in same loop.
please move below statement out of the first loop.
kkk = []
like below.
from math import *
deg = int(input("entrez le degré:"))
re = int(input("le réel:"))
im = int(input("l'imaginaire:"))
counter = 0
kkk = []
while counter < deg:
counter = counter + 1
kkk.append(counter)
r = sqrt(pow(re,2)+pow(im,2))
if im != 0:
teton = round(pi/radians(degrees(acos(re/r))),1)
else:
teton = round(pi/radians(degrees(acos(im/r))),1)
if round(r) != r:
r = "sqrt(",(pow(re,2)+pow(im,2)),")"
else:
r = r
teta = "pi/%s" %teton
print("z = ",r,"e^i",teta,)
for k in kkk:
if re != 0 or im != 0:
print(r,"e^i*2*",teta,"*",k,"pi")
else:
print(r,"^1/",deg,"e^i(",teta,"/",deg," +(2",k,"pi)/",deg)
print(k)

python is inexplicably shortening the step size with each iteration of a sliding window analysis

I am working on a program that estimates the statistic Tajima's D in a series of sliding windows across a chromosome. The chromosome itself is also divided into a number of different regions with (hopefully) functional significance. The sliding window analysis is performed by my script on each region.
At the start of the program, I define the size of the sliding windows and the size of the steps that move from one window to the next. I import a file which contains the coordinates for each different chromosomal region, and import another file which contains all the SNP data I am working with (this is read line-by-line, as it is a large file). The program loops through the list of chromosomal locations. For each location, it generates an index of steps and windows for the analysis, partitions the SNP data into output files (corresponding with the steps), calculates key statistics for each step file, and combines these statistics to estimate Tajima's D for each window.
The program works well for small files of SNP data. It also works well for the first iteration over the first chromosomal break point. However, for large files of SNP data, the step size in the analysis is inexplicably decreased as the program iterates over each chromosomal regions. For the first chromosomal regions, the step size is 2500 nucleotides (this is what it is suppose to be). For the second chromosome segment, however, the step size is 1966, and for the third it is 732.
If anyone has any suggestions at to why this might be the case, please let me know. I am especially stumped as this program seems to work size for small files but not for larger ones.
My code is below:
import sys
import math
import fileinput
import shlex
import string
windowSize = int(500)
stepSize = int(250)
n = int(50) #number of individuals in the anaysis
SNP_file = open("SNPs-1.txt",'r')
SNP_file.readline()
breakpoints = open("C:/Users/gwilymh/Desktop/Python/Breakpoint coordinates.txt", 'r')
breakpoints = list(breakpoints)
numSegments = len(breakpoints)
# Open a file to store the Tajima's D results:
outputFile = open("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/Tajima's D estimates.txt", 'a')
outputFile.write(str("segmentNumber\tchrSegmentName\tsegmentStart\tsegmentStop\twindowNumber\twindowStart\twindowStop\tWindowSize\tnSNPs\tS\tD\n"))
#Calculating parameters a1, a2, b1, b2, c1 and c2
numPairwiseComparisons=n*((n-1)/2)
b1=(n+1)/(3*(n-1))
b2=(2*(n**2+n+3))/(9*n*(n-1))
num=list(range(1,n)) # n-1 values as a list
i=0
a1=0
for i in num:
a1=a1+(1/i)
i=i+1
j=0
a2=0
for j in num:
a2=a2+(1/j**2)
j=j+1
c1=(b1/a1)-(1/a1**2)
c2=(1/(a1**2+a2))*(b2 - ((n+2)/(a1*n))+ (a2/a1**2) )
counter6=0
#For each segment, assign a number and identify the start and stop coodrinates and the segment name
for counter6 in range(counter6,numSegments):
segment = shlex.shlex(breakpoints[counter6],posix = True)
segment.whitespace += '\t'
segment.whitespace_split = True
segment = list(segment)
segmentName = segment[0]
segmentNumber = int(counter6+1)
segmentStartPos = int(segment[1])
segmentStopPos = int(segment[2])
outputFile1 = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'a')
#Make output files to index the lcoations of each window within each segment
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
k = segmentStartPos - 1
windowNumber = 0
while (k+1) <=segmentStopPos:
windowStart = k+1
windowNumber = windowNumber+1
windowStop = k + windowSize
if windowStop > segmentStopPos:
windowStop = segmentStopPos
windowFileIndex.write(("%s\t%s\t%s\n")%(str(windowNumber),str(windowStart),str(windowStop)))
k=k+stepSize
windowFileIndex.close()
# Make output files for each step to export the corresponding SNP data into + an index of these output files
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
i = segmentStartPos-1
stepNumber = 0
while (i+1) <= segmentStopPos:
stepStart = i+1
stepNumber = stepNumber+1
stepStop = i+stepSize
if stepStop > segmentStopPos:
stepStop = segmentStopPos
stepFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepNumber))), 'a')
stepFileIndex.write(("%s\t%s\t%s\n")%(str(stepNumber),str(stepStart),str(stepStop)))
i=i+stepSize
stepFile.close()
stepFileIndex.close()
# Open the index file for each step in current chromosomal segment
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepFileIndex = list(stepFileIndex)
numSteps = len(stepFileIndex)
while 1:
currentSNP = SNP_file.readline()
if not currentSNP: break
currentSNP = shlex.shlex(currentSNP,posix=True)
currentSNP.whitespace += '\t'
currentSNP.whitespace_split = True
currentSNP = list(currentSNP)
SNPlocation = int(currentSNP[0])
if SNPlocation > segmentStopPos:break
stepIndexBin = int(((SNPlocation-segmentStartPos-1)/stepSize)+1)
#print(SNPlocation, stepIndexBin)
writeFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepIndexBin))), 'a')
writeFile.write((("%s\n")%(str(currentSNP[:]))))
writeFile.close()
counter3=0
for counter3 in range(counter3,numSteps):
# open up each step in the list of steps across the chromosomal segment:
L=shlex.shlex(stepFileIndex[counter3],posix=True)
L.whitespace += '\t'
L.whitespace_split = True
L=list(L)
#print(L)
stepNumber = int(L[0])
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
#Now open the file of SNPs corresponding with the window in question and convert it into a list:
currentStepFile = open(("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(counter3+1)),'r')
currentStepFile = list(currentStepFile)
nSNPsInCurrentStepFile = len(currentStepFile)
print("number of SNPs in this step is:", nSNPsInCurrentStepFile)
#print(currentStepFile)
if nSNPsInCurrentStepFile == 0:
mismatchesPerSiteList = [0]
else:
# For each line of the file, estimate the per site parameters relevent to Tajima's D
mismatchesPerSiteList = list()
counter4=0
for counter4 in range(counter4,nSNPsInCurrentStepFile):
CountA=0
CountG=0
CountC=0
CountT=0
x = counter4
lineOfData = currentStepFile[x]
counter5=0
for counter5 in range(0,len(lineOfData)):
if lineOfData[counter5]==("A" or "a"): CountA=CountA+1
elif lineOfData[counter5]==("G" or "g"): CountG=CountG+1
elif lineOfData[counter5]==("C" or "c"): CountC=CountC+1
elif lineOfData[counter5]==("T" or "t"): CountT=CountT+1
else: continue
AxG=CountA*CountG
AxC=CountA*CountC
AxT=CountA*CountT
GxC=CountG*CountC
GxT=CountG*CountT
CxT=CountC*CountT
NumberMismatches = AxG+AxC+AxT+GxC+GxT+CxT
mismatchesPerSiteList=mismatchesPerSiteList+[NumberMismatches]
outputFile1.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber, segmentName,stepNumber,stepStart,stepStop,stepSize,nSNPsInCurrentStepFile,sum(mismatchesPerSiteList))))
outputFile1.close()
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
windowFileIndex = list(windowFileIndex)
numberOfWindows = len(windowFileIndex)
stepData = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepData = list(stepData)
numberOfSteps = len(stepData)
counter = 0
for counter in range(counter, numberOfWindows):
window = shlex.shlex(windowFileIndex[counter], posix = True)
window.whitespace += "\t"
window.whitespace_split = True
window = list(window)
windowNumber = int(window[0])
firstCoordinateInCurrentWindow = int(window[1])
lastCoordinateInCurrentWindow = int(window[2])
currentWindowSize = lastCoordinateInCurrentWindow - firstCoordinateInCurrentWindow +1
nSNPsInThisWindow = 0
nMismatchesInThisWindow = 0
counter2 = 0
for counter2 in range(counter2,numberOfSteps):
step = shlex.shlex(stepData[counter2], posix=True)
step.whitespace += "\t"
step.whitespace_split = True
step = list(step)
lastCoordinateInCurrentStep = int(step[4])
if lastCoordinateInCurrentStep < firstCoordinateInCurrentWindow: continue
elif lastCoordinateInCurrentStep <= lastCoordinateInCurrentWindow:
nSNPsInThisStep = int(step[6])
nMismatchesInThisStep = int(step[7])
nSNPsInThisWindow = nSNPsInThisWindow + nSNPsInThisStep
nMismatchesInThisWindow = nMismatchesInThisWindow + nMismatchesInThisStep
elif lastCoordinateInCurrentStep > lastCoordinateInCurrentWindow: break
if nSNPsInThisWindow ==0 :
S = 0
D = 0
else:
S = nSNPsInThisWindow/currentWindowSize
pi = nMismatchesInThisWindow/(currentWindowSize*numPairwiseComparisons)
print(nSNPsInThisWindow,nMismatchesInThisWindow,currentWindowSize,S,pi)
D = (pi-(S/a1))/math.sqrt(c1*S + c2*S*(S-1/currentWindowSize))
outputFile.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber,segmentName,segmentStartPos,segmentStopPos,windowNumber,firstCoordinateInCurrentWindow,lastCoordinateInCurrentWindow,currentWindowSize,nSNPsInThisWindow,S,D)))
A quick search shows that you do change your stepSize on line 110:
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
stepStop and stepStart appear to depend on your files' contents, so we can't debug it further.

Categories

Resources