Trying to convert the following to a def but doing something that's probably not allowed... What am I doing wrong and how could this be done better?
# Same for both
import alsaaudio
l_input = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK, card='default')
r_input = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK, card='default')
#
l, data = l_input.read()
if l > 0:
# transform data to logarithmic scale
lin_vu = (math.log(float(max(audioop.max(data, 2),1)))-log_lo)/(log_hi-log_lo)
# Calculate value
lin_vu = (min(max(int(lin_vu*15),0),15))
l, data = r_input.read()
if l > 0:
# transform data to logarithmic scale
rin_vu = (math.log(float(max(audioop.max(data, 2),1)))-log_lo)/(log_hi-log_lo)
# Calculate value
rin_vu = (min(max(int(rin_vu*15),0),15))
I was hoping to do something like this as I need to read 4 values, not just the two listed:
def readvu( src ):
l, data = src.read()
if l > 0:
# transform data to logarithmic scale
l_vu = (math.log(float(max(audioop.max(data, 2),1)))-log_lo)/(log_hi-log_lo)
# Calculate value
l_vu = (min(max(int(l_vu*15),0),15))
lin_vu = readvu( 'l_input' );
rin_vu = readvu( 'r_input' );
But that yields the mentioned error...
The solution is obvious: if you call readvu('l_input'), your src becomes 'l_input' and .read()ing from it will fail.
The call should be like
lin_vu = readvu(l_input)
rin_vu = readvu(r_input)
which passes the actual variables, not the strings.
Change of thought brought me to the following:
# Convert log scale, calculate value & max
def vu_log(data,vu_max,max_t):
# transform data to logarithmic scale
vu = (math.log(float(max(audioop.max(data, 2),1)))-log_lo)/(log_hi-log_lo)
# Calculate value
vu = (min(max(int(vu*15),0),15))
if vu >= vu_max:
max_t = (3 * vu_fps) # keep max for 3 seconds
vu_max = vu
else:
max_t = max(0, max_t-1) # Reduce max timer by 1 until 0
return (vu,vu_max,max_t);
# Fetch Left input VU from ALSA
l, data = l_input.read()
if l > 0:
# Transform data to logarithmic scale and calculate value
lin = vu_log(data,lin_max,lin_t)
lin_vu = lin[0] # Desired value
lin_max = lin[1] # Maximum value (last 3 sec)
lin_t = lin[2] # Used for tracking age of lin_max
Constructive suggestions always welcome... :)
Related
I have a variable with zeros and ones. Each sequence of ones represent "a phase" I want to observe, each sequence of zeros represent the space/distance that intercurr between these phases.
It may happen that a phase carries a sort of "impulse response", for example it can be the echo of a voice: in this case we will have 1,1,1,1,0,0,1,1,1,0,0,0 as an output, the first sequence ones is the shout we made, while the second one is just the echo cause by the shout.
So I made a function that doesn't take into account the echos/response of the main shout/action, and convert the ones sequence of the echo/response into zeros.
(1) If the sequence of zeros is greater or equal than the input threshold nearby_thr the function will recognize that the sequence of ones is an independent phase and it won't delete or change anything.
(2) If the sequence of zeros (between two sequences of ones) is smaller than the input threshold nearby_thr the function will recognize that we have "an impulse response/echo" and we do not take that into account. Infact it will convert the ones into zeros.
I made a naive function that can accomplish this result but I was wondering if pandas already has a function like that, or if it can be accomplished in few lines, without writing a "C-like" function.
Here's my code:
import pandas as pd
import matplotlib.pyplot as plt
# import utili_funzioni.util00 as ut0
x1 = pd.DataFrame([0,0,0,0,0,0,0,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1])
x2 = pd.DataFrame([0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1,0])
# rule = x1==1 ## counting number of consecutive ones
# cumsum_ones = rule.cumsum() - rule.cumsum().where(~rule).ffill().fillna(0).astype(int)
def detect_nearby_el_2(df, nearby_thr):
global el2del
# df = consecut_zeros
# i = 0
print("")
print("")
j = 0
enterOnce_if = 1
reset_count_0s = 0
start2detect = False
count0s = 0 # init
start2_getidxs = False # if this is not true, it won't store idxs to delete
el2del = [] # store idxs to delete elements
for i in range(df.shape[0]):
print("")
print("i: ", i)
x_i = df.iloc[i, 0]
if x_i == 1 and j==0: # first phase (ones) has been detected
start2detect = True # first phase (ones) has been detected
# j += 1
print("count0s:",count0s)
if start2detect == True: # first phase, seen/detected, --> (wait) has ended..
if x_i == 0: # 1st phase detected and ended with "a zero"
if reset_count_0s == 1:
count0s = 0
reset_count_0s = 0
count0s += 1
if enterOnce_if == 1:
start2_getidxs=True # avoiding to delete first phase
enterOnce_0 = 0
if start2_getidxs==True: # avoiding to delete first phase
if x_i == 1 and count0s < nearby_thr:
print("this is NOT a new phase!")
el2del = [*el2del, i] # idxs to delete
reset_count_0s = 1 # reset counter
if x_i == 1 and count0s >= nearby_thr:
print("this is a new phase!") # nothing to delete
reset_count_0s = 1 # reset counter
return el2del
def convert_nearby_el_into_zeros(df,idx):
df0 = df + 0 # error original dataframe is modified!
if len(idx) > 0:
# df.drop(df.index[idx]) # to delete completely
df0.iloc[idx] = 0
else:
print("no elements nearby to delete!!")
return df0
######
print("")
x1_2del = detect_nearby_el_2(df=x1,nearby_thr=3)
x2_2del = detect_nearby_el_2(df=x2,nearby_thr=3)
## deleting nearby elements
x1_a = convert_nearby_el_into_zeros(df=x1,idx=x1_2del)
x2_a = convert_nearby_el_into_zeros(df=x2,idx=x2_2del)
## PLOTTING
# ut0.grayplt()
fig1 = plt.figure()
fig1.suptitle("x1",fontsize=20)
ax1 = fig1.add_subplot(1,2,1)
ax2 = fig1.add_subplot(1,2,2,sharey=ax1)
ax1.title.set_text("PRE-detect")
ax2.title.set_text("POST-detect")
line1, = ax1.plot(x1)
line2, = ax2.plot(x1_a)
fig2 = plt.figure()
fig2.suptitle("x2",fontsize=20)
ax1 = fig2.add_subplot(1,2,1)
ax2 = fig2.add_subplot(1,2,2,sharey=ax1)
ax1.title.set_text("PRE-detect")
ax2.title.set_text("POST-detect")
line1, = ax1.plot(x2)
line2, = ax2.plot(x2_a)
You can see that x1 has two "response/echoes" that I want to not take into account, while x2 has none, infact nothing changed in x2
My question is: How this can be accomplished in few lines using pandas?
Thank You
Interesting problem, and I'm sure there's a more elegant solution out there, but here is my attempt - it's at least fairly performant:
x1 = pd.Series([0,0,0,0,0,0,0,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1])
x2 = pd.Series([0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1,0])
def remove_echos(series, threshold):
starting_points = (series==1) & (series.shift()==0)
echo_starting_points = starting_points & series.shift(threshold)==1
echo_starting_points = series[echo_starting_points].index
change_points = series[starting_points].index.to_list() + [series.index[-1]]
for (start, end) in zip(change_points, change_points[1:]):
if start in echo_starting_points:
series.loc[start:end] = 0
return series
x1 = remove_echos(x1, 3)
x2 = remove_echos(x2, 3)
(I changed x1 and x2 to be Series instead of DataFrame, it's easy to adapt this code to work with a df if you need to.)
Explanation: we define the "starting point" of each section as a 1 preceded by a 0. Of those we define an "echo" starting point if the point threshold places before is a 1. (The assumption is that we don't have a phases which is shorter than threshold.) For each echo starting point, we zero from it to the next starting point or the end of the Series.
Let's say I wanted to call a function to do some calculation, but I also wanted to use that calculated value in a later function. When I return the value of the first function can I not just send it to my next function? Here is an example of what I am talking about:
def add(x,y):
addition = x + y
return addition
def square(a):
result = a * a
return result
sum = add(1,4)
product = square(addition)
If I call the add function, it'll return 5 as the addition result. But I want to use that number 5 in the next function, can I just send it to the next function as shown? In the main program I am working on it does not work like this.
Edit: This is a sample of the code I am actually working on which will give a better idea of what the problem is. The problem is when I send the mean to the calculateStdDev function.
#import libraries to be used
import time
import StatisticsCalculations
#global variables
mean = 0
stdDev = 0
#get file from user
fileChoice = input("Enter the .csv file name: ")
inputFile = open(fileChoice)
headers = inputFile.readline().strip('\n').split(',') #create headers for columns and strips unnecessary characters
#create a list with header-number of lists in it
dataColumns = []
for i in headers:
dataColumns.append([]) #fills inital list with as many empty lists as there are columns
#counts how many rows there are and adds a column of data into each empty list
rowCount = 0
for row in inputFile:
rowCount = rowCount + 1
comps = row.strip().split(',') #components of data
for j in range(len(comps)):
dataColumns[j].append(float(comps[j])) #appends the jth entry into the jth column, separating data into categories
k = 0
for entry in dataColumns:
print("{:>11}".format(headers[k]),"|", "{:>10.2f}".format(StatisticsCalculations.findMax(dataColumns[k])),"|",
"{:>10.2f}".format(StatisticsCalculations.findMin(dataColumns[k])),"|","{:>10.2f}".format(StatisticsCalculations.calculateMean(dataColumns[k], rowCount)),"|","{:>10.2f}".format()) #format each data entry to be right aligned and be correctly spaced in its column
#prining break line for each row
k = k + 1 #counting until dataColumns is exhausted
inputFile.close()
And the StatisticsCalculations module:
import math
def calculateMean(data, rowCount):
sumForMean = 0
for entry in data:
sumForMean = sumForMean + entry
mean = sumForMean/rowCount
return mean
def calculateStdDev(data, mean, rowCount, entry):
stdDevSum = 0
for x in data:
stdDevSum = float(stdDevSum) + ((float(entry[x]) - mean)** 2) #getting sum of squared difference to be used in std dev formula
stdDev = math.sqrt(stdDevSum / rowCount) #using the stdDevSum for the remaining parts of std dev formula
return stdDev
def findMin(data):
lowestNum = 1000
for component in data:
if component < lowestNum:
lowestNum = component
return lowestNum
def findMax(data):
highestNum = -1
for number in data:
if number > highestNum:
highestNum = number
return highestNum
First of all, sum is a reserved word, you shouldn't use it as a variable.
You can do it this way:
def add(x,y):
addition = x + y
return addition
def square(a):
result = a * a
return result
s = add(1, 4)
product = square(s)
Or directly:
product = square(add(1, 4))
I have a set of data for which has an ID, timestamp, and identifiers. I have to go through it, calculate the entropy and save some other links for the data. At each step more identifiers are added to the identifiers dictionary and I have to re-compute the entropy and append it. I have really large amount of data and the program gets stuck due to growing number of identifiers and their entropy calculation after each step. I read the following solution but it is about the data consisting of numbers.
Incremental entropy computation
I have copied two functions from this page and the incremental calculation of entropy gives different values than the classical full entropy calculation at every step.
Here is the code I have:
from math import log
# ---------------------------------------------------------------------#
# Functions copied from https://stackoverflow.com/questions/17104673/incremental-entropy-computation
# maps x to -x*log2(x) for x>0, and to 0 otherwise
h = lambda p: -p*log(p, 2) if p > 0 else 0
# entropy of union of two samples with entropies H1 and H2
def update(H1, S1, H2, S2):
S = S1+S2
return 1.0*H1*S1/S+h(1.0*S1/S)+1.0*H2*S2/S+h(1.0*S2/S)
# compute entropy using the classic equation
def entropy(L):
n = 1.0*sum(L)
return sum([h(x/n) for x in L])
# ---------------------------------------------------------------------#
# Below is the input data (Actually I read it from a csv file)
input_data = [["1","2008-01-06T02:13:38Z","foo,bar"], ["2","2008-01-06T02:12:13Z","bar,blup"], ["3","2008-01-06T02:13:55Z","foo,bar"],
["4","2008-01-06T02:12:28Z","foo,xy"], ["5","2008-01-06T02:12:44Z","foo,bar"], ["6","2008-01-06T02:13:00Z","foo,bar"],
["7","2008-01-06T02:13:00Z","x,y"]]
total_identifiers = {} # To store the occurrences of identifiers. Values shows the number of occurrences
all_entropies = [] # Classical way of calculating entropy at every step
updated_entropies = [] # Incremental way of calculating entropy at every step
for item in input_data:
temp = item[2].split(",")
identifiers_sum = sum(total_identifiers.values()) # Sum of all identifiers
old_entropy = 0 if all_entropies[-1:] == [] else all_entropies[-1] # Get previous entropy calculation
for identifier in temp:
S_new = len(temp) # sum of new samples
temp_dictionaty = {a:1 for a in temp} # Store current identifiers and their occurrence
if identifier not in total_identifiers:
total_identifiers[identifier] = 1
else:
total_identifiers[identifier] += 1
current_entropy = entropy(total_identifiers.values()) # Entropy for current set of identifiers
updated_entropy = update(old_entropy, identifiers_sum, current_entropy, S_new)
updated_entropies.append(updated_entropy)
entropy_value = entropy(total_identifiers.values()) # Classical entropy calculation for comparison. This step becomes too expensive with big data
all_entropies.append(entropy_value)
print(total_identifiers)
print('Sum of Total Identifiers: ', identifiers_sum) # Gives 12 while the sum is 14 ???
print("All Classical Entropies: ", all_entropies) # print for comparison
print("All Updated Entropies: ", updated_entropies)
The other issue is that when I print "Sum of total_identifiers", it gives 12 instead of 14! (Due to very large amount of data, I read the actual file line by line and write the results directly to the disk and do not store it in the memory apart from the dictionary of identifiers).
The code above uses Theorem 4; it seems to me that you want to use Theorem 5 instead (from the paper in the next paragraph).
Note, however, that if the number of identifiers is really the problem then the incremental approach below isn't going to work either---at some point the dictionaries are going to get too large.
Below you can find a proof-of-concept Python implementation that follows the description from Updating Formulas and Algorithms for Computing Entropy and Gini Index from Time-Changing Data Streams.
import collections
import math
import random
def log2(p):
return math.log(p, 2) if p > 0 else 0
CountChange = collections.namedtuple('CountChange', ('label', 'change'))
class EntropyHolder:
def __init__(self):
self.counts_ = collections.defaultdict(int)
self.entropy_ = 0
self.sum_ = 0
def update(self, count_changes):
r = sum([change for _, change in count_changes])
residual = self._compute_residual(count_changes)
self.entropy_ = self.sum_ * (self.entropy_ - log2(self.sum_ / (self.sum_ + r))) / (self.sum_ + r) - residual
self._update_counts(count_changes)
return self.entropy_
def _compute_residual(self, count_changes):
r = sum([change for _, change in count_changes])
residual = 0
for label, change in count_changes:
p_new = (self.counts_[label] + change) / (self.sum_ + r)
p_old = self.counts_[label] / (self.sum_ + r)
residual += p_new * log2(p_new) - p_old * log2(p_old)
return residual
def _update_counts(self, count_changes):
for label, change in count_changes:
self.sum_ += change
self.counts_[label] += change
def entropy(self):
return self.entropy_
def naive_entropy(counts):
s = sum(counts)
return sum([-(r/s) * log2(r/s) for r in counts])
if __name__ == '__main__':
print(naive_entropy([1, 1]))
print(naive_entropy([1, 1, 1, 1]))
entropy = EntropyHolder()
freq = collections.defaultdict(int)
for _ in range(100):
index = random.randint(0, 5)
entropy.update([CountChange(index, 1)])
freq[index] += 1
print(naive_entropy(freq.values()))
print(entropy.entropy())
Thanks #blazs for providing the entropy_holder class. That solves the problem. So the idea is to import entropy_holder.py from (https://gist.github.com/blazs/4fc78807a96976cc455f49fc0fb28738) and use it to store the previous entropy and update at every step when new identifiers come.
So the minimum working code would look like this:
import entropy_holder
input_data = [["1","2008-01-06T02:13:38Z","foo,bar"], ["2","2008-01-06T02:12:13Z","bar,blup"], ["3","2008-01-06T02:13:55Z","foo,bar"],
["4","2008-01-06T02:12:28Z","foo,xy"], ["5","2008-01-06T02:12:44Z","foo,bar"], ["6","2008-01-06T02:13:00Z","foo,bar"],
["7","2008-01-06T02:13:00Z","x,y"]]
entropy = entropy_holder.EntropyHolder() # This class will hold the current entropy and counts of identifiers
for item in input_data:
for identifier in item[2].split(","):
entropy.update([entropy_holder.CountChange(identifier, 1)])
print(entropy.entropy())
This entropy by using the Blaz's incremental formulas is very close to the entropy calculated the classical way and saves from iterating over all the data again and again.
I'm getting this error: 'TypeError: list indices must be integers, not float'
but the functions I'm using need to accept non integer values, otherwise my results are different...
Just to give you an idea, I have written some code that fits a gaussian to some data with a single peak. To do this, I need to calculate an estimated value for sigma. To get that, I've written two functions that are meant to look at the data, use the x value for the peak to find two points(r_pos and l_pos) which are either side of the peak and a set distance from the y axis (thresh). And from that I can get an estimated sigma(r_pos - l_pos).
This is all coming about from a piece of code that worked, but the mark sheet for my coursework says I need to use functions, so I'm trying to turn this:
I0 = max(y)
pos = y.index(I0)
print 'Peak value is',I0,'Counts per sec at' ,x[pos], 'degrees(2theta)'
print pos,I0
#left position
thresh = 10
i = pos
while y[i] > thresh:
i -= 1
l_pos = x[i]
#right position
thresh = 10
i = y.index(I0)
while y[i] > thresh:
i += 1
r_pos = x[i]
print r_pos
sigma0 = r_pos - l_pos
print sigma0
Into something that uses functions that can be called etc. This is my attempt:
def Peak_Find(x,y):
I0 = max(y)
pos = y.index(I0)
return I0, x[pos]
def R_Pos(thresh,position):
i = position
while y[i] > thresh:
i += 0.1
r_pos = x[i]
return r_pos
peak_y,peak_x = Peak_Find(x,y)
Right Position = R_Pos(10,peak_x)
peak_y = 855.0
Peak_x = 32.1 , by the way
It looks like you want to replace the line
i = position
With something like
i = x.index(position)
because position is a float, and you want the location in the array of position. You are using i to get the index of an array, and you must use ints to do this, hence using the .index method to return the (integer) location in the array.
You are better off writing the program this way because then the variable names will actually match what is contained in the variables.
def Peak_Find(x,y):
I0 = max(y)
pos = y.index(I0)
return I0, pos
def R_Pos(thresh,position):
while y[position] > thresh:
position += 1 # Not sure if this is what you want
r_pos = x[position]
return r_pos # Not sure what you want here... this is the value at x, not the position
this probably leads to scipy/numpy, but right now I'm happy with any functionality as I couldn't find anything in those packages. I have a matrix that contains data for a multi-variate distribution (let's say, 2, for the fun of it). Is there any function to compute (higher) moments of that? All I could find was numpy.mean() and numpy.cov() :o
Thanks :)
/edit:
So some more detail: I have multivariate data, that is, a matrix where rows display variables and columns observations. Now I would like to have a simple way of computing the joint moments of that data, as defined in http://en.wikipedia.org/wiki/Central_moment#Multivariate_moments .
I'm pretty new to python/scipy so I'm not sure I'd be the best person to code this one up, especially for the n-variables case (note that the wikipedia definition is for n=2), and I kind of expected there to be some out-of-the-box thing to use as I thought this would be a standard problem.
/edit2:
Just for the future, in case someone wants to do something similar, the following code (which is still under review) should give the sample equivalent of the raw moments E(X^2), E(Y^2), etc. It only works for two variables right now, but it should be extendable if one feels the need. If you see some mistakes or unclean/unpython-nish code, feel free to comment.
from numpy import *
# this function should return something as
# moments[0] = 1
# moments[1] = mean(X), mean(Y)
# moments[2] = 1/n*X'X, 1/n*X'Y, 1/n*Y'Y
# moments[3] = mean(X'X'X), mean(X'X'Y), mean(X'Y'Y),
# mean(Y'Y'Y)
# etc
def getRawMoments(data, moment, axis=0):
a = moment
if (axis==0):
n = float(data.shape[1])
X = matrix(data[0,:]).reshape((n,1))
Y = matrix(data[1,:]).reshape((n,1))
else:
n = float(data.shape[0])
X = matrix(data[:,0]).reshape((n,1))
Y = matrix(data[:,1]).reshape((n,11))
result = 1
Z = hstack((X,Y))
iota = ones((1,n))
moments = {}
moments[0] = 1
#first, generate huge-ass matrix containing all x-y combinations
# for every power-combination k,l such that k+l = i
# for all 0 <= i <= a
for i in arange(1,a):
if i==2:
moments[i] = moments[i-1]*Z
# if even, postmultiply with X.
elif i%2 == 1:
moments[i] = kron(moments[i-1], Z.T)
# Else, postmultiply with X.T
elif i%2==0:
temp = moments[i-1]
temp2 = temp[:,0:n]*Z
temp3 = temp[:,n:2*n]*Z
moments[i] = hstack((temp2, temp3))
# since now we have many multiple moments
# such as x**2*y and x*y*x, filter non-distinct elements
momentsDistinct = {}
momentsDistinct[0] = 1
for i in arange(1,a):
if i%2 == 0:
data = 1/n*moments[i]
elif i == 1:
temp = moments[i]
temp2 = temp[:,0:n]*iota.T
data = 1/n*hstack((temp2))
else:
temp = moments[i]
temp2 = temp[:,0:n]*iota.T
temp3 = temp[:,n:2*n]*iota.T
data = 1/n*hstack((temp2, temp3))
momentsDistinct[i] = unique(data.flat)
return momentsDistinct(result, axis=1)