For a linear logistic regression model formulated as follows:
I was trying to code the likelihood and plot it to see what it looks like. This is how I've implemented it:
# Define likelihood function
def likelihood(beta):
# reshape correctly
beta = np.array(beta).reshape(-1, 1)
# X beta has x_i^T * \beta in every entry
Xbeta = np.matmul(X, beta).flatten()
exp_argument = np.multiply(newrows.y.values, Xbeta)
numerator = np.exp(exp_argument.sum())
denominator = np.prod((1 + np.exp(Xbeta)))
return numerator / denominator
# Prepare a grid
b1 = np.linspace(-10, -8.5, 100) # these values chosen by trial and error
b2 = np.linspace(4.5, 6.2, 100) # these values chosen by trial and error
B1, B2 = np.meshgrid(b1, b2)
# Evaluate function at the gridpoints
Zbeta = np.array([likelihood(thing) for thing in zip(B1.ravel(), B2.ravel())])
Zbeta = Zbeta.reshape(B1.shape)
# Surface plot
fig = plt.figure(figsize=(15, 6))
ax = fig.add_subplot(121, projection='3d')
ax.plot_surface(B1, B2, Zbeta)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax = fig.add_subplot(122)
ax.contour(B1, B2, Zbeta)
plt.show()
Here the matrix X is a matrix containing "fake" Bernoulli observations. It is a well known dataset called BeetleMortality, which looks like this and I've stored it in a Pandas Dataframe called data.
which are basically Binomial observations. I've created Bernoulli observations as follows:
newrows = np.zeros((data.NumberExposed.sum(), 2))
for index, row in data.iterrows():
obs = np.zeros(int(row.NumberExposed), dtype=int)
obs[:int(row.NumberKilled)] = 1
np.random.shuffle(obs)
obs = obs.reshape(-1, 1)
doses = np.repeat(row.Dose, len(obs)).reshape(-1, 1)
obs = np.hstack((doses, obs))
from_index = int(row.CumulativeN - row.NumberExposed)
to_index = int(row.CumulativeN)
newrows[from_index:to_index] = obs
# Save it as a dataframe
newrows = pd.DataFrame(newrows, columns=['x', 'y'])
# Notice that if you wanted to check that these calculations are correct use: newrows.groupby('x').sum()
newrows.head()
Everything seems to work fine, except that when I try to plot this I obtain the following plot:
Which makes absolutely no sense.. Any help? Maybe I'm implementing the likelihood function in the wrong way?
Related
I have created a code that returns the output that I am after - 2 graphs with multiple lines on each graph. However, the code is slow and quite big (in terms of how many lines of code it takes). I am interested in any improvements I can make that will help me to get such graphs faster, and make my code more presentable.
Additionally, I would like to add more to my graphs (axis names and titles is what I am after). Normally, I would use plt.xlabel,plt.ylabel and plt.title to do so, however I couldn't quite understand how to use them here. The aim here is to add a line to each graph after each loop ( I have adapted this piece of code to do so).
I should note that I need to use Python for this task (so I cannot change to anything else) and I do need Sympy library to find values that are plotted in my graphs.
My code so far is as follows:
import matplotlib.pyplot as plt
import sympy as sym
import numpy as np
sym.init_printing()
x, y = sym.symbols('x, y') # defining our unknown probabilities
al = np.arange(20,1000,5).reshape((196,1)) # values of alpha/beta
prob_of_strA = []
prob_of_strB = []
colours=['r','g','b','k','y']
pen_values = [[0,-5,-10,-25,-50],[0,-25,-50,-125,-250]]
fig1, ax1 = plt.subplots()
fig2, ax2 = plt.subplots()
for j in range(0,len(pen_values[1])):
for i in range(0,len(al)): # choosing the value of beta
A = sym.Matrix([[10, 50], [int(al[i]), pen_values[0][j]]]) # defining matrix A
B = sym.Matrix([[pen_values[1][j], 50], [int(al[i]), 10]]) # defining matrix B
sigma_r = sym.Matrix([[x, 1-x]]) # defining the vector of probabilities
sigma_c = sym.Matrix([y, 1-y]) # defining the vector of probabilities
ts1 = A * sigma_c ; ts2 = sigma_r * B # defining our utilities
y_sol = sym.solvers.solve(ts1[0] - ts1[1],y,dict = True) # solving for y
x_sol = sym.solvers.solve(ts2[0] - ts2[1],x,dict = True) # solving for x
prob_of_strA.append(y_sol[0][y]) # adding the value of y to the vector
prob_of_strB.append(x_sol[0][x]) # adding the value of x to the vector
ax1.plot(al,prob_of_strA,colours[j],label = ["penalty = " + str(pen_values[0][j])]) # plotting value of y for a given penalty value
ax2.plot(al,prob_of_strB,colours[j],label = ["penalty = " + str(pen_values[1][j])]) # plotting value of x for a given penalty value
ax1.legend() # showing the legend
ax2.legend() # showing the legend
prob_of_strA = [] # emptying the vector for the next round
prob_of_strB = [] # emptying the vector for the next round
You can save a couple of lines by initializing your empty vectors inside the loop. You don't have to bother re-defining them at the end.
for j in range(0,len(pen_values[1])):
prob_of_strA = []
prob_of_strB = []
for i in range(0,len(al)): # choosing the value of beta
A = sym.Matrix([[10, 50], [int(al[i]), pen_values[0][j]]]) # defining matrix A
B = sym.Matrix([[pen_values[1][j], 50], [int(al[i]), 10]]) # defining matrix B
sigma_r = sym.Matrix([[x, 1-x]]) # defining the vector of probabilities
sigma_c = sym.Matrix([y, 1-y]) # defining the vector of probabilities
ts1 = A * sigma_c ; ts2 = sigma_r * B # defining our utilities
y_sol = sym.solvers.solve(ts1[0] - ts1[1],y,dict = True) # solving for y
x_sol = sym.solvers.solve(ts2[0] - ts2[1],x,dict = True) # solving for x
prob_of_strA.append(y_sol[0][y]) # adding the value of y to the vector
prob_of_strB.append(x_sol[0][x]) # adding the value of x to the vector
ax1.plot(al,prob_of_strA,colours[j],label = ["penalty = " + str(pen_values[0][j])]) # plotting value of y for a given penalty value
ax2.plot(al,prob_of_strB,colours[j],label = ["penalty = " + str(pen_values[1][j])]) # plotting value of x for a given penalty value
ax1.legend() # showing the legend
ax2.legend() # showing the legend
I'm creating a scatter plot from an xarray Dataset using
scat = ds.hvplot.scatter(x='a', y='b', groupby='c', height=900, width=900)
How can I add a regression line to this plot?
I'm also using this to set some of the properties in the plot and I could add the Slope within the hook function but I can't figure out how to access x and y from the plot.state. This also might be completely the wrong way of doing it.
scat = scat.opts(hooks=[hook])
def hook(plot, element):
print('plot.state: ', plot.state)
print('plot.handles: ', sorted(plot.handles.keys()))
par = np.polyfit(x, y, 1, full=True)
gradient=par[0][0]
y_intercept=par[0][1]
slope = Slope(gradient=gradient, y_intercept=y_intercept,
line_color='orange', line_dash='dashed', line_width=3.5)
plot.state.add_layout(slope)
scat = scat.opts(hooks=[hook])
HoloViews >= 1.13 now has support for adding a regression line to your plot, so you don't need hooks anymore.
1) You can either add the regression line yourself by specifying keywords slope and y_intercept:
gradient = 2
y_intercept = 15
# create random data
xpts = np.arange(0, 20)
ypts = gradient * xpts + y_intercept + np.random.normal(0, 4, 20)
scatter = hv.Scatter((xpts, ypts))
# create slope with hv.Slope()
slope = hv.Slope(gradient, y_intercept)
scatter.opts(size=10) * slope.opts(color='red', line_width=6)
2) Or you can have HoloViews calculate it for you with hv.Slope.from_scatter():
normal = hv.Scatter(np.random.randn(20, 2))
normal.opts(size=10) * hv.Slope.from_scatter(normal)
Resulting plot:
The plot hooks is given two arguments, the second of which is the element being displayed. Since the element contains the data being displayed we can write a callback to compute the slope using the dimension_values method to get the values of the 'a' and 'b' dimensions in your data. Additionally, in order to avoid the Slope glyph being added multiple times, we can cache it on the plot and update its attributes:
def hook(plot, element):
x, y = element.dimension_values('a'), element.dimension_values('b')
par = np.polyfit(x, y, 1, full=True)
gradient=par[0][0]
y_intercept=par[0][1]
if 'slope' in plot.handles:
slope = plot.handles['slope']
slope.gradient = gradient
slope.y_intercept = y_intercept
else:
slope = Slope(gradient=gradient, y_intercept=y_intercept,
line_color='orange', line_dash='dashed', line_width=3.5)
plot.handles['slope'] = slope
plot.state.add_layout(slope)
I am attempting to remove my probes function from a signal using Fourier deconvolution, but I can not get a correct output with test signals.
t = np.zeros(30)
t = np.append(t, np.arange(0, 20, 0.1))
sigma = 2
mu = 5.
g = 1/np.sqrt(2*np.pi*sigma**2) * np.exp(-(np.arange(mu-3*sigma,mu+3*sigma,0.1)-mu)**2/(2*sigma**2))
def pad_signals(s1, s2):
size = t.size +g.size - 1
size = int(2 ** np.ceil(np.log2(size)))
s1 = np.pad(s1, ((size-s1.size)//2, int(np.ceil((size-s1.size)/2))), 'constant', constant_values=(0, 0))
s2 = np.pad(s2, ((size-s2.size)//2, int(np.ceil((size-s2.size)/2))), 'constant', constant_values=(0, 0))
return s1, s2
def decon_fourier_ratio(signal, removed_signal):
signal, removed_signal = pad_signals(signal, removed_signal)
recovered = np.fft.fftshift(np.fft.ifft(np.fft.fft(signal)/np.fft.fft(removed_signal)))
return np.real(recovered)
gt = (np.convolve(t, g, mode='full') / g.sum())[:230]
tr = decon_fourier_ratio(gt, g)
fig, ax = plt.subplots(nrows=2, ncols=2, sharex=True)
ax[0,0].plot(np.arange(0,np.fft.irfft(np.fft.rfft(t)).size), np.fft.irfft(np.fft.rfft(t)), label='thickness')
ax[0,1].plot(np.arange(0,np.fft.irfft(np.fft.rfft(g)).size), np.fft.irfft(np.fft.rfft(g)), label='probe shape')
ax[1,0].plot(np.arange(0,gt.size),gt, label='recorded signal')
ax[1,1].plot(np.arange(0,tr.size),tr, label='deconvolved signal')
plt.show()
The above script creates a demo sample (t), and a probe with Gaussian shape (g). Then, it convolves them to a signal gt, which is what a sample would look like when probed. I pad the signal to the nearest 2^N with pad_signals(), for efficiency and to fix any non-periodicity. Then I try to remove the gaussian probe with decon_fourier_ratio(). As is clear from the images, I do not recover the initial thickness gradient. Any ideas why the deconvolution is not working?
Note: I have also tried SciPy's deconvolve. But, this function only works for gaussians of certain widths.
Any help is greatly appreciated,
Eric
Any reason you are not doing the full convolution? If I change the construction of gt to:
g /= g.sum() # so the deconvolved signal has the same amplitude
gt = np.convolve(t, g, mode='full')
Then I get the following plots:
I can't quite tell you why your seeing this behavior, other than the partial convolution is probably altering the frequency content. Alternatively, you can pad your input signal with zeros if you want to get the same behavior and use same.
I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.
I'm learning digital signal processing to implement filters and am using python to easily implement a test ideas. So I just started using the scipy.signal library to find the impulse response and frequency response of different filters.
Currently I am working through the book "Digital Signals, Processors and Noise by Paul A. Lynn (1992)" (and finding it an amazing resource for learning this stuff). In this book they have a filter with the transfer functions shown below:
I divided the numerator and denominator by in order to get the following equation:
I then implemented this with Scipy using:
NumeratorZcoefs = [1, -1, 1, -1]
DenominatorZcoefs = [1, 0.54048, -0.62519, -0.66354, 0.60317, 0.69341]
FreqResponse = scipy.signal.freqz(NumeratorZcoefs, DenominatorZcoefs)
fig = plt.figure(figsize = [8, 6])
ax = fig.add_subplot(111)
ax.plot(FreqResponse[0], abs(np.array(FreqResponse[1])))
ax.set_xlim(0, 2*np.pi)
ax.set_xlabel("$\Omega$")
and produce the plot shown below:
However in the book the frequency response is shown to be the following:
They are the same shape but the ratio of the peaks at ~2.3 and 0.5 are very different for the 2 plots, could someone suggest why this is?
Edit:
To add to this, I've just implemented a function to calculate the frequency response by hand (by calculating the distance from the poles and zeros of the function) and I get a similar ratio to the plot generated by scipy.signal, however the numbers are not the same, does anyone know why this might by?
Implementation is as follows:
def H(omega):
z1 = np.array([0,0]) # zero at 0, 0
z2 = np.array([0,0]) # Another zero at 0, 0
z3 = np.array([0, 1]) # zero at i
z4 = np.array([0, -1]) # zero at -i
z5 = np.array([1, 0]) # zero at 1
z = np.array([z1, z2, z3, z4, z5])
p1 = np.array([-0.8, 0])
p = cmath.rect(0.98, np.pi/4)
p2 = np.array([p.real, p.imag])
p = cmath.rect(0.98, -np.pi/4)
p3 = np.array([p.real, p.imag])
p = cmath.rect(0.95, 5*np.pi/6)
p4 = np.array([p.real, p.imag])
p = cmath.rect(0.95, -5*np.pi/6)
p5 = np.array([p.real, p.imag])
p = np.array([p1, p2, p3, p4, p5])
a = cmath.rect(1,omega)
a_2dvector = np.array([a.real, a.imag])
dz = z-a_2dvector
dp = p-a_2dvector
dzmag = []
for dis in dz:
dzmag.append(np.sqrt(dis.dot(dis)))
dpmag = []
for dis in dp:
dpmag.append(np.sqrt(dis.dot(dis)))
return(np.product(dzmag)/np.product(dpmag))
I then plot the frequency response like so:
omegalist = np.linspace(0,2*np.pi,5000)
Hlist = []
for omega in omegalist:
Hlist.append(H(omega))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(omegalist, Hlist)
ax.set_xlabel("$\Omega$")
ax.set_ylabel("$|H(\Omega)|$")
and get the following plot:
The SciPy generated frequency response is correct. In any case, I wouldn't trust the book's figure which appears to have been drawn by hand.
If you want to find the frequency response "manually", this can be simply done by defining a function returning the original Z-transform and evaluating it on the unit circle as follows
def H(z):
num = z**5 - z**4 + z**3 - z**2
denom = z**5 + 0.54048*z**4 - 0.62519*z**3 - 0.66354*z**2 + 0.60317*z + 0.69341
return num/denom
import numpy as np
import matplotlib.pyplot as plt
w_range = np.linspace(0, 2*np.pi, 1000)
plt.plot(w_range, np.abs(H(np.exp(1j*w_range))))
The result is exactly the same as SciPy.