visualize error based on 2 factors

visualize error based on 2 factors - python

I have implemented one algorithm (RLSR) which there are two regularization factor. Based on different value of this two factors my cost function decreases or increase. Right now I visualize my error with plt.scatter by passing the error as the color which result :
but the problem here is my values in y-axis are very small so as you can see they overlap and I can not see some part of my results.
alpha_list=[1e-11,1e-10,1e-10,5*1e-10,8*1e-10,1e-8,1e-8,5*1e-8,8*1e-6,1e-6,1e-6,5*1e-6,8*1e-6,1e-4,1e-4,5*1e-4,8*1e-4,1e-3,1e-3,5*1e-3,6*1e-3,8*1e-3]
I tried to decrease the transparency but It didn't help much!
and this is how I implemented it :
eigenvalues,alphaa = np.meshgrid(eigRange,alpha_list )
fig = plt.figure()
DatavmaxTrain = np.max(normCostTrain)
DatavminTrain = np.min(normCostTrain)
DatavmaxTest = np.max(normCostTest)
DatavminTest = np.min(normCostTest)
plt.subplot(211)
plt.scatter(eigenvalues,alphaa,s=130, c=normCostTrain,cmap=cm.PuOr, vmin=DatavminTrain, vmax=DatavmaxTrain, alpha=0.70) #-----for train
cb1=plt.colorbar()
cb1.set_label("normalized square error")
plt.title("Train ")
plt.xlabel("No. of Eigenvalues")
plt.ylabel("Regualrization parameter")
So I am looking for a better way to visualize my data.
Thanks

How about plotting the log of the alpha_list values?
alpha_list = np.log(alpha_list)
There is still some overlap, but at least the values are more evenly spread out:
import matplotlib.pyplot as plt
import numpy as np
alpha_list=[1e-11,1e-10,1e-10,5*1e-10,8*1e-10,1e-8,1e-8,5*1e-8,8*1e-6,1e-6,1e-6,5*1e-6,8*1e-6,1e-4,1e-4,5*1e-4,8*1e-4,1e-3,1e-3,5*1e-3,6*1e-3,8*1e-3]
alpha_list = np.log(alpha_list)
eigRange = np.linspace(0,19,20)
eigenvalues,alphaa = np.meshgrid(eigRange,alpha_list )
normCostTrain = np.random.random((len(alpha_list),len(eigRange)))
fig = plt.figure()
DatavmaxTrain = np.max(normCostTrain)
DatavminTrain = np.min(normCostTrain)
plt.scatter(eigenvalues,alphaa,s = 130, c=normCostTrain,cmap=plt.get_cmap('PuOr'),
vmin=DatavminTrain, vmax=DatavmaxTrain, alpha=0.70) #-----for train
cb1=plt.colorbar()
cb1.set_label("normalized square error")
plt.title("Train ")
plt.xlabel("No. of Eigenvalues")
plt.ylabel("Log(Regularization parameter)")
plt.show()
yields
Here is an example of a 3D scatter plot of the same data, with the z-axis (and the color) are both used to represent the "normalized square error".
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
alpha_list = [1e-11, 1e-10, 1e-10, 5*1e-10, 8*1e-10, 1e-8, 1e-8, 5*1e-8, 8*1e-6,
1e-6, 1e-6, 5*1e-6, 8*1e-6, 1e-4, 1e-4, 5*1e-4, 8*1e-4, 1e-3, 1e-3,
5*1e-3, 6*1e-3, 8*1e-3]
alpha_list = np.log(alpha_list)
eigRange = np.linspace(0, 19, 20)
eigenvalues, alphaa = np.meshgrid(eigRange, alpha_list )
eigenvalues = eigenvalues.ravel()
alphaa = alphaa.ravel()
normCostTrain = np.random.random((len(alpha_list), len(eigRange))).ravel()
DatavmaxTrain = np.max(normCostTrain)
DatavminTrain = np.min(normCostTrain)
PuOr = plt.get_cmap('PuOr')
ax.scatter(eigenvalues, alphaa, normCostTrain,
c = normCostTrain.ravel(),
s = 30,
cmap = PuOr,
vmin = DatavminTrain,
vmax = DatavmaxTrain,
alpha = 0.70
) #-----for train
m = cm.ScalarMappable(cmap = PuOr)
m.set_array(normCostTrain)
cb1 = plt.colorbar(m)
cb1.set_label("normalized square error")
plt.title("Train ")
ax.set_xlabel("No. of Eigenvalues")
ax.set_ylabel("Log(Regularization parameter)")
ax.set_zlabel("normalized square error")
plt.show()
I'm not sure if this is an improvement. The points are a bit jumbled together, but are distinguishable if you drag the mouse to rotate the plot.

Related

How to compute the probability (e.g. 5%, 10%, 90%) of the Kernel density function?

I attempted to plot the kernel density distribution (Gaussian) curve along with the histogram plot of two data set in python.
However, in my script the estimation of 95% (data1: marked by red color vertical line) and 5% (data2: marked by black color vertical line) is very time-consuming, e.g. I need to test different limits [detail explanation in code, where I need to change the upper limited] to get the 95% and 5% probability of the kernel density curve.
May someone help out me here and suggest possible way out fixed this issue or another approach to plot the kernel density curve along with its 95% and 5% probability.
Thank you!
My script is here.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = result['95_24'] # data 1
data2 = result['5_24'] # data 2
def plot_prob_density(data1, data2, x_start1, x_end1):
fig, (ax1) = plt.subplots(1, 1, figsize=(6,5), sharey=False)
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Hisogram plot of data
ax1.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
ax1.hist(data2, bins=np.linspace(-20,20,40), density=True, color='k', alpha=0.4)
# kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
kd_data2 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data2)
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
kd_vals_data2 = np.exp(kd_data2.score_samples(x))
# density plot
ax1.plot(x, kd_vals_data1, color='r', label='$Na$', linewidth=2)
ax1.plot(x, kd_vals_data2, color='k', label='$Λ$', linewidth = 2)
# using the function get probability)
ax1.axvline(x=x_end1,color='red',linestyle='dashed', linewidth = 3, label='$β_{95\%}$')
ax1.axvline(x=x_start1,color='k',linestyle='dashed', linewidth = 3, label='$β_{5\%}$')
# Show the plots
ax1.set_ylabel('Probability density', fontsize=12)
ax1.set_xlabel('Beta', fontsize=12)
ax1.set_xlim([-20, 20])
ax1.set_ylim(0, 0.3)
ax1.set_yticks([0, 0.1, 0.2, 0.3])
ax1.set_xticks([-20, 20, -10, 10, 0])
ax1.legend(fontsize=12, loc='upper left', frameon=False)
fig.tight_layout()
gc.collect()
return kd_data1, kd_data2,
# Calculation of 95% and 5 % for data1 and data2 Kernel density curve
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
data2 = np.array(data2).reshape(-1, 1)
kd_data1, kd_data2= plot_prob_density(data1, data2, x_start1=-2.2, x_end1=5.3)
# ##############################
print('Beta-95%: {}'
.format(get_probability(start_value = -20,
end_value = 5.3,
eval_points = 1000,
kd = kd_data1)))
# here, I modify the end-value every time and then see teh output #value, when it reached to 95% then i took taht values as 95% #confidence, however this is very confsuing, i want to compute this 95% directly and same for 5% probbaility, computed below:
print('Beta-5%: {}\n'
.format(get_probability(start_value = -20,
end_value = -2.2,
eval_points = 1000,
kd = kd_data2)))
####################################################################
plt.savefig("Ev_test.png")
The pictorial representation is also attached here.
Histogram and kernel density plot along with its 95% and 5% probability limits highlighted with red and black vertical bold lines:

Here is the possible way out to fix this issue. Additionally, the proposed method it has error in percentile calculation, therefore i recommend not to use that:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
%matplotlib inline
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs
from scipy.stats import norm
import numpy as np
fig = plt.figure(figsize=(4, 4), dpi=300)
ax = fig.add_subplot(111)
# Plot the histogram
ax.hist(data8,bins=20,zorder=1,color="r",density=True,alpha=0.6,)
ax.hist(data7,bins=20,zorder=1,color="black",density=True,alpha=0.6,)
# kde.fit()
kde = sm.nonparametric.KDEUnivariate(data8)
kde1 = sm.nonparametric.KDEUnivariate(data7)
# Plot the KDE for various bandwidths
for bandwidth in [1.8]:
kde.fit(bw=bandwidth)
kde1.fit(bw=bandwidth)# Estimate the densities
ax.plot(kde.support, kde.density,"-",lw=2,color="r",zorder=10, alpha=0.6, label="Data1")
ax.plot(kde1.support, kde1.density,"-",lw=2,color="black",zorder=10, alpha=0.6, label="Data2")
ax.legend(loc="best")
ax.set_xlim([-20, 40])
ax.set_ylim([0, 0.3])
ax.grid(False)
# Probabilities calculation
quantiles_mesh = np.linspace(0,1,len(kde.density))
fig = plt.figure(figsize=(2, 2), dpi=300)
plt.plot(quantiles_mesh, kde.icdf)
data_1_95= np.percentile(kde1.icdf, 95)
data_2_5= np.percentile(kde2.icdf, 5)
ax.axvline(x=data_1_95,color='red',linestyle='dashed', linewidth = 2)
ax.axvline(x=data_2_5,color='k',linestyle='dashed', linewidth = 2)
#plt.savefig("KDE_Plot.png")

3D plot with multiple curves of data (frequency spectra) and color gradients to highlight the z-axis (magnitudes)

I'm trying to plot a series of frequency spectra in a 3D space using PolyCollection. My goal is to set "facecolors" as a gradient, i.e., the higher the magnitude, the lighter the color.
Please see this image for reference (I am not looking for the fancy design, just the gradients).
I tried to use the cmap argument of the PollyCollection, but I was unsuccessful.
I came this far with the following code adapted from here:
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
from mpl_toolkits.mplot3d import axes3d
import numpy as np
from scipy.ndimage import gaussian_filter1d
def plot_poly(magnitudes):
freq_data = np.arange(magnitudes.shape[0])[:,None]*np.ones(magnitudes.shape[1])[None,:]
mag_data = magnitudes
rad_data = np.linspace(1,magnitudes.shape[1],magnitudes.shape[1])
verts = []
for irad in range(len(rad_data)):
xs = np.concatenate([[freq_data[0,irad]], freq_data[:,irad], [freq_data[-1,irad]]])
ys = np.concatenate([[0],mag_data[:,irad],[0]])
verts.append(list(zip(xs, ys)))
poly = PolyCollection(verts, edgecolor='white', linewidths=0.5, cmap='Greys')
poly.set_alpha(.7)
fig = plt.figure(figsize=(24, 16))
ax = fig.add_subplot(111, projection='3d', proj_type = 'ortho')
ax.add_collection3d(poly, zs=rad_data, zdir='y')
ax.set_xlim3d(freq_data.min(), freq_data.max())
ax.set_xlabel('Frequency')
ax.set_ylim3d(rad_data.min(), rad_data.max())
ax.set_ylabel('Measurement')
ax.set_zlabel('Magnitude')
# Remove gray panes and axis grid
ax.xaxis.pane.fill = False
ax.xaxis.pane.set_edgecolor('white')
ax.yaxis.pane.fill = False
ax.yaxis.pane.set_edgecolor('white')
ax.zaxis.pane.fill = False
ax.zaxis.pane.set_edgecolor('white')
ax.view_init(50,-60)
plt.show()
sample_data = np.random.rand(2205, 4)
sample_data = gaussian_filter1d(sample_data, sigma=10, axis=0) # Just to smoothe the curves
plot_poly(sample_data)
Besides the missing gradients I am happy with the output of the code above.

how to change the scale of the y axis to see better in a heatmap

I currently have a scale problem with a heatmap:
As you can see at the beginning and at the end, there is a temperature variation, but as it is done on a very small distance and the scale is big, we can't see anything at all.
So is there a way or a function to fix this problem and automatically apply a better scale to see better ? To apply a small scale where there is a variation and a big scale when it is not ?
Here is the code to generate this image :
x = np.linspace(0,L,Nx+1) #array for y-axis
t = np.linspace(0.0, t_fin,Nt+1) #array to plot the time in the title
x = np.round(x,2) #change decimals
t = np.round(t,5)
y = np.arange(T[Nt,:].shape[0]) #T[Nt,:] is an array that contains the temperature
my_yticks = x #change the number of points in the y-axis
frequency = 100
data = np.vstack(T[Nt,:]) #to use in imshow
df = pd.DataFrame(data)
fig = plt.figure(figsize=(3,9)) #plotting
titre = f"Température à {t[Nt]} s"
plt.ylabel('Profondeur en m')
plt.yticks(y[::frequency], my_yticks[::frequency])
im = plt.imshow(df, cmap='jet', aspect ='auto', interpolation='bilinear')
ax = plt.gca()
ax.get_xaxis().set_visible(False)
cb = plt.colorbar()
cb.set_label('Température en °C')
plt.title(titre)
If you have any questions, do not hesitate.
Thank you !

You could use a logit scale on the y-axis. This won't however work with imshow as the values must be between 0 and 1 exclusively. You could use a pcolormesh instead.
import matplotlib.pyplot as plt
import numpy as np
M,N = 100,20
a = np.array(M*[15])
a[:3] = [0,5,10]
a[-3:] = [20,25,30]
a = np.outer(a, np.ones(N))
fig, (axl,axr) = plt.subplots(ncols=2, figsize=(3,6))
axl.imshow(a, cmap='jet', aspect ='auto', interpolation='bilinear', extent=(N,0,M,0))
axl.yaxis.set_ticklabels([f'{t/M*10:.3g}' for t in axl.yaxis.get_ticklocs()])
axl.get_xaxis().set_visible(False)
axl.set_title('linear')
eps = 1e-3
X, Y = np.meshgrid(np.linspace(0, 1, N), np.linspace(1-eps, eps, M))
cm = axr.pcolormesh(X, Y, a, cmap='jet', shading='gouraud')
axr.set_yscale('logit')
axr.yaxis.set_ticklabels([f'{10*(1-t):.3g}' for t in axr.yaxis.get_ticklocs()])
axr.get_xaxis().set_visible(False)
axr.set_title('logit')
cb = plt.colorbar(cm, pad=0.2)
Warning: setting the y labels as fixed values is only useful if you don't want to pan/zoom your image.

You can set the y axis limit by ax.set_ylim([,])

Calculating PDF given a histogram

I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:

I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))

How to plot the typical bowl shape when illustrating gradient descent with matplotlib?

When illustrating gradient descent, we usually see the bowl shape graph below. Also, it is said that using log_loss instead of squared error, we can find minimum value of loss more easily, as using squared error as loss function, may result in multiple local minimum values.
Therefore, I want to plot the bowl shape graph like below.
However, I only managed to plot the following
Here is my code, could anyone help me fix it? thanks
from mpl_toolkits.mplot3d.axes3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import math
fig, ax1 = plt.subplots(1, 1, figsize=(8, 5), subplot_kw={'projection': '3d'})
# Get the test data
x1 = 1
x2 = 1
y = 0.8
w = np.linspace(-10,10,100)
# w = np.random.random(100)
wl = np.linspace(-10,10,100)
# wl = np.random.random(100)
w1 = np.ones((100,100))
w2 = np.ones((100,100))
for idx in range(100):
w1[idx] = w1[idx]*w
w2[:,idx] = w2[:,idx]*wl
L = []
for i in range(w1.shape[0]):
for j in range(w1.shape[1]):
a = w1[i,j]*x1 + w2[i,j]*x2
f = 1/(1+math.exp(-a))
l = -(y*math.log(f)+(1-y)*math.log(1-f))
# l = (1/2)*(f-y)**2
L.append(l)
l = np.array(L).reshape(w1.shape)
ax1.plot_wireframe(w1,w2,l)
ax1.set_title("plot backpropogation")
plt.tight_layout()
plt.show()

The following ignores the Formula from the question and is probably completely unrelated to any actual problem. It just shows how to plot a bowl.
A way to plot a bowl is to use a function that is rotationally symmetric about the z axis.
For example:
from mpl_toolkits.mplot3d.axes3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
fig, ax1 = plt.subplots(figsize=(8, 5),
subplot_kw={'projection': '3d'})
alpha = 0.8
r = np.linspace(-alpha,alpha,100)
X,Y= np.meshgrid(r,r)
l = 1./(1+np.exp(-(X**2+Y**2)))
ax1.plot_wireframe(X,Y,l)
ax1.set_title("plot")
plt.show()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

visualize error based on 2 factors - python

Related

How to compute the probability (e.g. 5%, 10%, 90%) of the Kernel density function?

3D plot with multiple curves of data (frequency spectra) and color gradients to highlight the z-axis (magnitudes)

how to change the scale of the y axis to see better in a heatmap

Calculating PDF given a histogram

How to plot the typical bowl shape when illustrating gradient descent with matplotlib?

Categories

Resources