What does `log = True` actually do in matplotlib? - python

My MCVE:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
#normal sample
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)
#histogram
n, bins, patches = plt.hist(x, 50)
plt.axis([40, 160, 0, 800])
n2, bins2, patches2 = plt.hist(x,50,log=True)
n1 = np.log(n)
now why n2 is different from n1? I thought that the log = True scaled everything logarithmically.. but it doesn't. So what is it doing? Same thing happens for bins2 and bins1 = np.log(bins).
edit
This
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)
hist, bin_edges = np.histogram(x,50)
nwidth = bin_edges[7]-bin_edges[6] #just difference between two random bins
plt.bar(np.delete(bin_edges,len(bin_edges)-1),hist,nwidth) #so I have the right number of bins
plt.show()
gives me
then by doing
histsq = np.sqrt(np.log(hist))
plt.bar(np.delete(bin_edges,len(bin_edges)-1),histsq,nwidth)
plt.show()
I obtain

When you set log = True, the histogram axis (not return parameters) is in log scale. The return parameters (n,bins), ie the values of the bins and the edges of the bins are the same for log = True and log = False. What that means is n==n2 and bins == bins2 are both True.
See the code below to see if that is true
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
#normal sample
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)
#histogram
plt.subplot(3, 1, 1)
n, bins, patches = plt.hist(x, 50)
# plt.axis([40, 160, 0, 800])
plt.subplot(3, 1, 2)
n2, bins2, patches2 = plt.hist(x,50,log=True)
if (all(n==n2) and all(bins==bins2)):
print 'Return parameters of hist with log option or without log option are always the same'
#if you want to apply two transformation on the return data and visualize it.
n1 = np.log(n)
plt.subplot(3, 1, 3)
xx = bins[1:]
yy = np.sqrt(n1)
plt.bar(xx,yy, width = xx[1]-xx[0])
# square root of inverted parabola is not a linear line, but has some curvature
x = np.arange(-1,1.1,.1)
y = 10-(x)**2 # inverted parabola
y1 = np.sqrt(y) # square of inverted parabola
fig, ax1 = plt.subplots()
ax1.plot(x, y, 'b-')
# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('inverted parabola', color='b')
ax1.tick_params('y', colors='b')
ax2 = ax1.twinx()
ax2.plot(x, y1, 'r.')
ax2.set_ylabel('sqrt of inverted parabola', color='r')
ax2.tick_params('y', colors='r')
fig.tight_layout()
plt.show()
will return
Return parameters of hist method with log option or without log option are always the same

Related

Trying to rescale the plot to show small concentration differences

In the attached image, only one small dot is seen, representing the point source, but I want to be able to see the entire concentration spread across the map.
cmap = "jet"
fig = plt.figure()
proj = ccrs.PlateCarree()
ax = plt.axes(projection=proj)
cs = ax.contourf(
lon,
lat,
depo,
transform=ccrs.PlateCarree(),
cmap=cmap,
vmin=depo.min(),
vmax=depo.max(),
extend="max",
)
cbar = fig.colorbar(cs, shrink=0.8)
ax.coastlines()
ax.set_title("Conc (ng/m2) 2020-01-28 18.00")
ax.set_xticks([-40, 0, 40])
ax.set_xlabel("longitude [degE]", visible="true")
ax.set_yticks([10, 40, 70])
ax.set_ylabel("latitude [degN]", visible="true")
The problem is that you are scaling the color map with vmax to the largest occuring value in your dataset. Try something like vmax=0.1 * depo.max(). Sometimes, the mean value is also useful, e.g. vmax=1.5 * depo.mean() but that depends on the distribution of the values. Also the colors can be easiest specified with the levels parameter:
Here is a small example that you can use as starting point
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
x = np.linspace(-2, 2, 51) + 1
y = np.linspace(-2, 2, 51) - 0.5
xx, yy = np.meshgrid(x, y)
zz = 1 / (xx ** 2 + yy ** 2)
norm=mpl.colors.Normalize(vmin=-0.5, vmax=1))
cs = plt.contourf(xx, yy, zz, cmap='RdBu', levels=np.linspace(-0.5, 1, 10), extend='both')
cbar = plt.colorbar(cs, shrink=0.8)

Plotting a line between 2-D point to its corresponding value in 3-D

I am trying to plot a dashed line in a 3-D Matplotlib plot. I would like to get a dashed line between each (x_pt, y_pt) to its corresponding z_pt.
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'cm'
matplotlib.rcParams['axes.labelsize'] = 13
def z_function(x, y):
a = 1
b = 5.1/(4*np.pi**2)
c = 5/np.pi
r = 6
s = 10
t = 1/(8*np.pi)
return a*(y - b*x**2 + c*x - r)**2 + s*(1 - t)*np.cos(x) + s
x = np.linspace(-5, 10, 100)
y = np.linspace(0, 15, 100)
indexes = np.random.randint(0, 100, 5)
x_pt = x[indexes]
y_pt = y[indexes]
z_pt = z_function(x_pt, y_pt)
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(x_pt, y_pt, color='k', marker='x', depthshade=False)
ax.scatter(x_pt, y_pt, z_pt, color='k', marker='^', depthshade=False)
ax.set_xticks([-5, 0, 5, 10])
ax.set_yticks([0, 5, 10, 15])
ax.set_zticks([100, 200, 300])
ax.view_init(30, -120)
ax.set_xlabel(r'$x_1$')
ax.set_ylabel(r'$x_2$')
ax.zaxis.set_rotate_label(False)
ax.set_zlabel(r'$f(x)$', rotation=0)
ax.w_xaxis.pane.fill = False
ax.w_yaxis.pane.fill = False
ax.w_zaxis.pane.fill = False
plt.show()
Can anyone help me with this?
If I understand your problem correctly, you need to connect the point (x,y,0) to (x,y,z) like so:
for x_,y_,z_ in zip(x_pt, y_pt, z_pt):
ax.plot([x_,x_],[y_,y_],[0,z_], '--', c='grey')
It should be as simple as:
ax.plot(x_pt, y_pt, zs=z_pt, color='blue', marker='--', depthshade=False)
alternatively using:
ax.plot3D(x_pt, y_pt, z_pt, marker='--')
UPDATE:
You will need to create extra dummy coordinates for each point on the x-y axis, like so:
import numpy as np
n = 10 # number of points in the line
for i in len(x_pt):
x_range = np.linspace(0, x_pt[i], n)
y_range = np.linspace(0, y_pt[i], n)
ax.plot3D(x_range, y_range, [z_pt[i]]*n, marker='--')
NOTE: Untested

Calculating PDF given a histogram

I have a heavily right-skewed histogram and would like to calculate the probabilities for a range of Lifetimevalues (Area under the curve, the PDF). For instance, the probability that the Lifetime value is in (0-0.01)
Dataframe consisting of LTV calculated by cumulative revenue/ cumulative installs:
df['LTV'] is
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)
I have tried utilizing SKlearn's KernelDensity, however, after fitting it to the histogram it does not capture the over-represented 0s.
import gc
from sklearn.neighbors import KernelDensity
def plot_prob_density(df_lunch, field, x_start, x_end):
plt.figure(figsize = (10, 7))
unit = 0
x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
# Do kernel density estimation
kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
# Plot the estimated densty
kd_vals_lunch = np.exp(kd_lunch.score_samples(x))
plt.plot(x, kd_vals_lunch, color='orange')
plt.axvline(x=x_start,color='red',linestyle='dashed')
plt.axvline(x=x_end,color='red',linestyle='dashed')
# Show the plots
plt.xlabel(field, fontsize=15)
plt.ylabel('Probability Density', fontsize=15)
plt.legend(fontsize=15)
plt.show()
gc.collect()
return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)
Then finding the probabilities like this:
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
print('Probability of LTV 0-3 tips during LUNCH time: {}\n'
.format(get_probability(start_value = 0,
end_value = 0.01,
eval_points = 100,
kd = kd_lunch)))
However, this method does not yield the appropriate PDF values we were aiming for.
Any suggestions for alternative methods would be appreciated.
PLot:
I have used more or less similar script for my work, here is my script may be it will be helpful for you.
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]
def plot_prob_density(data1, x_start, x_end):
plt.figure(figsize = (4, 3.5))
unit = 1.5
x = np.linspace(-20, 20, 1000)[:, np.newaxis]
# Plot the data using a normalized histogram
plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
#plt.show
# Do kernel density estimation
kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)
# Plot the estimated densty
kd_vals_data1 = np.exp(kd_data1.score_samples(x))
plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')
# Show the plots
plt.xlabel('Beta', fontsize=10)
plt.ylabel('Probability Density', fontsize=10)
plt.title('02 hours window', fontsize=12)
plt.xlim(-20, 20)
plt.ylim(0, 0.3)
plt.yticks([0, 0.1, 0.2, 0.3])
plt.legend(fontsize=12, loc='upper left', frameon=False)
plt.show()
gc.collect()
return kd_data1
def get_probability(start_value, end_value, eval_points, kd):
# Number of evaluation points
N = eval_points
step = (end_value - start_value) / (N - 1) # Step size
x = np.linspace(start_value, end_value, N)[:, np.newaxis] # Generate values in the range
kd_vals = np.exp(kd.score_samples(x)) # Get PDF values for each x
probability = np.sum(kd_vals * step) # Approximate the integral of the PDF
return probability.round(4)
data1 = np.array(data1).reshape(-1, 1)
kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)
print('Beta-95%: {}\n'
.format(get_probability(start_value = -10,
end_value = 13,
eval_points = 1000,
kd = kd_data1)))

Reorient Histogram and Scatterplot with Trend Line

I have a dataset that looks similar to the one simulated in the code below. There are two sets of observations, one for those at X=0 and another for those at X>0.
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
X1 = np.random.normal(0, 1, 100)
X1 = X1 - np.min(X1)
Y1 = X1 + np.random.normal(0, 1, 100)
X0 = np.zeros(100)
Y0 = np.random.normal(0, 1.2, 100) + 2
X = np.concatenate((X1, X0))
Y = np.concatenate((Y1, Y0))
sns.distplot(Y0, color="orange")
plt.show()
sns.scatterplot(X, Y, hue = (X == 0), legend=False)
plt.show()
There are two plots: a histogram with KDE and a scatterplot.
I want to take the histogram with KDE, rotate it, and orient it appropriately with respect to the scatter plot. I would also like to add a trend line for each respective set of observations.
The ideal result would look something like this:
How do you do this in python, either using seaborn or matplotlib?
This can be done by combining plt.subplots with shared y-axis to keep the scale and sns plots. For trend line you need some additional computation, but you can use np for quick fitting. Here is an example how to achieve your goal, and here is jupyter notebook to play with.
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
# Prepare some data
np.random.seed(2020)
mean_Y1 = 0
std_Y1 = 1
size_Y1 = 100
X1 = np.random.normal(mean_Y1, std_Y1, size_Y1)
X1 = X1 - np.min(X1)
Y1 = X1 + np.random.normal(mean_Y1, std_Y1, size_Y1)
# this for computing trend line
Z = np.polyfit(X1, Y1, 1)
Y_ = np.poly1d(Z)(X1)
mean_Y0 = 2
std_Y0 = 1.2
size_Y0 = 100
X0 = np.zeros(100)
Y0 = np.random.normal(mean_Y0, std_Y0, size_Y0)
X = np.concatenate((X1, X0))
Y = np.concatenate((Y1, Y0))
# Now time for plotting
fig, axs = plt.subplots(1, 2,
sharey=True,
figsize=(10, 5),
gridspec_kw={'width_ratios': (1, 2)}
)
# control space between plots
fig.subplots_adjust(wspace=0.1)
# set the ticks for y-axis:
axs[0].yaxis.set_tick_params(left=False, labelleft=False, labelright=True)
# if you wish you can rotate xticks on the histogram with:
axs[0].xaxis.set_tick_params(rotation=90)
# plot histogram
dist = sns.distplot(Y0, color="orange", vertical=True, ax=axs[0])
# now we need to get the coordinate of the peak, we need this for mean line
line_data = dist.get_lines()[0].get_data()
max_Y0 = np.max(line_data[0])
# plotting the mean line
axs[0].plot([0, max_Y0], [mean_Y0, mean_Y0], '--', c='orange')
# inverting xaxis
axs[0].invert_xaxis()
# Plotting scatterpot
sns.scatterplot(X, Y, hue = (X == 0), legend=False, ax=axs[1])
# Plotting trend line
sns.lineplot(X1, Y_, ax=axs[1])
# Plotting mean again
axs[1].plot([0, max(X1)], [mean_Y0, mean_Y0], '--', c='orange')
plt.show()
Out:

Changing subplot size for displaying ellipse correctly

I'm trying to plot the confidence ellipse for uniformly distributed points. When plotting the ellipse and the scatter plot using Matplotlib, I find that a portion of the ellipse is clipped by the subplot. I tried implementing other solutions suggested here on SO, given here, here, here and here. but am not able to correct the displayed plot.
How do I change the size of this subplot in order to correctly and completely display the ellipse?
Code for generating ellipse: multidimensional confidence intervals
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
def eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:,order]
nstd = 2
fig = plt.figure(1, figsize=(12, 5))
#ax = plt.subplots(111)
ax = fig.add_subplot(111, aspect='equal')
test1 = np.random.uniform(40, 60, 1000)
test2 = np.random.uniform(100, 120, 1000)
cov = np.cov(test1, test2)
vals, vecs = eigsorted(cov)
theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
w, h = 2 * nstd * np.sqrt(vals)
ell = Ellipse(xy=(np.mean(test1), np.mean(test2)),
width=w, height=h,
angle=theta, color='black')
ell.set_facecolor('none')
ax.add_artist(ell)
plt.scatter(test1, test2)
plt.show()
As #ImportanceOfBeingEarnest rightly commented, a patch needed to be added for displaying the ellipse correctly. Updating the code below for future reference of others.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
def eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:,order]
nstd = 2
fig = plt.figure(1, figsize=(12, 5))
#ax = plt.subplots(111)
ax = fig.add_subplot(111, aspect='equal')
test1 = np.random.uniform(40, 60, 1000)
test2 = np.random.uniform(100, 120, 1000)
cov = np.cov(test1, test2)
vals, vecs = eigsorted(cov)
theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
w, h = 2 * nstd * np.sqrt(vals)
ell = Ellipse(xy=(np.mean(test1), np.mean(test2)),
width=w, height=h,
angle=theta, color='black')
ell.set_facecolor('none')
#Corrected code for displaying ellipse correctly
ax.add_patch(ell)
plt.scatter(test1, test2)
plt.show()

Categories

Resources