How can I get rid of the line in pylab histogram - python

I have this simplified code:
import numpy, pylab, random
data = [ random.gauss(1, 0.2) for x in range(1000) ]
fig = pylab.figure()
weights = numpy.ones_like(data)/float(len(data))
n, bins, patches =pylab.hist(data, bins=20, histtype='stepfilled',
weights=weights)
pylab.xlim(min(bins), max(bins))
pylab.ylim(0, 1)
p, = pylab.plot(bins)
pylab.savefig("test.png")
And the plot looks like this:
What is that green line? How can I remove it?

That's because you are plotting the bins points of x-axis on the y-axis (that's the green line), you don't need the plot():
import numpy, pylab, random
data = [ random.gauss(1, 0.2) for x in range(1000) ]
fig = pylab.figure()
weights = numpy.ones_like(data)/float(len(data))
n, bins, patches =pylab.hist(data, bins=20, histtype='bar',
weights=weights)
pylab.xlim(min(bins), max(bins))
pylab.ylim(0, 1)
pylab.show()

Related

Hist wrong binwidth with logarithmix x and y axis

I need to plot a hist with bot logarithmic y and x-axis, but I'd like also to have hist's bins displayed of same size.
How can I achieve this result with the following code (the x used is very long so I have intentionally avoided to insert it):
import matplotlib as plt
import numpy as np
fig, ax1 = plt.subplots()
hist, bins, _ = ax1.hist(x, log=True, color="red", rwidth=0.5)
plt.xscale("log")
np_x = np.array(x)
print("np_x.mean() = " + str(np_x.mean()))
plt.axvline(np_x.mean() * 1.1, color='lime', linestyle='dashed', linewidth=3,
label='Mean: {:.2f}'.format(np_x.mean()))
handles, labels = ax1.get_legend_handles_labels()
binwidth = math.floor(bins[1] - bins[0])
mylabel = "Binwidth: {}".format(binwidth) + ", Bins: {}".format(len(hist))
red_patch = mpatches.Patch(color='red', label=mylabel)
handles = [red_patch] + handles
labels = [mylabel] + labels
ax1.legend(handles, labels)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()

Using drawstyle "steps-mid" together with x-log-scale causes step points to be non-centered

Matplotlib offers various options for the drawstyle. steps-mid does the following:
The steps variants connect the points with step-like lines, i.e. horizontal lines with vertical steps. [...]
'steps-mid': The step is halfway between the points.
This works fine when the x-scale is linear however when using a log-scale it still seems to compute the step points by averaging in data-space rather than log-space. This leads to data points not being centered between the steps.
import matplotlib.pyplot as plt
import numpy as np
x = np.logspace(0, 10, num=10)
y = np.arange(x.size) % 2
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.plot(x, y, drawstyle='steps-mid', marker='s')
Is there a way to use step-like plotting together with x-log-scale such that the steps are centered between data points in log-space?
I don't know of a way other than building the steps correctly in log space yourself:
import matplotlib.pyplot as plt
import numpy as np
x = np.logspace(0, 10, num=10)
y = np.arange(x.size) % 2
def log_steps_mid(x, y, **kwargs):
x_log = np.log10(x)
x_log_mid = x_log[:-1] + np.diff(x_log)/2
x_mid = 10 ** x_log_mid
x_mid = np.hstack([x[0],
np.repeat(x_mid, 2),
x[-1]])
y_mid = np.repeat(y, 2)
ax.plot(x_mid, y_mid, **kwargs)
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.plot(x, y, ls='', marker='s', color='b')
log_steps_mid(x, y, color='b')

How to add cross (X) on a heatmap cells like with R language?

I would like to add cross (X) on heatmap cells (depending on significance level, but the question is on adding the X).
Like in R-language (sig.level = XXX).
See the Python and R code used and the corresponding output images.
Thank you for your help.
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, square=True, linewidths=0.5, fmt=".2f",
cbar_kws={"shrink": .65, "orientation": "horizontal", "ticks":np.arange(-1, 1+1, 0.2)},
annot = True, annot_kws={"weight": 'bold', "size":15})
corrplot(cor(subset (wqw, select =
c(fixed.acidity:quality,ratio.sulfur.dioxide))),
# compute the p matrix
p.mat = cor.mtest(subset
(wqw, select = c(fixed.acidity:quality,ratio.sulfur.dioxide))),
# significance level 0.01
sig.level = 0.01,
# Method to display : color (could be corcle, ...)
method = "color",
# color palette
col = colorRampPalette(c("#BB4444", "#EE9988",
"#FFFFFF", "#77AADD", "#4477AA"))(200),
)
```
The easy solution is to add a scatter plot with an X-shaped marker to cross out the unwanted cells.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
data = np.random.rand(10,10)
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True
data_masked = np.ma.array(data, mask=mask)
fig, ax = plt.subplots()
im = ax.imshow(data_masked, cmap="YlGnBu", origin="upper")
fig.colorbar(im)
ax.scatter(*np.argwhere(data_masked.T < 0.4).T, marker="x", color="black", s=100)
plt.show()
The drawback of this is that the markersize (s) is independent of the number of cells and needs to be adjusted for different figure sizes.
An alternative is hence to draw some lines (an X are two crossed lines) at the respective positions. Here we create a function crossout(points, ax=None, scale=1, **kwargs), where scale is the percentage the lines shall take from each cell.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
def crossout(points, ax=None, scale=1, **kwargs):
ax = ax or plt.gca()
l = np.array([[[1,1],[-1,-1]]])*scale/2.
r = np.array([[[-1,1],[1,-1]]])*scale/2.
p = np.atleast_3d(points).transpose(0,2,1)
c = LineCollection(np.concatenate((l+p,r+p), axis=0), **kwargs)
ax.add_collection(c)
return c
data = np.random.rand(10,10)
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True
data_masked = np.ma.array(data, mask=mask)
fig, ax = plt.subplots()
im = ax.imshow(data_masked, cmap="YlGnBu", origin="upper")
fig.colorbar(im)
crossout(np.argwhere(data_masked.T < 0.4), ax=ax, scale=0.8, color="black")
plt.show()
For scale=0.8 this looks like
Note that for a pcolormesh plot or a seaborn heatmap (which uses pcolormesh internally), one would need to add 0.5 to the data, i.e.
np.argwhere(data_masked.T < 0.4)+0.5

Align histogram bars - Python

I need to center the bars of a histogram.
x = array
y = [0,1,2,3,4,5,6,7,8,9,10]
num_bins = len(array)
n, bins, patches = plt.hist(x, num_bins, facecolor='green', alpha=0.5)
barWidth=20
x.bar(x, y, width=barWidth, align='center')
plt.show()
What I need, is that it looks like the one in this picture
I tried almost everything, but still can't go through.
Thank you all
For your task, I think it's better to calculate the histogram with NumPy and plot with bat function. Please refer to a following code and see how to use bin_edges.
import matplotlib.pyplot as plt
import numpy as np
num_samples = 100
num_bins = 10
lb, ub = 0, 10 # lower bound, upper bound
# create samples
y = np.random.random(num_samples) * ub
# caluculate histogram
hist, bin_edges = np.histogram(y, num_bins, range=(lb, ub))
width = (bin_edges[1] - bin_edges[0])
# plot histogram
plt.bar(bin_edges[:-1], hist, align='center',
width=width, edgecolor='k', facecolor='green', alpha=0.5)
plt.xticks(range(num_bins))
plt.xlim([lb-width/2, ub-width/2])
plt.show()

Histogram does not show up in f-distribution plot

I am trying to create f-distributed random numbers with given degree of freedoms d1 and d2, and plot both a histogram with f-distributed random numbers, and plot an idealised f-distribution curve, but when I give small values to df's, the histogram does not show up. I am new at statistics and matplotlib, and I could not figure out how to deal with this problem.
This is my code:
def distF(request, distribution_id):
dist = get_object_or_404(Distribution, pk=distribution_id)
dfd = dist.var4
dfn = dist.var2
x = np.random.f(dfn, dfd, size = dist.var3)
num_bins = 50
fig, ax = plt.subplots()
print(x)
# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, normed=True)
y = np.linspace(0, 5, 1001)[1:]
dist = st.f(dfn, dfd, 0)
#y = np.linspace(st.f.ppf(0.01, dfn, dfd), st.f.ppf(0.99, dfn, dfd), 100)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, 4])
ax.set_ylim([0, 3])
fig.tight_layout()
canvas = FigureCanvas(fig)
response = HttpResponse(content_type='image/png')
canvas.print_png(response)
plt.close(fig)
return response
This is how the plots look like:
F-distribution plot with small df values
F-distribution plot with large df values
The problem is that the f distribution with a dfd of 1 spreads out hugely towards large numbers. So let's say you have values of 2000 or so in your array x, but only 50 bins between 0 and 2000. That makes the bin rather large and hence rather low in height. I would think that if you anyway want to limit your view to some low number, it would be better to also limit the histogram to that number.
In the code below the limit would be 5 and the bin width is 0.2.
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
dfn = 10
dfd =1
limit = 5
x = np.random.f(dfn, dfd, size = 100)
bins = np.arange(0, limit, 0.2)
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(x, bins, normed=True)
y = np.linspace(0, limit, 1001)[1:]
dist = st.f(dfn, dfd, 0)
ax.plot(y, dist.pdf(y), '--')
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability density')
ax.set_xlim([0, limit])
fig.tight_layout()
plt.show()

Categories

Resources