I have a seaborn scatter plot (lmplot) with over 10K points. In order to perceive all the data, it works better when the plot size is larger (making the markers relatively small) and the alpha on the markers is low. However, this makes the markers on the legend difficult to distinguish. How does one set the marker size and marker alpha in Seaborn?
I see that g._legend has a markersize attribute, but directly setting it doesn't do anything.
Example
import numpy as np
import pandas as pd
import seaborn as sns
n_group = 4000
pos = np.concatenate((np.random.randn(n_group,2) + np.array([-1,-1]),
np.random.randn(n_group,2) + np.array([0.2, 1.5]),
np.random.randn(n_group,2) + np.array([0.6, -1.8])))
df = pd.DataFrame({"x": pos[:,0], "y": pos[:, 1],
"label": np.repeat(range(3), n_group)})
g = sns.lmplot("x", "y", df, hue = "label", fit_reg = False,
size = 8, scatter_kws = {"alpha": 0.1})
g._legend.set_title("Clusters")
You can do this by setting the alpha values of the legend markers themselves. You can also use _sizes to set the marker sizes in the same for loop:
n_group = 4000
pos = np.concatenate((np.random.randn(n_group,2) + np.array([-1,-1]),
np.random.randn(n_group,2) + np.array([0.2, 1.5]),
np.random.randn(n_group,2) + np.array([0.6, -1.8])))
df = pd.DataFrame({"x": pos[:,0], "y": pos[:, 1],
"label": np.repeat(range(3), n_group)})
g = sns.lmplot("x", "y", df, hue = "label", fit_reg = False,
size = 8, scatter_kws = {"alpha": 0.1})
g._legend.set_title("Clusters")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
# You can also use lh.set_sizes([50])
the above did not work for me in a seaborn lineplot. This did:
g = sns.lineplot(data=df, x='X', y='Y', hue='HUE', ci=False, style='STYLE',
markers=True, ms=16, dashes=False)
#get legend and change stuff
handles, lables = g.get_legend_handles_labels()
for h in handles:
h.set_markersize(10)
# replace legend using handles and labels from above
lgnd = plt.legend(handles, lables, bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title='TITLE')
Related
My question:
while plotting x and y values from a dataframe, if we have y values as discrete numbers say, id_number or category. if we use scatter plot, it will give linearly spaced yaxis ticks which may have large vertical spacing in between the plotted values depending on how much spaced our original values are.
what i required is to plot some category values ( fixed discrete values ) against the time events ( xaxis ) in a scatter plot, but the values in the table are just integer not strings. As i don't have any deep idea how to do this, the following is what i have achieved, but with modified original table with string values. Here is my testing data ( original data is large )
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtic
import matplotlib.category as mcat
np.random.seed(432987435)
nofpoints = 160
xval = np.arange(nofpoints)
disc = [ 200, 240, 250, 290 ]
yval = np.random.choice( disc , nofpoints)
yval_str = yval.astype(str)
yval , yval_str
cval = np.random.random( nofpoints )
df = pd.DataFrame( { 'xval': xval , 'yval':yval , 'cval': cval })
df_str = pd.DataFrame( { 'xval': xval , 'yval':yval_str , 'cval': cval })
using usual plotting method
fig = plt.figure(dpi=128 , figsize=(12,6))
ax1 = fig.add_subplot(111)
# here we are using the original dataframe(df), without any string field inside.
#ax1.grid(True)
ax1.scatter( 'xval' , 'yval' , data=df , marker='o', facecolor='None' , edgecolor='g')
plt.show()
this is what we get
see the large spacing between the values and each plot point is not against the tick values. (I don't want to use legend to show the category using colourmap, since it is preserved for some other purpose)
with modified dataframe having string as yaxis value
fig = plt.figure(dpi=128 , figsize=(12,6))
ax2 = fig.add_subplot(111)
# dataframe used is modified one with a string field inside.
# as we can see the order is shuffled.
ax2.scatter( 'xval' , 'yval' , data=df_str , marker='o', facecolor='None' , edgecolor='k')
plt.show()
to avoid shuffling
fig = plt.figure(dpi=128 , figsize=(12,6))
ax3 = fig.add_subplot(111)
# to maintain the same order and avoid shuffling we used matplotlib.category
#ax3.grid(True)
disc_str = [ str(x) for x in disc ]
units = mcat.UnitData(sorted(disc_str))
ax3.yaxis.set_units(units)
ax3.yaxis.set_major_locator( mcat.StrCategoryLocator(units._mapping))
ax3.yaxis.set_major_formatter( mcat.StrCategoryFormatter(units._mapping))
ax3.scatter( 'xval' , 'yval' , data=df_str , marker='o', facecolor='None' , edgecolor='y')
plt.show()
Is there any way to achieve this, without modifying the original table, i mean to plot integer category values as yaxis values.
You can do it by replacing ax1.scatter with seaborn.stripplot:
sns.stripplot(ax = ax1, data = df, x = 'xval', y = 'yval_str', marker = 'o', color = 'white', edgecolor = 'green', linewidth = 1)
Before you do that, if you want y axis in a particular order, you should sort your df:
df = pd.DataFrame({'xval': xval, 'yval': yval, 'yval_str': yval_str, 'cval': cval}).sort_values(by = 'yval', ascending = False)
Complete Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(432987435)
nofpoints = 160
xval = np.arange(nofpoints)
disc = [200, 240, 250, 290]
yval = np.random.choice(disc, nofpoints)
yval_str = yval.astype(str)
cval = np.random.random(nofpoints)
df = pd.DataFrame({'xval': xval, 'yval': yval, 'yval_str': yval_str, 'cval': cval}).sort_values(by = 'yval', ascending = False)
fig = plt.figure(dpi = 128, figsize = (12, 6))
ax1 = fig.add_subplot(111)
sns.stripplot(ax = ax1, data = df, x = 'xval', y = 'yval_str', marker = 'o', color = 'white', edgecolor = 'green', linewidth = 1)
plt.show()
If you want perfectly horizontally aligned points, you have to pass jitter = False to sns.stripplot:
sns.stripplot(ax = ax1, data = df, x = 'xval', y = 'yval_str', marker = 'o', color = 'white', edgecolor = 'green', linewidth = 1, jitter = False)
I am trying to create a bar plot that looks like this:
x axis is the number of detectors hit in coincidence (i.e. multiplicity)
for each multiplicity i have several events. The y axis contains the average pulse height of each event.The colors should correspond to the number of hits which have the shown pulse heights and appeared in events with the respective multiplicity
I have a dictionary that has multiplicities as keys and arrays of the avarage pulse heights as values. :
averages = {2 : [...],
3 : [...],
4 : [...],
5 : [...],
6 : [...],}
for key in averages:
plt.bar(key,averages[key] ,width = 0.8)
i only know how to produce the simple version of a bar chart that looks like this:
can someone tell me how to make the bars "broken to show all pulse heights and add the color coding?
Not entirely clear but I think you want something like this
import seaborn as sns
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
# Create some fake data that looks roughly like what you have
tips = sns.load_dataset("tips")
weights = stats.gaussian_kde(tips["total_bill"])(tips["total_bill"])
tips = tips.sample(frac=50, weights=weights, replace=True)
days = []
segments = []
counts = []
for day, x in tips["total_bill"].groupby(tips["day"]):
days.append(day)
segments.append(np.sort(x.unique()))
counts.append(x.value_counts().sort_index())
# Map from counts to colors
norm = mpl.colors.Normalize(0, np.concatenate(counts).max())
colors = [mpl.cm.viridis(norm(c)) for c in counts]
f, ax = plt.subplots()
# Draw each horizontal line
events = ax.eventplot(segments, colors=colors, orientation="vertical", zorder=.5)
events[0].set_norm(norm)
f.colorbar(events[0])
# Add the mean/std for each x position
sns.pointplot(data=tips, x="day", y="total_bill", ci="sd", order=days, join=False, color=".1")
I took the question to need each horizontal line to represent each data value, but if you're satisfied with a histogram, this is two function calls in seaborn (>=0.11)
sns.histplot(
data=tips, x="day", y="total_bill",
discrete=(True, False), binwidth=(1, .5),
cmap="viridis", cbar=True, zorder=.5, alpha=.75,
)
sns.pointplot(
data=tips, x="day", y="total_bill",
ci="sd", order=days, join=False, color=".1",
)
Here is a solution which uses imshow to produce the columnwise "color histograms":
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
# Create dummy data
coincidences = [2, 3, 4, 5, 6]
n_list = [10000, 8000, 6000, 4000, 2000]
mu_list = np.array([200, 300, 400, 500, 600])
scale = 100
averages = {c: np.random.normal(loc=mu_list[i], scale=scale, size=n_list[i])
for i, c in enumerate(coincidences)}
# Calculate histogram for each column
bins = np.linspace(0, 1000, 1000)
hist_img = np.array([np.histogram(averages[c], bins=bins)[0]
for c in coincidences]).T
# Create Normalized colormap
# norm = mpl.colors.Normalize()
norm = mpl.colors.LogNorm(vmin=1, vmax=hist_img.max())
sm = mpl.cm.ScalarMappable(cmap='viridis', norm=norm)
# Use colormap for img_hist and make zeros transparent
hist_img2 = sm.to_rgba(hist_img, bytes=True)
hist_img2[hist_img == 0, 3] = 0
# Plot
fig, ax = plt.subplots()
cc = ax.imshow(hist_img2, aspect='auto', interpolation='none', origin='lower',
extent=[1.5, 6.5, 0, 1000])
plt.colorbar(sm)
mean = [np.mean(averages[c]) for c in coincidences]
std = [np.std(averages[c]) for c in coincidences]
ax.errorbar(coincidences, mean, yerr=std, ls='', c='k', capsize=3, label='std')
ax.plot(coincidences, mean, ls='', marker='o', c='b', label='mean')
ax.legend()
I am trying to incorporate a gradient fill with multiple histograms using seaborn facet grid where the gradient is determined by the spread of values under each curve, not just by a sequence of row or col using hue. There are some links below that partly perform somewhat similar functions in python:
How to fill histogram with gradient color fills a diverging gradient but each histogram is independent of the others so comparison between histograms is somewhat void. Using the figure below each histogram should be relative to the others. Furthermore, it does not use the seaborn facet grid, which is the central question here.
How to generate series of histograms doesn't plot histograms. It just fills the area under a curve.
I've found a few images displaying what I'm hoping to execute but they all seem to be generated in R with nothing in python. My assumption is the functionality doesn't exist as yet using seaborn and I'll have to use R but I think this will be applicable for many users.
Using the code below, we can change adjust the gradient using hue to either row or col but this doesn't consider the area under the curve.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(120)
g = np.tile(list("ABCD"), 30)
h = np.tile(list("XYZ"), 40)
# Generate df
df = pd.DataFrame(dict(x = x, g = g, h = h))
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(4, rot = -0.25, light = 0.7)
g = sns.FacetGrid(df, col = 'h', hue = 'h', row = 'g', aspect = 3, height= 1, palette = pal)
# Draw the densities
g = g.map(sns.kdeplot, 'x', shade = True, alpha = 0.8, lw = 1, bw = 0.8)
g = g.map(sns.kdeplot, 'x', color= 'w', lw = 1, bw = 0.8)
g = g.map(plt.axhline, y = 0, lw = 1)
# Adjust title and axis labels directly
g.axes[0,0].set_ylabel('L 1')
g.axes[1,0].set_ylabel('L 2')
g.axes[2,0].set_ylabel('L 3')
g.axes[3,0].set_ylabel('L 4')
g.axes[0,0].set_title('Top 1')
g.axes[0,1].set_title('Top 2')
g.axes[0,2].set_title('Top 3')
g.axes[1,0].set_title('')
g.axes[1,1].set_title('')
g.axes[1,2].set_title('')
g.axes[2,0].set_title('')
g.axes[2,1].set_title('')
g.axes[2,2].set_title('')
g.axes[3,0].set_title('')
g.axes[3,1].set_title('')
g.axes[3,2].set_title('')
g.set_axis_labels(x_var = 'Total Amount')
g.set(yticks = [])
Out:
There is a gradient that can be adjusted for row or col but I'm hoping to pass this gradient to the area underneath each histogram curve. Similar to the figure above. So the area underneath each curve would be lighter when lower than zero and darker when higher than zero.
Even adjusting the area under the curve to the median value may suffice.
You can create an image gradient, and use the histogram itself as a clipping path for the image, so that the only visible part is the part under the curve.
As such, you can play around with any cmaps and normalization that are available when creating images.
Here is a quick example:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(120)
g = np.tile(list("ABCD"), 30)
h = np.tile(list("XYZ"), 40)
# Generate df
df = pd.DataFrame(dict(x = x, g = g, h = h))
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(4, rot = -0.25, light = 0.7)
g = sns.FacetGrid(df, col = 'h', hue = 'h', row = 'g', aspect = 3, height= 1, palette = pal)
# Draw the densities
g = g.map(sns.kdeplot, 'x', shade = True, alpha = 0.8, lw = 1, bw = 0.8)
g = g.map(sns.kdeplot, 'x', color= 'w', lw = 1, bw = 0.8)
g = g.map(plt.axhline, y = 0, lw = 1)
for ax in g.axes.flat:
ax.set_title("")
# Adjust title and axis labels directly
for i in range(4):
g.axes[i,0].set_ylabel('L {:d}'.format(i))
for i in range(3):
g.axes[0,i].set_title('Top {:d}'.format(i))
# generate a gradient
cmap = 'coolwarm'
x = np.linspace(0,1,100)
for ax in g.axes.flat:
im = ax.imshow(np.vstack([x,x]), aspect='auto', extent=[*ax.get_xlim(), *ax.get_ylim()], cmap=cmap, zorder=10)
path = ax.collections[0].get_paths()[0]
patch = matplotlib.patches.PathPatch(path, transform=ax.transData)
im.set_clip_path(patch)
g.set_axis_labels(x_var = 'Total Amount')
g.set(yticks = [])
I have managed to manipulate my plotting data to render the corresponding time series plot. But I am not quite satisfied with the current output because it is not easy to understand the newly generated plot.
my current data and my output:
here is my data looks like:
update
here is my sketch code that shaped above plot data:
df=df.groupby(['date'])['qty1'].sum().reset_index()
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
plot_data=df.groupby(['year', 'month'])['qty1'].sum().unstack().fillna(0)
plot_data.plot(kind='line')
and based on this data, I am getting this plot:
but this is not what I expected for.
desired plot:
Here is the plot that I actually want it:
I didn't get this plot. How can I get this? any idea?
Is this what you are looking for ?
import pandas as pd
import matplotlib.pyplot as plt
import calendar
%matplotlib inline
df = pd.DataFrame(dic) #dic is the dictionary you provided in the github link
df.columns = [str(i) for i in range(1,13)]
df = df.T
df.columns = ['2014','2015','2016','2017','2018']
df['Avg'] = df.mean(axis =1)
fig,ax = plt.subplots(figsize = (15,7))
plt.plot(df.index,df['2016'], marker='s',color = 'green', linewidth = 1, label="2016")
plt.plot(df.index,df['2017'],"bo-", linewidth = 1.5, label="2017")
plt.plot(df.index,df['2018'], marker='s', ms =10, color = 'red', linewidth = 3, label="2018")
plt.plot(df.index,df['Avg'], "--", color = 'grey', linewidth = 8, label="5-Yr-Avg")
plt.xlabel('\n Months\n', fontsize = 25, color = 'lightslategrey')
plt.legend(frameon = False, loc = "lower center", ncol=len(df.columns), fontsize = 14)
plt.grid(axis='y')
ax.set_xticklabels([calendar.month_abbr[i] for i in range(1,13)])
plt.tick_params( left = False, labelsize= 13)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()
I would like to add cross (X) on heatmap cells (depending on significance level, but the question is on adding the X).
Like in R-language (sig.level = XXX).
See the Python and R code used and the corresponding output images.
Thank you for your help.
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, square=True, linewidths=0.5, fmt=".2f",
cbar_kws={"shrink": .65, "orientation": "horizontal", "ticks":np.arange(-1, 1+1, 0.2)},
annot = True, annot_kws={"weight": 'bold', "size":15})
corrplot(cor(subset (wqw, select =
c(fixed.acidity:quality,ratio.sulfur.dioxide))),
# compute the p matrix
p.mat = cor.mtest(subset
(wqw, select = c(fixed.acidity:quality,ratio.sulfur.dioxide))),
# significance level 0.01
sig.level = 0.01,
# Method to display : color (could be corcle, ...)
method = "color",
# color palette
col = colorRampPalette(c("#BB4444", "#EE9988",
"#FFFFFF", "#77AADD", "#4477AA"))(200),
)
```
The easy solution is to add a scatter plot with an X-shaped marker to cross out the unwanted cells.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
data = np.random.rand(10,10)
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True
data_masked = np.ma.array(data, mask=mask)
fig, ax = plt.subplots()
im = ax.imshow(data_masked, cmap="YlGnBu", origin="upper")
fig.colorbar(im)
ax.scatter(*np.argwhere(data_masked.T < 0.4).T, marker="x", color="black", s=100)
plt.show()
The drawback of this is that the markersize (s) is independent of the number of cells and needs to be adjusted for different figure sizes.
An alternative is hence to draw some lines (an X are two crossed lines) at the respective positions. Here we create a function crossout(points, ax=None, scale=1, **kwargs), where scale is the percentage the lines shall take from each cell.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
def crossout(points, ax=None, scale=1, **kwargs):
ax = ax or plt.gca()
l = np.array([[[1,1],[-1,-1]]])*scale/2.
r = np.array([[[-1,1],[1,-1]]])*scale/2.
p = np.atleast_3d(points).transpose(0,2,1)
c = LineCollection(np.concatenate((l+p,r+p), axis=0), **kwargs)
ax.add_collection(c)
return c
data = np.random.rand(10,10)
mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = True
data_masked = np.ma.array(data, mask=mask)
fig, ax = plt.subplots()
im = ax.imshow(data_masked, cmap="YlGnBu", origin="upper")
fig.colorbar(im)
crossout(np.argwhere(data_masked.T < 0.4), ax=ax, scale=0.8, color="black")
plt.show()
For scale=0.8 this looks like
Note that for a pcolormesh plot or a seaborn heatmap (which uses pcolormesh internally), one would need to add 0.5 to the data, i.e.
np.argwhere(data_masked.T < 0.4)+0.5