Related
started learning how to plot data on python and I need help achieving the following:
I have the following example df6:
df6 = pd.DataFrame({
'emails': [50, 60 ,30, 40, 90, 10, 0,85 ],
'delivered': [20, 16 ,6, 15, 66, 6, 0,55 ]
})
df6
Looks like:
emails delivered
0 50 20
1 60 16
2 30 6
3 40 15
4 90 66
5 10 6
6 0 0
7 85 55
I need to plot emails VS delivered in a 4 quadrant chart. X & Y range will be slightly extra of the max and the cross section will be the means of both columns.
What I did so far, used describe() to get the values of the df6 then:
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.axhline(y=45.6, color="black", linestyle="--")
plt.axvline(x=23, color="black", linestyle="--")
plt.plot(df6['delivered'],df6['emails'],"o")
plt.xlim([0, df6['delivered'].max()+20])
plt.ylim([0, df6['emails'].max()+20])
plt.show()
I got the following output so far:
What I am looking for is seeing the chart into just 4 groups scattered and label each group with the total count of one quarter:
I found it easier to normalize the data before plotting... UPDATE: Messed something up with counts, but the code is here to analyze my mistake.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale = scaler.fit(df6)
# normalize the sen_matrix
norm_df = pd.DataFrame(scale.transform(df6), columns=df6.columns)
quadrant_1 = sum(np.logical_and(norm_df['emails'] < 0, norm_df['delivered'] < 0))
display(quadrant_1)
quadrant_2 = sum(np.logical_and(norm_df['emails'] > 0, norm_df['delivered'] < 0))
display(quadrant_2)
quadrant_3 = sum(np.logical_and(norm_df['emails'] < 0, norm_df['delivered'] > 0))
display(quadrant_3)
quadrant_4 = sum(np.logical_and(norm_df['emails'] > 0, norm_df['delivered'] > 0))
display(quadrant_4)
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.axhline(y=0, color="black", linestyle="--")
plt.axvline(x=0, color="black", linestyle="--")
plt.plot(norm_df['delivered'],norm_df['emails'],"o")
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().axes.get_xaxis().set_visible(False)
plt.gca().axes.get_yaxis().set_visible(False)
plt.text(0,-2.1,'Delivered',horizontalalignment='center', verticalalignment='center')
plt.text(-2.1,0,'Emails', horizontalalignment='center', verticalalignment='center', rotation=90)
plt.text(1,1,'Count: ' + str(quadrant_1),horizontalalignment='center', verticalalignment='center')
plt.text(-1,1,'Count: ' + str(quadrant_2), horizontalalignment='center', verticalalignment='center')
plt.text(-1,-1,'Count: ' + str(quadrant_3),horizontalalignment='center', verticalalignment='center')
plt.text(1,-1,'Count: ' + str(quadrant_4), horizontalalignment='center', verticalalignment='center')
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.show()
So to use the means in your plots you can start by simply modifying these 2 lines:
plt.axhline(y=df6['emails'].mean(), color="black", linestyle="--")
plt.axvline(x=df6['delivered'].mean(), color="black", linestyle="--")
We can then use pd.value_counts to compute the counts:
counts = df6.transform(lambda s: s >= s.mean()).value_counts()
pos = df6.agg(['min', 'max'])
Here counts contains the values of each pair of above/below means:
emails delivered
False False 4
True False 2
True 2
and pos contains the x/y (or email/delivered) coordinates at which the boxes are placed:
emails delivered
min 0 0
max 90 66
So you can adjust pos to change the annotation placement.
Finally you want to do the annotation on the figure:
for (eml, dlv), num in counts.iteritems():
ax.text(s=f'count: {num}',
x=pos.loc['max' if dlv else 'min', 'delivered'],
y=pos.loc['max' if eml else 'min', 'emails'],
ha='right' if dlv else 'left',
va='top' if eml else 'bottom',
)
Your are just missing the code for setting your left/bottom-spines position
import pandas as pd, numpy as np
df6 = pd.DataFrame({'emails': [50, 60 ,30, 40, 90, 10, 0,85 ],
'delivered': [20, 16 ,6, 15, 66, 6, 0,55 ]})
plt.plot(df6['delivered'],df6['emails'],"o")
count = np.count_nonzero(
(df6['emails'] < df6['delivered'].mean())&
(df6['delivered'] < df6['emails'].mean()) )
plt.annotate('count: %s'%count,(5,60))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_position(('data',df6['delivered'].mean()))
plt.gca().spines['bottom'].set_position(('data',df6['emails'].mean()))
Here's another solution, with a more symmetric looking plot:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
{
"emails": [50, 60, 30, 40, 90, 10, 0, 85],
"delivered": [20, 16, 6, 15, 66, 6, 0, 55],
}
)
plt.plot(df["delivered"], df["emails"], "o")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_position(("data", df["delivered"].mean()))
plt.gca().spines["bottom"].set_position(("data", df["emails"].mean()))
def get_lims(df, column, w=0.1):
mean = df[column].mean()
max_diff = max(
abs(df[column].max() - mean),
abs(df[column].min() - mean),
)
return [mean - max_diff - max_diff * w, mean + max_diff + max_diff * w]
plt.xlim(get_lims(df, "delivered"))
plt.ylim(get_lims(df, "emails"))
plt.show()
I have a dataframe where each column contains values considered "normal" if they fall within an interval, which is different for every column:
# The main df
df = pd.DataFrame({"A": [20, 10, 7, 39],
"B": [1, 8, 12, 9],
"C": [780, 800, 1200, 250]})
The df_info represents the intervals for each column of df.
So for example df_info["A"][0] is the min for the column df["A"] and df_info["A"][1] represents the max for the column df["A"] and so on.
df_info = pd.DataFrame({"A": [22, 35],
"B": [5, 10],
"C": [850, 900]})
Thanks to this SO Answer I was able to create a custom heatmap to print in blue values below the range, in red value above the range and in white values within the range. Just remember each column has a different range. SO i normalized according to this:
df_norm = pd.DataFrame()
for col in df:
col_min = df_info[col][0]
col_max = df_info[col][1]
df_norm[col] = (df[col] - col_min) / (col_max - col_min)
And finally printed my heatmap
vmin = df_norm.min().min()
vmax = df_norm.max().max()
norm_zero = (0 - vmin) / (vmax - vmin)
norm_one = (1 - vmin) / (vmax - vmin)
colors = [[0, 'darkblue'],
[norm_zero, 'white'],
[norm_one, 'white'],
[1, 'darkred']
]
cmap = LinearSegmentedColormap.from_list('', colors, )
fig, ax = plt.subplots()
ax=sns.heatmap(data=data,
annot=True,
annot_kws={'size': 'large'},
mask=None,
cmap=cmap,
vmin=vmin,
vmax=vmax) \
.set_facecolor('white')
In the example you can see that the third column has values much higher/lower compared to the the 0-1 interval (and to the first column) so they "absorb" all the shades of red and blue.
QUESTION:
What I want to obtain is use the entire shades of red/blue for each column or at least to reduce the perceptual difference between (for example) the first and third column.
I had tough of:
create a custom colormap where each colormap normalization is performed by column
use multiple colormaps, each one applied to a different column
applying a colormap mpl.colors.LogNorm but I'm not sure how to use it with my custom LinearSegmentedColormap
Using a mask per column, you could draw the heatmap column per column, each with its own colormap:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import ScalarMappable
df = pd.DataFrame({"A": [20, 10, 7, 39],
"B": [1, 8, 12, 9],
"C": [780, 800, 1200, 250]})
df_info = pd.DataFrame({"A": [22, 35],
"B": [5, 10],
"C": [850, 900]})
df_norm = pd.DataFrame()
for col in df:
col_min = df_info[col][0]
col_max = df_info[col][1]
df_norm[col] = (df[col] - col_min) / (col_max - col_min)
fig, ax = plt.subplots()
for col in df:
vmin = df_norm[col].min()
vmax = df_norm[col].max()
norm_zero = (0 - vmin) / (vmax - vmin)
norm_one = (1 - vmin) / (vmax - vmin)
colors = [[0, 'darkblue'],
[norm_zero, 'white'],
[norm_one, 'white'],
[1, 'darkred']]
cmap = LinearSegmentedColormap.from_list('', colors)
mask = df.copy()
for col_m in mask:
mask[col_m] = col != col_m
sns.heatmap(data=df_norm,
annot=df.to_numpy(), annot_kws={'size': 'large'}, fmt="g",
mask=mask,
cmap=cmap, vmin=vmin, vmax=vmax, cbar=False, ax=ax)
ax.set_facecolor('white')
colors = [[0, 'darkblue'],
[1 / 3, 'white'],
[2 / 3, 'white'],
[1, 'darkred']]
cmap = LinearSegmentedColormap.from_list('', colors)
cbar = plt.colorbar(ScalarMappable(cmap=cmap), ax=ax, ticks=[0, 1 / 3, 2 / 3, 1])
cbar.ax.yaxis.set_ticklabels(['min\nlimit', 'min', 'max', 'max\nlimit'])
plt.tight_layout()
plt.show()
You can re-scale your df_norm before plotting:
# alternative method to scale
df_norm = (df - df_info.iloc[0])/(df_info.iloc[1]-df_info.iloc[0])
# scale the norm
df_plot = (df_norm - df_norm.min())/(df_norm.max()-df_norm.min())
# heat map on the normalized `df_plot`
# use values in `df_norm` to annotate
# color bar doesn't make sense so we remove it
sns.heatmap(df_plot, annot=df_norm, cmap='RdBu_r', cbar=False))
Output:
I am using following code to make 5 bars on 3 different data sets a, b and c. How can I show all colors in each bar. I don't want their value to add up. For example, in first bar if the value of Green is 1, Yellow is 3 and Red is 6 I don't want the final value to be 10 rather it should be 6 but all colors should appear till their final value. I don't want to use transparent colors or only bar outlines.
import matplotlib.pyplot as plt
import numpy as np
a = [1, 2, 3, 4, 5]
b = [3, 4, 1, 10, 9]
c = [6, 7, 2, 4, 6]
ind = np.arange(len(a))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(x=ind, height=a, width=0.35, align='center', label='Green',
facecolor='g')
ax.bar(x=ind, height=b, width=0.35, align='center', label='Yellow',
facecolor='y')
ax.bar(x=ind, height=c, width=0.35, align='center', label='Red', facecolor='r')
plt.xticks(ind, a)
plt.xlabel('Coordination Number')
plt.ylabel('Frequency')
plt.legend()
plt.show()
The reference value for the 'a' column is 6, but it was unclear if it is the maximum value. I understood it to be the maximum value and calculated the composition ratio.
I created a stacked graph based on the results.
import numpy as np
import pandas as pd
a = [1, 2, 3, 4, 5]
b = [3, 4, 1, 10, 9]
c = [6, 7, 2, 4, 6]
ind = np.arange(len(a))
df = pd.DataFrame({'a':a,'b':b,'c':c}, index=ind)
df['total'] = df.sum(axis=1)
df['max'] = df[['a','b','c']].max(axis=1)
df['aa'] = df['max']*(df['a']/df['total'])
df['bb'] = df['max']*(df['b']/df['total'])
df['cc'] = df['max']*(df['c']/df['total'])
df
a b c total max aa bb cc
0 1 3 6 10 6 0.600000 1.800000 3.600000
1 2 4 7 13 7 1.076923 2.153846 3.769231
2 3 1 2 6 3 1.500000 0.500000 1.000000
3 4 10 4 18 10 2.222222 5.555556 2.222222
4 5 9 6 20 9 2.250000 4.050000 2.700000
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(x=ind, height=df.loc[:,'aa'], bottom=0, width=0.35, align='center', label='Green',
facecolor='g')
ax.bar(x=ind, height=df.loc[:,'bb'], bottom=df.loc[:,'aa'], width=0.35, align='center', label='Yellow',
facecolor='y')
ax.bar(x=ind, height=df.loc[:,'cc'], bottom=df.loc[:,'aa']+df.loc[:,'bb'], width=0.35, align='center', label='Red', facecolor='r')
plt.xticks(ind, a)
plt.xlabel('Coordination Number')
plt.ylabel('Frequency')
plt.legend()
plt.show()
If I understand your question correctly, you want to show all colour bars starting from the same zero baseline and grouped together under their corresponding Number?
I'll use bokeh for plotting, since it provides an easy way to "offset" each bar in the group. To vary the amount of visual offset for each bar, change the second parameter of the dodge function. For this combination of widths, 0.05 seemed like a nice value.
from bokeh.io import output_notebook, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
output_notebook() # or output_file("chart.html") if not using Jupyter
x_axis_values = [str(x) for x in range(1, 6)]
data = {
"Coordination Number" : x_axis_values,
"Green" : [1, 2, 3, 4, 5],
"Yellow" : [3, 4, 1, 10, 9],
"Red" : [6, 7, 2, 4, 6]
}
src = ColumnDataSource(data=data)
p = figure(
x_range=x_axis_values, y_range=(0, 10), plot_height=275,
title="Offset Group Bar Chart", toolbar_location=None, tools="")
p.vbar(
x=dodge('Coordination Number', -0.05, range=p.x_range),
top='Green', width=0.2, source=src, color="#8DD3C7", legend_label="Green")
p.vbar(
x=dodge('Coordination Number', 0.0, range=p.x_range),
top='Yellow', width=0.2, source=src, color="#FFD92F", legend_label="Yellow")
p.vbar(
x=dodge('Coordination Number', 0.05, range=p.x_range),
top='Red', width=0.2, source=src, color="#E15759", legend_label="Red")
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.axis_label = "Coordination Number"
p.yaxis.axis_label = "Frequency"
show(p)
I have the data in the following format:
X,Y,Z,Category
I used plotly to generate a scatter plot and then a fit a Curve through the scatter points using the following code.
from scipy.interpolate import griddata
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
x=np.asarray([3,5,9,3,3,7,6,9,1,9]);
y=np.asarray([4,3,3,10,8,2,4,10,9,3]);
z=np.asarray([1,2,4,10,1,7,10,3,1,7]);
# x = np.random.random(100)
xi=np.linspace(min(x), max(x),50)
#print xi
yi=np.linspace(min(y),max(y),50)
X,Y= np.meshgrid(xi,yi)
Z = np.nan_to_num(griddata((x,y), z, (X, Y), method='cubic'))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm,
linewidth=0, antialiased=False,alpha=0.4)
plt.show()
What i am looking to do is to color the plot according to categories something like this :
Where red represents the category 1 and Blue represents category 2.
So inorder to get something like this I need to generate a 2D Array and then use a colormap/colorscale to color the categories accordingly.
The above output have been created using XLSTAT where it took category as the 4th col as the category.
Can someone explain me how do i generate the Z data which will help me color the categories differently?
I have tried to something like dividing the 2D matrix into halves 0's and half 1's and got output something like this.
Considering the following sample data :
x y z Category
3 4 1 Cat 1
5 3 2 cat2
9 3 4 cat2
3 10 10 cat3
3 8 1 cat3
7 2 7 cat2
6 4 10 Cat 1
9 10 3 Cat 4
1 9 1 Cat 1
9 3 7 cat2
I need to generate 2D Data that will represent the surface color and color the different categories with custom color
Just as griddata can be used to interpolate the 1D z array to a 2D grid, you can use griddata to interpolate a 1D color array to the same 2D grid:
color = [colormap[cat] for cat in category]
C = np.nan_to_num(griddata((x, y), color, (X, Y), method='cubic'))
Then you can use the colormap cm.coolwarm to map values in C to RGBA facecolors:
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap,
linewidth=0, antialiased=False, alpha=0.4, facecolors=cmap(C))
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import scipy.interpolate as interpolate
x = np.asarray([3, 5, 9, 3, 3, 7, 6, 9, 1, 9])
y = np.asarray([4, 3, 3, 10, 8, 2, 4, 10, 9, 3])
z = np.asarray([1, 2, 4, 10, 1, 7, 10, 3, 1, 7])
category = np.array(['Cat 1', 'cat2', 'cat2', 'cat3', 'cat3',
'cat2', 'Cat 1', 'Cat 4', 'Cat 1', 'cat2'])
# coolwarm: 0 --> blue, 1 --> red
# want: 'Cat 1' --> blue, 'cat2' --> red, 'cat3' --> ?, 'Cat 4' --> ?
colormap = {'Cat 1': 0, 'cat2': 1, 'cat3': 0.333, 'Cat 4': 0.666}
color = np.array([colormap[cat] for cat in category])
xi = np.linspace(min(x), max(x), 50)
yi = np.linspace(min(y), max(y), 50)
X, Y = np.meshgrid(xi, yi)
Z = np.nan_to_num(interpolate.griddata((x, y), z, (X, Y), method='cubic'))
C = interpolate.griddata((x, y), color, (X, Y), method='cubic')
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
cmap = cm.coolwarm
ax.scatter(x, y, z, c=color, cmap=cmap)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap,
linewidth=0, antialiased=False, alpha=0.4, facecolors=cmap(C))
plt.show()
I have three list which are X, Y, Z
X = [[0.67910803031180977, 0.1443997264255876], [0.57, 0.87], [0.545, 0.854], [0.645, 0.1254], [0.645, 0.1354], [0.62, 0.83], [0.6945, 0.144], [0.9945, 0.45244], [0.235, 0.7754], [0.7, 0.85]]
Y = [0, 1, -1, -1, -1, 1, -1, -1, -1, 1]
Z = [0 1 1 0 0 1 0 1 1 1]
Where,
X is the dataset,
Y is labelset where 0 means "Normal", 1 means "LL" and -1 means "Unlabelled"
Z is outputset in which labels from Y is propagated to unlabelled labels.
Now, i am trying to plot a figure where one subplot contains the dataset as cluster with respect to each label from Y it belongs to and another subplot showing dataset with respect to Z.
I tried code from this example but i am not able to do it.
Please help.
I'm guessing at what you want, but here's an example of plotting the X values with colors determined by the Y and Z lists respectively. It's using a lot of default behavior -- color values between 0 and 1 get plotted into a default colorbar, iirc -- but you could make a more complicated function and pass a list of (rgb) or (rgba) values instead.
import matplotlib.pyplot as plt
from numpy import array
X = array([[0.67910803031180977, 0.1443997264255876], [0.57, 0.87],
[0.545, 0.854], [0.645, 0.1254], [0.645, 0.1354], [0.62, 0.83],
[0.6945, 0.144], [0.9945, 0.45244], [0.235, 0.7754], [0.7, 0.85]])
Y = [0, 1, -1, -1, -1, 1, -1, -1, -1, 1]
Z = [0, 1, 1, 0, 0, 1, 0, 1, 1, 1]
# for readability mostly
Xx = X.T[0]
Xy = X.T[1]
fig = plt.figure()
ax1 = fig.add_subplot(121)
ax1.scatter(Xx, Xy, c=map(lambda c: 0.3 * c + 0.5, Y), s=50, alpha=0.75)
ax1.set_xlabel('Y labels')
ax2 = fig.add_subplot(122)
ax2.scatter(Xx, Xy, c=map(lambda c: 0.3 * c + 0.5, Z), s=50, alpha=0.75)
ax2.set_xlabel('Z labels')
plt.show()