Not able to plot box plot separately - python

I have lot of feature in data and i want to make box plot for each feature. So for that
import pandas as pd
import seaborn as sns
plt.figure(figsize=(25,20))
for data in train_df.columns:
plt.subplot(7,4,i+1)
plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
ax =sns.boxplot(train_df[data])
I did this
and the output is
All the plot are on one image i want something like
( not with skew graphs but with box plot )
What changes i need to do ?

In your code, I cannot see where the i is coming from and also it's not clear how ax was assigned.
Maybe try something like this, first an example data frame:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
train_df = pd.concat([pd.Series(np.random.normal(i,1,100)) for i in range(12)],axis=1)
Set up fig and a flattened ax for each subplot:
fig,ax = plt.subplots(4,3,figsize=(10,10))
ax = ax.flatten()
The most basic would be to call sns.boxplot assigning ax inside the function:
for i,data in enumerate(train_df.columns):
sns.boxplot(train_df[data],ax=ax[i])

Related

seaborn jointplot with same size plots

I'm doing a jointplot with a basemap, the problem is that when I add the basemap the main plot doesn't have the same size of the marginal plots. I've tried with different parameters without luck. Does anyone have an idea?
import seaborn as sns
import matplotlib.pyplot as plt
import contextily as ctx
import pandas as pd
##exaplme of the data
coords={'longitud':[-62.2037376443, -62.1263309099, -62.1111660957, -62.2094232682, -62.2373117384, -62.4837603464,
-62.4030570833, -62.3975699059, -62.7017114116, -62.7830883096, -62.7786038141, -62.7683234105, -62.7490101452,
-62.7709656745, -63.1002199219, -63.1890252191, -63.1183018549, -63.069960016, -62.7957745659, -63.1715687622,
-63.2156105034, -63.0634381954, -63.2243260588, -63.1153871895, -63.1068292891, -63.103945266, -63.046202785,
-63.1002257551, -63.2076065143, -62.9766391316, -62.9639256604, -62.9911452446, -62.9819984159, -62.9693649898,
-63.066770885, -62.9867441519, -62.9566360192, -62.962616287, -62.835080907, -63.0704805194, -62.8796906301,
-63.0725050601, -63.2224345145, -63.1609069526, -63.0614466072, -62.8847887504, -63.1093652381, -62.822694115,
-63.211982035, -63.1689040153],
'latitud':[8.54644405234, 8.54344899107, 8.54223724187, 8.54290207992, 8.49122679072, 8.48386575122, 8.46450360179,
8.46404720757, 8.35310083084, 8.31701565261, 8.30258604829, 8.29974870902, 8.29281679496, 8.28939264064, 8.28785272804,
8.28221439317, 8.27978694565, 8.27864159366, 8.27634987807, 8.27619269053, 8.27236343925, 8.27258932351, 8.26833993531,
8.267530064, 8.26446669791, 8.26266392333, 8.2641092051, 8.26208837315, 8.26034269744, 8.26123972942, 8.25789799656,
8.25825378832, 8.25833002805, 8.25914612933, 8.2540499893, 8.25347956867, 8.2540932736, 8.25405171513, 8.2478564527,
8.24561857662, 8.2440865055, 8.24256528837, 8.24089278, 8.23877286416, 8.23782626443, 8.23865421655, 8.23733824299,
8.23477115627, 8.23552604027, 8.24327920905]}
df = pd.DataFrame(coords)
OSM_C = 'http://c.tile.openstreetmap.org/{z}/{x}/{y}.png'
joint_axes = sns.jointplot(
x='longitud', y='latitud', data=df, ec="r", s=5)
ctx.add_basemap(joint_axes.ax_joint,crs=4326,attribution=False,url=OSM_C)
adjust(hspace=0, wspace=0)
#plt.tight_layout()
plt.show()
Here is an approach that:
removes the axes sharing in the y-direction to be able to change the aspect to 'datalim'
sets the aspect to 'equal', 'datalim'
sets the y data limits of the marginal plot to be the same as the joint plot; this seems to need a redraw
The following code shows the idea (using imshow, as I don't have contextily installed):
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
coords = {'longitud' : [-62.2037376443, -62.1263309099, -62.1111660957, -62.2094232682, -62.2373117384, -62.4837603464, -62.4030570833, -62.3975699059, -62.7017114116, -62.7830883096, -62.7786038141, -62.7683234105, -62.7490101452, -62.7709656745, -63.1002199219, -63.1890252191, -63.1183018549, -63.069960016, -62.7957745659, -63.1715687622, -63.2156105034, -63.0634381954, -63.2243260588, -63.1153871895, -63.1068292891, -63.103945266, -63.046202785, -63.1002257551, -63.2076065143, -62.9766391316, -62.9639256604, -62.9911452446, -62.9819984159, -62.9693649898, -63.066770885, -62.9867441519, -62.9566360192, -62.962616287, -62.835080907, -63.0704805194, -62.8796906301, -63.0725050601, -63.2224345145, -63.1609069526, -63.0614466072, -62.8847887504, -63.1093652381, -62.822694115, -63.211982035, -63.1689040153],
'latitud' : [8.54644405234, 8.54344899107, 8.54223724187, 8.54290207992, 8.49122679072, 8.48386575122, 8.46450360179, 8.46404720757, 8.35310083084, 8.31701565261, 8.30258604829, 8.29974870902, 8.29281679496, 8.28939264064, 8.28785272804, 8.28221439317, 8.27978694565, 8.27864159366, 8.27634987807, 8.27619269053, 8.27236343925, 8.27258932351, 8.26833993531, 8.267530064, 8.26446669791, 8.26266392333, 8.2641092051, 8.26208837315, 8.26034269744, 8.26123972942, 8.25789799656, 8.25825378832, 8.25833002805, 8.25914612933, 8.2540499893, 8.25347956867, 8.2540932736, 8.25405171513, 8.2478564527, 8.24561857662, 8.2440865055, 8.24256528837, 8.24089278, 8.23877286416, 8.23782626443, 8.23865421655, 8.23733824299, 8.23477115627, 8.23552604027, 8.24327920905]}
df = pd.DataFrame(coords)
g = sns.jointplot(data=df, x='longitud', y='latitud')
ctx.add_basemap(g.ax_joint,crs=4326,attribution=False,url=OSM_C)
# g.ax_joint.imshow(np.random.rand(20, 10), cmap='spring', interpolation='bicubic',
# extent=[df['longitud'].min(), df['longitud'].max(), df['latitud'].min(), df['latitud'].max()])
for axes in g.ax_joint.get_shared_y_axes():
for ax in axes:
g.ax_joint.get_shared_y_axes().remove(ax)
g.ax_joint.set_aspect('equal', 'datalim')
g.fig.canvas.draw()
g.ax_marg_y.set_ylim(g.ax_joint.get_ylim())
plt.show()
You can still combine this approach with changing the figure's width or height, or adding more whitespace on top or below.

Removing outliers from dataset identified in Matplotlib/Seaborn boxplot

I have produced a Boxplot/Swarmplot graph using Matplotlib/Seaborn in Pandas. Some outliers can been seen in the graph (as dots outside the "whiskers"/"fence" area). I am looking for a way to trim the dataset directly after they have been identified in the graph and without removing them from the original dataset. I do not want to simply hide the outlier dots.
Some methods have been recommended and pandas quantile looks promising but I am not sure how to implement these with the code I have been using.
My graph with the outliers.
The code I used to produce this graph. The data has been organized into the tidy format.
# Import libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set seaborn style
sns.set(style="whitegrid", palette="colorblind")
# load length tidy data
length_tidy = pd.read_csv('results/tidy/length_tidy.csv')
score_tidy = pd.read_csv('results/tidy/score_tidy.csv')
# Define and save boxplot and swarmplot for length data
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x='Metric', y='Length', data=length_tidy, ax=ax)
ax = sns.swarmplot(x="Metric", y="Length", data=length_tidy, color=".25")
ax.set_xlabel('Condition')
ax.set_ylabel('Length in micrometers')
plt.savefig('statistics/boxplot/length_boxplot.png', dpi=300)
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x='Metric', y='Score', data=score_tidy, ax=ax)
ax = sns.swarmplot(x="Metric", y="Score", data=score_tidy, color=".25")
ax.set_xlabel('Condition')
ax.set_ylabel('Score')
plt.savefig('statistics/boxplot/score_boxplot.png', dpi=300)
An example of some of the data I am working with in the CSV format.
Object,Metric,Length
M11,B2A10,1.807782
MT1,B2A10,3.2207116666666664
MT1,B2A1,3.57675
MT1,B2A2,2.9474600000000004
MT1,B2A3,2.247772857142857
MT1,B2A4,3.754455
MT1,B2A5,2.716282
MT1,B2A6,2.91325
MT1,B2A7,1.24806
MT1,B2A8,2.00371875
MT1,B2A9,1.5435599999999998
MT1,B2B1,2.2051515384615388
MT1,B2B2,1.5278873333333332
MT1,B2B3,1.7283750000000002
MT1,B2B4,1.4547385714285714
MT1,B2B5,3.237578333333333
MT1,B2B6,2.47016
MT1,B2B7,2.1185947777777776
MT1,B2B8,1.8502877777777773
MT10,B2A10,3.07143
MT10,B2A1,3.34361
MT10,B2A2,2.889958333333333
MT10,B2A3,2.22087
MT10,B2A4,2.87669
MT10,B2A5,1.6745005555555557
MT10,B2A7,2.09018
MT10,B2A8,2.4947450000000004
MT10,B2B1,1.849095882352941
MT10,B2B2,1.5291758000000002
MT10,B2B5,1.6423770999999998
MT10,B2B6,1.9680385714285715
MT10,B2B7,1.7207240000000001
MT10,B2B8,2.9618275
MT12,B2A10,1.7243058333333334
MT12,B2A1,3.3938900000000003
MT12,B2A2,2.00601
MT12,B2A3,2.1720200000000003
MT12,B2A4,2.452923333333333
MT12,B2A5,2.986948
MT12,B2A7,2.08466
MT12,B2A8,1.29047
MT12,B2B1,2.528839230769232
MT12,B2B2,1.4011425454545454
MT12,B2B5,1.626078333333333
MT12,B2B6,1.074394454545455
MT12,B2B7,2.0897078571428573
MT12,B2B8,1.4102533333333336

Matplotlib inline in Jupyter - how to contol when the plot is shown?

I have a function that creates a figure and for some reason it is shown in Jupyter notebook twice, even though I didn't run show at all. I pass the fig and ax as an output of this function, and plan to show it only later.
I get confused between plt, fig and ax functionaries and guess that the answer is hidden somewhere there.
Here is an anonymised version of my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
def plot_curve(dummydata):
# builds a chart
fig,ax = plt.subplots(1) # get subplots
fig.set_figheight(7)
fig.set_figwidth(12) #set shape
plt.plot(dummydata.x1, dummydata.y1,label = 'l1') #curve 1
plt.plot(dummydata.x2, dummydata.y2,label = 'l2') #curve2
plt.xlabel('xlabel') #labels
plt.ylabel('xlabel')
plt.yscale('linear') #scale and bounds
plt.ylim(0,100)
ymin,ymax= ax.get_ylim()
ax.axhline(1, color='k', linestyle=':', label = 'lab1') #guideline - horizontal
ax.axvline(2, color='r',linestyle='--', label = 'lab2') #guideline - vertical
ax.axvline(3, color='g',linestyle='--', label = 'lab3') #guideline - vertical
ax.arrow(1,2,3,0, head_width=0.1, head_length=0.01, fc='k', ec='k') # arrow
rect = mpl.patches.Rectangle((1,2), 2,3, alpha = 0.1, facecolor='yellow',
linewidth=0 , label= 'lab4') #yellow area patch
ax.add_patch(rect)
plt.legend()
plt.title('title')
return fig,ax
and then call it with:
for i in range(3):
dummydata = pd.DataFrame({
'x1':np.arange(1+i,100,0.1),
'y1':np.arange(11+i,110,0.1),
'x2':np.arange(1+i,100,0.1),
'y2':np.arange(21+i,120,0.1)
})
fig,ax = plot_curve(dummydata) #get the chart
What should I change to not show the figure by default, and show it only by my command?
Thanks
Try disabling matplotlib interactive mode using plt.ioff(). With interactive mode disabled the plots will only be shown with an explicit plt.show().
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
# Desactivate interactive mode
plt.ioff()
def plot_curve(dummydata):
# the same code as before
Then in another cell
for i in range(3):
dummydata = pd.DataFrame({
'x1':np.arange(1+i,100,0.1),
'y1':np.arange(11+i,110,0.1),
'x2':np.arange(1+i,100,0.1),
'y2':np.arange(21+i,120,0.1)
})
# I'am assuming this should not be in the for loop
# The plot will NOT be shown because we are not in interactive mode
fig, ax = plot_curve(dummydata) #get the chart
No plot will be shown yet.
Now in another cell
# Now ANY plot (figure) which was created and not shown yet will be finally shown
plt.show()
The plot is finally shown. Note that if you have created several plots all of them will be shown now.
Try this:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib
With this importing you should not see the figure after plotting.
But you can see the figure by writing fig to IPython cell:
dummydata = pd.DataFrame({
'x1':np.arange(1,100,0.1),
'y1':np.arange(11,110,0.1),
'x2':np.arange(1,100,0.1),
'y2':np.arange(21,120,0.1)
})
fig,ax = plot_curve(dummydata) #get the chart
fig # Will now plot the figure.
Is this the desired output?

Seaborn - Display Last Value / Label

I would like create an plot with to display the last value on line. But i can not create the plot with the last value on chart. Do you have an idea for to resolve my problem, thanks you !
Input :
DataFrame
Plot
Output :
Cross = Last Value In columns
Output Final
# import eikon as ek
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import seaborn as sns; sns.set()
import pylab
from scipy import *
from pylab import *
fichier = "P:/GESTION_RPSE/GES - Gestion Epargne Salariale/Dvp Python/Florian/Absolute
Performance/PLOT.csv"
df = pd.read_csv(fichier)
df = df.drop(columns=['Unnamed: 0'])
# sns.set()
plt.figure(figsize=(16, 10))
df = df.melt('Date', var_name='Company', value_name='Value')
#palette = sns.color_palette("husl",12)
ax = sns.lineplot(x="Date", y="Value", hue='Company', data=df).set_title("LaLaLa")
plt.show()
Do you just want to put an 'X' at the end of your lines?
If so, you could pass markerevery=[-1] to the call to lineplot(). However there are a few caveats:
You have to use style= instead of hue= otherwise, there are no markers drawn
Filled markers work better than unfilled markers (like "x"). You can just use markers=True to use the default markers, or pass a list markers=['s','d','o',etc...]
code:
fmri = sns.load_dataset("fmri")
fig, ax = plt.subplots()
ax = sns.lineplot(x="timepoint", y="signal",
style="event", data=fmri, ci=None, markers=True, markevery=[-1], markersize=10)

How to show label names in pandas groupby histogram plot

I can plot multiple histograms in a single plot using pandas but there are few things missing:
How to give the label.
I can only plot one figure, how to change it to layout=(3,1) or something else.
Also, in figure 1, all the bins are filled with solid colors, and its kind of difficult to know which is which, how to fill then with different markers (eg. crosses,slashes,etc)?
Here is the MWE:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('iris')
df.groupby('species')['sepal_length'].hist(alpha=0.7,label='species')
plt.legend()
Output:
To change layout I can use by keyword, but can't give them colors
HOW TO GIVE DIFFERENT COLORS?
df.hist('sepal_length',by='species',layout=(3,1))
plt.tight_layout()
Gives:
You can resolve to groupby:
fig,ax = plt.subplots()
hatches = ('\\', '//', '..') # fill pattern
for (i, d),hatch in zip(df.groupby('species'), hatches):
d['sepal_length'].hist(alpha=0.7, ax=ax, label=i, hatch=hatch)
ax.legend()
Output:
In pandas version 1.1.0 you can simply set the legend keyword to true.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('iris')
df.groupby('species')['sepal_length'].hist(alpha=0.7, legend = True)
output image
It's more code, but using pure matplotlib will always give you more control over the plots. For your second case:
import matplotlib.pyplot as plt
import numpy as np
from itertools import zip_longest
# Dictionary of color for each species
color_d = dict(zip_longest(df.species.unique(),
plt.rcParams['axes.prop_cycle'].by_key()['color']))
# Use the same bins for each
xmin = df.sepal_length.min()
xmax = df.sepal_length.max()
bins = np.linspace(xmin, xmax, 20)
# Set up correct number of subplots, space them out.
fig, ax = plt.subplots(nrows=df.species.nunique(), figsize=(4,8))
plt.subplots_adjust(hspace=0.4)
for i, (lab, gp) in enumerate(df.groupby('species')):
ax[i].hist(gp.sepal_length, ec='k', bins=bins, color=color_d[lab])
ax[i].set_title(lab)
# same xlim for each so we can see differences
ax[i].set_xlim(xmin, xmax)

Categories

Resources