I am trying to save a large dendrogram made from a large table (10000+ rows, 18 columns), and I came with this code
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import pandas as pd
data = pd.read_csv("Input.txt", header = 0, index_col = None,\
sep = "\t", memory_map = True)
data = data.fillna(0)
Matrix = data.iloc[:,-18:]
Linkage_Matrix = linkage (Matrix, "ward")
fig=plt.figure(figsize=(20, 200))
#fig, ax = plt.subplots(1, 1, tight_layout=False)
ax = fig.add_axes([0.1,0.1,0.75,0.75])
#fig.title('Hierarchical Clustering Dendrogram')
ax.set_title("Hierarchical Clustering Dendrogram")
ax.set_xlabel("distance")
ax.set_xlabel("name")
dendrogram(
Linkage_Matrix,
orientation ="left",
leaf_rotation=0.,
leaf_font_size=12.,
labels = list(data.loc[:,"name"])
)
ax.set_yticklabels(list(data.loc[:,"name"]), minor=False)
ax.yaxis.set_label_position('right')
ax.yaxis.tick_right()
plt.savefig("plt1.png", dpi = 320, format= "png", bbox_inches=None)
But unfortunately, it doesn't save the axis, while I left some space as showed in these:
Matplotlib savefig does not save axes
Why is my xlabel cut off in my matplotlib plot?
Matplotlib savefig image trim
Plotting hierarchical clustering dendrograms for large data sets
Dendrogram generated by scipy-cluster customisation
I have a correct display in the console, which I can save, but the dpi are not good, and ideally I also would like to switch to svg to be able to set the level of readability afterwards.
Any insights would be greatly appreciated
Removing this line
ax = fig.add_axes([0.1,0.1,0.75,0.75])
and setting bbox_inches='tight' in plt.savefig() makes it work for me.
Also, since you are loading the data with pandas, note how you can declare the 'name' column as index and use these index values as labels.
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import pandas as pd
data = pd.read_csv('input.txt', header=0, index_col=['name'], sep="\t")
data = data.fillna(0)
link_matrix = linkage(data, 'ward')
fig, ax = plt.subplots(1, 1, figsize=(20,200))
ax.set_title('Hierarchical Clustering Dendrogram')
ax.set_xlabel('distance')
ax.set_ylabel('name')
dendrogram(
link_matrix,
orientation='left',
leaf_rotation=0.,
leaf_font_size=12.,
labels=data.index.values
)
ax.yaxis.set_label_position('right')
ax.yaxis.tick_right()
plt.savefig('plt1.png', dpi=320, format='png', bbox_inches='tight')
Related
I'm doing a jointplot with a basemap, the problem is that when I add the basemap the main plot doesn't have the same size of the marginal plots. I've tried with different parameters without luck. Does anyone have an idea?
import seaborn as sns
import matplotlib.pyplot as plt
import contextily as ctx
import pandas as pd
##exaplme of the data
coords={'longitud':[-62.2037376443, -62.1263309099, -62.1111660957, -62.2094232682, -62.2373117384, -62.4837603464,
-62.4030570833, -62.3975699059, -62.7017114116, -62.7830883096, -62.7786038141, -62.7683234105, -62.7490101452,
-62.7709656745, -63.1002199219, -63.1890252191, -63.1183018549, -63.069960016, -62.7957745659, -63.1715687622,
-63.2156105034, -63.0634381954, -63.2243260588, -63.1153871895, -63.1068292891, -63.103945266, -63.046202785,
-63.1002257551, -63.2076065143, -62.9766391316, -62.9639256604, -62.9911452446, -62.9819984159, -62.9693649898,
-63.066770885, -62.9867441519, -62.9566360192, -62.962616287, -62.835080907, -63.0704805194, -62.8796906301,
-63.0725050601, -63.2224345145, -63.1609069526, -63.0614466072, -62.8847887504, -63.1093652381, -62.822694115,
-63.211982035, -63.1689040153],
'latitud':[8.54644405234, 8.54344899107, 8.54223724187, 8.54290207992, 8.49122679072, 8.48386575122, 8.46450360179,
8.46404720757, 8.35310083084, 8.31701565261, 8.30258604829, 8.29974870902, 8.29281679496, 8.28939264064, 8.28785272804,
8.28221439317, 8.27978694565, 8.27864159366, 8.27634987807, 8.27619269053, 8.27236343925, 8.27258932351, 8.26833993531,
8.267530064, 8.26446669791, 8.26266392333, 8.2641092051, 8.26208837315, 8.26034269744, 8.26123972942, 8.25789799656,
8.25825378832, 8.25833002805, 8.25914612933, 8.2540499893, 8.25347956867, 8.2540932736, 8.25405171513, 8.2478564527,
8.24561857662, 8.2440865055, 8.24256528837, 8.24089278, 8.23877286416, 8.23782626443, 8.23865421655, 8.23733824299,
8.23477115627, 8.23552604027, 8.24327920905]}
df = pd.DataFrame(coords)
OSM_C = 'http://c.tile.openstreetmap.org/{z}/{x}/{y}.png'
joint_axes = sns.jointplot(
x='longitud', y='latitud', data=df, ec="r", s=5)
ctx.add_basemap(joint_axes.ax_joint,crs=4326,attribution=False,url=OSM_C)
adjust(hspace=0, wspace=0)
#plt.tight_layout()
plt.show()
Here is an approach that:
removes the axes sharing in the y-direction to be able to change the aspect to 'datalim'
sets the aspect to 'equal', 'datalim'
sets the y data limits of the marginal plot to be the same as the joint plot; this seems to need a redraw
The following code shows the idea (using imshow, as I don't have contextily installed):
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
coords = {'longitud' : [-62.2037376443, -62.1263309099, -62.1111660957, -62.2094232682, -62.2373117384, -62.4837603464, -62.4030570833, -62.3975699059, -62.7017114116, -62.7830883096, -62.7786038141, -62.7683234105, -62.7490101452, -62.7709656745, -63.1002199219, -63.1890252191, -63.1183018549, -63.069960016, -62.7957745659, -63.1715687622, -63.2156105034, -63.0634381954, -63.2243260588, -63.1153871895, -63.1068292891, -63.103945266, -63.046202785, -63.1002257551, -63.2076065143, -62.9766391316, -62.9639256604, -62.9911452446, -62.9819984159, -62.9693649898, -63.066770885, -62.9867441519, -62.9566360192, -62.962616287, -62.835080907, -63.0704805194, -62.8796906301, -63.0725050601, -63.2224345145, -63.1609069526, -63.0614466072, -62.8847887504, -63.1093652381, -62.822694115, -63.211982035, -63.1689040153],
'latitud' : [8.54644405234, 8.54344899107, 8.54223724187, 8.54290207992, 8.49122679072, 8.48386575122, 8.46450360179, 8.46404720757, 8.35310083084, 8.31701565261, 8.30258604829, 8.29974870902, 8.29281679496, 8.28939264064, 8.28785272804, 8.28221439317, 8.27978694565, 8.27864159366, 8.27634987807, 8.27619269053, 8.27236343925, 8.27258932351, 8.26833993531, 8.267530064, 8.26446669791, 8.26266392333, 8.2641092051, 8.26208837315, 8.26034269744, 8.26123972942, 8.25789799656, 8.25825378832, 8.25833002805, 8.25914612933, 8.2540499893, 8.25347956867, 8.2540932736, 8.25405171513, 8.2478564527, 8.24561857662, 8.2440865055, 8.24256528837, 8.24089278, 8.23877286416, 8.23782626443, 8.23865421655, 8.23733824299, 8.23477115627, 8.23552604027, 8.24327920905]}
df = pd.DataFrame(coords)
g = sns.jointplot(data=df, x='longitud', y='latitud')
ctx.add_basemap(g.ax_joint,crs=4326,attribution=False,url=OSM_C)
# g.ax_joint.imshow(np.random.rand(20, 10), cmap='spring', interpolation='bicubic',
# extent=[df['longitud'].min(), df['longitud'].max(), df['latitud'].min(), df['latitud'].max()])
for axes in g.ax_joint.get_shared_y_axes():
for ax in axes:
g.ax_joint.get_shared_y_axes().remove(ax)
g.ax_joint.set_aspect('equal', 'datalim')
g.fig.canvas.draw()
g.ax_marg_y.set_ylim(g.ax_joint.get_ylim())
plt.show()
You can still combine this approach with changing the figure's width or height, or adding more whitespace on top or below.
I have lot of feature in data and i want to make box plot for each feature. So for that
import pandas as pd
import seaborn as sns
plt.figure(figsize=(25,20))
for data in train_df.columns:
plt.subplot(7,4,i+1)
plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
ax =sns.boxplot(train_df[data])
I did this
and the output is
All the plot are on one image i want something like
( not with skew graphs but with box plot )
What changes i need to do ?
In your code, I cannot see where the i is coming from and also it's not clear how ax was assigned.
Maybe try something like this, first an example data frame:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
train_df = pd.concat([pd.Series(np.random.normal(i,1,100)) for i in range(12)],axis=1)
Set up fig and a flattened ax for each subplot:
fig,ax = plt.subplots(4,3,figsize=(10,10))
ax = ax.flatten()
The most basic would be to call sns.boxplot assigning ax inside the function:
for i,data in enumerate(train_df.columns):
sns.boxplot(train_df[data],ax=ax[i])
I have produced a Boxplot/Swarmplot graph using Matplotlib/Seaborn in Pandas. Some outliers can been seen in the graph (as dots outside the "whiskers"/"fence" area). I am looking for a way to trim the dataset directly after they have been identified in the graph and without removing them from the original dataset. I do not want to simply hide the outlier dots.
Some methods have been recommended and pandas quantile looks promising but I am not sure how to implement these with the code I have been using.
My graph with the outliers.
The code I used to produce this graph. The data has been organized into the tidy format.
# Import libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set seaborn style
sns.set(style="whitegrid", palette="colorblind")
# load length tidy data
length_tidy = pd.read_csv('results/tidy/length_tidy.csv')
score_tidy = pd.read_csv('results/tidy/score_tidy.csv')
# Define and save boxplot and swarmplot for length data
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x='Metric', y='Length', data=length_tidy, ax=ax)
ax = sns.swarmplot(x="Metric", y="Length", data=length_tidy, color=".25")
ax.set_xlabel('Condition')
ax.set_ylabel('Length in micrometers')
plt.savefig('statistics/boxplot/length_boxplot.png', dpi=300)
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.boxplot(x='Metric', y='Score', data=score_tidy, ax=ax)
ax = sns.swarmplot(x="Metric", y="Score", data=score_tidy, color=".25")
ax.set_xlabel('Condition')
ax.set_ylabel('Score')
plt.savefig('statistics/boxplot/score_boxplot.png', dpi=300)
An example of some of the data I am working with in the CSV format.
Object,Metric,Length
M11,B2A10,1.807782
MT1,B2A10,3.2207116666666664
MT1,B2A1,3.57675
MT1,B2A2,2.9474600000000004
MT1,B2A3,2.247772857142857
MT1,B2A4,3.754455
MT1,B2A5,2.716282
MT1,B2A6,2.91325
MT1,B2A7,1.24806
MT1,B2A8,2.00371875
MT1,B2A9,1.5435599999999998
MT1,B2B1,2.2051515384615388
MT1,B2B2,1.5278873333333332
MT1,B2B3,1.7283750000000002
MT1,B2B4,1.4547385714285714
MT1,B2B5,3.237578333333333
MT1,B2B6,2.47016
MT1,B2B7,2.1185947777777776
MT1,B2B8,1.8502877777777773
MT10,B2A10,3.07143
MT10,B2A1,3.34361
MT10,B2A2,2.889958333333333
MT10,B2A3,2.22087
MT10,B2A4,2.87669
MT10,B2A5,1.6745005555555557
MT10,B2A7,2.09018
MT10,B2A8,2.4947450000000004
MT10,B2B1,1.849095882352941
MT10,B2B2,1.5291758000000002
MT10,B2B5,1.6423770999999998
MT10,B2B6,1.9680385714285715
MT10,B2B7,1.7207240000000001
MT10,B2B8,2.9618275
MT12,B2A10,1.7243058333333334
MT12,B2A1,3.3938900000000003
MT12,B2A2,2.00601
MT12,B2A3,2.1720200000000003
MT12,B2A4,2.452923333333333
MT12,B2A5,2.986948
MT12,B2A7,2.08466
MT12,B2A8,1.29047
MT12,B2B1,2.528839230769232
MT12,B2B2,1.4011425454545454
MT12,B2B5,1.626078333333333
MT12,B2B6,1.074394454545455
MT12,B2B7,2.0897078571428573
MT12,B2B8,1.4102533333333336
I can plot multiple histograms in a single plot using pandas but there are few things missing:
How to give the label.
I can only plot one figure, how to change it to layout=(3,1) or something else.
Also, in figure 1, all the bins are filled with solid colors, and its kind of difficult to know which is which, how to fill then with different markers (eg. crosses,slashes,etc)?
Here is the MWE:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('iris')
df.groupby('species')['sepal_length'].hist(alpha=0.7,label='species')
plt.legend()
Output:
To change layout I can use by keyword, but can't give them colors
HOW TO GIVE DIFFERENT COLORS?
df.hist('sepal_length',by='species',layout=(3,1))
plt.tight_layout()
Gives:
You can resolve to groupby:
fig,ax = plt.subplots()
hatches = ('\\', '//', '..') # fill pattern
for (i, d),hatch in zip(df.groupby('species'), hatches):
d['sepal_length'].hist(alpha=0.7, ax=ax, label=i, hatch=hatch)
ax.legend()
Output:
In pandas version 1.1.0 you can simply set the legend keyword to true.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('iris')
df.groupby('species')['sepal_length'].hist(alpha=0.7, legend = True)
output image
It's more code, but using pure matplotlib will always give you more control over the plots. For your second case:
import matplotlib.pyplot as plt
import numpy as np
from itertools import zip_longest
# Dictionary of color for each species
color_d = dict(zip_longest(df.species.unique(),
plt.rcParams['axes.prop_cycle'].by_key()['color']))
# Use the same bins for each
xmin = df.sepal_length.min()
xmax = df.sepal_length.max()
bins = np.linspace(xmin, xmax, 20)
# Set up correct number of subplots, space them out.
fig, ax = plt.subplots(nrows=df.species.nunique(), figsize=(4,8))
plt.subplots_adjust(hspace=0.4)
for i, (lab, gp) in enumerate(df.groupby('species')):
ax[i].hist(gp.sepal_length, ec='k', bins=bins, color=color_d[lab])
ax[i].set_title(lab)
# same xlim for each so we can see differences
ax[i].set_xlim(xmin, xmax)
I need to change the colors of the boxplot drawn using pandas utility function. I can change most properties using the color argument but can't figure out how to change the facecolor of the box. Someone knows how to do it?
import pandas as pd
import numpy as np
data = np.random.randn(100, 4)
labels = list("ABCD")
df = pd.DataFrame(data, columns=labels)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
df.plot.box(color=props)
While I still recommend seaborn and raw matplotlib over the plotting interface in pandas, it turns out that you can pass patch_artist=True as a kwarg to df.plot.box, which will pass it as a kwarg to df.plot, which will pass is as a kwarg to matplotlib.Axes.boxplot.
import pandas as pd
import numpy as np
data = np.random.randn(100, 4)
labels = list("ABCD")
df = pd.DataFrame(data, columns=labels)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
df.plot.box(color=props, patch_artist=True)
As suggested, I ended up creating a function to plot this, using raw matplotlib.
def plot_boxplot(data, ax):
bp = ax.boxplot(data.values, patch_artist=True)
for box in bp['boxes']:
box.set(color='DarkGreen')
box.set(facecolor='DarkGreen')
for whisker in bp['whiskers']:
whisker.set(color="DarkOrange")
for cap in bp['caps']:
cap.set(color="Gray")
for median in bp['medians']:
median.set(color="white")
ax.axhline(0, color="DarkBlue", linestyle=":")
ax.set_xticklabels(data.columns)
I suggest using df.plot.box with patch_artist=True and return_type='both' (which returns the matplotlib axes the boxplot is drawn on and a dictionary whose values are the matplotlib Lines of the boxplot) in order to have the best customization possibilities.
For example, given this data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
data=np.random.randn(100, 4),
columns=list("ABCD")
)
you can set a specific color for all the boxes:
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch in props['boxes']:
patch.set_facecolor('lime')
plt.show()
you can set a specific color for each box:
colors = ['green','blue','yellow','red']
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors):
patch.set_facecolor(color)
plt.show()
you can easily integrate a colormap:
colors = np.random.randint(0,10, 4)
cm = plt.cm.get_cmap('rainbow')
colors_cm = [cm((c-colors.min())/(colors.max()-colors.min())) for c in colors]
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors_cm):
patch.set_facecolor(color)
# to add colorbar
fig.colorbar(plt.cm.ScalarMappable(
plt.cm.colors.Normalize(min(colors),max(colors)),
cmap='rainbow'
), ax=ax, cmap='rainbow')
plt.show()