seaborn pairplot after converting a integer column to string - python

I am facing a trouble with seaborn.pairplot() with the below code
I have a dataframe and in one case I have to convert one of the column to string; After converting to String.
Pairplot() is not working properly.
How to fix the issue.
Below is the code,
import numpy as np
from pandas import DataFrame
import seaborn as sns
%matplotlib inline
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
df_temp = DataFrame(abs(np.random.randn(5, 4)), index=Index, columns=Cols)
print(df_temp)
sns.pairplot(df_temp) # This works
# convert one of the column to String datatype
df_temp['A'] = df_temp['A'].astype(str)
sns.pairplot(df_temp) # Gives error
Complete error log - Error log

On the diagonal of a pairplot there are histograms. It is not possible to draw histrograms from strings. Since I'm not sure what you would want to show on the diagonal instead in such case, let's leave that out and simply plot a pair grid from the dataframe which contains strings in one column,
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
import seaborn as sns
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
df = DataFrame(abs(np.random.randn(5, 4)), index=Index, columns=Cols)
df['A'] = list("VWXYZ")
g = sns.PairGrid(df, vars=df.columns, height=2)
g.map_offdiag(sns.scatterplot)
plt.show()
If instead the aim is to just use numeric columns, you can filter the dataframe by dtype.
import matplotlib.pyplot as plt
import numpy as np
from pandas import DataFrame
import seaborn as sns
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
df = DataFrame(abs(np.random.randn(5, 4)), index=Index, columns=Cols)
# convert one of the column to String datatype
df['A'] = df['A'].astype(str)
sns.pairplot(df.select_dtypes(include=[np.number]))
plt.show()

import numpy as np
from pandas import DataFrame
import seaborn as sns
%matplotlib inline
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
df_temp = DataFrame(abs(np.random.randn(5, 4)), index=Index, columns=Cols)
print(df_temp)
# convert one of the column to String datatype
df_temp['A'] = df_temp['A'].astype(str)
You can find all the columns of type float and plot only those.
cols_to_plot=df_temp[df_temp.types=='float']#find not strings
sns.pairplot(df_temp[cols_to_plot[cols_to_plot==1].index])

Related

How do you plot by Groupby?

I want to know how to plot a bar graph among groups 'new_build', X Axis shows the towns and Y Axis shows the percentage values from the calculation performed in the code below
df_House.groupby(['new_build', 'town'])['price_paid'].count()/df_House.groupby(['new_build', 'town'])['price_paid'].count().sum()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = [['a', 'x', 100], ['b', 'y', 200], ['c', 'z', 300], ['a', 'y', 400], ['b', 'z', 600], ['c', 'x', 100]]
df = pd.DataFrame(data, columns=['new_build', 'town', 'price_paid'])
df_town = df.groupby(['new_build', 'town']).agg({'price_paid': 'sum'})
new_df = df_town.div(state, level='new_build') * 100
new_df.reset_index(inplace=True)
sns.set()
new_df.set_index(['new_build', 'town']).price_paid.plot(kind='bar', stacked=True)
plt.ylabel('% of price_paid')
Not sure if you need a stacked graph to better represent the data but to keep things less complicated, I have created a bar graph performing the task you requested.

How to save multiple Seaborn plots into single pdf file

So I'm trying to save multiple plots that i create in a for loop into a single pdf file. I've searched around on SO and pieced together some code that seems to work except it doesn't save the figures it creates a pdf but without anything in it.
Here's the code to reproduce it:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
dftest = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),
columns=['a', 'b', 'c', 'd', 'e'])
from matplotlib.backends.backend_pdf import PdfPages
with PdfPages('count.pdf') as pdf_pages:
df1 = dftest.select_dtypes([np.int, np.float, np.object])
for i, col in enumerate(df1.columns):
plt.figure(i)
countplot = sns.countplot(x=col, data=df1)
pdf_pages.savefig(countplot.fig)
Saving the plt.figure works for me
with PdfPages('count.pdf') as pdf_pages:
df1 = dftest.select_dtypes([np.int, np.float, np.object])
for i, col in enumerate(df1.columns):
figu = plt.figure(i)
countplot = sns.countplot(x=col, data=df1)
pdf_pages.savefig(figu)

Pandas DataFrame plot: specify column from MultiIndex for secondary_y

I am plotting a multi-index columns DataFrame.
What is the syntax to specify the column(s) to be plotted on secondary_y using the .plot method of pandas DataFrame?
Setup
import numpy as np
import pandas as pd
mt_idx = pd.MultiIndex.from_product([['A', 'B'], ['first', 'second']])
df = pd.DataFrame(np.random.randint(0, 10, size=(20, len(mt_idx))), columns=mt_idx)
My Attempts
df.plot(secondary_y=('B', 'second'))
df.plot(secondary_y='(B, second)')
None of the above worked, as all the lines were plotted on the principal y-axis.
One possible solution would be to plot each column, then specify secondary=True. Doing it the following way requires you to specifiy the axes to which they will be plotted:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
mt_idx = pd.MultiIndex.from_product([['A', 'B'], ['first', 'second']])
df = pd.DataFrame(np.random.randint(0, 10, size=(20, len(mt_idx))), columns=mt_idx)
df.A.plot(ax=ax)
df.B.plot(ax=ax, secondary_y=True)
plt.show()
You might drop the upper column index level. If you don't want to modify the original dataframe, this could be done on a copy of it.
df2 = df.copy()
df2.columns = df2.columns.map('_'.join)
df2.plot(secondary_y=('B_second'))

Plot duplication in Pandas Plot()

There is an issue with the plot() function in Pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'A', 'B'])
ax = df.plot()
ax.legend(ncol=1, bbox_to_anchor=(1., 1, 0., 0), loc=2 , prop={'size':6})
This will make a plot with too many lines. Note however that half will be on top of each other. It seems to have something to do with the axis because when I do not use them the issue goes away.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'A', 'B'])
df.plot()
UPDATE
While not idea for my use case the issue can be fixed by using MultiIndex
columns = pd.MultiIndex.from_arrays([np.hstack([ ['left']*2, ['right']*2]), ['A', 'B']*2], names=['High', 'Low'])
df = pd.DataFrame(np.random.randn(8, 4), columns=columns)
ax = df.plot()
ax.legend(ncol=1, bbox_to_anchor=(1., 1, 0., 0), loc=2 , prop={'size':16})
It has to do with your duplication of column names, not ax at all (if you call plt.legend after your second example you see the same extra lines). Having multiple columns with the same name is confusing the call to DataFrame.plot_frame.
If you change your columns to ['A', 'B', 'C', 'D'] instead, it's fine.

Color a heatmap in Python/Matplotlib according to requirement

I'm trying to make a heatmap with a specified requirement of the coloring. I want to set an interval for the data and judge that as ok and color it green, the rest of the results should be colored as red. Does anyone have a clue of how to do this??
I have attache a simple example using pandas and matplotlib for better understanding.
import numpy as np
from pandas import *
import matplotlib.pyplot as plt
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
data= abs(np.random.randn(5, 4))
df = DataFrame(data, index=Index, columns=Cols)
plt.pcolor(df)
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()
There's more than one way to do this.
The easiest way is to just pass in a boolean array to pcolor and then choose a colormap where green is high and red is low.
For example:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
data= np.random.random((5, 4))
df = pd.DataFrame(data, index=Index, columns=Cols)
plt.pcolor(df > 0.5, cmap='RdYlGn')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()
Alternately, as #Cyber mentioned, you could make a two-color colormap based on your values and use it:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
data= np.random.random((5, 4))
df = pd.DataFrame(data, index=Index, columns=Cols)
# Values from 0-0.5 will be red and 0.5-1 will be green
cmap, norm = mcolors.from_levels_and_colors([0, 0.5, 1], ['red', 'green'])
plt.pcolor(df, cmap=cmap, norm=norm)
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()
(The color difference is just because the "RdYlGn" colormap uses darker greens and reds as its endpoints.)
On a side note, it's also considerably faster to use pcolormesh for this, rather than pcolor. For small arrays, it won't make a significant difference, but for large arrays pcolor is excessively slow. imshow is even faster yet, if you don't mind raster output. Use imshow(data, interpolation='nearest', aspect='auto', origin='lower') to match the defaults of pcolor and pcolormesh.
You can make a 2 color colormap.
Then you can set the cutoff value between red and green.

Categories

Resources