Seaborn catplot Sort by Count column - python

Problem
I have data looks like the following:
Month
Product
SalesCount
1
4
94
1
6
38
1
2
56
1
7
47
I would like:
Display a histogram and sort them by SalesCount, from highest to lowest.
Display all labels and titles.
What I've Tried
import numpy as np
import seaborn as sns
rng = np.random.default_rng()
dft = pd.DataFrame({'Month': 1,
'Product': rng.choice(30, size=30, replace=False),
'SalesCount': np.random.randint(1, 100, 30),
})
# Try to sort the dataframe
#dft = dft.sort_values(by=['SalesCount'])
print(dft)
g = sns.catplot(data=dft, kind='bar', x='Product', y='SalesCount', height=6, aspect=1.8, facecolor=(0.3,0.3,0.7,1))
#, order=dft[['Product', 'SalesCount']].index
(g.set_axis_labels('Product', 'Count')
.set_titles('test'))
Which shows chart similar to this:
I have tried sorting the dataframe first (dft = dft.sort_values(by=['SalesCount'])) and also add order parameter (order=dft[['Product', 'SalesCount']].index) to sns.catplot method. Both of these attempts don't sort the histogram.
The second issue I have is adding the titles. I have tried .set_titles('test') in FacetGrid (from sns.catplot) instance, but title would not show up.
Thanks!

You may need to make your Product column a string instead of an integer. This should work.
import numpy as np
import pandas as pd
import seaborn as sns
rng = np.random.default_rng()
dft = pd.DataFrame({'Month': 1,
'Product': rng.choice(30, size=30, replace=False),
'SalesCount': np.random.randint(1, 100, 30),
})
# Try to sort the dataframe
dft = dft.sort_values(by=['SalesCount'])
dft['Product'] = dft['Product'].astype(str)
print(dft)
g = sns.catplot(data=dft, kind='bar', x='Product', y='SalesCount', height=6, aspect=1.8, facecolor=(0.3,0.3,0.7,1))
#, order=dft[['Product', 'SalesCount']].index
(g.set_axis_labels('Product', 'Count')
.set_titles('test'))

Related

Pandas Seaborn FacetGrid same x labels on every plot, though different y values

I'm trying to display FacetGrid with barplots so that it displays data of yellow_cards count (y) and team name (x), divided by different football leagues (other plots should show other leagues, and other team names). The data is being counted correctly but the display shows only the first league on every plot.
Here's the code snippet i'm using to build the FacetGrid:
df_alt2_teams = df_alt2.groupby(['league', 'squad'])['cards_yellow', 'cards_red'].sum().reset_index()
df_alt2_teams = df_alt2_teams.sort_values(by=['cards_yellow', 'cards_red'], ascending=True)
g = sns.FacetGrid(df_alt2_teams, col='league', height=8, aspect=4)
g = g.map(sns.barplot, 'squad', 'cards_yellow', palette="flare", data=df_alt2_teams)
g.set(ylim=(0, 50))
g.set_xticklabels(rotation=90)
the data differs, but labels don't
data example:
index league squad cards_red cards_yellow
52 Ligue 1 Strasbourg 1.0 2.0
57 Premier League Brighton 1.0 3.0
If you do not need ordered bars, you can directly plot the dataframe and let seaborn do all the calculations:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#test data generation
import numpy as np
n=30
np.random.seed(123)
df_alt2 = pd.DataFrame({"index": np.arange(n),
"league": "Ligue 1",
"squad": np.random.choice(list("ABCDXYZ"), n),
"cards_red": np.random.randint(0, 3, n),
"cards_yellow": np.random.randint(0, 5, n)})
df_alt2.loc[:, "league"][df_alt2.squad.isin(list("XYZ"))] = "Premier League"
g = sns.catplot(data=df_alt2, x="squad", y="cards_yellow", col="league",
kind="bar", estimator=sum, ci=None, sharex=False)
g.set(ylim=(0, 20))
g.set_xticklabels(rotation=90)
plt.tight_layout()
plt.show()
Sample output:
I would not know how to convince seaborn to order the bars by value. So in this case,
you might have to revert to your precalculation:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#test data generation
import numpy as np
n=30
np.random.seed(123)
df_alt2 = pd.DataFrame({"index": np.arange(n),
"league": "Ligue 1",
"squad": np.random.choice(list("ABCDXYZ"), n),
"cards_red": np.random.randint(0, 3, n),
"cards_yellow": np.random.randint(0, 5, n)})
df_alt2.loc[:, "league"][df_alt2.squad.isin(list("XYZ"))] = "Premier League"
df_alt2_teams = df_alt2.groupby(['league', 'squad'])['cards_yellow', 'cards_red'].sum().reset_index()
df_alt2_teams = df_alt2_teams.sort_values(by=['cards_yellow', 'cards_red'], ascending=True)
g = sns.catplot(data=df_alt2_teams, x="squad", y="cards_yellow", col="league", kind="bar", sharex=False)
g.set(ylim=(0, 20))
g.set_xticklabels(rotation=90)
plt.tight_layout()
plt.show()
Output:

Scatter and curve plot using matplotlib

I am trying to plot the accuracy evolution of NN models overtimes. So, I have an excel file with data like the following:
and I wrote the following code to extract data and plot the scatter:
import pandas as pd
data = pd.read_excel (r'SOTA DNN.xlsx')
acc1 = pd.DataFrame(data, columns= ['Top-1-Acc'])
para = pd.DataFrame(data, columns= ['Parameters'])
dates = pd.to_datetime(data['Date'], format='%Y-%m-%d')
import matplotlib.pyplot as plt
plt.grid(True)
plt.ylim(40, 100)
plt.scatter(dates, acc1)
plt.show()
Is there a way to draw a line in the same figure to show only the ones achieving the maximum and print their names at the same time as in this example:
is it also possible to stretch the x-axis (for the dates)?
It is still not clear what you mean by "stretch the x-axis" and you did not provide your dataset, but here is a possible general approach:
import matplotlib.pyplot as plt
import pandas as pd
#fake data generation, this has to be substituted by your .xls import routine
from pandas._testing import rands_array
import numpy as np
np.random.seed(1234)
n = 30
acc = np.concatenate([np.random.randint(0, 10, 10), np.random.randint(0, 30, 10), np.random.randint(0, 100, n-20)])
date_range = pd.date_range("20190101", periods=n)
model = rands_array(5, n)
df = pd.DataFrame({"Model": model, "Date": date_range, "TopAcc": acc})
df = df.sample(frac=1).reset_index(drop=True)
#now to the actual data modification
#first, we extract the dataframe with monotonically increasing values after sorting the date column
df = df.sort_values("Date").reset_index()
df["Max"] = df.TopAcc.cummax().diff()
df.loc[0, "Max"] = 1
dfmax = df[df.Max > 0]
#then, we plot all data, followed by the best performers
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(df.Date, df.TopAcc, c="grey")
ax.plot(dfmax.Date, dfmax.TopAcc, marker="x", c="blue")
#finally, we annotate the best performers
for _, xylabel in dfmax.iterrows():
ax.text(xylabel.Date, xylabel.TopAcc, xylabel.Model, c="blue", horizontalalignment="right", verticalalignment="bottom")
plt.show()
Sample output:

for-Loop to creat LinePlots with seaborn in DataFrame

i am a beginner with coding with python and i have a question:
This code works fantastic to creat a chart for each Column:
The Main DF is:
enter image description here
1- Removing Outliers:
def remove_outliers(df_in, col):
q1 = df_in[col].quantile(0.25)
q3 = df_in[col].quantile(0.75)
iqr = q3-q1
lower_bound = q1-1.5*iqr
upper_bound = q3+1.5*iqr
df_out = df_in.loc[(df_in[col] > lower_bound) & (df_in[col] < upper_bound)]
return df_out
2- Define the Format of the Lineplot
rc={'axes.labelsize': 20, 'font.size': 20, 'legend.fontsize':20,'axes.titlesize':20,'xtick.labelsize': 14,'ytick.labelsize': 14, 'lines.linewidth':1, 'lines.markersize':7, 'xtick.major.pad':10}
sns.set(rc=rc)
3- Creat a Lineplot with seaborn:
df1_DH001= remove_outliers(main_df, 'DH001')[['DH 001','Datum']]
df1_DH001_chart= sns.scatterplot(x='Datum', y='DH 001', data=df1_DH001)
df1_DH001_chart= sns.lineplot(x='Datum', y='DH 001', data=df1_DH001, lw=3, color="b")
df1_DH001_chart.set(xlim=('1995','2019'), ylim=(0, 220) ,title='DH 001', ylabel='Nitrat mg/L', xlabel="Jahr")
df1_DH001_chart.xaxis.set_major_locator(mdates.YearLocator(1))
df1_DH001_chart.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
df1_DH001_chart
So I got this:
enter image description here
Now I would like to creat a for-Loop to creat the same plot and the same x-Axis (Datum) but with another column (There are 22 Columns)
Could some one help me?
Import the following:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Create asample DF:
data = {'day': ['Mon','Tue','Wed','Thu'],
'col1': [22000,25000,27000,35000],
'col2': [2200,2500,2700,3500],
}
df = pd.DataFrame(data)
Select only numeric columns from your DF or alternatively select the columns that you want to consider in the loop:
df1 = df.select_dtypes([np.int, np.float])
Iterate through the columns and print a line plot with seaborn:
for i, col in enumerate(df1.columns):
plt.figure(i)
sns.lineplot(x='day',y=col, data=df)
Then the following pictures will be shown:

multiple boxplots by date in index

My dataframe
index Dates Hours_played
0 2014-11-06 11
1 2014-12-06 4
2 2015-09-06 5
3 2015-97-06 5
Then, I set Dates as index:
Hours_played
Dates
2014-11-06 11
2014-12-06 4
2015-09-06 5
2015-97-06 5
The Problem: When I tried to create one box plot for each year found in index, I got both plots on the same grid.
df.loc['2014']['Hours_played'].plot.box(ylim=(0,200))
df.loc['2015']['Hours_played'].plot.box(ylim=(0,200))
I tried the following but the plot comes up empty:
data_2015 = df.loc['2015']['Hours_played']
data_2016 = df.loc['2016']['Hours_played']
data_to_plot = [data_2015, data_2016]
mpl_fig = plt.figure()
ax = mpl_fig.add_subplot(111)
ax.boxplot(data_to_plot)
ax.set_ylim(0,300)
Is it possible to have them in the same grid, one by the other?
A simple solution will be grouping by year first and then making boxplot:
import io
import matplotlib.pyplot as plt
import pandas as pd
# Re-create your sample data
s = """Dates,Hours_played
2014-11-06,11
2014-12-06,4
2015-09-06,5
2015-07-06,5"""
df = pd.read_table(io.StringIO(s), sep=',', index_col=0, parse_dates=True)
# The following codes are the answer relevant to your question.
df.groupby(df.index.year).boxplot()
plt.show()
Your second method ends up with an empty plot because matplotlib fail to recognize pandas.DataFrame correctly. Try use Numpy-array representation:
import io
import matplotlib.pyplot as plt
import pandas as pd
# Re-create your sample data
s = """Dates,Hours_played
2014-11-06,11
2014-12-06,4
2015-09-06,5
2015-07-06,5"""
df = pd.read_table(io.StringIO(s), sep=',', index_col=0, parse_dates=True)
# The following codes are the answer relevant to your question.
data_2014 = df[df.index.year == 2014].as_matrix()
data_2015 = df[df.index.year == 2015].as_matrix()
data_to_plot = [data_2014, data_2015]
mpl_fig = plt.figure()
ax = mpl_fig.add_subplot(111)
ax.boxplot(data_to_plot)
plt.show()
To use subplots, you will need to plot them one by one:
import io
import matplotlib.pyplot as plt
import pandas as pd
# Re-create your sample data
s = """Dates,Hours_played
2014-11-06,11
2014-12-06,4
2015-09-06,5
2015-07-06,5"""
df = pd.read_table(io.StringIO(s), sep=',', parse_dates=[0])
df['Year'] = df.Dates.dt.year
df.set_index(['Year', 'Dates'], inplace=True)
# The following codes are the answer relevant to your question.
mpl_fig = plt.figure()
ax1 = mpl_fig.add_subplot(121)
ax1.boxplot(df.loc[2014]['Hours_played'], labels=[2014])
ax2 = mpl_fig.add_subplot(122)
ax2.boxplot(df.loc[2015]['Hours_played'], labels=[2015])
plt.show()
Let's reshape the data with Years in columns and boxplot:
df.set_index(['Dates',df.Dates.dt.year])['Hours_played'].unstack().boxplot()
If you want to put all the boxes in the same plot, you can do something like this:
import matplotlib.pyplot as plt
def setBoxColors(bp, num_plots):
color = ['red', 'blue', 'green']
for idx in range(num_plots):
plt.setp(bp['boxes'][idx], color=color[idx])
plt.setp(bp['caps'][2*idx], color=color[idx])
plt.setp(bp['caps'][2*idx+1], color=color[idx])
plt.setp(bp['whiskers'][2*idx], color=color[idx])
plt.setp(bp['whiskers'][2*idx+1], color=color[idx])
plt.setp(bp['fliers'][2*idx], color=color[idx])
plt.setp(bp['fliers'][2*idx+1], color=color[idx])
plt.setp(bp['medians'][idx], color=color[idx])
# Some fake data to plot
A = [[1, 2, 5,]]
B = [[3, 4, 5]]
C = [[1, 7, 10]]
fig = plt.figure()
ax = plt.axes()
plt.hold(True)
bp = plt.boxplot(A, positions = [2], widths = 0.6, patch_artist=True)
setBoxColors(bp, 1)
bp = plt.boxplot(B, positions = [6], widths = 0.6, patch_artist=True)
setBoxColors(bp, 1)
bp = plt.boxplot(C, positions = [10], widths = 0.6, patch_artist=True)
setBoxColors(bp, 1)
# set axes limits and labels
plt.xlim(0,12)
plt.ylim(0,12)
ax.set_xticklabels(['A', 'B', 'C'])
ax.set_xticks([2, 6, 10])
# draw temporary legend
hB, = plt.plot([1,1],'r-')
plt.legend((hB, ),('Type1', ))
hB.set_visible(False)
plt.show()
With the help of Scott Boston, Y. Luo, and yuhow5566, I was able to devise an interesting answer. From Scott, I learned that it's better not to index the Dates (keep them a regular column) for this type of boxplot; and from Y. Luo, I learned how to create a new column, while isolating the year from a datetime value.
df['Year'] = s['Dates'].dt.year
df.boxplot(column='Hours_played', by='Year', figsize=(9,9))

Color by Column Values in Matplotlib

One of my favorite aspects of using the ggplot2 library in R is the ability to easily specify aesthetics. I can quickly make a scatterplot and apply color associated with a specific column and I would love to be able to do this with python/pandas/matplotlib. I'm wondering if there are there any convenience functions that people use to map colors to values using pandas dataframes and Matplotlib?
##ggplot scatterplot example with R dataframe, `df`, colored by col3
ggplot(data = df, aes(x=col1, y=col2, color=col3)) + geom_point()
##ideal situation with pandas dataframe, 'df', where colors are chosen by col3
df.plot(x=col1,y=col2,color=col3)
EDIT:
Thank you for your responses but I want to include a sample dataframe to clarify what I am asking. Two columns contain numerical data and the third is a categorical variable. The script I am thinking of will assign colors based on this value.
np.random.seed(250)
df = pd.DataFrame({'Height': np.append(np.random.normal(6, 0.25, size=5), np.random.normal(5.4, 0.25, size=5)),
'Weight': np.append(np.random.normal(180, 20, size=5), np.random.normal(140, 20, size=5)),
'Gender': ["Male","Male","Male","Male","Male",
"Female","Female","Female","Female","Female"]})
Height Weight Gender
0 5.824970 159.210508 Male
1 5.780403 180.294943 Male
2 6.318295 199.142201 Male
3 5.617211 157.813278 Male
4 6.340892 191.849944 Male
5 5.625131 139.588467 Female
6 4.950479 146.711220 Female
7 5.617245 121.571890 Female
8 5.556821 141.536028 Female
9 5.714171 134.396203 Female
Imports and Data
import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
seaborn.set(style='ticks')
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
Update August 2021
With seaborn 0.11.0, it's recommended to use new figure level functions like seaborn.relplot than to use FacetGrid directly.
sns.relplot(data=df, x='Weight (kg)', y='Height (cm)', hue='Gender', hue_order=_genders, aspect=1.61)
plt.show()
Update October 2015
Seaborn handles this use-case splendidly:
Map matplotlib.pyplot.scatter onto a seaborn.FacetGrid
fg = sns.FacetGrid(data=df, hue='Gender', hue_order=_genders, aspect=1.61)
fg.map(plt.scatter, 'Weight (kg)', 'Height (cm)').add_legend()
Which immediately outputs:
Old Answer
In this case, I would use matplotlib directly.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def dfScatter(df, xcol='Height', ycol='Weight', catcol='Gender'):
fig, ax = plt.subplots()
categories = np.unique(df[catcol])
colors = np.linspace(0, 1, len(categories))
colordict = dict(zip(categories, colors))
df["Color"] = df[catcol].apply(lambda x: colordict[x])
ax.scatter(df[xcol], df[ycol], c=df.Color)
return fig
if 1:
df = pd.DataFrame({'Height':np.random.normal(size=10),
'Weight':np.random.normal(size=10),
'Gender': ["Male","Male","Unknown","Male","Male",
"Female","Did not respond","Unknown","Female","Female"]})
fig = dfScatter(df)
fig.savefig('fig1.png')
And that gives me:
As far as I know, that color column can be any matplotlib compatible color (RBGA tuples, HTML names, hex values, etc).
I'm having trouble getting anything but numerical values to work with the colormaps.
Actually you could use ggplot for python:
from ggplot import *
import numpy as np
import pandas as pd
df = pd.DataFrame({'Height':np.random.randn(10),
'Weight':np.random.randn(10),
'Gender': ["Male","Male","Male","Male","Male",
"Female","Female","Female","Female","Female"]})
ggplot(aes(x='Height', y='Weight', color='Gender'), data=df) + geom_point()
https://seaborn.pydata.org/generated/seaborn.scatterplot.html
import numpy
import pandas
import seaborn as sns
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
sns.scatterplot(data=df, x='Height (cm)', y='Weight (kg)', hue='Gender')
You can use the color parameter to the plot method to define the colors you want for each column. For example:
from pandas import DataFrame
data = DataFrame({'a':range(5),'b':range(1,6),'c':range(2,7)})
colors = ['yellowgreen','cyan','magenta']
data.plot(color=colors)
You can use color names or Color hex codes like '#000000' for black say. You can find all the defined color names in matplotlib's color.py file. Below is the link for the color.py file in matplotlib's github repo.
https://github.com/matplotlib/matplotlib/blob/master/lib/matplotlib/colors.py
The OP is coloring by a categorical column, but this answer is for coloring by a column that is numeric, or can be interpreted as numeric, such as a datetime dtype.
pandas.DataFrame.plot and matplotlib.pyplot.scatter can take a c or color parameter, which must be a color, a sequence of colors, or a sequence of numbers.
Tested in python 3.8, pandas 1.3.1, and matplotlib 3.4.2
Choosing Colormaps in Matplotlib for other valid cmap options.
Imports and Test Data
'Date' is already a datetime64[ns] dtype from DataReader
conda install -c anaconda pandas-datareader or pip install pandas-datareader depending on your environment.
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader as web # for data; not part of pandas
tickers = 'amzn'
df = web.DataReader(ticker, data_source='yahoo', start='2018-01-01', end='2021-01-01').reset_index()
df['ticker'] = ticker
Date High Low Open Close Volume Adj Close ticker
0 2018-01-02 1190.00000 1170.510010 1172.000000 1189.010010 2694500 1189.010010 amzn
1 2018-01-03 1205.48999 1188.300049 1188.300049 1204.199951 3108800 1204.199951 amzn
c as a number
pandas.DataFrame.plot
df.Date.dt.month creates a pandas.Series of month numbers
ax = df.plot(kind='scatter', x='Date', y='High', c=df.Date.dt.month, cmap='Set3', figsize=(11, 4), title='c parameter as a month number')
plt.show()
matplotlib.pyplot.scatter
fig, ax = plt.subplots(figsize=(11, 4))
ax.scatter(data=df, x='Date', y='High', c=df.Date.dt.month, cmap='Set3')
ax.set(title='c parameter as a month number', xlabel='Date', ylabel='High')
plt.show()
c as a datetime dtype
pandas.DataFrame.plot
ax = df.plot(kind='scatter', x='Date', y='High', c='Date', cmap='winter', figsize=(11, 4), title='c parameter as a datetime dtype')
plt.show()
matplotlib.pyplot.scatter
fig, ax = plt.subplots(figsize=(11, 4))
ax.scatter(data=df, x='Date', y='High', c='Date', cmap='winter')
ax.set(title='c parameter as a datetime dtype', xlabel='Date', ylabel='High')
plt.show()
Though not matplotlib, you can achieve this using plotly express:
import numpy as np
import pandas as pd
import plotly.express as px
df = pd.DataFrame({
'Height':np.random.normal(size=10),
'Weight':np.random.normal(size=10),
'Size': 1, # How large each point should be?
'Gender': ["Male","Male","Male","Male","Male","Female","Female","Female","Female","Female"]})
# Create your plot
px.scatter(df, x='Weight', y='Height', size='Size', color='Gender')
If creating in a notebook, you'll get an interactive output like the following:

Categories

Resources